Пример #1
0
 def upload_to_IA(self, library, Id):
     """Upload book to IA with appropriate metadata."""
     if self.ia_identifier == None:
         item = self.get_valid_identifier()
         self.ia_identifier = item.identifier
     else:
         item = ia.get_item(self.ia_identifier)
     language_from_input = redis_py.get(self.book_key + ":language", True)
     metadata = dict(
         mediatype="text",
         creator=self.author,
         title=re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '', self.title)[:330],
         publisher=self.publisher,
         description=re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""", '',
                            self.description),
         source=self.infoLink,
         language=self.language
         if language_from_input in (None, "") else language_from_input,
         year=self.year,
         date=self.publishedDate,
         subject="bub_upload",
         licenseurl="http://creativecommons.org/publicdomain/mark/1.0/"
         if self.publicDomain == True else "",
         scanner=self.scanner,
         sponsor=self.sponser,
         uploader="bub")
     metadata['google-id'] = self.Id if self.library == 'gb' else ""
     filename = redis_py.get(self.redis_output_file_key, True)
     self.filename = filename
     S3_access_key = keys.S3_access_key
     S3_secret_key = keys.S3_secret_key
     try:
         status = item.upload(filename,
                              access_key=S3_access_key,
                              secret_key=S3_secret_key,
                              metadata=metadata)
     except:
         item = self.get_valid_identifier(primary=False)
         self.ia_identifier = item.identifier
         status = item.upload(filename,
                              access_key=S3_access_key,
                              secret_key=S3_secret_key,
                              metadata=metadata)
     command = "rm %s" % (filename)
     try:
         subprocess.check_call(command, shell=True)
     except:
         log.write("%s  Command rm %s failed" % (datetime.now(), filename))
         log.flush()
     return status
Пример #2
0
def verify_id(Id_string): 
    """Verify the Id and public-domain status for the book"""
    Id = get_id_from_string(Id_string, 'desanitize')
    if Id == None:
        return 1
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id_string)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)
    
   
    try:
        r = requests.get(url)
    except:
        return 10
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        soup = BeautifulSoup(source)
        #public_domain = OAI_metadata_content("DC.relation", soup)
        #if public_domain != "Domínio público".decode('utf-8') and public_domain != "":
        #return 2
        #else:
        tld = extract_base_domain(url)
        if tld[-1:] == '/':
            tld = tld[:-1]
        pdf_url = get_pdf_link(tld, soup)
        if pdf_url == False:
            return 8
        return 0
Пример #3
0
    def upload_to_IA(self, library, Id): 
        """Upload book to IA with appropriate metadata."""
        if self.ia_identifier == None:
            item = self.get_valid_identifier()
            self.ia_identifier = item.identifier
        else:
            item = ia.get_item(self.ia_identifier)
        language_from_input = redis_py.get(self.book_key + ":language", True)
        metadata = dict(
            mediatype = "text",
            creator = self.author,
            title = re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""",'',self.title)[:330],
            publisher = self.publisher,
            description = re.sub(r"""[!#\n|^\\\"~()\[\]:\-/]""",'',self.description),
            source = self.infoLink,
            language = self.language if language_from_input in (None, "") else language_from_input,
            year = self.year,
            date = self.publishedDate,
            subject = "bub_upload",
            licenseurl = "http://creativecommons.org/publicdomain/mark/1.0/" if self.publicDomain == True else "",
            scanner = self.scanner,
            sponsor = self.sponser,
            uploader = "bub")
        metadata['google-id'] = self.Id if self.library == 'gb' else ""
        filename = redis_py.get(self.redis_output_file_key, True)
	self.filename = filename
        S3_access_key = keys.S3_access_key
        S3_secret_key = keys.S3_secret_key
        try:
            status = item.upload(filename, access_key = S3_access_key, secret_key = S3_secret_key, metadata=metadata)
        except:
             item = self.get_valid_identifier(primary = False)
             self.ia_identifier = item.identifier
             status = item.upload(filename, access_key = S3_access_key, secret_key = S3_secret_key, metadata=metadata)
        command = "rm %s" %(filename)
        try:
            subprocess.check_call(command, shell=True)     
        except:
            log.write("%s  Command rm %s failed" %(datetime.now(), filename))
            log.flush()
        return status
Пример #4
0
def metadata(Id):
    """Return book information and meta-data"""
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)    
    try:
        r = requests.get(url)
    except:
        return 1
    if r.status_code == 404:
	    return 1
    if r.status_code != 200:
        return 10
    else:
        source = r.text
        if "Página não encontrada".decode('utf-8') in source:
            return 1
        soup = BeautifulSoup(source)
        #public_domain = OAI_metadata_content("DC.relation", soup)
        #if public_domain != "Domínio público".decode('utf-8') and public_domain != "":
        #return 2
    thumbnail_url = extract_thumbnail_url(soup, url)        
    return dict(
        image_url = thumbnail_url,
        thumbnail_url = thumbnail_url,
        printType = "BOOK",
        title = OAI_metadata_content("DC.title", soup),
        subtitle = "",
        author = OAI_metadata_content("DC.creator", soup),
        publisher = OAI_metadata_content("DC.publisher", soup),
        publishedDate = OAI_metadata_content("DCTERMS.issued", soup),
        description = OAI_metadata_content("DC.description", soup),
        infoLink = url,
        publicDomain = True,
        language = normalize_to_ascii(OAI_metadata_content("DC.language", soup)),
        scanner = extract_base_domain(url),
        sponser = extract_base_domain(url)
    )
Пример #5
0
def download_book(Id, id_for_key): 
    redis = redis_py.Redis()
    redis_key3 = keys.redis_key3
    book_key = "%s:%s:%s" %(redis_key3, 'usp', Id)
    library_url_key = book_key + ":library_url"
    url = redis_py.get(library_url_key, True)
    url = get_absolute_url_of_book(url)    
    r = requests.get(url)
    source = r.text
    soup = BeautifulSoup(source)
    tld = extract_base_domain(url)
    if tld[-1:] == '/':
        tld = tld[:-1]
    pdf_url = get_pdf_link(tld, soup)   
    if pdf_url in ("", None):
        return 1
    pdf = requests.get(pdf_url, stream=True)
    output_file = "/data/scratch/BUB_downloads/bub_usp_%s.pdf" %Id ###
    store_output_file_name(id_for_key, output_file)
    with open(output_file, 'wb') as f:
        for chunk in pdf.iter_content(1024):  
            f.write(chunk)  
    return 0            
Пример #6
0
 def stored_copy_check(self):
     """Check if book already uploaded by the tool."""
     if redis_py.get(self.book_key + ":upload_progress", True) == '1':
         return True
     else:
         return None
Пример #7
0
 def __init__(self, value):
     """Assign variable, and get metadata from cache"""
     redis_key3 = keys.redis_key3
     self.redis_key3 = redis_key3
     self.redis = redis_py.Redis()
     if isinstance(value, (int, long, float, complex)):
         db = mysql_py.Db()
         values = db.execute(
             'select library, book_id from request where sno = %s;',
             value)[0]
         db.close()
         self.library = values[0]
         self.Id = values[1].encode('utf-8')
         self.book_key = "%s:%s:%s" % (redis_key3, self.library, self.Id)
         self.redis.set(redis_key3 + ":ongoing_job_identifier", self.Id)
         self.ia_identifier = None
         self.id_for_key = self.Id
     else:
         self.library = value['library']
         self.Id = value['Id']
         self.ia_identifier = "bub_" + self.library + "_" + value[
             'ia_identifier_suffix']
         self.book_key = "%s:%s:%s" % (redis_key3, self.library,
                                       value['ia_identifier_suffix'])
         self.redis.set(redis_key3 + ":ongoing_job_identifier",
                        value['ia_identifier_suffix'])
         self.id_for_key = value['ia_identifier_suffix']
     if '/' not in self.id_for_key:
         self.redis_output_file_key = "%s:%s:%s:output_file" % (
             redis_key3, self.library, self.id_for_key)
     else:
         self.redis_output_file_key = "%s:%s:%s:output_file" % (
             redis_key3, self.library, hashlib.md5(
                 self.id_for_key).hexdigest())
     self.library_name = bridge.lib_module(self.library)[1]
     metadata_key = self.book_key + ":meta_data"
     metadata = redis_py.get(metadata_key, True)
     info = json.loads(metadata)
     try:
         self.title = info['title'].encode(
             "utf-8") + " " + info['subtitle'].encode("utf-8")
     except:
         self.title = str(info['title'].encode("utf-8")) + " " + str(
             info['subtitle'])
     self.author = info['author'].encode("utf-8")
     self.publisher = info['publisher'].encode("utf-8")
     self.description = info['description'].replace("\n",
                                                    "").encode("utf-8")
     self.printType = info['printType'].encode("utf-8")
     self.publishedDate = re.sub("[^0123456789/.-]", "",
                                 info['publishedDate'].encode("utf-8"))
     self.infoLink = info['infoLink']
     self.publicDomain = info['publicDomain']
     language_code = info['language'].encode("utf-8")
     if self.publishedDate not in (None, ""):
         try:
             self.publishedDate = re.sub('[x?]', '0', self.publishedDate)
             self.year = parser.parse(self.publishedDate).year
             self.month = parser.parse(self.publishedDate).month
             self.day = parser.parse(self.publishedDate).day
         except:
             self.year = ""
             self.month = ""
             self.day = ""
     else:
         self.year = ""
         self.month = ""
         self.day = ""
     try:
         self.language = lang_code(language_code)
     except:
         self.language = ""
     self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" % (
         self.library, self.Id)
     self.scanner = info['scanner']
     self.sponser = info['sponser']
Пример #8
0
    def __init__(self, value):
        """Assign variable, and get metadata from cache"""
        redis_key3 = keys.redis_key3
        self.redis_key3 = redis_key3
        self.redis = redis_py.Redis()
        if  isinstance(value, (int, long, float, complex)):
            db = mysql_py.Db()
            values = db.execute('select library, book_id from request where sno = %s;',value)[0]
            db.close()
            self.library = values[0]
            self.Id = values[1].encode('utf-8')
            self.book_key = "%s:%s:%s" %(redis_key3, self.library, self.Id) 
            self.redis.set(redis_key3+":ongoing_job_identifier", self.Id)
            self.ia_identifier = None
	    self.id_for_key = self.Id
        else:
            self.library = value['library']
            self.Id = value['Id']
            self.ia_identifier = "bub_" + self.library + "_" + value['ia_identifier_suffix']
            self.book_key = "%s:%s:%s" %(redis_key3, self.library, value['ia_identifier_suffix']) 
            self.redis.set(redis_key3+":ongoing_job_identifier", value['ia_identifier_suffix'])
	    self.id_for_key = value['ia_identifier_suffix']
        if '/' not in self.id_for_key:
            self.redis_output_file_key = "%s:%s:%s:output_file" %(redis_key3, self.library, self.id_for_key)
        else:
            self.redis_output_file_key = "%s:%s:%s:output_file" %(redis_key3, self.library, hashlib.md5(self.id_for_key).hexdigest())
        self.library_name = bridge.lib_module(self.library)[1]           
        metadata_key = self.book_key + ":meta_data"
        metadata = redis_py.get(metadata_key, True)
        info = json.loads(metadata)      
        try:
            self.title = info['title'].encode("utf-8") + " " + info['subtitle'].encode("utf-8")
        except:
            self.title = str(info['title'].encode("utf-8")) + " " + str(info['subtitle']) 
        self.author = info['author'].encode("utf-8")
        self.publisher = info['publisher'].encode("utf-8")
        self.description = info['description'].replace("\n", "").encode("utf-8")
        self.printType = info['printType'].encode("utf-8")
        self.publishedDate = re.sub("[^0123456789/.-]","", info['publishedDate'].encode("utf-8"))
        self.infoLink = info['infoLink']
        self.publicDomain = info['publicDomain']
        language_code = info['language'].encode("utf-8")
        if self.publishedDate not in (None,"") :
            try:
                self.publishedDate = re.sub('[x?]','0',self.publishedDate)
                self.year = parser.parse(self.publishedDate).year
                self.month = parser.parse(self.publishedDate).month
                self.day = parser.parse(self.publishedDate).day
            except:
                self.year = ""
                self.month = ""
                self.day = "" 
        else:
            self.year = ""
            self.month = ""
            self.day = ""  
        try:
            self.language = lang_code(language_code)
        except:
            self.language = ""
        self.pdf_path = "/data/scratch/BUB_downloads/bub_%s_%s.pdf" %(self.library, self.Id)
        self.scanner = info['scanner'] 
        self.sponser = info['sponser']        
Пример #9
0
 def stored_copy_check(self):
     """Check if book already uploaded by the tool."""
     if redis_py.get(self.book_key + ":upload_progress", True) == '1':
         return True  
     else:
         return None