def nb_by_fmt(fmt): return sum([ 1 for book in books if BookFormat.select(BookFormat, Book, Format).join(Book).switch( BookFormat).join(Format).where(Book.id == book.id).where( Format.mime == FORMAT_MATRIX.get(fmt)).count() ])
def nb_by_fmt(fmt): return sum([1 for book in books if BookFormat.select(BookFormat, Book, Format) .join(Book).switch(BookFormat) .join(Format) .where(Book.id == book.id) .where(Format.mime == FORMAT_MATRIX.get(fmt)) .count()])
def get_urls(book): """ Get all possible urls that could point to the book on either of the two mirrors. param: book: The book you want the possible urls from returns: a list of all possible urls sorted by their probability """ filtered_book = [bf.format for bf in BookFormat.select().where(BookFormat.book == book)] # Strip out the encoding of the file f = lambda x: x.mime.split(';')[0].strip() available_formats = [ {x.pattern.format(id=book.id): {'mime': f(x), 'id': book.id}} for x in filtered_book if f(x) in FORMAT_MATRIX.values()] files = sort_by_mime_type(available_formats) return build_urls(files)
def get_urls(book): """ Get all possible urls that could point to the book on either of the two mirrors. param: book: The book you want the possible urls from returns: a list of all possible urls sorted by their probability """ filtered_book = [ bf.format for bf in BookFormat.select().where(BookFormat.book == book) ] # Strip out the encoding of the file f = lambda x: x.mime.split(';')[0].strip() available_formats = [{ x.pattern.format(id=book.id): { 'mime': f(x), 'id': book.id } } for x in filtered_book if f(x) in FORMAT_MATRIX.values()] files = sort_by_mime_type(available_formats) return build_urls(files)
def download_book(book, download_cache, languages, formats, force): logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if "html" not in formats: formats.append("html") for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}".format( fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == "html": patterns = [ "mnsrb10h.htm", "8ledo10h.htm", "tycho10f.htm", "8ledo10h.zip", "salme10h.htm", "8nszr10h.htm", "{id}-h.html", "{id}.html.gen", "{id}-h.htm", "8regr10h.zip", "{id}.html.noimages", "8lgme10h.htm", "tycho10h.htm", "tycho10h.zip", "8lgme10h.zip", "8indn10h.zip", "8resp10h.zip", "20004-h.htm", "8indn10h.htm", "8memo10h.zip", "fondu10h.zip", "{id}-h.zip", "8mort10h.zip", ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( format, book.id, book.title).encode("utf-8")) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images).get() except Exception: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( format, book.id, book.title).encode("utf-8")) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while urls: url = urls.pop() if len(allurls) != 1: if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith(".zip"): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) pp(allurls) continue
def download_book(book, download_cache, languages, formats, force, s3_storage, optimizer_version): logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if "html" not in formats: formats.append("html") book_dir = pathlib.Path(download_cache).joinpath(str(book.id)) optimized_dir = book_dir.joinpath("optimized") unoptimized_dir = book_dir.joinpath("unoptimized") unsuccessful_formats = [] for book_format in formats: unoptimized_fpath = unoptimized_dir.joinpath( fname_for(book, book_format)) optimized_fpath = optimized_dir.joinpath( archive_name_for(book, book_format)) # check if already downloaded if (unoptimized_fpath.exists() or optimized_fpath.exists()) and not force: logger.debug( f"\t\t{book_format} already exists for book #{book.id}") continue if force: if book_format == "html": for fpath in book_dir.iterdir(): if fpath.is_file() and fpath.suffix not in [ ".pdf", ".epub" ]: fpath.unlink() else: if unoptimized_fpath.exists(): unoptimized_fpath.unlink() if optimized_fpath.exists(): optimized_fpath.unlink() # delete dirs which are empty for dir_name in [optimized_dir, unoptimized_dir]: if not dir_name.exists(): continue if not list(dir_name.iterdir()): dir_name.rmdir() # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if book_format == "html": patterns = [ "mnsrb10h.htm", "8ledo10h.htm", "tycho10f.htm", "8ledo10h.zip", "salme10h.htm", "8nszr10h.htm", "{id}-h.html", "{id}.html.gen", "{id}-h.htm", "8regr10h.zip", "{id}.html.noimages", "8lgme10h.htm", "tycho10h.htm", "tycho10h.zip", "8lgme10h.zip", "8indn10h.zip", "8resp10h.zip", "20004-h.htm", "8indn10h.htm", "8memo10h.zip", "fondu10h.zip", "{id}-h.zip", "8mort10h.zip", ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") unsuccessful_formats.append(book_format) continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(book_format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( book_format, book.id, book.title)) unsuccessful_formats.append(book_format) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images).get() except Exception: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( book_format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format)))) import copy allurls = copy.copy(urls) downloaded_from_cache = False while urls: url = urls.pop() # for development # if len(allurls) != 1: # if not resource_exists(url): # continue # HTML files are *sometime* available as ZIP files if url.endswith(".zip"): zpath = unoptimized_dir.joinpath( f"{fname_for(book, book_format)}.zip") etag = get_etag_from_url(url) if s3_storage: if download_from_cache( book=book, etag=etag, book_format=book_format, dest_dir=optimized_dir, s3_storage=s3_storage, optimizer_version=optimizer_version, ): downloaded_from_cache = True break if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # save etag book.html_etag = etag book.save() # extract zipfile handle_zipped_epub(zippath=zpath, book=book, dst_dir=unoptimized_dir) else: if (url.endswith(".htm") or url.endswith(".html") or url.endswith(".html.utf8") or url.endswith(".epub")): etag = get_etag_from_url(url) if s3_storage: logger.info( f"Trying to download {book.id} from optimization cache" ) if download_from_cache( book=book, etag=etag, book_format=book_format, dest_dir=optimized_dir, s3_storage=s3_storage, optimizer_version=optimizer_version, ): downloaded_from_cache = True break if not download_file(url, unoptimized_fpath): logger.error( "file donwload failed: {}".format(unoptimized_fpath)) continue # save etag if html or epub if download is successful if (url.endswith(".htm") or url.endswith(".html") or url.endswith(".html.utf8")): logger.debug(f"Saving html ETag for {book.id}") book.html_etag = etag book.save() elif url.endswith(".epub"): logger.debug(f"Saving epub ETag for {book.id}") book.epub_etag = etag book.save() # store working URL in DB bf.downloaded_from = url bf.save() # break as we got a working URL break if not bf.downloaded_from and not downloaded_from_cache: logger.error("NO FILE FOR #{}/{}".format(book.id, book_format)) # delete instance from DB if download failed logger.info("Deleting instance from DB") bf.delete_instance() unsuccessful_formats.append(book_format) pp(allurls) # delete book from DB if not downloaded in any format if len(unsuccessful_formats) == len(formats): logger.debug( f"Book #{book.id} could not be downloaded in any format. Deleting from DB ..." ) book.delete_instance() if book_dir.exists(): shutil.rmtree(book_dir, ignore_errors=True) return download_cover(book, book_dir, s3_storage, optimizer_version)
def save_rdf_in_database(parser): # Insert author, if it not exists if parser.author_id: try: author_record = Author.get(gut_id=parser.author_id) except Exception: try: author_record = Author.create( gut_id=parser.author_id, last_name=normalize(parser.last_name), first_names=normalize(parser.first_name), birth_year=parser.birth_year, death_year=parser.death_year, ) # concurrent workers might colide here so we retry once on IntegrityError except peewee.IntegrityError: author_record = Author.get(gut_id=parser.author_id) else: if parser.last_name: author_record.last_name = normalize(parser.last_name) if parser.first_name: author_record.first_names = normalize(parser.first_name) if parser.birth_year: author_record.birth_year = parser.birth_year if parser.death_year: author_record.death_year = parser.death_year author_record.save() else: # No author, set Anonymous author_record = Author.get(gut_id="216") # Get license try: license_record = License.get(name=parser.license) except Exception: license_record = None # Insert book try: book_record = Book.get(id=parser.gid) except Book.DoesNotExist: book_record = Book.create( id=parser.gid, title=normalize(parser.title.strip()), subtitle=normalize(parser.subtitle.strip()), author=author_record, # foreign key license=license_record, # foreign key language=parser.language.strip(), downloads=parser.downloads, bookshelf=parser.bookshelf, cover_page=parser.cover_image, ) else: book_record.title = normalize(parser.title.strip()) book_record.subtitle = normalize(parser.subtitle.strip()) book_record.author = author_record # foreign key book_record.license = license_record # foreign key book_record.language = parser.language.strip() book_record.downloads = parser.downloads book_record.save() # insert pdf if not exists in parser.file_types # this is done as presence of PDF on server and RDF is inconsistent if not [ key for key in parser.file_types if parser.file_types[key].startswith("application/pdf") ]: parser.file_types.update({"{id}-pdf.pdf": "application/pdf"}) # Insert formats for file_type in parser.file_types: # Sanitize MIME mime = parser.file_types[file_type] if not mime.startswith("text/plain"): mime = re.sub(r"; charset=[a-z0-9-]+", "", mime) # else: # charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0] # Insert format type pattern = re.sub(r"" + parser.gid, "{id}", file_type) pattern = pattern.split("/")[-1] bid = int(book_record.id) if bid in BAD_BOOKS_FORMATS.keys() and mime in [ FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid) ]: logger.error( "\t**** EXCLUDING **** {} for book #{} from list.".format( mime, bid)) continue format_record, _ = Format.get_or_create( mime=mime, images=file_type.endswith(".images") or parser.file_types[file_type] == "application/pdf", pattern=pattern, ) # Insert book format BookFormat.get_or_create( book=book_record, format=format_record # foreign key # foreign key )
def save_rdf_in_database(parser): # Insert author, if it not exists if parser.author_id: try: author_record = Author.get(gut_id=parser.author_id) except Exception: try: author_record = Author.create( gut_id=parser.author_id, last_name=normalize(parser.last_name), first_names=normalize(parser.first_name), birth_year=parser.birth_year, death_year=parser.death_year) # concurrent workers might colide here so we retry once on IntegrityError except peewee.IntegrityError: author_record = Author.get(gut_id=parser.author_id) else: if parser.last_name: author_record.last_name = normalize(parser.last_name) if parser.first_name: author_record.first_names = normalize(parser.first_name) if parser.birth_year: author_record.birth_year = parser.birth_year if parser.death_year: author_record.death_year = parser.death_year author_record.save() else: # No author, set Anonymous author_record = Author.get(gut_id='216') # Get license try: license_record = License.get(name=parser.license) except Exception: license_record = None # Insert book try: book_record = Book.get(id=parser.gid) except Book.DoesNotExist: book_record = Book.create( id=parser.gid, title=normalize(parser.title.strip()), subtitle=normalize(parser.subtitle.strip()), author=author_record, # foreign key license=license_record, # foreign key language=parser.language.strip(), downloads=parser.downloads) else: book_record.title = normalize(parser.title.strip()) book_record.subtitle = normalize(parser.subtitle.strip()) book_record.author = author_record # foreign key book_record.license = license_record # foreign key book_record.language = parser.language.strip() book_record.downloads = parser.downloads book_record.save() # Insert formats for file_type in parser.file_types: # Sanitize MIME mime = parser.file_types[file_type] if not mime.startswith('text/plain'): mime = re.sub(r'; charset=[a-z0-9-]+', '', mime) # else: # charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0] # Insert format type pattern = re.sub(r'' + parser.gid, '{id}', file_type) pattern = pattern.split('/')[-1] bid = int(book_record.id) if bid in BAD_BOOKS_FORMATS.keys() \ and mime in [FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid)]: logger.error( "\t**** EXCLUDING **** {} for book #{} from list.".format( mime, bid)) continue format_record, _ = Format.get_or_create( mime=mime, images=file_type.endswith('.images') or parser.file_types[file_type] == 'application/pdf', pattern=pattern) # Insert book format BookFormat.get_or_create( book=book_record, # foreign key format=format_record # foreign key )