def parse_and_fill(rdf_path, concurrency, only_books=[], force=False): logger.info("\tLooping throught RDF files in {}".format(rdf_path)) fpaths = [] for root, dirs, files in os.walk(rdf_path): if root.endswith("999999"): continue # skip books outside of requsted list if len(only_books) and path(root).basename() not in [ str(bid) for bid in only_books ]: continue for fname in files: if fname in (".", "..", "pg0.rdf"): continue if not fname.endswith(".rdf"): continue fpaths.append(os.path.join(root, fname)) fpaths = sorted( fpaths, key=lambda f: int(re.match(r".*/pg([0-9]+).rdf", f).groups()[0])) def ppf(x): return parse_and_process_file(x, force) Pool(concurrency).map(ppf, fpaths)
def copy_file(src, dst): logger.info("\t\tCopying {}".format(dst)) try: shutil.copy2(src, dst) except IOError: logger.error("/!\\ Unable to copy missing file {}".format(src)) return
def download_cover(book, book_dir, s3_storage, optimizer_version): has_cover = Book.select(Book.cover_page).where(Book.id == book.id) if has_cover: # try to download optimized cover from cache if s3_storage url = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id, book.id) etag = get_etag_from_url(url) downloaded_from_cache = False cover = "{}_cover_image.jpg".format(book.id) if (book_dir.joinpath("optimized").joinpath(cover).exists() or book_dir.joinpath("unoptimized").joinpath(cover).exists()): logger.debug(f"Cover already exists for book #{book.id}") return if s3_storage: logger.info( f"Trying to download cover for {book.id} from optimization cache" ) downloaded_from_cache = download_from_cache( book=book, etag=etag, book_format="cover", dest_dir=book_dir.joinpath("optimized"), s3_storage=s3_storage, optimizer_version=optimizer_version, ) if not downloaded_from_cache: logger.debug("Downloading {}".format(url)) if download_file(url, book_dir.joinpath("unoptimized").joinpath(cover)): book.cover_etag = etag book.save() else: logger.debug("No Book Cover found for Book #{}".format(book.id))
def download_rdf_file(rdf_url): fname = "rdf-files.tar.bz2" if path(fname).exists(): logger.info("\tdf-files.tar.bz2 already exists in {}".format(fname)) return fname logger.info("\tDownloading {} into {}".format(rdf_url, fname)) download_file(rdf_url, pathlib.Path(fname).resolve()) return fname
def optimize_image(src, dst, force=False): if path(dst).exists() and not force: logger.info("\tSkipping image optimization for {}".format(dst)) return dst logger.info("\tOptimizing image {}".format(dst)) if path(src).ext == '.png': return optimize_png(src, dst) if path(src).ext in ('.jpg', '.jpeg'): return optimize_jpeg(src, dst) if path(src).ext == '.gif': return optimize_gif(src, dst) return dst
def symlink_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tSymlinking {}".format(dst)) path(dst).unlink_p() try: path(src).link(dst) # hard link except IOError: logger.error("/!\ Unable to symlink missing file {}".format(src)) return
def optimize_image(src, dst, force=False): if path(dst).exists() and not force: logger.info("\tSkipping image optimization for {}".format(dst)) return dst logger.info("\tOptimizing image {}".format(dst)) if path(src).ext == '.png': return optimize_png(src, dst) if path(src).ext in ('.jpg', '.jpeg'): return optimize_jpeg(src, dst) if path(src).ext == '.gif': return optimize_gif(src, dst) return dst
def setup_database(wipe=False): logger.info("Setting up the database") for model in (License, Format, Author, Book, BookFormat, Url): if wipe: model.drop_table(fail_silently=True) if not model.table_exists(): model.create_table() logger.debug("Created table for {}".format(model._meta.name)) load_fixtures(model) else: logger.debug("{} table already exists.".format(model._meta.name))
def copy_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tCopying {}".format(dst)) path(dst).unlink_p() try: path(src).copy(dst) except IOError: logger.error("/!\ Unable to copy missing file {}".format(src)) return
def optimize_image(src, dst, force=False): if dst.exists() and not force: logger.info("\tSkipping image optimization for {}".format(dst)) return dst logger.info("\tOptimizing image {}".format(dst)) if src.suffix == ".png": return optimize_png(str(src.resolve()), str(dst.resolve())) if src.suffix in (".jpg", ".jpeg"): return optimize_jpeg(str(src.resolve()), str(dst.resolve())) if src.suffix == ".gif": return optimize_gif(str(src.resolve()), str(dst.resolve())) return dst
def setup_database(wipe=False): logger.info("Setting up the database") for model in (License, Format, Author, Book, BookFormat, Url): if wipe: model.drop_table(fail_silently=True) if not model.table_exists(): model.create_table() logger.debug("Created table for {}".format(model._meta.name)) load_fixtures(model) else: logger.debug("{} table already exists.".format(model._meta.name))
def copy_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tCopying {}".format(dst)) path(dst).unlink_p() try: path(src).copy(dst) except IOError: logger.error("/!\\ Unable to copy missing file {}".format(src)) return
def extract_rdf_files(rdf_tarball, rdf_path, force=False): if path(rdf_path).exists() and not force: logger.info("\tRDF-files folder already exists in {}".format(rdf_path)) return logger.info("\tExtracting {} into {}".format(rdf_tarball, rdf_path)) # create destdir if not exists dest = path(rdf_path) dest.mkdir_p() exec_cmd([ "tar", "-C", rdf_path, "--strip-components", "2", "-x", "-f", rdf_tarball ]) return
def write_book_presentation_article( static_folder, book, force, project_id, title_search, add_bookshelves, books ): cover_fpath = static_folder.joinpath(article_name_for(book=book, cover=True)) if not cover_fpath.exists() or force: logger.info("\t\tExporting to {}".format(cover_fpath)) html = cover_html_content_for( book=book, static_folder=static_folder, books=books, project_id=project_id, title_search=title_search, add_bookshelves=add_bookshelves, ) with open(cover_fpath, "w") as f: if six.PY2: f.write(html.encode(UTF8)) else: f.write(html) else: logger.info("\t\tSkipping cover {}".format(cover_fpath))
def build_zimfile(static_folder, zim_path=None, languages=[], formats=[], title=None, description=None, only_books=[], create_index=True, force=False): # revert HTML/JS/CSS to zim-compatible versions export_skeleton(static_folder=static_folder, dev_mode=False, languages=languages, formats=formats, only_books=only_books) if not languages: languages = ['mul'] languages.sort() formats.sort() if title is None: if len(languages) > 5: title = ("Project Gutenberg Library with {formats}" .format(formats=",".join(formats))) else: title = ("Project Gutenberg Library ({langs}) with {formats}" .format(langs=",".join(languages), formats=",".join(formats))) logger.info("\tWritting ZIM for {}".format(title)) if description is None: description = "The first producer of free ebooks" project_id = get_project_id(languages, formats, only_books) if zim_path is None: zim_path = "{}.zim".format(project_id) if path(zim_path).exists() and not force: logger.info("ZIM file `{}` already exist.".format(zim_path)) return languages = [ISO_MATRIX.get(lang, lang) for lang in languages] languages.sort() cmd = ['zimwriterfs', '--welcome', "Home.html", '--favicon', "favicon.png", '--language', ','.join(languages), '--name', project_id, '--title', title, '--description', description, '--creator', "gutenberg.org", '--publisher', "Kiwix", static_folder, zim_path] if create_index: cmd.insert(1, '--withFullTextIndex') if exec_cmd(cmd) == 0: logger.info("Successfuly created ZIM file at {}".format(zim_path)) else: logger.error("Unable to create ZIM file :(")
def parse_and_process_file(rdf_file, force=False): if not path(rdf_file).exists(): raise ValueError(rdf_file) gid = re.match(r".*/pg([0-9]+).rdf", rdf_file).groups()[0] if Book.get_or_none(id=int(gid)): logger.info("\tSkipping already parsed file {}".format(rdf_file)) return logger.info("\tParsing file {}".format(rdf_file)) with open(rdf_file, "r") as f: parser = RdfParser(f.read(), gid).parse() if parser.license == "None": logger.info( "\tWARN: Unusable book without any information {}".format(gid)) elif parser.title == "": logger.info("\tWARN: Unusable book without title {}".format(gid)) else: save_rdf_in_database(parser)
def handle_companion_file(fname, dstfname=None, book=None, force=False, as_ext=None): ext = path(fname).ext if as_ext is None else as_ext src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) if path(dst).exists() and not force: logger.debug("\t\tSkipping existing companion {}".format(dstfname)) return # optimization based on mime/extension if ext in (".png", ".jpg", ".jpeg", ".gif"): logger.info( "\t\tCopying and optimizing image companion {}".format(fname)) # copy_from_cache(src, dst) optimize_image(src, dst) elif ext == ".epub": logger.info("\t\tCreating optimized EPUB file {}".format(fname)) tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER) tmp_epub.close() try: optimize_epub(src, tmp_epub.name) except zipfile.BadZipFile: logger.warn("\t\tBad zip file. " "Copying as it might be working{}".format(fname)) handle_companion_file(fname, dstfname, book, force, as_ext="zip") else: path(tmp_epub.name).move(dst) else: # excludes files created by Windows Explorer if src.endswith("_Thumbs.db"): return # copy otherwise (PDF mostly) logger.debug("\t\tshitty ext: {}".format(dst)) logger.info("\t\tCopying companion file to {}".format(fname)) copy_from_cache(src, dst)
def handle_companion_file(fname, dstfname=None, book=None, force=False, as_ext=None): ext = path(fname).ext if as_ext is None else as_ext src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) if path(dst).exists() and not force: logger.debug("\t\tSkipping existing companion {}".format(dstfname)) return # optimization based on mime/extension if ext in ('.png', '.jpg', '.jpeg', '.gif'): logger.info("\t\tCopying and optimizing image companion {}" .format(fname)) # copy_from_cache(src, dst) optimize_image(src, dst) elif ext == '.epub': logger.info("\t\tCreating optimized EPUB file {}".format(fname)) tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub', dir=TMP_FOLDER) tmp_epub.close() try: optimize_epub(src, tmp_epub.name) except zipfile.BadZipFile: logger.warn("\t\tBad zip file. " "Copying as it might be working{}".format(fname)) handle_companion_file(fname, dstfname, book, force, as_ext='zip') else: path(tmp_epub.name).move(dst) else: # excludes files created by Windows Explorer if src.endswith('_Thumbs.db'): return # copy otherwise (PDF mostly) logger.debug("\t\tshitty ext: {}".format(dst)) logger.info("\t\tCopying companion file to {}".format(fname)) copy_from_cache(src, dst)
def handle_companion_file( fname, dstfname=None, book=None, force=False, as_ext=None, html_file_list=None, s3_storage=None, ): ext = fname.suffix if as_ext is None else as_ext src = fname if dstfname is None: dstfname = fname.name dst = static_folder.joinpath(dstfname) if dst.exists() and not force: logger.debug("\t\tSkipping existing companion {}".format(dstfname)) return # optimization based on mime/extension if ext in (".png", ".jpg", ".jpeg", ".gif"): logger.info("\t\tCopying and optimizing image companion {}".format(fname)) optimize_image(src, dst) if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage: upload_to_cache( asset=dst, book_format="cover", book_id=book.id, etag=book.cover_etag, s3_storage=s3_storage, optimizer_version=optimizer_version, ) update_download_cache(src, dst) elif html_file_list: html_file_list.append(dst) update_download_cache(src, dst) elif ext == ".epub": logger.info("\t\tCreating optimized EPUB file {}".format(fname)) tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER) tmp_epub.close() try: optimize_epub(src, tmp_epub.name) except zipfile.BadZipFile: logger.warn( "\t\tBad zip file. " "Copying as it might be working{}".format(fname) ) handle_companion_file(fname, dstfname, book, force, as_ext=".zip") else: path(tmp_epub.name).move(dst) if s3_storage: upload_to_cache( asset=dst, book_format="epub", book_id=book.id, etag=book.epub_etag, s3_storage=s3_storage, optimizer_version=optimizer_version, ) update_download_cache(src, dst) else: # excludes files created by Windows Explorer if src.name.endswith("_Thumbs.db"): return # copy otherwise (PDF mostly) logger.info("\t\tCopying companion file to {}".format(dst)) copy_file(src, dst) if ext != ".pdf" and ext != ".zip" and html_file_list: html_file_list.append(dst) update_download_cache(src, dst)
def optimize_epub(src, dst): logger.info("\t\tCreating ePUB off {} at {}".format(src, dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) try: with zipfile.ZipFile(src, "r") as zf: zipped_files = zf.namelist() zf.extractall(tmpd) except zipfile.BadZipFile as exc: shutil.rmtree(tmpd) raise exc remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"): # special case to remove ugly cover if fname.endswith("cover.jpg") and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True) if path(fname).ext in (".htm", ".html"): html_content, _ = read_file(fnp) html = update_html_for_static( book=book, html_content=html_content, epub=True ) save_bs_output(html, fnp, UTF8) if path(fname).ext == ".ncx": pattern = "*** START: FULL LICENSE ***" ncx, _ = read_file(fnp) soup = BeautifulSoup(ncx, "lxml-xml") for tag in soup.findAll("text"): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling save_bs_output(soup, fnp, UTF8) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p() soup = None opff = os.path.join(tmpd, text_type(book.id), "content.opf") if os.path.exists(opff): opff_content, _ = read_file(opff) soup = BeautifulSoup(opff_content, "lxml-xml") for elem in soup.findAll(): if getattr(elem, "attrs", {}).get("href") == "cover.jpg": elem.decompose() save_bs_output(soup, opff, UTF8) # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) path(tmpd).rmtree_p()
def export_book_to(book, static_folder, download_cache, cached_files, languages, formats, books, project_id, force=False): logger.info("\tExporting Book #{id}.".format(id=book.id)) # actual book content, as HTML html, encoding = html_content_for(book=book, static_folder=static_folder, download_cache=download_cache) if html: article_fpath = os.path.join(static_folder, article_name_for(book)) if not path(article_fpath).exists() or force: logger.info("\t\tExporting to {}".format(article_fpath)) try: new_html = update_html_for_static(book=book, html_content=html) except: raise new_html = html save_bs_output(new_html, article_fpath, UTF8) else: logger.info("\t\tSkipping HTML article {}".format(article_fpath)) def symlink_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tSymlinking {}".format(dst)) path(dst).unlink_p() try: path(src).link(dst) # hard link except IOError: logger.error("/!\ Unable to symlink missing file {}".format(src)) return def copy_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tCopying {}".format(dst)) path(dst).unlink_p() try: path(src).copy(dst) except IOError: logger.error("/!\ Unable to copy missing file {}".format(src)) return def optimize_image(src, dst, force=False): if path(dst).exists() and not force: logger.info("\tSkipping image optimization for {}".format(dst)) return dst logger.info("\tOptimizing image {}".format(dst)) if path(src).ext == '.png': return optimize_png(src, dst) if path(src).ext in ('.jpg', '.jpeg'): return optimize_jpeg(src, dst) if path(src).ext == '.gif': return optimize_gif(src, dst) return dst def optimize_gif(src, dst): exec_cmd(['gifsicle', '-O3', src, '-o', dst]) def optimize_png(src, dst): exec_cmd(['pngquant', '--nofs', '--force', '--output', dst, src]) exec_cmd(['advdef', '-z', '-4', '-i', '5', dst]) def optimize_jpeg(src, dst): copy_from_cache(src, dst) exec_cmd(['jpegoptim', '--strip-all', '-m50', dst]) def optimize_epub(src, dst): logger.info("\t\tCreating ePUB off {} at {}".format(src, dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(fnp, fnp) if path(fname).ext in ('.htm', '.html'): html_content, html_encoding = read_file(fnp) html = update_html_for_static(book=book, html_content=html_content, epub=True) save_bs_output(html, fnp, UTF8) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" ncx, ncx_encoding = read_file(fnp) soup = BeautifulSoup(ncx, 'lxml-xml') for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling save_bs_output(soup, fnp, UTF8) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path( os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, text_type(book.id), 'content.opf') if os.path.exists(opff): opff_content, opff_encoding = read_file(opff) soup = BeautifulSoup(opff_content, 'lxml-xml') for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() save_bs_output(soup, opff, UTF8) # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) path(tmpd).rmtree_p() def handle_companion_file(fname, dstfname=None, book=None, force=False, as_ext=None): ext = path(fname).ext if as_ext is None else as_ext src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) if path(dst).exists() and not force: logger.debug("\t\tSkipping existing companion {}".format(dstfname)) return # optimization based on mime/extension if ext in ('.png', '.jpg', '.jpeg', '.gif'): logger.info("\t\tCopying and optimizing image companion {}" .format(fname)) # copy_from_cache(src, dst) optimize_image(src, dst) elif ext == '.epub': logger.info("\t\tCreating optimized EPUB file {}".format(fname)) tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub', dir=TMP_FOLDER) tmp_epub.close() try: optimize_epub(src, tmp_epub.name) except zipfile.BadZipFile: logger.warn("\t\tBad zip file. " "Copying as it might be working{}".format(fname)) handle_companion_file(fname, dstfname, book, force, as_ext='zip') else: path(tmp_epub.name).move(dst) else: # excludes files created by Windows Explorer if src.endswith('_Thumbs.db'): return # copy otherwise (PDF mostly) logger.debug("\t\tshitty ext: {}".format(dst)) logger.info("\t\tCopying companion file to {}".format(fname)) copy_from_cache(src, dst) # associated files (images, etc) for fname in [fn for fn in cached_files if fn.startswith("{}_".format(book.id))]: if path(fname).ext in ('.html', '.htm'): src = os.path.join(path(download_cache).abspath(), fname) dst = os.path.join(path(static_folder).abspath(), fname) if path(dst).exists() and not force: logger.debug("\t\tSkipping existing HTML {}".format(dst)) continue logger.info("\t\tExporting HTML file to {}".format(dst)) html, encoding = read_file(src) new_html = update_html_for_static(book=book, html_content=html) save_bs_output(new_html, dst, UTF8) else: try: handle_companion_file(fname, force=force) except Exception as e: logger.exception(e) logger.error("\t\tException while handling companion file: {}" .format(e)) # other formats for format in formats: if format not in book.formats() or format == 'html': continue try: handle_companion_file(fname_for(book, format), archive_name_for(book, format), force=force) except Exception as e: logger.exception(e) logger.error("\t\tException while handling companion file: {}" .format(e)) # book presentation article cover_fpath = os.path.join(static_folder, article_name_for(book=book, cover=True)) if not path(cover_fpath).exists() or force: logger.info("\t\tExporting to {}".format(cover_fpath)) html = cover_html_content_for(book=book, static_folder=static_folder, books=books, project_id=project_id) with open(cover_fpath, 'w') as f: if six.PY2: f.write(html.encode(UTF8)) else: f.write(html) else: logger.info("\t\tSkipping cover {}".format(cover_fpath))
def optimize_epub(src, dst): logger.info("\t\tCreating ePUB off {} at {}".format(src, dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(fnp, fnp) if path(fname).ext in ('.htm', '.html'): html_content, html_encoding = read_file(fnp) html = update_html_for_static(book=book, html_content=html_content, epub=True) save_bs_output(html, fnp, UTF8) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" ncx, ncx_encoding = read_file(fnp) soup = BeautifulSoup(ncx, 'lxml-xml') for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling save_bs_output(soup, fnp, UTF8) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path( os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, text_type(book.id), 'content.opf') if os.path.exists(opff): opff_content, opff_encoding = read_file(opff) soup = BeautifulSoup(opff_content, 'lxml-xml') for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() save_bs_output(soup, opff, UTF8) # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) path(tmpd).rmtree_p()
def export_to_json_helpers(books, static_folder, languages, formats, project_id): def dumpjs(col, fn, var='json_data'): with open(os.path.join(static_folder, fn), 'w') as f: f.write("var {var} = ".format(var=var)) f.write(json.dumps(col)) f.write(";") # json.dump(col, f) # all books sorted by popularity logger.info("\t\tDumping full_by_popularity.js") dumpjs([book.to_array() for book in books.order_by(Book.downloads.desc())], 'full_by_popularity.js') # all books sorted by title logger.info("\t\tDumping full_by_title.js") dumpjs([book.to_array() for book in books.order_by(Book.title.asc())], 'full_by_title.js') avail_langs = get_langs_with_count(books=books) all_filtered_authors = [] # language-specific collections for lang_name, lang, lang_count in avail_langs: lang_filtered_authors = list( set([book.author.gut_id for book in books.filter(language=lang)])) for aid in lang_filtered_authors: if aid not in all_filtered_authors: all_filtered_authors.append(aid) # by popularity logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .order_by(Book.downloads.desc())], 'lang_{}_by_popularity.js'.format(lang)) # by title logger.info("\t\tDumping lang_{}_by_title.js".format(lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .order_by(Book.title.asc())], 'lang_{}_by_title.js'.format(lang)) authors = authors_from_ids(lang_filtered_authors) logger.info("\t\tDumping authors_lang_{}.js".format(lang)) dumpjs([author.to_array() for author in authors], 'authors_lang_{}.js'.format(lang), 'authors_json_data') # author specific collections authors = authors_from_ids(all_filtered_authors) for author in authors: # all_filtered_authors.remove(author.gut_id) # by popularity logger.info( "\t\tDumping auth_{}_by_popularity.js".format(author.gut_id)) dumpjs( [book.to_array() for book in books.where(Book.author == author) .order_by(Book.downloads.desc())], 'auth_{}_by_popularity.js'.format(author.gut_id)) # by title logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id)) dumpjs( [book.to_array() for book in books.where(Book.author == author) .order_by(Book.title.asc())], 'auth_{}_by_title.js'.format(author.gut_id)) # by language for lang_name, lang, lang_count in avail_langs: logger.info("\t\tDumping auth_{}_by_lang_{}.js" .format(author.gut_id, lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .where(Book.author == author) .order_by(Book.downloads.desc())], 'auth_{}_lang_{}_by_popularity.js'.format(author.gut_id, lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .where(Book.author == author) .order_by(Book.title.asc())], 'auth_{}_lang_{}_by_title.js'.format(author.gut_id, lang)) # author HTML redirect file save_author_file(author, static_folder, books, project_id, force=True) # authors list sorted by name logger.info("\t\tDumping authors.js") dumpjs([author.to_array() for author in authors], 'authors.js', 'authors_json_data') # languages list sorted by code logger.info("\t\tDumping languages.js") dumpjs(avail_langs, 'languages.js', 'languages_json_data') # languages by weight main_languages, other_languages = get_lang_groups(books) logger.info("\t\tDumping main_languages.js") dumpjs(main_languages, 'main_languages.js', 'main_languages_json_data') dumpjs(other_languages, 'other_languages.js', 'other_languages_json_data')
def download_book(book, download_cache, languages, formats, force): logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if "html" not in formats: formats.append("html") for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}".format( fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == "html": patterns = [ "mnsrb10h.htm", "8ledo10h.htm", "tycho10f.htm", "8ledo10h.zip", "salme10h.htm", "8nszr10h.htm", "{id}-h.html", "{id}.html.gen", "{id}-h.htm", "8regr10h.zip", "{id}.html.noimages", "8lgme10h.htm", "tycho10h.htm", "tycho10h.zip", "8lgme10h.zip", "8indn10h.zip", "8resp10h.zip", "20004-h.htm", "8indn10h.htm", "8memo10h.zip", "fondu10h.zip", "{id}-h.zip", "8mort10h.zip", ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( format, book.id, book.title).encode("utf-8")) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images).get() except Exception: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( format, book.id, book.title).encode("utf-8")) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while urls: url = urls.pop() if len(allurls) != 1: if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith(".zip"): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) pp(allurls) continue
def export_to_json_helpers( books, static_folder, languages, formats, project_id, title_search, add_bookshelves ): def dumpjs(col, fn, var="json_data"): with open(os.path.join(static_folder, fn), "w") as f: f.write("var {var} = ".format(var=var)) f.write(json.dumps(col)) f.write(";") # json.dump(col, f) # all books sorted by popularity logger.info("\t\tDumping full_by_popularity.js") dumpjs( [book.to_array() for book in books.order_by(Book.downloads.desc())], "full_by_popularity.js", ) # all books sorted by title logger.info("\t\tDumping full_by_title.js") dumpjs( [book.to_array() for book in books.order_by(Book.title.asc())], "full_by_title.js", ) avail_langs = get_langs_with_count(books=books) all_filtered_authors = [] # language-specific collections for lang_name, lang, lang_count in avail_langs: lang_filtered_authors = list( set([book.author.gut_id for book in books.filter(language=lang)]) ) for aid in lang_filtered_authors: if aid not in all_filtered_authors: all_filtered_authors.append(aid) # by popularity logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang)) dumpjs( [ book.to_array() for book in books.where(Book.language == lang).order_by( Book.downloads.desc() ) ], "lang_{}_by_popularity.js".format(lang), ) # by title logger.info("\t\tDumping lang_{}_by_title.js".format(lang)) dumpjs( [ book.to_array() for book in books.where(Book.language == lang).order_by( Book.title.asc() ) ], "lang_{}_by_title.js".format(lang), ) authors = authors_from_ids(lang_filtered_authors) logger.info("\t\tDumping authors_lang_{}.js".format(lang)) dumpjs( [author.to_array() for author in authors], "authors_lang_{}.js".format(lang), "authors_json_data", ) if add_bookshelves: bookshelves = bookshelf_list(books) for bookshelf in bookshelves: # exclude the books with no bookshelf data if bookshelf is None: continue # dumpjs for bookshelf by popularity # this will allow the popularity button to use this js on the # particular bookshelf page logger.info("\t\tDumping bookshelf_{}_by_popularity.js".format(bookshelf)) dumpjs( [ book.to_array() for book in books.select() .where(Book.bookshelf == bookshelf) .order_by(Book.downloads.desc()) ], "bookshelf_{}_by_popularity.js".format(bookshelf), ) # by title logger.info("\t\tDumping bookshelf_{}_by_title.js".format(bookshelf)) dumpjs( [ book.to_array() for book in books.select() .where(Book.bookshelf == bookshelf) .order_by(Book.title.asc()) ], "bookshelf_{}_by_title.js".format(bookshelf), ) # by language for lang_name, lang, lang_count in avail_langs: logger.info( "\t\tDumping bookshelf_{}_by_lang_{}.js".format(bookshelf, lang) ) dumpjs( [ book.to_array() for book in books.select() .where(Book.language == lang) .where(Book.bookshelf == bookshelf) .order_by(Book.downloads.desc()) ], "bookshelf_{}_lang_{}_by_popularity.js".format(bookshelf, lang), ) dumpjs( [ book.to_array() for book in books.select() .where(Book.language == lang) .where(Book.bookshelf == bookshelf) .order_by(Book.title.asc()) ], "bookshelf_{}_lang_{}_by_title.js".format(bookshelf, lang), ) # dump all bookshelves from any given language for lang_name, lang, lang_count in avail_langs: logger.info("\t\tDumping bookshelves_lang_{}.js".format(lang)) temp = bookshelf_list_language(books, lang) dumpjs(temp, "bookshelves_lang_{}.js".format(lang)) logger.info("\t\tDumping bookshelves.js") dumpjs(bookshelves, "bookshelves.js", "bookshelves_json_data") # Create the bookshelf home page context = get_default_context(project_id=project_id, books=books) context.update({"bookshelf_home": True, "add_bookshelves": True}) template = jinja_env.get_template("bookshelf_home.html") rendered = template.render(**context) save_bs_output( rendered, os.path.join(static_folder, "bookshelf_home.html"), UTF8 ) # add individual bookshelf pages for bookshelf in bookshelves: if bookshelf is None: continue context["bookshelf"] = bookshelf context.update( { "bookshelf_home": False, "individual_book_shelf": True, "no_filters": True, "add_bookshelves": True, } ) template = jinja_env.get_template("bookshelf.html") rendered = template.render(**context) savepath = os.path.join(static_folder, "{}.html".format(bookshelf)) # logger.info("Saving {} to {}".format(bookshelf, savepath)) save_bs_output(rendered, savepath, UTF8) # author specific collections authors = authors_from_ids(all_filtered_authors) for author in authors: # all_filtered_authors.remove(author.gut_id) # by popularity logger.info("\t\tDumping auth_{}_by_popularity.js".format(author.gut_id)) dumpjs( [ book.to_array() for book in books.where(Book.author == author).order_by( Book.downloads.desc() ) ], "auth_{}_by_popularity.js".format(author.gut_id), ) # by title logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id)) dumpjs( [ book.to_array() for book in books.where(Book.author == author).order_by( Book.title.asc() ) ], "auth_{}_by_title.js".format(author.gut_id), ) # by language for lang_name, lang, lang_count in avail_langs: logger.info("\t\tDumping auth_{}_by_lang_{}.js".format(author.gut_id, lang)) dumpjs( [ book.to_array() for book in books.where(Book.language == lang) .where(Book.author == author) .order_by(Book.downloads.desc()) ], "auth_{}_lang_{}_by_popularity.js".format(author.gut_id, lang), ) dumpjs( [ book.to_array() for book in books.where(Book.language == lang) .where(Book.author == author) .order_by(Book.title.asc()) ], "auth_{}_lang_{}_by_title.js".format(author.gut_id, lang), ) # author HTML redirect file save_author_file(author, static_folder, books, project_id, force=True) # authors list sorted by name logger.info("\t\tDumping authors.js") dumpjs([author.to_array() for author in authors], "authors.js", "authors_json_data") # languages list sorted by code logger.info("\t\tDumping languages.js") dumpjs(avail_langs, "languages.js", "languages_json_data") # languages by weight main_languages, other_languages = get_lang_groups(books) logger.info("\t\tDumping main_languages.js") dumpjs(main_languages, "main_languages.js", "main_languages_json_data") dumpjs(other_languages, "other_languages.js", "other_languages_json_data")
def export_book_to(book, static_folder, download_cache, cached_files, languages, formats, books, project_id, force=False): logger.info("\tExporting Book #{id}.".format(id=book.id)) # actual book content, as HTML html, encoding = html_content_for(book=book, static_folder=static_folder, download_cache=download_cache) if html: article_fpath = os.path.join(static_folder, article_name_for(book)) if not path(article_fpath).exists() or force: logger.info("\t\tExporting to {}".format(article_fpath)) try: new_html = update_html_for_static(book=book, html_content=html) except Exception: raise new_html = html save_bs_output(new_html, article_fpath, UTF8) else: logger.info("\t\tSkipping HTML article {}".format(article_fpath)) def symlink_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tSymlinking {}".format(dst)) path(dst).unlink_p() try: path(src).link(dst) # hard link except IOError: logger.error("/!\\ Unable to symlink missing file {}".format(src)) return def copy_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tCopying {}".format(dst)) path(dst).unlink_p() try: path(src).copy(dst) except IOError: logger.error("/!\\ Unable to copy missing file {}".format(src)) return def optimize_image(src, dst, force=False): if path(dst).exists() and not force: logger.info("\tSkipping image optimization for {}".format(dst)) return dst logger.info("\tOptimizing image {}".format(dst)) if path(src).ext == '.png': return optimize_png(src, dst) if path(src).ext in ('.jpg', '.jpeg'): return optimize_jpeg(src, dst) if path(src).ext == '.gif': return optimize_gif(src, dst) return dst def optimize_gif(src, dst): exec_cmd(['gifsicle', '-O3', src, '-o', dst]) def optimize_png(src, dst): exec_cmd(['pngquant', '--nofs', '--force', '--output', dst, src]) exec_cmd(['advdef', '-z', '-4', '-i', '5', dst]) def optimize_jpeg(src, dst): copy_from_cache(src, dst) exec_cmd(['jpegoptim', '--strip-all', '-m50', dst]) def optimize_epub(src, dst): logger.info("\t\tCreating ePUB off {} at {}".format(src, dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(fnp, fnp) if path(fname).ext in ('.htm', '.html'): html_content, html_encoding = read_file(fnp) html = update_html_for_static(book=book, html_content=html_content, epub=True) save_bs_output(html, fnp, UTF8) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" ncx, ncx_encoding = read_file(fnp) soup = BeautifulSoup(ncx, 'lxml-xml') for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling save_bs_output(soup, fnp, UTF8) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path( os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, text_type(book.id), 'content.opf') if os.path.exists(opff): opff_content, opff_encoding = read_file(opff) soup = BeautifulSoup(opff_content, 'lxml-xml') for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() save_bs_output(soup, opff, UTF8) # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) path(tmpd).rmtree_p() def handle_companion_file(fname, dstfname=None, book=None, force=False, as_ext=None): ext = path(fname).ext if as_ext is None else as_ext src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) if path(dst).exists() and not force: logger.debug("\t\tSkipping existing companion {}".format(dstfname)) return # optimization based on mime/extension if ext in ('.png', '.jpg', '.jpeg', '.gif'): logger.info("\t\tCopying and optimizing image companion {}" .format(fname)) # copy_from_cache(src, dst) optimize_image(src, dst) elif ext == '.epub': logger.info("\t\tCreating optimized EPUB file {}".format(fname)) tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub', dir=TMP_FOLDER) tmp_epub.close() try: optimize_epub(src, tmp_epub.name) except zipfile.BadZipFile: logger.warn("\t\tBad zip file. " "Copying as it might be working{}".format(fname)) handle_companion_file(fname, dstfname, book, force, as_ext='zip') else: path(tmp_epub.name).move(dst) else: # excludes files created by Windows Explorer if src.endswith('_Thumbs.db'): return # copy otherwise (PDF mostly) logger.debug("\t\tshitty ext: {}".format(dst)) logger.info("\t\tCopying companion file to {}".format(fname)) copy_from_cache(src, dst) # associated files (images, etc) for fname in [fn for fn in cached_files if fn.startswith("{}_".format(book.id))]: if path(fname).ext in ('.html', '.htm'): src = os.path.join(path(download_cache).abspath(), fname) dst = os.path.join(path(static_folder).abspath(), fname) if path(dst).exists() and not force: logger.debug("\t\tSkipping existing HTML {}".format(dst)) continue logger.info("\t\tExporting HTML file to {}".format(dst)) html, encoding = read_file(src) new_html = update_html_for_static(book=book, html_content=html) save_bs_output(new_html, dst, UTF8) else: try: handle_companion_file(fname, force=force) except Exception as e: logger.exception(e) logger.error("\t\tException while handling companion file: {}" .format(e)) # other formats for format in formats: if format not in book.formats() or format == 'html': continue try: handle_companion_file(fname_for(book, format), archive_name_for(book, format), force=force) except Exception as e: logger.exception(e) logger.error("\t\tException while handling companion file: {}" .format(e)) # book presentation article cover_fpath = os.path.join(static_folder, article_name_for(book=book, cover=True)) if not path(cover_fpath).exists() or force: logger.info("\t\tExporting to {}".format(cover_fpath)) html = cover_html_content_for(book=book, static_folder=static_folder, books=books, project_id=project_id) with open(cover_fpath, 'w') as f: if six.PY2: f.write(html.encode(UTF8)) else: f.write(html) else: logger.info("\t\tSkipping cover {}".format(cover_fpath))
def build_zimfile( static_folder, output_folder, zim_name=None, languages=[], formats=[], title=None, description=None, only_books=[], create_index=True, force=False, title_search=False, add_bookshelves=False, ): # revert HTML/JS/CSS to zim-compatible versions export_skeleton( static_folder=static_folder, dev_mode=False, languages=languages, formats=formats, only_books=only_books, title_search=title_search, add_bookshelves=add_bookshelves, ) if not languages: languages = ["mul"] languages.sort() formats.sort() if title is None: if len(languages) > 5: title = "Project Gutenberg Library" else: title = "Project Gutenberg Library ({langs})".format( langs=",".join(languages) ) if len(formats) < len(FORMAT_MATRIX): title += " with {formats}".format(formats=",".join(formats)) logger.info("\tWritting ZIM for {}".format(title)) if description is None: description = "The first producer of free ebooks" project_id = get_project_id(languages, formats, only_books) if zim_name is None: zim_name = "{}.zim".format(project_id) zim_path = output_folder.joinpath(zim_name) if path(zim_name).exists() and not force: logger.info("ZIM file `{}` already exist.".format(zim_name)) return languages = [ISO_MATRIX.get(lang, lang) for lang in languages] languages.sort() cmd = [ "zimwriterfs", "--welcome", "Home.html", "--favicon", "favicon.png", "--language", ",".join(languages), "--name", project_id, "--title", title, "--description", description, "--creator", "gutenberg.org", "--tags", "gutenberg", "--publisher", "Kiwix", "--scraper", "gutengergtozim-{v}".format(v=VERSION), static_folder, six.text_type(zim_path), ] if not create_index: cmd.insert(1, "--withoutFTIndex") if exec_cmd(cmd) == 0: logger.info("Successfuly created ZIM file at {}".format(zim_path)) else: logger.error("Unable to create ZIM file :(")
def optimize_epub(src, dst): logger.info("\t\tCreating ePUB off {} at {}".format(src, dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(fnp, fnp) if path(fname).ext in ('.htm', '.html'): html_content, html_encoding = read_file(fnp) html = update_html_for_static(book=book, html_content=html_content, epub=True) save_bs_output(html, fnp, UTF8) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" ncx, ncx_encoding = read_file(fnp) soup = BeautifulSoup(ncx, 'lxml-xml') for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling save_bs_output(soup, fnp, UTF8) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path( os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, text_type(book.id), 'content.opf') if os.path.exists(opff): opff_content, opff_encoding = read_file(opff) soup = BeautifulSoup(opff_content, 'lxml-xml') for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() save_bs_output(soup, opff, UTF8) # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) path(tmpd).rmtree_p()
def load_fixtures(model): logger.info("Loading fixtures for {}".format(model._meta.name)) for fixture in getattr(model._meta, 'fixtures', []): f = model.create(**fixture) logger.debug("[fixtures] Created {}".format(f))
def export_to_json_helpers(books, static_folder, languages, formats, project_id): def dumpjs(col, fn, var='json_data'): with open(os.path.join(static_folder, fn), 'w') as f: f.write("var {var} = ".format(var=var)) f.write(json.dumps(col)) f.write(";") # json.dump(col, f) # all books sorted by popularity logger.info("\t\tDumping full_by_popularity.js") dumpjs([book.to_array() for book in books.order_by(Book.downloads.desc())], 'full_by_popularity.js') # all books sorted by title logger.info("\t\tDumping full_by_title.js") dumpjs([book.to_array() for book in books.order_by(Book.title.asc())], 'full_by_title.js') avail_langs = get_langs_with_count(books=books) all_filtered_authors = [] # language-specific collections for lang_name, lang, lang_count in avail_langs: lang_filtered_authors = list( set([book.author.gut_id for book in books.filter(language=lang)])) for aid in lang_filtered_authors: if aid not in all_filtered_authors: all_filtered_authors.append(aid) # by popularity logger.info("\t\tDumping lang_{}_by_popularity.js".format(lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .order_by(Book.downloads.desc())], 'lang_{}_by_popularity.js'.format(lang)) # by title logger.info("\t\tDumping lang_{}_by_title.js".format(lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .order_by(Book.title.asc())], 'lang_{}_by_title.js'.format(lang)) authors = authors_from_ids(lang_filtered_authors) logger.info("\t\tDumping authors_lang_{}.js".format(lang)) dumpjs([author.to_array() for author in authors], 'authors_lang_{}.js'.format(lang), 'authors_json_data') # author specific collections authors = authors_from_ids(all_filtered_authors) for author in authors: # all_filtered_authors.remove(author.gut_id) # by popularity logger.info( "\t\tDumping auth_{}_by_popularity.js".format(author.gut_id)) dumpjs( [book.to_array() for book in books.where(Book.author == author) .order_by(Book.downloads.desc())], 'auth_{}_by_popularity.js'.format(author.gut_id)) # by title logger.info("\t\tDumping auth_{}_by_title.js".format(author.gut_id)) dumpjs( [book.to_array() for book in books.where(Book.author == author) .order_by(Book.title.asc())], 'auth_{}_by_title.js'.format(author.gut_id)) # by language for lang_name, lang, lang_count in avail_langs: logger.info("\t\tDumping auth_{}_by_lang_{}.js" .format(author.gut_id, lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .where(Book.author == author) .order_by(Book.downloads.desc())], 'auth_{}_lang_{}_by_popularity.js'.format(author.gut_id, lang)) dumpjs( [book.to_array() for book in books.where(Book.language == lang) .where(Book.author == author) .order_by(Book.title.asc())], 'auth_{}_lang_{}_by_title.js'.format(author.gut_id, lang)) # author HTML redirect file save_author_file(author, static_folder, books, project_id, force=True) # authors list sorted by name logger.info("\t\tDumping authors.js") dumpjs([author.to_array() for author in authors], 'authors.js', 'authors_json_data') # languages list sorted by code logger.info("\t\tDumping languages.js") dumpjs(avail_langs, 'languages.js', 'languages_json_data') # languages by weight main_languages, other_languages = get_lang_groups(books) logger.info("\t\tDumping main_languages.js") dumpjs(main_languages, 'main_languages.js', 'main_languages_json_data') dumpjs(other_languages, 'other_languages.js', 'other_languages_json_data')
def load_fixtures(model): logger.info("Loading fixtures for {}".format(model._meta.name)) for fixture in getattr(model._meta, "fixtures", []): f = model.create(**fixture) logger.debug("[fixtures] Created {}".format(f))
def handle_unoptimized_files( book, static_folder, src_dir, languages, formats, books, project_id, optimizer_version, force=False, title_search=False, add_bookshelves=False, s3_storage=None, ): def copy_file(src, dst): logger.info("\t\tCopying {}".format(dst)) try: shutil.copy2(src, dst) except IOError: logger.error("/!\\ Unable to copy missing file {}".format(src)) return def update_download_cache(unoptimized_file, optimized_file): book_dir = unoptimized_file.parents[1] optimized_dir = book_dir.joinpath("optimized") unoptimized_dir = book_dir.joinpath("unoptimized") if not optimized_dir.exists(): optimized_dir.mkdir() dst = optimized_dir.joinpath(optimized_file.name) os.unlink(unoptimized_file) copy_file(optimized_file.resolve(), dst.resolve()) if not [fpath for fpath in unoptimized_dir.iterdir()]: unoptimized_dir.rmdir() logger.info("\tExporting Book #{id}.".format(id=book.id)) # actual book content, as HTML html, _ = html_content_for(book=book, src_dir=src_dir) html_book_optimized_files = [] if html: article_fpath = static_folder.joinpath(article_name_for(book)) if not article_fpath.exists() or force: logger.info("\t\tExporting to {}".format(article_fpath)) try: new_html = update_html_for_static(book=book, html_content=html) except Exception: raise save_bs_output(new_html, article_fpath, UTF8) html_book_optimized_files.append(article_fpath) update_download_cache( src_dir.joinpath(fname_for(book, "html")), article_fpath ) if not src_dir.exists(): return else: logger.info("\t\tSkipping HTML article {}".format(article_fpath)) def optimize_image(src, dst, force=False): if dst.exists() and not force: logger.info("\tSkipping image optimization for {}".format(dst)) return dst logger.info("\tOptimizing image {}".format(dst)) if src.suffix == ".png": return optimize_png(str(src.resolve()), str(dst.resolve())) if src.suffix in (".jpg", ".jpeg"): return optimize_jpeg(str(src.resolve()), str(dst.resolve())) if src.suffix == ".gif": return optimize_gif(str(src.resolve()), str(dst.resolve())) return dst def optimize_gif(src, dst): exec_cmd(["gifsicle", "-O3", src, "-o", dst]) def optimize_png(src, dst): exec_cmd(["pngquant", "--nofs", "--force", "--output", dst, src]) exec_cmd(["advdef", "-z", "-4", "-i", "5", dst]) def optimize_jpeg(src, dst): if src != dst: copy_file(src, dst) exec_cmd(["jpegoptim", "--strip-all", "-m50", dst]) def optimize_epub(src, dst): logger.info("\t\tCreating ePUB off {} at {}".format(src, dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) try: with zipfile.ZipFile(src, "r") as zf: zipped_files = zf.namelist() zf.extractall(tmpd) except zipfile.BadZipFile as exc: shutil.rmtree(tmpd) raise exc remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"): # special case to remove ugly cover if fname.endswith("cover.jpg") and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True) if path(fname).ext in (".htm", ".html"): html_content, _ = read_file(fnp) html = update_html_for_static( book=book, html_content=html_content, epub=True ) save_bs_output(html, fnp, UTF8) if path(fname).ext == ".ncx": pattern = "*** START: FULL LICENSE ***" ncx, _ = read_file(fnp) soup = BeautifulSoup(ncx, "lxml-xml") for tag in soup.findAll("text"): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling save_bs_output(soup, fnp, UTF8) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p() soup = None opff = os.path.join(tmpd, text_type(book.id), "content.opf") if os.path.exists(opff): opff_content, _ = read_file(opff) soup = BeautifulSoup(opff_content, "lxml-xml") for elem in soup.findAll(): if getattr(elem, "attrs", {}).get("href") == "cover.jpg": elem.decompose() save_bs_output(soup, opff, UTF8) # bundle epub as zip zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files) path(tmpd).rmtree_p() def handle_companion_file( fname, dstfname=None, book=None, force=False, as_ext=None, html_file_list=None, s3_storage=None, ): ext = fname.suffix if as_ext is None else as_ext src = fname if dstfname is None: dstfname = fname.name dst = static_folder.joinpath(dstfname) if dst.exists() and not force: logger.debug("\t\tSkipping existing companion {}".format(dstfname)) return # optimization based on mime/extension if ext in (".png", ".jpg", ".jpeg", ".gif"): logger.info("\t\tCopying and optimizing image companion {}".format(fname)) optimize_image(src, dst) if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage: upload_to_cache( asset=dst, book_format="cover", book_id=book.id, etag=book.cover_etag, s3_storage=s3_storage, optimizer_version=optimizer_version, ) update_download_cache(src, dst) elif html_file_list: html_file_list.append(dst) update_download_cache(src, dst) elif ext == ".epub": logger.info("\t\tCreating optimized EPUB file {}".format(fname)) tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER) tmp_epub.close() try: optimize_epub(src, tmp_epub.name) except zipfile.BadZipFile: logger.warn( "\t\tBad zip file. " "Copying as it might be working{}".format(fname) ) handle_companion_file(fname, dstfname, book, force, as_ext=".zip") else: path(tmp_epub.name).move(dst) if s3_storage: upload_to_cache( asset=dst, book_format="epub", book_id=book.id, etag=book.epub_etag, s3_storage=s3_storage, optimizer_version=optimizer_version, ) update_download_cache(src, dst) else: # excludes files created by Windows Explorer if src.name.endswith("_Thumbs.db"): return # copy otherwise (PDF mostly) logger.info("\t\tCopying companion file to {}".format(dst)) copy_file(src, dst) if ext != ".pdf" and ext != ".zip" and html_file_list: html_file_list.append(dst) update_download_cache(src, dst) # associated files (images, etc) for fpath in src_dir.iterdir(): if fpath.is_file() and fpath.name.startswith(f"{book.id}_"): if fpath.suffix in (".html", ".htm"): src = fpath dst = static_folder.joinpath(fpath.name) if dst.exists() and not force: logger.debug("\t\tSkipping existing HTML {}".format(dst)) continue logger.info("\t\tExporting HTML file to {}".format(dst)) html, _ = read_file(src) new_html = update_html_for_static(book=book, html_content=html) save_bs_output(new_html, dst, UTF8) html_book_optimized_files.append(dst) update_download_cache(src, dst) else: try: handle_companion_file( fpath, force=force, html_file_list=html_book_optimized_files, s3_storage=s3_storage, book=book, ) except Exception as e: logger.exception(e) logger.error( "\t\tException while handling companion file: {}".format(e) ) if s3_storage and html_book_optimized_files: upload_to_cache( asset=html_book_optimized_files, book_format="html", etag=book.html_etag, book_id=book.id, s3_storage=s3_storage, optimizer_version=optimizer_version, ) # other formats for format in formats: if format not in book.formats() or format == "html": continue book_file = src_dir.joinpath(fname_for(book, format)) if book_file.exists(): try: handle_companion_file( book_file, archive_name_for(book, format), force=force, book=book, s3_storage=s3_storage, ) except Exception as e: logger.exception(e) logger.error( "\t\tException while handling companion file: {}".format(e) )
def build_zimfile(static_folder, zim_path=None, languages=[], formats=[], title=None, description=None, only_books=[], create_index=True, force=False): # revert HTML/JS/CSS to zim-compatible versions export_skeleton(static_folder=static_folder, dev_mode=False, languages=languages, formats=formats, only_books=only_books) if not languages: languages = ['mul'] languages.sort() formats.sort() if title is None: if len(languages) > 5: title = ("Project Gutenberg Library with {formats}".format( formats=",".join(formats))) else: title = ( "Project Gutenberg Library ({langs}) with {formats}".format( langs=",".join(languages), formats=",".join(formats))) logger.info("\tWritting ZIM for {}".format(title)) if description is None: description = "The first producer of free ebooks" project_id = get_project_id(languages, formats, only_books) if zim_path is None: zim_path = "{}.zim".format(project_id) if path(zim_path).exists() and not force: logger.info("ZIM file `{}` already exist.".format(zim_path)) return languages = [ISO_MATRIX.get(lang, lang) for lang in languages] languages.sort() cmd = [ 'zimwriterfs', '--welcome', "Home.html", '--favicon', "favicon.png", '--language', ','.join(languages), '--name', project_id, '--title', title, '--description', description, '--creator', "gutenberg.org", '--publisher', "Kiwix", static_folder, zim_path ] if create_index: cmd.insert(1, '--withFullTextIndex') if exec_cmd(cmd) == 0: logger.info("Successfuly created ZIM file at {}".format(zim_path)) else: logger.error("Unable to create ZIM file :(")
def download_book(book, download_cache, languages, formats, force, s3_storage, optimizer_version): logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if "html" not in formats: formats.append("html") book_dir = pathlib.Path(download_cache).joinpath(str(book.id)) optimized_dir = book_dir.joinpath("optimized") unoptimized_dir = book_dir.joinpath("unoptimized") unsuccessful_formats = [] for book_format in formats: unoptimized_fpath = unoptimized_dir.joinpath( fname_for(book, book_format)) optimized_fpath = optimized_dir.joinpath( archive_name_for(book, book_format)) # check if already downloaded if (unoptimized_fpath.exists() or optimized_fpath.exists()) and not force: logger.debug( f"\t\t{book_format} already exists for book #{book.id}") continue if force: if book_format == "html": for fpath in book_dir.iterdir(): if fpath.is_file() and fpath.suffix not in [ ".pdf", ".epub" ]: fpath.unlink() else: if unoptimized_fpath.exists(): unoptimized_fpath.unlink() if optimized_fpath.exists(): optimized_fpath.unlink() # delete dirs which are empty for dir_name in [optimized_dir, unoptimized_dir]: if not dir_name.exists(): continue if not list(dir_name.iterdir()): dir_name.rmdir() # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if book_format == "html": patterns = [ "mnsrb10h.htm", "8ledo10h.htm", "tycho10f.htm", "8ledo10h.zip", "salme10h.htm", "8nszr10h.htm", "{id}-h.html", "{id}.html.gen", "{id}-h.htm", "8regr10h.zip", "{id}.html.noimages", "8lgme10h.htm", "tycho10h.htm", "tycho10h.zip", "8lgme10h.zip", "8indn10h.zip", "8resp10h.zip", "20004-h.htm", "8indn10h.htm", "8memo10h.zip", "fondu10h.zip", "{id}-h.zip", "8mort10h.zip", ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") unsuccessful_formats.append(book_format) continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(book_format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( book_format, book.id, book.title)) unsuccessful_formats.append(book_format) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images).get() except Exception: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( book_format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format)))) import copy allurls = copy.copy(urls) downloaded_from_cache = False while urls: url = urls.pop() # for development # if len(allurls) != 1: # if not resource_exists(url): # continue # HTML files are *sometime* available as ZIP files if url.endswith(".zip"): zpath = unoptimized_dir.joinpath( f"{fname_for(book, book_format)}.zip") etag = get_etag_from_url(url) if s3_storage: if download_from_cache( book=book, etag=etag, book_format=book_format, dest_dir=optimized_dir, s3_storage=s3_storage, optimizer_version=optimizer_version, ): downloaded_from_cache = True break if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # save etag book.html_etag = etag book.save() # extract zipfile handle_zipped_epub(zippath=zpath, book=book, dst_dir=unoptimized_dir) else: if (url.endswith(".htm") or url.endswith(".html") or url.endswith(".html.utf8") or url.endswith(".epub")): etag = get_etag_from_url(url) if s3_storage: logger.info( f"Trying to download {book.id} from optimization cache" ) if download_from_cache( book=book, etag=etag, book_format=book_format, dest_dir=optimized_dir, s3_storage=s3_storage, optimizer_version=optimizer_version, ): downloaded_from_cache = True break if not download_file(url, unoptimized_fpath): logger.error( "file donwload failed: {}".format(unoptimized_fpath)) continue # save etag if html or epub if download is successful if (url.endswith(".htm") or url.endswith(".html") or url.endswith(".html.utf8")): logger.debug(f"Saving html ETag for {book.id}") book.html_etag = etag book.save() elif url.endswith(".epub"): logger.debug(f"Saving epub ETag for {book.id}") book.epub_etag = etag book.save() # store working URL in DB bf.downloaded_from = url bf.save() # break as we got a working URL break if not bf.downloaded_from and not downloaded_from_cache: logger.error("NO FILE FOR #{}/{}".format(book.id, book_format)) # delete instance from DB if download failed logger.info("Deleting instance from DB") bf.delete_instance() unsuccessful_formats.append(book_format) pp(allurls) # delete book from DB if not downloaded in any format if len(unsuccessful_formats) == len(formats): logger.debug( f"Book #{book.id} could not be downloaded in any format. Deleting from DB ..." ) book.delete_instance() if book_dir.exists(): shutil.rmtree(book_dir, ignore_errors=True) return download_cover(book, book_dir, s3_storage, optimizer_version)