def copy_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tCopying {}".format(dst)) path(dst).unlink_p() try: path(src).copy(dst) except IOError: logger.error("/!\ Unable to copy missing file {}".format(src)) return
def check_dependencies(): def bin_is_present(binary): try: subprocess.Popen(binary, universal_newlines=True, shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) except OSError: return False else: return True all_bins = { 'gifsicle': "GIF compression tool, part of `gifsicle` package", 'pngquant': "PNG compression tool, part of `pngquant` package", 'advdef': "PNG compression tool, part of `advancecomp` package", 'jpegoptim': "JPEG compression tool, part of `jpegoptim` package", 'zip': "ZIP file packager for ePub", 'tar': "TAR archive extractor", 'curl': "Files downloader, part of `curl` package", 'zimwriterfs': "ZIM file writer, available on kiwix-other repository", } all_good = True has_zimwriter = True for bin, msg in all_bins.items(): if bin == 'zimwriterfs': if not bin_is_present(bin): has_zimwriter = False continue if not bin_is_present(bin): logger.error("\t*{}* binary missing. {}".format(bin, msg)) all_good = False return all_good, has_zimwriter
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books( languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info("\tDownloading content files for Book #{id}" .format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}" .format(fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip'] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}" .format(format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}" .format(format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while(urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error("ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp ; pp(allurls) continue
def export_book_to(book, static_folder, download_cache, cached_files, languages, formats, books): logger.info("\tExporting Book #{id}.".format(id=book.id)) # actual book content, as HTML html = html_content_for(book=book, static_folder=static_folder, download_cache=download_cache) if html: article_fpath = os.path.join(static_folder, article_name_for(book)) logger.info("\t\tExporting to {}".format(article_fpath)) try: new_html = update_html_for_static(book=book, html_content=html) except: new_html = html with open(article_fpath, 'w') as f: f.write(new_html) def symlink_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tSymlinking {}".format(dst)) path(dst).unlink_p() try: path(src).link(dst) # hard link except IOError: logger.error("/!\ Unable to symlink missing file {}".format(src)) return def copy_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tCopying {}".format(dst)) path(dst).unlink_p() try: path(src).copy(dst) except IOError: logger.error("/!\ Unable to copy missing file {}".format(src)) return def optimize_image(fpath): if path(fpath).ext == '.png': return optimize_png(fpath) if path(fpath).ext in ('.jpg', '.jpeg'): return optimize_jpeg(fpath) if path(fpath).ext == '.gif': return optimize_gif(fpath) return fpath def optimize_gif(fpath): exec_cmd('gifsicle -O3 "{path}" -o "{path}"'.format(path=fpath)) def optimize_png(fpath): pngquant = 'pngquant --nofs --force --ext=".png" "{path}"' advdef = 'advdef -z -4 -i 5 "{path}"' exec_cmd(pngquant.format(path=fpath)) exec_cmd(advdef.format(path=fpath)) def optimize_jpeg(fpath): exec_cmd('jpegoptim --strip-all -m50 "{path}"'.format(path=fpath)) def optimize_epub(src, dst): logger.info("\t\tCreating ePUB at {}".format(dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(path_for_cmd(fnp)) if path(fname).ext in ('.htm', '.html'): f = open(fnp, 'r') html = update_html_for_static(book=book, html_content=f.read(), epub=True) f.close() with open(fnp, 'w') as f: f.write(html) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" f = open(fnp, 'r') ncx = f.read() f.close() soup = BeautifulSoup(ncx, ["lxml", "xml"]) for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling with open(fnp, 'w') as f: f.write(soup.encode()) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, str(book.id), 'content.opf') if os.path.exists(opff): with open(opff, 'r') as fd: soup = BeautifulSoup(fd.read(), ["lxml", "xml"]) for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() with (open(opff, 'w')) as fd: fd.write(soup.encode()) with cd(tmpd): exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst))) exec_cmd('zip -qXr9D "{dst}" {files}'.format( dst=path_for_cmd(dst), files=" ".join( [f for f in zipped_files if not f == 'mimetype']))) path(tmpd).rmtree_p() def handle_companion_file(fname, dstfname=None, book=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) # optimization based on mime/extension if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'): copy_from_cache(src, dst) optimize_image(path_for_cmd(dst)) elif path(fname).ext == '.epub': tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub', dir=TMP_FOLDER) tmp_epub.close() optimize_epub(src, tmp_epub.name) path(tmp_epub.name).move(dst) else: # excludes files created by Windows Explorer if src.endswith('_Thumbs.db'): return # copy otherwise (PDF mostly) logger.debug("\t\tshitty ext: {}".format(dst)) copy_from_cache(src, dst) # associated files (images, etc) for fname in [ fn for fn in cached_files if fn.startswith("{}_".format(book.id)) ]: if path(fname).ext in ('.html', '.htm'): src = os.path.join(path(download_cache).abspath(), fname) dst = os.path.join(path(static_folder).abspath(), fname) logger.info("\t\tExporting HTML file to {}".format(dst)) html = "CAN'T READ FILE" with open(src, 'r') as f: html = f.read() new_html = update_html_for_static(book=book, html_content=html) with open(dst, 'w') as f: f.write(new_html) else: logger.info("\t\tCopying companion file to {}".format(fname)) try: handle_companion_file(fname) except Exception as e: logger.error( "\t\tException while handling companion file: {}".format( e)) # other formats for format in formats: if format not in book.formats() or format == 'html': continue logger.info("\t\tCopying format file to {}".format( archive_name_for(book, format))) try: handle_companion_file(fname_for(book, format), archive_name_for(book, format)) except Exception as e: logger.error( "\t\tException while handling companion file: {}".format(e)) # book presentation article cover_fpath = os.path.join(static_folder, article_name_for(book=book, cover=True)) logger.info("\t\tExporting to {}".format(cover_fpath)) html = cover_html_content_for(book=book, static_folder=static_folder, books=books) with open(cover_fpath, 'w') as f: f.write(html.encode('utf-8'))
def main(arguments): # actions constants DO_PREPARE = arguments.get('--prepare', False) DO_PARSE = arguments.get('--parse', False) DO_DOWNLOAD = arguments.get('--download', False) DO_EXPORT = arguments.get('--export', False) DO_ZIM = arguments.get('--zim', False) DO_CHECKDEPS = arguments.get('--check', False) COMPLETE_DUMP = arguments.get('--complete', False) URL_MIRROR = arguments.get('--mirror') or 'http://zimfarm.kiwix.org/gutenberg' RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files') STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static') ZIM_FILE = arguments.get('--zim-file') WIPE_DB = not arguments.get('--keep-db') or False RDF_URL = arguments.get('--rdf-url') or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2' DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache') BOOKS = arguments.get('--books') or [] ZTITLE = arguments.get('--zim-title') ZDESC = arguments.get('--zim-desc') LANGUAGES = [x.strip().lower() for x in (arguments.get('--languages') or '').split(',') if x.strip()] # special shortcuts for "all" if arguments.get('--formats') in ['all', None]: FORMATS = ['epub', 'pdf'] else: FORMATS = [x.strip().lower() for x in (arguments.get('--formats') or '').split(',') if x.strip()] try: BOOKS = [bid for bid in BOOKS.split(',')] f = lambda x: map(int, [i for i in x.split('-') if i.isdigit()]) books = [] for i in BOOKS: blst = f(i) if len(blst) > 1: blst = range(blst[0], blst[1]+1) books.extend(blst) BOOKS = list(set(books)) except Exception as e: logger.error(e) BOOKS = [] # no arguments, default to --complete if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_EXPORT + DO_ZIM): COMPLETE_DUMP = True if COMPLETE_DUMP: DO_CHECKDEPS = DO_PREPARE = DO_PARSE = \ DO_DOWNLOAD = DO_EXPORT = DO_ZIM = True if DO_CHECKDEPS: logger.info("CHECKING for dependencies on the system") if not check_dependencies()[0]: logger.error("Exiting...") sys.exit(1) if DO_PREPARE: logger.info("PREPARING rdf-files cache from {}".format(RDF_URL)) setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER) if DO_PARSE: logger.info("PARSING rdf-files in {}".format(RDF_FOLDER)) setup_database(wipe=WIPE_DB) parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS) if DO_DOWNLOAD: logger.info("DOWNLOADING ebooks from mirror using filters") download_all_books(url_mirror=URL_MIRROR, download_cache=DL_CACHE, languages=LANGUAGES, formats=FORMATS, only_books=BOOKS) if DO_EXPORT: logger.info("EXPORTING ebooks to static folder (and JSON)") export_all_books(static_folder=STATIC_FOLDER, download_cache=DL_CACHE, languages=LANGUAGES, formats=FORMATS, only_books=BOOKS) if DO_ZIM: if not check_dependencies()[1]: logger.error("You don't have zimwriterfs installed.") sys.exit(1) logger.info("BUILDING ZIM off static folder {}".format(STATIC_FOLDER)) build_zimfile(static_folder=STATIC_FOLDER, zim_path=ZIM_FILE, languages=LANGUAGES, formats=FORMATS, only_books=BOOKS, title=ZTITLE, description=ZDESC)
def build_zimfile(static_folder, zim_path=None, languages=[], formats=[], title=None, description=None, only_books=[]): if not languages: languages = ['mul'] languages.sort() formats.sort() if title is None: if len(languages) > 5: title = ("Project Gutenberg Library with {formats}" .format(formats=",".join(formats))) else: title = ("Project Gutenberg Library ({langs}) with {formats}" .format(langs=",".join(languages), formats=",".join(formats))) logger.info("\tWritting ZIM for {}".format(title)) if description is None: description = "The first producer of free ebooks" if zim_path is None: if len(languages) > 1: zim_path = "gutenberg_all_{date}.zim".format( date=datetime.datetime.now().strftime('%m_%Y')) else: zim_path = "gutenberg_{lang}_all_{date}.zim".format( lang=languages[0], date=datetime.datetime.now().strftime('%Y-%m')) languages = [ISO_MATRIX.get(lang, lang) for lang in languages] languages.sort() context = { 'languages': ','.join(languages), 'title': title, 'description': description, 'creator': 'gutenberg.org', 'publisher': 'Kiwix', 'home': 'Home.html', 'favicon': 'favicon.png', 'static': static_folder, 'zim': zim_path } cmd = ('zimwriterfs --welcome=\\"{home}\\" --favicon=\\"{favicon}\\" ' '--language=\\"{languages}\\" --title=\\"{title}\\" ' '--description=\\"{description}\\" ' '--creator=\\"{creator}\\" --publisher=\\"{publisher}\\" \\"{static}\\" \\"{zim}\\"' .format(**context)) logger.debug("\t\t{}".format(re.sub('\\\\"','"',cmd))) if exec_cmd(cmd): logger.info("Successfuly created ZIM file at {}".format(zim_path)) else: logger.error("Unable to create ZIM file :(")
def download_all_books(url_mirror, download_cache, languages=[], formats=[], only_books=[], force=False): available_books = get_list_of_filtered_books(languages=languages, formats=formats, only_books=only_books) # ensure dir exist path(download_cache).mkdir_p() for book in available_books: logger.info( "\tDownloading content files for Book #{id}".format(id=book.id)) # apply filters if not formats: formats = FORMAT_MATRIX.keys() # HTML is our base for ZIM for add it if not present if not 'html' in formats: formats.append('html') for format in formats: fpath = os.path.join(download_cache, fname_for(book, format)) # check if already downloaded if path(fpath).exists() and not force: logger.debug("\t\t{fmt} already exists at {path}".format( fmt=format, path=fpath)) continue # retrieve corresponding BookFormat bfs = BookFormat.filter(book=book) if format == 'html': patterns = [ 'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm', '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm', '{id}-h.html', '{id}.html.gen', '{id}-h.htm', '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip', '20004-h.htm', '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip' ] bfso = bfs bfs = bfs.join(Format).filter(Format.pattern << patterns) if not bfs.count(): from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs])) from pprint import pprint as pp pp( list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso])) logger.error("html not found") continue else: bfs = bfs.filter(BookFormat.format << Format.filter( mime=FORMAT_MATRIX.get(format))) if not bfs.count(): logger.debug("[{}] not avail. for #{}# {}".format( format, book.id, book.title)) continue if bfs.count() > 1: try: bf = bfs.join(Format).filter(Format.images == True).get() except: bf = bfs.get() else: bf = bfs.get() logger.debug("[{}] Requesting URLs for #{}# {}".format( format, book.id, book.title)) # retrieve list of URLs for format unless we have it in DB if bf.downloaded_from and not force: urls = [bf.downloaded_from] else: urld = get_urls(book) urls = list(reversed(urld.get(FORMAT_MATRIX.get(format)))) import copy allurls = copy.copy(urls) while (urls): url = urls.pop() if not resource_exists(url): continue # HTML files are *sometime* available as ZIP files if url.endswith('.zip'): zpath = "{}.zip".format(fpath) if not download_file(url, zpath): logger.error( "ZIP file donwload failed: {}".format(zpath)) continue # extract zipfile handle_zipped_epub(zippath=zpath, book=book, download_cache=download_cache) else: if not download_file(url, fpath): logger.error("file donwload failed: {}".format(fpath)) continue # store working URL in DB bf.downloaded_from = url bf.save() if not bf.downloaded_from: logger.error("NO FILE FOR #{}/{}".format(book.id, format)) from pprint import pprint as pp pp(allurls) continue
def main(arguments): # actions constants DO_PREPARE = arguments.get('--prepare', False) DO_PARSE = arguments.get('--parse', False) DO_DOWNLOAD = arguments.get('--download', False) DO_EXPORT = arguments.get('--export', False) DO_ZIM = arguments.get('--zim', False) DO_CHECKDEPS = arguments.get('--check', False) COMPLETE_DUMP = arguments.get('--complete', False) URL_MIRROR = arguments.get( '--mirror') or 'http://zimfarm.kiwix.org/gutenberg' RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files') STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static') ZIM_FILE = arguments.get('--zim-file') WIPE_DB = not arguments.get('--keep-db') or False RDF_URL = arguments.get( '--rdf-url' ) or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2' DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache') BOOKS = arguments.get('--books') or '' ZTITLE = arguments.get('--zim-title') ZDESC = arguments.get('--zim-desc') # create tmp dir path('tmp').mkdir_p() LANGUAGES = [ x.strip().lower() for x in (arguments.get('--languages') or '').split(',') if x.strip() ] # special shortcuts for "all" if arguments.get('--formats') in ['all', None]: FORMATS = ['epub', 'pdf'] else: FORMATS = [ x.strip().lower() for x in (arguments.get('--formats') or '').split(',') if x.strip() ] try: BOOKS = [bid for bid in BOOKS.split(',')] f = lambda x: map(int, [i for i in x.split('-') if i.isdigit()]) books = [] for i in BOOKS: blst = f(i) if len(blst) > 1: blst = range(blst[0], blst[1] + 1) books.extend(blst) BOOKS = list(set(books)) except Exception as e: logger.error(e) BOOKS = [] # no arguments, default to --complete if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_EXPORT + DO_ZIM): COMPLETE_DUMP = True if COMPLETE_DUMP: DO_CHECKDEPS = DO_PREPARE = DO_PARSE = \ DO_DOWNLOAD = DO_EXPORT = DO_ZIM = True if DO_CHECKDEPS: logger.info("CHECKING for dependencies on the system") if not check_dependencies()[0]: logger.error("Exiting...") sys.exit(1) if DO_PREPARE: logger.info("PREPARING rdf-files cache from {}".format(RDF_URL)) setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER) if DO_PARSE: logger.info("PARSING rdf-files in {}".format(RDF_FOLDER)) setup_database(wipe=WIPE_DB) parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS) if DO_DOWNLOAD: logger.info("DOWNLOADING ebooks from mirror using filters") download_all_books(url_mirror=URL_MIRROR, download_cache=DL_CACHE, languages=LANGUAGES, formats=FORMATS, only_books=BOOKS) if DO_EXPORT: logger.info("EXPORTING ebooks to static folder (and JSON)") export_all_books(static_folder=STATIC_FOLDER, download_cache=DL_CACHE, languages=LANGUAGES, formats=FORMATS, only_books=BOOKS) if DO_ZIM: if not check_dependencies()[1]: logger.error("You don't have zimwriterfs installed.") sys.exit(1) logger.info("BUILDING ZIM off static folder {}".format(STATIC_FOLDER)) build_zimfile(static_folder=STATIC_FOLDER, zim_path=ZIM_FILE, languages=LANGUAGES, formats=FORMATS, only_books=BOOKS, title=ZTITLE, description=ZDESC)
def export_book_to(book, static_folder, download_cache, cached_files, languages, formats, books): logger.info("\tExporting Book #{id}.".format(id=book.id)) # actual book content, as HTML html = html_content_for(book=book, static_folder=static_folder, download_cache=download_cache) if html: article_fpath = os.path.join(static_folder, article_name_for(book)) logger.info("\t\tExporting to {}".format(article_fpath)) try: new_html = update_html_for_static(book=book, html_content=html) except: new_html = html with open(article_fpath, 'w') as f: f.write(new_html) def symlink_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tSymlinking {}".format(dst)) path(dst).unlink_p() try: path(src).link(dst) # hard link except IOError: logger.error("/!\ Unable to symlink missing file {}".format(src)) return def copy_from_cache(fname, dstfname=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) logger.info("\t\tCopying {}".format(dst)) path(dst).unlink_p() try: path(src).copy(dst) except IOError: logger.error("/!\ Unable to copy missing file {}".format(src)) return def optimize_image(fpath): if path(fpath).ext == '.png': return optimize_png(fpath) if path(fpath).ext in ('.jpg', '.jpeg'): return optimize_jpeg(fpath) if path(fpath).ext == '.gif': return optimize_gif(fpath) return fpath def optimize_gif(fpath): exec_cmd('gifsicle -O3 "{path}" -o "{path}"'.format(path=fpath)) def optimize_png(fpath): pngquant = 'pngquant --nofs --force --ext=".png" "{path}"' advdef = 'advdef -z -4 -i 5 "{path}"' exec_cmd(pngquant.format(path=fpath)) exec_cmd(advdef.format(path=fpath)) def optimize_jpeg(fpath): exec_cmd('jpegoptim --strip-all -m50 "{path}"'.format(path=fpath)) def optimize_epub(src, dst): logger.info("\t\tCreating ePUB at {}".format(dst)) zipped_files = [] # create temp directory to extract to tmpd = tempfile.mkdtemp(dir=TMP_FOLDER) with zipfile.ZipFile(src, 'r') as zf: zipped_files = zf.namelist() zf.extractall(tmpd) remove_cover = False for fname in zipped_files: fnp = os.path.join(tmpd, fname) if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'): # special case to remove ugly cover if fname.endswith('cover.jpg') and is_bad_cover(fnp): zipped_files.remove(fname) remove_cover = True else: optimize_image(path_for_cmd(fnp)) if path(fname).ext in ('.htm', '.html'): f = open(fnp, 'r') html = update_html_for_static(book=book, html_content=f.read(), epub=True) f.close() with open(fnp, 'w') as f: f.write(html) if path(fname).ext == '.ncx': pattern = "*** START: FULL LICENSE ***" f = open(fnp, 'r') ncx = f.read() f.close() soup = BeautifulSoup(ncx, ["lxml", "xml"]) for tag in soup.findAll('text'): if pattern in tag.text: s = tag.parent.parent s.decompose() for s in s.next_siblings: s.decompose() s.next_sibling with open(fnp, 'w') as f: f.write(soup.encode()) # delete {id}/cover.jpg if exist and update {id}/content.opf if remove_cover: # remove cover path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p() soup = None opff = os.path.join(tmpd, str(book.id), 'content.opf') if os.path.exists(opff): with open(opff, 'r') as fd: soup = BeautifulSoup(fd.read(), ["lxml", "xml"]) for elem in soup.findAll(): if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg': elem.decompose() with(open(opff, 'w')) as fd: fd.write(soup.encode()) with cd(tmpd): exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst))) exec_cmd('zip -qXr9D "{dst}" {files}' .format(dst=path_for_cmd(dst), files=" ".join([f for f in zipped_files if not f == 'mimetype']))) path(tmpd).rmtree_p() def handle_companion_file(fname, dstfname=None, book=None): src = os.path.join(path(download_cache).abspath(), fname) if dstfname is None: dstfname = fname dst = os.path.join(path(static_folder).abspath(), dstfname) # optimization based on mime/extension if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'): copy_from_cache(src, dst) optimize_image(path_for_cmd(dst)) elif path(fname).ext == '.epub': tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub', dir=TMP_FOLDER) tmp_epub.close() optimize_epub(src, tmp_epub.name) path(tmp_epub.name).move(dst) else: # excludes files created by Windows Explorer if src.endswith('_Thumbs.db'): return # copy otherwise (PDF mostly) logger.debug("\t\tshitty ext: {}".format(dst)) copy_from_cache(src, dst) # associated files (images, etc) for fname in [fn for fn in cached_files if fn.startswith("{}_".format(book.id))]: if path(fname).ext in ('.html', '.htm'): src = os.path.join(path(download_cache).abspath(), fname) dst = os.path.join(path(static_folder).abspath(), fname) logger.info("\t\tExporting HTML file to {}".format(dst)) html = "CAN'T READ FILE" with open(src, 'r') as f: html = f.read() new_html = update_html_for_static(book=book, html_content=html) with open(dst, 'w') as f: f.write(new_html) else: logger.info("\t\tCopying companion file to {}".format(fname)) try: handle_companion_file(fname) except Exception as e: logger.error("\t\tException while handling companion file: {}" .format(e)) # other formats for format in formats: if format not in book.formats() or format == 'html': continue logger.info("\t\tCopying format file to {}" .format(archive_name_for(book, format))) try: handle_companion_file(fname_for(book, format), archive_name_for(book, format)) except Exception as e: logger.error("\t\tException while handling companion file: {}" .format(e)) # book presentation article cover_fpath = os.path.join(static_folder, article_name_for(book=book, cover=True)) logger.info("\t\tExporting to {}".format(cover_fpath)) html = cover_html_content_for(book=book, static_folder=static_folder, books=books) with open(cover_fpath, 'w') as f: f.write(html.encode('utf-8'))
def save_rdf_in_database(parser): # Insert author, if it not exists if parser.author_id: try: author_record = Author.get(gut_id=parser.author_id) if parser.last_name: author_record.last_name if parser.first_name: author_record.first_names = parser.first_name if parser.birth_year: author_record.birth_year = parser.birth_year if parser.death_year: author_record.death_year = parser.death_year author_record.save() except: author_record = Author.create( gut_id=parser.author_id, last_name=parser.last_name, first_names=parser.first_name, birth_year=parser.birth_year, death_year=parser.death_year) else: # No author, set Anonymous author_record = Author.get(gut_id='216') # Get license try: license_record = License.get(name=parser.license) except: license_record = None # Insert book book_record = Book.create( id=parser.gid, title=parser.title.strip(), subtitle=parser.subtitle.strip(), author=author_record, # foreign key license=license_record, # foreign key language=parser.language.strip(), downloads=parser.downloads ) # Insert formats for file_type in parser.file_types: # Sanitize MIME mime = parser.file_types[file_type] if not mime.startswith('text/plain'): mime = re.sub(r'; charset=[a-z0-9-]+', '', mime) # else: # charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0] # Insert format type pattern = re.sub(r'' + parser.gid, '{id}', file_type) pattern = pattern.split('/')[-1] bid = int(book_record.id) if bid in BAD_BOOKS_FORMATS.keys() \ and mime in [FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid)]: logger.error("\t**** EXCLUDING **** {} for book #{} from list." .format(mime, bid)) continue format_record = Format.get_or_create( mime=mime, images=file_type.endswith( '.images') or parser.file_types[file_type] == 'application/pdf', pattern=pattern) # Insert book format BookFormat.create( book=book_record, # foreign key format=format_record # foreign key )