Пример #1
0
def build_zimfile(static_folder, zim_path=None,
                  languages=[], formats=[],
                  title=None, description=None,
                  only_books=[],
                  create_index=True, force=False):

    # revert HTML/JS/CSS to zim-compatible versions
    export_skeleton(static_folder=static_folder, dev_mode=False,
                    languages=languages, formats=formats,
                    only_books=only_books)

    if not languages:
        languages = ['mul']

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = ("Project Gutenberg Library with {formats}"
                     .format(formats=",".join(formats)))
        else:
            title = ("Project Gutenberg Library ({langs}) with {formats}"
                     .format(langs=",".join(languages),
                             formats=",".join(formats)))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    project_id = get_project_id(languages, formats, only_books)

    if zim_path is None:
        zim_path = "{}.zim".format(project_id)

    if path(zim_path).exists() and not force:
        logger.info("ZIM file `{}` already exist.".format(zim_path))
        return

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    cmd = ['zimwriterfs',
           '--welcome', "Home.html",
           '--favicon', "favicon.png",
           '--language', ','.join(languages),
           '--name', project_id,
           '--title', title,
           '--description', description,
           '--creator', "gutenberg.org",
           '--publisher', "Kiwix",
           static_folder, zim_path]

    if create_index:
            cmd.insert(1, '--withFullTextIndex')
    if exec_cmd(cmd) == 0:
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Пример #2
0
 def copy_file(src, dst):
     logger.info("\t\tCopying {}".format(dst))
     try:
         shutil.copy2(src, dst)
     except IOError:
         logger.error("/!\\ Unable to copy missing file {}".format(src))
         return
Пример #3
0
def download_file(url, fname=None):
    fname.parent.mkdir(parents=True, exist_ok=True)
    try:
        save_large_file(url, fname)
        return True
    except Exception as exc:
        logger.error(f"Error while downloading from {url}: {exc}")
        return False
Пример #4
0
def get_etag_from_url(url):
    try:
        response_headers = requests.head(url=url, allow_redirects=True).headers
    except Exception as e:
        logger.error(url + " > Problem while head request\n" + str(e) + "\n")
        return None
    else:
        return response_headers.get("Etag", None)
Пример #5
0
def download_file(url, fpath):
    fpath.parent.mkdir(parents=True, exist_ok=True)
    try:
        save_large_file(url, fpath)
        return True
    except Exception as exc:
        logger.error(f"Error while downloading from {url}: {exc}")
        if fpath.exists():
            os.unlink(fpath)
        return False
Пример #6
0
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\ Unable to copy missing file {}".format(src))
         return
Пример #7
0
 def symlink_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tSymlinking {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).link(dst)  # hard link
     except IOError:
         logger.error("/!\ Unable to symlink missing file {}".format(src))
         return
Пример #8
0
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\\ Unable to copy missing file {}".format(src))
         return
Пример #9
0
def html_content_for(book, src_dir):

    html_fpath = src_dir.joinpath(fname_for(book, "html"))

    # is HTML file present?
    if not html_fpath.exists():
        logger.warn("Missing HTML content for #{} at {}".format(book.id, html_fpath))
        return None, None

    try:
        return read_file(html_fpath)
    except UnicodeDecodeError:
        logger.error("Unable to read HTML content: {}".format(html_fpath))
        raise
Пример #10
0
def html_content_for(book, static_folder, download_cache):

    html_fpath = os.path.join(download_cache, fname_for(book, 'html'))

    # is HTML file present?
    if not path(html_fpath).exists():
        logger.warn("Missing HTML content for #{} at {}"
                    .format(book.id, html_fpath))
        return None, None

    try:
        return read_file(html_fpath)
    except UnicodeDecodeError:
        logger.error("Unable to read HTML content: {}".format(html_fpath))
        raise
Пример #11
0
def html_content_for(book, static_folder, download_cache):

    html_fpath = os.path.join(download_cache, fname_for(book, 'html'))

    # is HTML file present?
    if not path(html_fpath).exists():
        logger.warn("Missing HTML content for #{} at {}"
                    .format(book.id, html_fpath))
        return None, None

    try:
        return read_file(html_fpath)
    except UnicodeDecodeError:
        logger.error("Unable to read HTML content: {}".format(html_fpath))
        raise
Пример #12
0
def check_dependencies():
    def bin_is_present(binary):
        try:
            subprocess.Popen(
                binary,
                universal_newlines=True,
                shell=False,
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                bufsize=0,
            )
        except OSError:
            return False
        else:
            return True

    all_bins = {
        "gifsicle": "GIF compression tool, part of `gifsicle` package",
        "pngquant": "PNG compression tool, part of `pngquant` package",
        "advdef": "PNG compression tool, part of `advancecomp` package",
        "jpegoptim": "JPEG compression tool, part of `jpegoptim` package",
        "zip": "ZIP file packager for ePub",
        "tar": "TAR archive extractor",
        "curl": "Files downloader, part of `curl` package",
        "zimwriterfs": "ZIM file writer, available on kiwix-other repository",
    }

    all_good = True
    has_zimwriter = True
    for bin, msg in all_bins.items():
        if bin == "zimwriterfs":
            if not bin_is_present(bin):
                has_zimwriter = False
                continue

        if not bin_is_present(bin):
            logger.error("\t*{}* binary missing. {}".format(bin, msg))
            all_good = False

    return all_good, has_zimwriter
Пример #13
0
def check_dependencies():

    def bin_is_present(binary):
        try:
            subprocess.Popen(binary,
                             universal_newlines=True,
                             shell=False,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             bufsize=0)
        except OSError:
            return False
        else:
            return True

    all_bins = {
        'gifsicle': "GIF compression tool, part of `gifsicle` package",
        'pngquant': "PNG compression tool, part of `pngquant` package",
        'advdef': "PNG compression tool, part of `advancecomp` package",
        'jpegoptim': "JPEG compression tool, part of `jpegoptim` package",
        'zip': "ZIP file packager for ePub",
        'tar': "TAR archive extractor",
        'curl': "Files downloader, part of `curl` package",
        'zimwriterfs': "ZIM file writer, available on kiwix-other repository",
    }

    all_good = True
    has_zimwriter = True
    for bin, msg in all_bins.items():
        if bin == 'zimwriterfs':
            if not bin_is_present(bin):
                has_zimwriter = False
                continue

        if not bin_is_present(bin):
            logger.error("\t*{}* binary missing. {}".format(bin, msg))
            all_good = False

    return all_good, has_zimwriter
Пример #14
0
def save_rdf_in_database(parser):

    # Insert author, if it not exists
    if parser.author_id:
        try:
            author_record = Author.get(gut_id=parser.author_id)
        except Exception:
            try:
                author_record = Author.create(
                    gut_id=parser.author_id,
                    last_name=normalize(parser.last_name),
                    first_names=normalize(parser.first_name),
                    birth_year=parser.birth_year,
                    death_year=parser.death_year,
                )
            # concurrent workers might colide here so we retry once on IntegrityError
            except peewee.IntegrityError:
                author_record = Author.get(gut_id=parser.author_id)
        else:
            if parser.last_name:
                author_record.last_name = normalize(parser.last_name)
            if parser.first_name:
                author_record.first_names = normalize(parser.first_name)
            if parser.birth_year:
                author_record.birth_year = parser.birth_year
            if parser.death_year:
                author_record.death_year = parser.death_year
            author_record.save()
    else:
        # No author, set Anonymous
        author_record = Author.get(gut_id="216")

    # Get license
    try:
        license_record = License.get(name=parser.license)
    except Exception:
        license_record = None

    # Insert book

    try:
        book_record = Book.get(id=parser.gid)
    except Book.DoesNotExist:
        book_record = Book.create(
            id=parser.gid,
            title=normalize(parser.title.strip()),
            subtitle=normalize(parser.subtitle.strip()),
            author=author_record,  # foreign key
            license=license_record,  # foreign key
            language=parser.language.strip(),
            downloads=parser.downloads,
            bookshelf=parser.bookshelf,
            cover_page=parser.cover_image,
        )
    else:
        book_record.title = normalize(parser.title.strip())
        book_record.subtitle = normalize(parser.subtitle.strip())
        book_record.author = author_record  # foreign key
        book_record.license = license_record  # foreign key
        book_record.language = parser.language.strip()
        book_record.downloads = parser.downloads
        book_record.save()

    # insert pdf if not exists in parser.file_types
    # this is done as presence of PDF on server and RDF is inconsistent
    if not [
            key for key in parser.file_types
            if parser.file_types[key].startswith("application/pdf")
    ]:
        parser.file_types.update({"{id}-pdf.pdf": "application/pdf"})

    # Insert formats
    for file_type in parser.file_types:

        # Sanitize MIME
        mime = parser.file_types[file_type]
        if not mime.startswith("text/plain"):
            mime = re.sub(r"; charset=[a-z0-9-]+", "", mime)
        # else:
        #    charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0]

        # Insert format type
        pattern = re.sub(r"" + parser.gid, "{id}", file_type)
        pattern = pattern.split("/")[-1]

        bid = int(book_record.id)

        if bid in BAD_BOOKS_FORMATS.keys() and mime in [
                FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid)
        ]:
            logger.error(
                "\t**** EXCLUDING **** {} for book #{} from list.".format(
                    mime, bid))
            continue

        format_record, _ = Format.get_or_create(
            mime=mime,
            images=file_type.endswith(".images")
            or parser.file_types[file_type] == "application/pdf",
            pattern=pattern,
        )

        # Insert book format
        BookFormat.get_or_create(
            book=book_record,
            format=format_record  # foreign key  # foreign key
        )
Пример #15
0
def build_zimfile(static_folder,
                  zim_path=None,
                  languages=[],
                  formats=[],
                  title=None,
                  description=None,
                  only_books=[],
                  create_index=True,
                  force=False):

    # revert HTML/JS/CSS to zim-compatible versions
    export_skeleton(static_folder=static_folder,
                    dev_mode=False,
                    languages=languages,
                    formats=formats,
                    only_books=only_books)

    if not languages:
        languages = ['mul']

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = ("Project Gutenberg Library with {formats}".format(
                formats=",".join(formats)))
        else:
            title = (
                "Project Gutenberg Library ({langs}) with {formats}".format(
                    langs=",".join(languages), formats=",".join(formats)))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    project_id = get_project_id(languages, formats, only_books)

    if zim_path is None:
        zim_path = "{}.zim".format(project_id)

    if path(zim_path).exists() and not force:
        logger.info("ZIM file `{}` already exist.".format(zim_path))
        return

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    cmd = [
        'zimwriterfs', '--welcome', "Home.html", '--favicon', "favicon.png",
        '--language', ','.join(languages), '--name', project_id, '--title',
        title, '--description', description, '--creator', "gutenberg.org",
        '--publisher', "Kiwix", static_folder, zim_path
    ]

    if create_index:
        cmd.insert(1, '--withFullTextIndex')
    if exec_cmd(cmd) == 0:
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Пример #16
0
def download_book(book, download_cache, languages, formats, force, s3_storage,
                  optimizer_version):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
    optimized_dir = book_dir.joinpath("optimized")
    unoptimized_dir = book_dir.joinpath("unoptimized")
    unsuccessful_formats = []
    for book_format in formats:

        unoptimized_fpath = unoptimized_dir.joinpath(
            fname_for(book, book_format))
        optimized_fpath = optimized_dir.joinpath(
            archive_name_for(book, book_format))

        # check if already downloaded
        if (unoptimized_fpath.exists()
                or optimized_fpath.exists()) and not force:
            logger.debug(
                f"\t\t{book_format} already exists for book #{book.id}")
            continue

        if force:
            if book_format == "html":
                for fpath in book_dir.iterdir():
                    if fpath.is_file() and fpath.suffix not in [
                            ".pdf", ".epub"
                    ]:
                        fpath.unlink()
            else:
                if unoptimized_fpath.exists():
                    unoptimized_fpath.unlink()
                if optimized_fpath.exists():
                    optimized_fpath.unlink()
            # delete dirs which are empty
            for dir_name in [optimized_dir, unoptimized_dir]:
                if not dir_name.exists():
                    continue
                if not list(dir_name.iterdir()):
                    dir_name.rmdir()

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if book_format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                unsuccessful_formats.append(book_format)
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(book_format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                book_format, book.id, book.title))
            unsuccessful_formats.append(book_format)
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            book_format, book.id, book.title))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format))))

        import copy

        allurls = copy.copy(urls)
        downloaded_from_cache = False

        while urls:
            url = urls.pop()

            # for development
            # if len(allurls) != 1:
            #     if not resource_exists(url):
            #         continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = unoptimized_dir.joinpath(
                    f"{fname_for(book, book_format)}.zip")

                etag = get_etag_from_url(url)
                if s3_storage:
                    if download_from_cache(
                            book=book,
                            etag=etag,
                            book_format=book_format,
                            dest_dir=optimized_dir,
                            s3_storage=s3_storage,
                            optimizer_version=optimizer_version,
                    ):
                        downloaded_from_cache = True
                        break
                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue
                # save etag
                book.html_etag = etag
                book.save()
                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   dst_dir=unoptimized_dir)
            else:
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")
                        or url.endswith(".epub")):
                    etag = get_etag_from_url(url)
                    if s3_storage:
                        logger.info(
                            f"Trying to download {book.id} from optimization cache"
                        )
                        if download_from_cache(
                                book=book,
                                etag=etag,
                                book_format=book_format,
                                dest_dir=optimized_dir,
                                s3_storage=s3_storage,
                                optimizer_version=optimizer_version,
                        ):
                            downloaded_from_cache = True
                            break
                if not download_file(url, unoptimized_fpath):
                    logger.error(
                        "file donwload failed: {}".format(unoptimized_fpath))
                    continue
                # save etag if html or epub if download is successful
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")):
                    logger.debug(f"Saving html ETag for {book.id}")
                    book.html_etag = etag
                    book.save()
                elif url.endswith(".epub"):
                    logger.debug(f"Saving epub ETag for {book.id}")
                    book.epub_etag = etag
                    book.save()

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()
            # break as we got a working URL
            break

        if not bf.downloaded_from and not downloaded_from_cache:
            logger.error("NO FILE FOR #{}/{}".format(book.id, book_format))
            # delete instance from DB if download failed
            logger.info("Deleting instance from DB")
            bf.delete_instance()
            unsuccessful_formats.append(book_format)
            pp(allurls)

    # delete book from DB if not downloaded in any format
    if len(unsuccessful_formats) == len(formats):
        logger.debug(
            f"Book #{book.id} could not be downloaded in any format. Deleting from DB ..."
        )
        book.delete_instance()
        if book_dir.exists():
            shutil.rmtree(book_dir, ignore_errors=True)
        return
    download_cover(book, book_dir, s3_storage, optimizer_version)
Пример #17
0
def download_book(book, download_cache, languages, formats, force):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    for format in formats:

        fpath = os.path.join(download_cache, fname_for(book, format))

        # check if already downloaded
        if path(fpath).exists() and not force:
            logger.debug("\t\t{fmt} already exists at {path}".format(
                fmt=format, path=fpath))
            continue

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                format, book.id, book.title).encode("utf-8"))
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            format, book.id, book.title).encode("utf-8"))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

        import copy

        allurls = copy.copy(urls)

        while urls:
            url = urls.pop()

            if len(allurls) != 1:
                if not resource_exists(url):
                    continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = "{}.zip".format(fpath)

                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue

                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   download_cache=download_cache)
            else:
                if not download_file(url, fpath):
                    logger.error("file donwload failed: {}".format(fpath))
                    continue

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()

        if not bf.downloaded_from:
            logger.error("NO FILE FOR #{}/{}".format(book.id, format))
            pp(allurls)
            continue
Пример #18
0
def build_zimfile(
    static_folder,
    output_folder,
    zim_name=None,
    languages=[],
    formats=[],
    title=None,
    description=None,
    only_books=[],
    create_index=True,
    force=False,
    title_search=False,
    add_bookshelves=False,
):

    # revert HTML/JS/CSS to zim-compatible versions
    export_skeleton(
        static_folder=static_folder,
        dev_mode=False,
        languages=languages,
        formats=formats,
        only_books=only_books,
        title_search=title_search,
        add_bookshelves=add_bookshelves,
    )

    if not languages:
        languages = ["mul"]

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = "Project Gutenberg Library"
        else:
            title = "Project Gutenberg Library ({langs})".format(
                langs=",".join(languages)
            )

        if len(formats) < len(FORMAT_MATRIX):
            title += " with {formats}".format(formats=",".join(formats))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    project_id = get_project_id(languages, formats, only_books)

    if zim_name is None:
        zim_name = "{}.zim".format(project_id)
    zim_path = output_folder.joinpath(zim_name)

    if path(zim_name).exists() and not force:
        logger.info("ZIM file `{}` already exist.".format(zim_name))
        return

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    cmd = [
        "zimwriterfs",
        "--welcome",
        "Home.html",
        "--favicon",
        "favicon.png",
        "--language",
        ",".join(languages),
        "--name",
        project_id,
        "--title",
        title,
        "--description",
        description,
        "--creator",
        "gutenberg.org",
        "--tags",
        "gutenberg",
        "--publisher",
        "Kiwix",
        "--scraper",
        "gutengergtozim-{v}".format(v=VERSION),
        static_folder,
        six.text_type(zim_path),
    ]

    if not create_index:
        cmd.insert(1, "--withoutFTIndex")
    if exec_cmd(cmd) == 0:
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
Пример #19
0
def handle_unoptimized_files(
    book,
    static_folder,
    src_dir,
    languages,
    formats,
    books,
    project_id,
    optimizer_version,
    force=False,
    title_search=False,
    add_bookshelves=False,
    s3_storage=None,
):
    def copy_file(src, dst):
        logger.info("\t\tCopying {}".format(dst))
        try:
            shutil.copy2(src, dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def update_download_cache(unoptimized_file, optimized_file):
        book_dir = unoptimized_file.parents[1]
        optimized_dir = book_dir.joinpath("optimized")
        unoptimized_dir = book_dir.joinpath("unoptimized")
        if not optimized_dir.exists():
            optimized_dir.mkdir()
        dst = optimized_dir.joinpath(optimized_file.name)
        os.unlink(unoptimized_file)
        copy_file(optimized_file.resolve(), dst.resolve())
        if not [fpath for fpath in unoptimized_dir.iterdir()]:
            unoptimized_dir.rmdir()

    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, _ = html_content_for(book=book, src_dir=src_dir)
    html_book_optimized_files = []
    if html:
        article_fpath = static_folder.joinpath(article_name_for(book))
        if not article_fpath.exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
            save_bs_output(new_html, article_fpath, UTF8)
            html_book_optimized_files.append(article_fpath)
            update_download_cache(
                src_dir.joinpath(fname_for(book, "html")), article_fpath
            )
            if not src_dir.exists():
                return
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def optimize_image(src, dst, force=False):
        if dst.exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if src.suffix == ".png":
            return optimize_png(str(src.resolve()), str(dst.resolve()))
        if src.suffix in (".jpg", ".jpeg"):
            return optimize_jpeg(str(src.resolve()), str(dst.resolve()))
        if src.suffix == ".gif":
            return optimize_gif(str(src.resolve()), str(dst.resolve()))
        return dst

    def optimize_gif(src, dst):
        exec_cmd(["gifsicle", "-O3", src, "-o", dst])

    def optimize_png(src, dst):
        exec_cmd(["pngquant", "--nofs", "--force", "--output", dst, src])
        exec_cmd(["advdef", "-z", "-4", "-i", "5", dst])

    def optimize_jpeg(src, dst):
        if src != dst:
            copy_file(src, dst)
        exec_cmd(["jpegoptim", "--strip-all", "-m50", dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        try:
            with zipfile.ZipFile(src, "r") as zf:
                zipped_files = zf.namelist()
                zf.extractall(tmpd)
        except zipfile.BadZipFile as exc:
            shutil.rmtree(tmpd)
            raise exc

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in (".png", ".jpeg", ".jpg", ".gif"):

                # special case to remove ugly cover
                if fname.endswith("cover.jpg") and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(pathlib.Path(fnp), pathlib.Path(fnp), force=True)

            if path(fname).ext in (".htm", ".html"):
                html_content, _ = read_file(fnp)
                html = update_html_for_static(
                    book=book, html_content=html_content, epub=True
                )
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == ".ncx":
                pattern = "*** START: FULL LICENSE ***"
                ncx, _ = read_file(fnp)
                soup = BeautifulSoup(ncx, "lxml-xml")
                for tag in soup.findAll("text"):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, text_type(book.id), "cover.jpg")).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), "content.opf")
            if os.path.exists(opff):
                opff_content, _ = read_file(opff)
                soup = BeautifulSoup(opff_content, "lxml-xml")

                for elem in soup.findAll():
                    if getattr(elem, "attrs", {}).get("href") == "cover.jpg":
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst, root_folder=tmpd, fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(
        fname,
        dstfname=None,
        book=None,
        force=False,
        as_ext=None,
        html_file_list=None,
        s3_storage=None,
    ):
        ext = fname.suffix if as_ext is None else as_ext
        src = fname
        if dstfname is None:
            dstfname = fname.name
        dst = static_folder.joinpath(dstfname)
        if dst.exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in (".png", ".jpg", ".jpeg", ".gif"):
            logger.info("\t\tCopying and optimizing image companion {}".format(fname))
            optimize_image(src, dst)
            if dst.name == (f"{book.id}_cover_image.jpg") and s3_storage:
                upload_to_cache(
                    asset=dst,
                    book_format="cover",
                    book_id=book.id,
                    etag=book.cover_etag,
                    s3_storage=s3_storage,
                    optimizer_version=optimizer_version,
                )
                update_download_cache(src, dst)
            elif html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)
        elif ext == ".epub":
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix=".epub", dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn(
                    "\t\tBad zip file. "
                    "Copying as it might be working{}".format(fname)
                )
                handle_companion_file(fname, dstfname, book, force, as_ext=".zip")
            else:
                path(tmp_epub.name).move(dst)
                if s3_storage:
                    upload_to_cache(
                        asset=dst,
                        book_format="epub",
                        book_id=book.id,
                        etag=book.epub_etag,
                        s3_storage=s3_storage,
                        optimizer_version=optimizer_version,
                    )
                    update_download_cache(src, dst)
        else:
            # excludes files created by Windows Explorer
            if src.name.endswith("_Thumbs.db"):
                return
            # copy otherwise (PDF mostly)
            logger.info("\t\tCopying companion file to {}".format(dst))
            copy_file(src, dst)
            if ext != ".pdf" and ext != ".zip" and html_file_list:
                html_file_list.append(dst)
                update_download_cache(src, dst)

    # associated files (images, etc)
    for fpath in src_dir.iterdir():
        if fpath.is_file() and fpath.name.startswith(f"{book.id}_"):
            if fpath.suffix in (".html", ".htm"):
                src = fpath
                dst = static_folder.joinpath(fpath.name)
                if dst.exists() and not force:
                    logger.debug("\t\tSkipping existing HTML {}".format(dst))
                    continue

                logger.info("\t\tExporting HTML file to {}".format(dst))
                html, _ = read_file(src)
                new_html = update_html_for_static(book=book, html_content=html)
                save_bs_output(new_html, dst, UTF8)
                html_book_optimized_files.append(dst)
                update_download_cache(src, dst)
            else:
                try:
                    handle_companion_file(
                        fpath,
                        force=force,
                        html_file_list=html_book_optimized_files,
                        s3_storage=s3_storage,
                        book=book,
                    )
                except Exception as e:
                    logger.exception(e)
                    logger.error(
                        "\t\tException while handling companion file: {}".format(e)
                    )
    if s3_storage and html_book_optimized_files:
        upload_to_cache(
            asset=html_book_optimized_files,
            book_format="html",
            etag=book.html_etag,
            book_id=book.id,
            s3_storage=s3_storage,
            optimizer_version=optimizer_version,
        )

    # other formats
    for format in formats:
        if format not in book.formats() or format == "html":
            continue
        book_file = src_dir.joinpath(fname_for(book, format))
        if book_file.exists():
            try:
                handle_companion_file(
                    book_file,
                    archive_name_for(book, format),
                    force=force,
                    book=book,
                    s3_storage=s3_storage,
                )
            except Exception as e:
                logger.exception(e)
                logger.error(
                    "\t\tException while handling companion file: {}".format(e)
                )
Пример #20
0
def save_rdf_in_database(parser):

    # Insert author, if it not exists
    if parser.author_id:
        try:
            author_record = Author.get(gut_id=parser.author_id)
        except Exception:
            try:
                author_record = Author.create(
                    gut_id=parser.author_id,
                    last_name=normalize(parser.last_name),
                    first_names=normalize(parser.first_name),
                    birth_year=parser.birth_year,
                    death_year=parser.death_year)
            # concurrent workers might colide here so we retry once on IntegrityError
            except peewee.IntegrityError:
                author_record = Author.get(gut_id=parser.author_id)
        else:
            if parser.last_name:
                author_record.last_name = normalize(parser.last_name)
            if parser.first_name:
                author_record.first_names = normalize(parser.first_name)
            if parser.birth_year:
                author_record.birth_year = parser.birth_year
            if parser.death_year:
                author_record.death_year = parser.death_year
            author_record.save()
    else:
        # No author, set Anonymous
        author_record = Author.get(gut_id='216')

    # Get license
    try:
        license_record = License.get(name=parser.license)
    except Exception:
        license_record = None

    # Insert book

    try:
        book_record = Book.get(id=parser.gid)
    except Book.DoesNotExist:
        book_record = Book.create(
            id=parser.gid,
            title=normalize(parser.title.strip()),
            subtitle=normalize(parser.subtitle.strip()),
            author=author_record,  # foreign key
            license=license_record,  # foreign key
            language=parser.language.strip(),
            downloads=parser.downloads)
    else:
        book_record.title = normalize(parser.title.strip())
        book_record.subtitle = normalize(parser.subtitle.strip())
        book_record.author = author_record  # foreign key
        book_record.license = license_record  # foreign key
        book_record.language = parser.language.strip()
        book_record.downloads = parser.downloads
        book_record.save()

    # Insert formats
    for file_type in parser.file_types:

        # Sanitize MIME
        mime = parser.file_types[file_type]
        if not mime.startswith('text/plain'):
            mime = re.sub(r'; charset=[a-z0-9-]+', '', mime)
        # else:
        #    charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0]

        # Insert format type
        pattern = re.sub(r'' + parser.gid, '{id}', file_type)
        pattern = pattern.split('/')[-1]

        bid = int(book_record.id)

        if bid in BAD_BOOKS_FORMATS.keys() \
            and mime in [FORMAT_MATRIX.get(f)
                         for f in BAD_BOOKS_FORMATS.get(bid)]:
            logger.error(
                "\t**** EXCLUDING **** {} for book #{} from list.".format(
                    mime, bid))
            continue

        format_record, _ = Format.get_or_create(
            mime=mime,
            images=file_type.endswith('.images')
            or parser.file_types[file_type] == 'application/pdf',
            pattern=pattern)

        # Insert book format
        BookFormat.get_or_create(
            book=book_record,  # foreign key
            format=format_record  # foreign key
        )
Пример #21
0
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except Exception:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))
Пример #22
0
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books,
                   project_id, force=False):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html, encoding = html_content_for(book=book,
                                      static_folder=static_folder,
                                      download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        if not path(article_fpath).exists() or force:
            logger.info("\t\tExporting to {}".format(article_fpath))
            try:
                new_html = update_html_for_static(book=book, html_content=html)
            except:
                raise
                new_html = html
            save_bs_output(new_html, article_fpath, UTF8)
        else:
            logger.info("\t\tSkipping HTML article {}".format(article_fpath))

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(src, dst, force=False):
        if path(dst).exists() and not force:
            logger.info("\tSkipping image optimization for {}".format(dst))
            return dst
        logger.info("\tOptimizing image {}".format(dst))
        if path(src).ext == '.png':
            return optimize_png(src, dst)
        if path(src).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(src, dst)
        if path(src).ext == '.gif':
            return optimize_gif(src, dst)
        return dst

    def optimize_gif(src, dst):
        exec_cmd(['gifsicle', '-O3', src, '-o', dst])

    def optimize_png(src, dst):
        exec_cmd(['pngquant', '--nofs', '--force',
                  '--output', dst, src])
        exec_cmd(['advdef', '-z', '-4', '-i', '5', dst])

    def optimize_jpeg(src, dst):
        copy_from_cache(src, dst)
        exec_cmd(['jpegoptim', '--strip-all', '-m50', dst])

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB off {} at {}".format(src, dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)

        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(fnp, fnp)

            if path(fname).ext in ('.htm', '.html'):
                html_content, html_encoding = read_file(fnp)
                html = update_html_for_static(book=book,
                                              html_content=html_content,
                                              epub=True)
                save_bs_output(html, fnp, UTF8)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                ncx, ncx_encoding = read_file(fnp)
                soup = BeautifulSoup(ncx, 'lxml-xml')
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                save_bs_output(soup, fnp, UTF8)

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(
                os.path.join(tmpd, text_type(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, text_type(book.id), 'content.opf')
            if os.path.exists(opff):
                opff_content, opff_encoding = read_file(opff)
                soup = BeautifulSoup(opff_content, 'lxml-xml')

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                save_bs_output(soup, opff, UTF8)

        # bundle epub as zip
        zip_epub(epub_fpath=dst,
                 root_folder=tmpd,
                 fpaths=zipped_files)

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None,
                              force=False, as_ext=None):
        ext = path(fname).ext if as_ext is None else as_ext
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        if path(dst).exists() and not force:
            logger.debug("\t\tSkipping existing companion {}".format(dstfname))
            return

        # optimization based on mime/extension
        if ext in ('.png', '.jpg', '.jpeg', '.gif'):
            logger.info("\t\tCopying and optimizing image companion {}"
                        .format(fname))
            # copy_from_cache(src, dst)
            optimize_image(src, dst)
        elif ext == '.epub':
            logger.info("\t\tCreating optimized EPUB file {}".format(fname))
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            try:
                optimize_epub(src, tmp_epub.name)
            except zipfile.BadZipFile:
                logger.warn("\t\tBad zip file. "
                            "Copying as it might be working{}".format(fname))
                handle_companion_file(fname, dstfname, book, force,
                                      as_ext='zip')
            else:
                path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            logger.info("\t\tCopying companion file to {}".format(fname))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            if path(dst).exists() and not force:
                logger.debug("\t\tSkipping existing HTML {}".format(dst))
                continue

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html, encoding = read_file(src)
            new_html = update_html_for_static(book=book, html_content=html)
            save_bs_output(new_html, dst, UTF8)
        else:
            try:
                handle_companion_file(fname, force=force)
            except Exception as e:
                logger.exception(e)
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format),
                                  force=force)
        except Exception as e:
            logger.exception(e)
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    if not path(cover_fpath).exists() or force:
        logger.info("\t\tExporting to {}".format(cover_fpath))
        html = cover_html_content_for(book=book,
                                      static_folder=static_folder,
                                      books=books, project_id=project_id)
        with open(cover_fpath, 'w') as f:
            if six.PY2:
                f.write(html.encode(UTF8))
            else:
                f.write(html)
    else:
        logger.info("\t\tSkipping cover {}".format(cover_fpath))