Пример #1
0
def download_book(book, download_cache, languages, formats, force):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    for format in formats:

        fpath = os.path.join(download_cache, fname_for(book, format))

        # check if already downloaded
        if path(fpath).exists() and not force:
            logger.debug("\t\t{fmt} already exists at {path}".format(
                fmt=format, path=fpath))
            continue

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                format, book.id, book.title).encode("utf-8"))
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            format, book.id, book.title).encode("utf-8"))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

        import copy

        allurls = copy.copy(urls)

        while urls:
            url = urls.pop()

            if len(allurls) != 1:
                if not resource_exists(url):
                    continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = "{}.zip".format(fpath)

                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue

                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   download_cache=download_cache)
            else:
                if not download_file(url, fpath):
                    logger.error("file donwload failed: {}".format(fpath))
                    continue

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()

        if not bf.downloaded_from:
            logger.error("NO FILE FOR #{}/{}".format(book.id, format))
            pp(allurls)
            continue
Пример #2
0
def download_book(book, download_cache, languages, formats, force, s3_storage,
                  optimizer_version):
    logger.info(
        "\tDownloading content files for Book #{id}".format(id=book.id))

    # apply filters
    if not formats:
        formats = FORMAT_MATRIX.keys()

    # HTML is our base for ZIM for add it if not present
    if "html" not in formats:
        formats.append("html")

    book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
    optimized_dir = book_dir.joinpath("optimized")
    unoptimized_dir = book_dir.joinpath("unoptimized")
    unsuccessful_formats = []
    for book_format in formats:

        unoptimized_fpath = unoptimized_dir.joinpath(
            fname_for(book, book_format))
        optimized_fpath = optimized_dir.joinpath(
            archive_name_for(book, book_format))

        # check if already downloaded
        if (unoptimized_fpath.exists()
                or optimized_fpath.exists()) and not force:
            logger.debug(
                f"\t\t{book_format} already exists for book #{book.id}")
            continue

        if force:
            if book_format == "html":
                for fpath in book_dir.iterdir():
                    if fpath.is_file() and fpath.suffix not in [
                            ".pdf", ".epub"
                    ]:
                        fpath.unlink()
            else:
                if unoptimized_fpath.exists():
                    unoptimized_fpath.unlink()
                if optimized_fpath.exists():
                    optimized_fpath.unlink()
            # delete dirs which are empty
            for dir_name in [optimized_dir, unoptimized_dir]:
                if not dir_name.exists():
                    continue
                if not list(dir_name.iterdir()):
                    dir_name.rmdir()

        # retrieve corresponding BookFormat
        bfs = BookFormat.filter(book=book)

        if book_format == "html":
            patterns = [
                "mnsrb10h.htm",
                "8ledo10h.htm",
                "tycho10f.htm",
                "8ledo10h.zip",
                "salme10h.htm",
                "8nszr10h.htm",
                "{id}-h.html",
                "{id}.html.gen",
                "{id}-h.htm",
                "8regr10h.zip",
                "{id}.html.noimages",
                "8lgme10h.htm",
                "tycho10h.htm",
                "tycho10h.zip",
                "8lgme10h.zip",
                "8indn10h.zip",
                "8resp10h.zip",
                "20004-h.htm",
                "8indn10h.htm",
                "8memo10h.zip",
                "fondu10h.zip",
                "{id}-h.zip",
                "8mort10h.zip",
            ]
            bfso = bfs
            bfs = bfs.join(Format).filter(Format.pattern << patterns)
            if not bfs.count():
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfs]))
                pp(
                    list([(b.format.mime, b.format.images, b.format.pattern)
                          for b in bfso]))
                logger.error("html not found")
                unsuccessful_formats.append(book_format)
                continue
        else:
            bfs = bfs.filter(BookFormat.format << Format.filter(
                mime=FORMAT_MATRIX.get(book_format)))

        if not bfs.count():
            logger.debug("[{}] not avail. for #{}# {}".format(
                book_format, book.id, book.title))
            unsuccessful_formats.append(book_format)
            continue

        if bfs.count() > 1:
            try:
                bf = bfs.join(Format).filter(Format.images).get()
            except Exception:
                bf = bfs.get()
        else:
            bf = bfs.get()

        logger.debug("[{}] Requesting URLs for #{}# {}".format(
            book_format, book.id, book.title))

        # retrieve list of URLs for format unless we have it in DB
        if bf.downloaded_from and not force:
            urls = [bf.downloaded_from]
        else:
            urld = get_urls(book)
            urls = list(reversed(urld.get(FORMAT_MATRIX.get(book_format))))

        import copy

        allurls = copy.copy(urls)
        downloaded_from_cache = False

        while urls:
            url = urls.pop()

            # for development
            # if len(allurls) != 1:
            #     if not resource_exists(url):
            #         continue

            # HTML files are *sometime* available as ZIP files
            if url.endswith(".zip"):
                zpath = unoptimized_dir.joinpath(
                    f"{fname_for(book, book_format)}.zip")

                etag = get_etag_from_url(url)
                if s3_storage:
                    if download_from_cache(
                            book=book,
                            etag=etag,
                            book_format=book_format,
                            dest_dir=optimized_dir,
                            s3_storage=s3_storage,
                            optimizer_version=optimizer_version,
                    ):
                        downloaded_from_cache = True
                        break
                if not download_file(url, zpath):
                    logger.error("ZIP file donwload failed: {}".format(zpath))
                    continue
                # save etag
                book.html_etag = etag
                book.save()
                # extract zipfile
                handle_zipped_epub(zippath=zpath,
                                   book=book,
                                   dst_dir=unoptimized_dir)
            else:
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")
                        or url.endswith(".epub")):
                    etag = get_etag_from_url(url)
                    if s3_storage:
                        logger.info(
                            f"Trying to download {book.id} from optimization cache"
                        )
                        if download_from_cache(
                                book=book,
                                etag=etag,
                                book_format=book_format,
                                dest_dir=optimized_dir,
                                s3_storage=s3_storage,
                                optimizer_version=optimizer_version,
                        ):
                            downloaded_from_cache = True
                            break
                if not download_file(url, unoptimized_fpath):
                    logger.error(
                        "file donwload failed: {}".format(unoptimized_fpath))
                    continue
                # save etag if html or epub if download is successful
                if (url.endswith(".htm") or url.endswith(".html")
                        or url.endswith(".html.utf8")):
                    logger.debug(f"Saving html ETag for {book.id}")
                    book.html_etag = etag
                    book.save()
                elif url.endswith(".epub"):
                    logger.debug(f"Saving epub ETag for {book.id}")
                    book.epub_etag = etag
                    book.save()

            # store working URL in DB
            bf.downloaded_from = url
            bf.save()
            # break as we got a working URL
            break

        if not bf.downloaded_from and not downloaded_from_cache:
            logger.error("NO FILE FOR #{}/{}".format(book.id, book_format))
            # delete instance from DB if download failed
            logger.info("Deleting instance from DB")
            bf.delete_instance()
            unsuccessful_formats.append(book_format)
            pp(allurls)

    # delete book from DB if not downloaded in any format
    if len(unsuccessful_formats) == len(formats):
        logger.debug(
            f"Book #{book.id} could not be downloaded in any format. Deleting from DB ..."
        )
        book.delete_instance()
        if book_dir.exists():
            shutil.rmtree(book_dir, ignore_errors=True)
        return
    download_cover(book, book_dir, s3_storage, optimizer_version)