示例#1
0
def download_cover(book, book_dir, s3_storage, optimizer_version):
    has_cover = Book.select(Book.cover_page).where(Book.id == book.id)
    if has_cover:
        # try to download optimized cover from cache if s3_storage
        url = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id, book.id)
        etag = get_etag_from_url(url)
        downloaded_from_cache = False
        cover = "{}_cover_image.jpg".format(book.id)
        if (book_dir.joinpath("optimized").joinpath(cover).exists()
                or book_dir.joinpath("unoptimized").joinpath(cover).exists()):
            logger.debug(f"Cover already exists for book #{book.id}")
            return
        if s3_storage:
            logger.info(
                f"Trying to download cover for {book.id} from optimization cache"
            )
            downloaded_from_cache = download_from_cache(
                book=book,
                etag=etag,
                book_format="cover",
                dest_dir=book_dir.joinpath("optimized"),
                s3_storage=s3_storage,
                optimizer_version=optimizer_version,
            )
        if not downloaded_from_cache:
            logger.debug("Downloading {}".format(url))
            if download_file(url,
                             book_dir.joinpath("unoptimized").joinpath(cover)):
                book.cover_etag = etag
                book.save()
    else:
        logger.debug("No Book Cover found for Book #{}".format(book.id))
示例#2
0
def get_list_of_filtered_books(languages, formats, only_books=[]):
    if len(formats):
        qs = (Book.select().join(BookFormat).join(Format).where(
            Format.mime << [FORMAT_MATRIX.get(f)
                            for f in formats]).group_by(Book.id))
    else:
        qs = Book.select()

    if len(only_books):
        # print(only_books)
        qs = qs.where(Book.id << only_books)

    if len(languages):
        qs = qs.where(Book.language << languages)

    return qs
示例#3
0
文件: utils.py 项目: kiwix/gutenberg
def get_list_of_filtered_books(languages, formats, only_books=[]):
    if len(formats):
        qs = Book.select().join(BookFormat) \
                 .join(Format) \
                 .where(Format.mime << [FORMAT_MATRIX.get(f)
                                        for f in formats]) \
                 .group_by(Book.id)
    else:
        qs = Book.select()

    if len(only_books):
        # print(only_books)
        qs = qs.where(Book.id << only_books)

    if len(languages):
        qs = qs.where(Book.language << languages)

    return qs
示例#4
0
def download_covers(book, download_cache):
    cover = "{}_cover.jpg".format(book.id)
    fpath = os.path.join(download_cache, cover)
    has_cover = Book.select(Book.cover_page).where(Book.id == book.id)
    if has_cover:
        title = "{}{}/pg{}.cover.medium.jpg".format(IMAGE_BASE, book.id,
                                                    book.id)
        logger.debug("Downloading {}".format(title))
        download_file(title, fpath)
    else:
        logger.debug("No Book Cover found for Book #{}".format(book.id))
    return True
示例#5
0
def parse_and_process_file(rdf_file, force=False):
    if not path(rdf_file).exists():
        raise ValueError(rdf_file)

    gid = re.match(r".*/pg([0-9]+).rdf", rdf_file).groups()[0]

    if Book.get_or_none(id=int(gid)):
        logger.info("\tSkipping already parsed file {}".format(rdf_file))
        return

    logger.info("\tParsing file {}".format(rdf_file))
    with open(rdf_file, "r") as f:
        parser = RdfParser(f.read(), gid).parse()

    if parser.license == "None":
        logger.info(
            "\tWARN: Unusable book without any information {}".format(gid))
    elif parser.title == "":
        logger.info("\tWARN: Unusable book without title {}".format(gid))
    else:
        save_rdf_in_database(parser)
示例#6
0
    etext_names = ["{0:0=2d}".format(i) for i in etext_nums]
    etext_urls = []
    for i in etext_names:
        etext_urls.append(os.path.join(u.build() + i, file_name))

    urls.extend([url_zip, url_htm, url_html, html_utf8])
    urls.extend(etext_urls)
    return list(set(urls))


def setup_urls():

    file_with_url = os.path.join("tmp",
                                 "file_on_{}".format(UrlBuilder.SERVER_NAME))
    cmd = [
        "bash", "-c",
        "rsync -a --list-only {} > {}".format(UrlBuilder.RSYNC, file_with_url)
    ]
    exec_cmd(cmd)
    in_place_opt = ["-i", ".bak"] if platform.system() == "Darwin" else ["-i"]
    cmd = ["sed"] + in_place_opt + [r"s#.* \(.*\)$#\\1#", file_with_url]
    exec_cmd(cmd)

    field_names = ['url']
    load_csv(Url, file_with_url, field_names=field_names)


if __name__ == '__main__':
    book = Book.get(id=9)
    print(get_urls(book))
示例#7
0
def get_list_of_all_languages():
    return list(set(list([b.language for b in Book.select(Book.language)])))
示例#8
0
文件: urls.py 项目: kiwix/gutenberg
    u.with_base(UrlBuilder.BASE_THREE)
    file_index = index_of_substring(files, ['html', 'htm'])
    file_name = files[file_index]['name']
    etext_nums = []
    etext_nums.extend(range(90, 100))
    etext_nums.extend(range(0, 6))
    etext_names = ["{0:0=2d}".format(i) for i in etext_nums]
    etext_urls = []
    for i in etext_names:
        etext_urls.append(os.path.join(u.build() + i, file_name))

    urls.extend([url_zip, url_htm, url_html, html_utf8])
    urls.extend(etext_urls)
    return list(set(urls))

def setup_urls():

    file_with_url = os.path.join("tmp","file_on_{}".format(UrlBuilder.SERVER_NAME))
    cmd = ["bash",  "-c", "rsync -a --list-only {} > {}".format(UrlBuilder.RSYNC,file_with_url) ]
    exec_cmd(cmd)
    cmd = ["sed" , "-i", "s#.* \(.*\)$#\\1#", file_with_url ]
    exec_cmd(cmd)

    field_names = [ 'url' ]
    load_csv(Url, file_with_url, field_names = field_names)


if __name__ == '__main__':
    book = Book.get(id=9)
    print(get_urls(book))
示例#9
0
文件: export.py 项目: kiwix/gutenberg
def get_list_of_all_languages():
    return list(set(list([b.language for b in Book.select(Book.language)])))
示例#10
0
def save_rdf_in_database(parser):

    # Insert author, if it not exists
    if parser.author_id:
        try:
            author_record = Author.get(gut_id=parser.author_id)
        except Exception:
            try:
                author_record = Author.create(
                    gut_id=parser.author_id,
                    last_name=normalize(parser.last_name),
                    first_names=normalize(parser.first_name),
                    birth_year=parser.birth_year,
                    death_year=parser.death_year,
                )
            # concurrent workers might colide here so we retry once on IntegrityError
            except peewee.IntegrityError:
                author_record = Author.get(gut_id=parser.author_id)
        else:
            if parser.last_name:
                author_record.last_name = normalize(parser.last_name)
            if parser.first_name:
                author_record.first_names = normalize(parser.first_name)
            if parser.birth_year:
                author_record.birth_year = parser.birth_year
            if parser.death_year:
                author_record.death_year = parser.death_year
            author_record.save()
    else:
        # No author, set Anonymous
        author_record = Author.get(gut_id="216")

    # Get license
    try:
        license_record = License.get(name=parser.license)
    except Exception:
        license_record = None

    # Insert book

    try:
        book_record = Book.get(id=parser.gid)
    except Book.DoesNotExist:
        book_record = Book.create(
            id=parser.gid,
            title=normalize(parser.title.strip()),
            subtitle=normalize(parser.subtitle.strip()),
            author=author_record,  # foreign key
            license=license_record,  # foreign key
            language=parser.language.strip(),
            downloads=parser.downloads,
            bookshelf=parser.bookshelf,
            cover_page=parser.cover_image,
        )
    else:
        book_record.title = normalize(parser.title.strip())
        book_record.subtitle = normalize(parser.subtitle.strip())
        book_record.author = author_record  # foreign key
        book_record.license = license_record  # foreign key
        book_record.language = parser.language.strip()
        book_record.downloads = parser.downloads
        book_record.save()

    # insert pdf if not exists in parser.file_types
    # this is done as presence of PDF on server and RDF is inconsistent
    if not [
            key for key in parser.file_types
            if parser.file_types[key].startswith("application/pdf")
    ]:
        parser.file_types.update({"{id}-pdf.pdf": "application/pdf"})

    # Insert formats
    for file_type in parser.file_types:

        # Sanitize MIME
        mime = parser.file_types[file_type]
        if not mime.startswith("text/plain"):
            mime = re.sub(r"; charset=[a-z0-9-]+", "", mime)
        # else:
        #    charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0]

        # Insert format type
        pattern = re.sub(r"" + parser.gid, "{id}", file_type)
        pattern = pattern.split("/")[-1]

        bid = int(book_record.id)

        if bid in BAD_BOOKS_FORMATS.keys() and mime in [
                FORMAT_MATRIX.get(f) for f in BAD_BOOKS_FORMATS.get(bid)
        ]:
            logger.error(
                "\t**** EXCLUDING **** {} for book #{} from list.".format(
                    mime, bid))
            continue

        format_record, _ = Format.get_or_create(
            mime=mime,
            images=file_type.endswith(".images")
            or parser.file_types[file_type] == "application/pdf",
            pattern=pattern,
        )

        # Insert book format
        BookFormat.get_or_create(
            book=book_record,
            format=format_record  # foreign key  # foreign key
        )
示例#11
0
def save_rdf_in_database(parser):

    # Insert author, if it not exists
    if parser.author_id:
        try:
            author_record = Author.get(gut_id=parser.author_id)
        except Exception:
            try:
                author_record = Author.create(
                    gut_id=parser.author_id,
                    last_name=normalize(parser.last_name),
                    first_names=normalize(parser.first_name),
                    birth_year=parser.birth_year,
                    death_year=parser.death_year)
            # concurrent workers might colide here so we retry once on IntegrityError
            except peewee.IntegrityError:
                author_record = Author.get(gut_id=parser.author_id)
        else:
            if parser.last_name:
                author_record.last_name = normalize(parser.last_name)
            if parser.first_name:
                author_record.first_names = normalize(parser.first_name)
            if parser.birth_year:
                author_record.birth_year = parser.birth_year
            if parser.death_year:
                author_record.death_year = parser.death_year
            author_record.save()
    else:
        # No author, set Anonymous
        author_record = Author.get(gut_id='216')

    # Get license
    try:
        license_record = License.get(name=parser.license)
    except Exception:
        license_record = None

    # Insert book

    try:
        book_record = Book.get(id=parser.gid)
    except Book.DoesNotExist:
        book_record = Book.create(
            id=parser.gid,
            title=normalize(parser.title.strip()),
            subtitle=normalize(parser.subtitle.strip()),
            author=author_record,  # foreign key
            license=license_record,  # foreign key
            language=parser.language.strip(),
            downloads=parser.downloads)
    else:
        book_record.title = normalize(parser.title.strip())
        book_record.subtitle = normalize(parser.subtitle.strip())
        book_record.author = author_record  # foreign key
        book_record.license = license_record  # foreign key
        book_record.language = parser.language.strip()
        book_record.downloads = parser.downloads
        book_record.save()

    # Insert formats
    for file_type in parser.file_types:

        # Sanitize MIME
        mime = parser.file_types[file_type]
        if not mime.startswith('text/plain'):
            mime = re.sub(r'; charset=[a-z0-9-]+', '', mime)
        # else:
        #    charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0]

        # Insert format type
        pattern = re.sub(r'' + parser.gid, '{id}', file_type)
        pattern = pattern.split('/')[-1]

        bid = int(book_record.id)

        if bid in BAD_BOOKS_FORMATS.keys() \
            and mime in [FORMAT_MATRIX.get(f)
                         for f in BAD_BOOKS_FORMATS.get(bid)]:
            logger.error(
                "\t**** EXCLUDING **** {} for book #{} from list.".format(
                    mime, bid))
            continue

        format_record, _ = Format.get_or_create(
            mime=mime,
            images=file_type.endswith('.images')
            or parser.file_types[file_type] == 'application/pdf',
            pattern=pattern)

        # Insert book format
        BookFormat.get_or_create(
            book=book_record,  # foreign key
            format=format_record  # foreign key
        )