예제 #1
0
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\ Unable to copy missing file {}".format(src))
         return
예제 #2
0
파일: export.py 프로젝트: emijrp/gutenberg
 def copy_from_cache(fname, dstfname=None):
     src = os.path.join(path(download_cache).abspath(), fname)
     if dstfname is None:
         dstfname = fname
     dst = os.path.join(path(static_folder).abspath(), dstfname)
     logger.info("\t\tCopying {}".format(dst))
     path(dst).unlink_p()
     try:
         path(src).copy(dst)
     except IOError:
         logger.error("/!\ Unable to copy missing file {}".format(src))
         return
예제 #3
0
def check_dependencies():

    def bin_is_present(binary):
        try:
            subprocess.Popen(binary,
                             universal_newlines=True,
                             shell=False,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             bufsize=0)
        except OSError:
            return False
        else:
            return True

    all_bins = {
        'gifsicle': "GIF compression tool, part of `gifsicle` package",
        'pngquant': "PNG compression tool, part of `pngquant` package",
        'advdef': "PNG compression tool, part of `advancecomp` package",
        'jpegoptim': "JPEG compression tool, part of `jpegoptim` package",
        'zip': "ZIP file packager for ePub",
        'tar': "TAR archive extractor",
        'curl': "Files downloader, part of `curl` package",
        'zimwriterfs': "ZIM file writer, available on kiwix-other repository",
    }

    all_good = True
    has_zimwriter = True
    for bin, msg in all_bins.items():
        if bin == 'zimwriterfs':
            if not bin_is_present(bin):
                has_zimwriter = False
                continue

        if not bin_is_present(bin):
            logger.error("\t*{}* binary missing. {}".format(bin, msg))
            all_good = False

    return all_good, has_zimwriter
예제 #4
0
파일: download.py 프로젝트: guaka/gutenberg
def download_all_books(url_mirror, download_cache,
                       languages=[], formats=[],
                       only_books=[], force=False):

    available_books = get_list_of_filtered_books(
        languages=languages,
        formats=formats,
        only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info("\tDownloading content files for Book #{id}"
                    .format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}"
                             .format(fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                            '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                            '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                            '8regr10h.zip', '{id}.html.noimages',
                            '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip',
                            '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip',
                            '20004-h.htm', '8indn10h.htm', '8memo10h.zip',
                            'fondu10h.zip', '{id}-h.zip', '8mort10h.zip']
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp ; pp(list([(b.format.mime, b.format.images, b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}"
                             .format(format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}"
                         .format(format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while(urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error("ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath, book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp ; pp(allurls)
                continue
예제 #5
0
def export_book_to(book, static_folder, download_cache, cached_files,
                   languages, formats, books):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html = html_content_for(book=book,
                            static_folder=static_folder,
                            download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        logger.info("\t\tExporting to {}".format(article_fpath))
        try:
            new_html = update_html_for_static(book=book, html_content=html)
        except:
            new_html = html
        with open(article_fpath, 'w') as f:
            f.write(new_html)

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(fpath):
        if path(fpath).ext == '.png':
            return optimize_png(fpath)
        if path(fpath).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(fpath)
        if path(fpath).ext == '.gif':
            return optimize_gif(fpath)
        return fpath

    def optimize_gif(fpath):
        exec_cmd('gifsicle -O3 "{path}" -o "{path}"'.format(path=fpath))

    def optimize_png(fpath):
        pngquant = 'pngquant --nofs --force --ext=".png" "{path}"'
        advdef = 'advdef -z -4 -i 5 "{path}"'
        exec_cmd(pngquant.format(path=fpath))
        exec_cmd(advdef.format(path=fpath))

    def optimize_jpeg(fpath):
        exec_cmd('jpegoptim --strip-all -m50 "{path}"'.format(path=fpath))

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB at {}".format(dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(path_for_cmd(fnp))

            if path(fname).ext in ('.htm', '.html'):
                f = open(fnp, 'r')
                html = update_html_for_static(book=book,
                                              html_content=f.read(),
                                              epub=True)
                f.close()
                with open(fnp, 'w') as f:
                    f.write(html)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                f = open(fnp, 'r')
                ncx = f.read()
                f.close()
                soup = BeautifulSoup(ncx, ["lxml", "xml"])
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                with open(fnp, 'w') as f:
                    f.write(soup.encode())

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, str(book.id), 'content.opf')
            if os.path.exists(opff):
                with open(opff, 'r') as fd:
                    soup = BeautifulSoup(fd.read(), ["lxml", "xml"])

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                with (open(opff, 'w')) as fd:
                    fd.write(soup.encode())

        with cd(tmpd):
            exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst)))
            exec_cmd('zip -qXr9D "{dst}" {files}'.format(
                dst=path_for_cmd(dst),
                files=" ".join(
                    [f for f in zipped_files if not f == 'mimetype'])))

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)

        # optimization based on mime/extension
        if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'):
            copy_from_cache(src, dst)
            optimize_image(path_for_cmd(dst))
        elif path(fname).ext == '.epub':
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            optimize_epub(src, tmp_epub.name)
            path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [
            fn for fn in cached_files if fn.startswith("{}_".format(book.id))
    ]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html = "CAN'T READ FILE"
            with open(src, 'r') as f:
                html = f.read()
            new_html = update_html_for_static(book=book, html_content=html)
            with open(dst, 'w') as f:
                f.write(new_html)
        else:
            logger.info("\t\tCopying companion file to {}".format(fname))
            try:
                handle_companion_file(fname)
            except Exception as e:
                logger.error(
                    "\t\tException while handling companion file: {}".format(
                        e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        logger.info("\t\tCopying format file to {}".format(
            archive_name_for(book, format)))
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format))
        except Exception as e:
            logger.error(
                "\t\tException while handling companion file: {}".format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    logger.info("\t\tExporting to {}".format(cover_fpath))
    html = cover_html_content_for(book=book,
                                  static_folder=static_folder,
                                  books=books)
    with open(cover_fpath, 'w') as f:
        f.write(html.encode('utf-8'))
예제 #6
0
def main(arguments):

    # actions constants
    DO_PREPARE = arguments.get('--prepare', False)
    DO_PARSE = arguments.get('--parse', False)
    DO_DOWNLOAD = arguments.get('--download', False)
    DO_EXPORT = arguments.get('--export', False)
    DO_ZIM = arguments.get('--zim', False)
    DO_CHECKDEPS = arguments.get('--check', False)
    COMPLETE_DUMP = arguments.get('--complete', False)

    URL_MIRROR = arguments.get('--mirror') or 'http://zimfarm.kiwix.org/gutenberg'
    RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files')
    STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static')
    ZIM_FILE = arguments.get('--zim-file')
    WIPE_DB = not arguments.get('--keep-db') or False
    RDF_URL = arguments.get('--rdf-url') or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache')
    BOOKS = arguments.get('--books') or []
    ZTITLE = arguments.get('--zim-title')
    ZDESC = arguments.get('--zim-desc')

    LANGUAGES = [x.strip().lower()
                 for x in (arguments.get('--languages') or '').split(',')
                 if x.strip()]
    # special shortcuts for "all"
    if arguments.get('--formats') in ['all', None]:
        FORMATS = ['epub', 'pdf']
    else:
        FORMATS = [x.strip().lower()
                   for x in (arguments.get('--formats') or '').split(',')
                   if x.strip()]

    try:
        BOOKS = [bid for bid in BOOKS.split(',')]
        f = lambda x: map(int, [i for i in x.split('-') if i.isdigit()])
        books = []
        for i in BOOKS:
            blst = f(i)
            if len(blst) > 1:
                blst = range(blst[0], blst[1]+1)
            books.extend(blst)
        BOOKS = list(set(books))
    except Exception as e:
        logger.error(e)
        BOOKS = []

    # no arguments, default to --complete
    if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_EXPORT + DO_ZIM):
        COMPLETE_DUMP = True

    if COMPLETE_DUMP:
        DO_CHECKDEPS = DO_PREPARE = DO_PARSE = \
            DO_DOWNLOAD = DO_EXPORT = DO_ZIM = True

    if DO_CHECKDEPS:
        logger.info("CHECKING for dependencies on the system")
        if not check_dependencies()[0]:
            logger.error("Exiting...")
            sys.exit(1)

    if DO_PREPARE:
        logger.info("PREPARING rdf-files cache from {}".format(RDF_URL))
        setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER)

    if DO_PARSE:
        logger.info("PARSING rdf-files in {}".format(RDF_FOLDER))
        setup_database(wipe=WIPE_DB)
        parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS)

    if DO_DOWNLOAD:
        logger.info("DOWNLOADING ebooks from mirror using filters")
        download_all_books(url_mirror=URL_MIRROR,
                           download_cache=DL_CACHE,
                           languages=LANGUAGES,
                           formats=FORMATS,
                           only_books=BOOKS)

    if DO_EXPORT:
        logger.info("EXPORTING ebooks to static folder (and JSON)")
        export_all_books(static_folder=STATIC_FOLDER,
                         download_cache=DL_CACHE,
                         languages=LANGUAGES,
                         formats=FORMATS,
                         only_books=BOOKS)

    if DO_ZIM:
        if not check_dependencies()[1]:
            logger.error("You don't have zimwriterfs installed.")
            sys.exit(1)
        logger.info("BUILDING ZIM off static folder {}".format(STATIC_FOLDER))
        build_zimfile(static_folder=STATIC_FOLDER, zim_path=ZIM_FILE,
                      languages=LANGUAGES, formats=FORMATS,
                      only_books=BOOKS,
                      title=ZTITLE, description=ZDESC)
예제 #7
0
def build_zimfile(static_folder, zim_path=None,
                  languages=[], formats=[],
                  title=None, description=None,
                  only_books=[]):

    if not languages:
        languages = ['mul']

    languages.sort()
    formats.sort()

    if title is None:
        if len(languages) > 5:
            title = ("Project Gutenberg Library with {formats}"
                     .format(formats=",".join(formats)))
        else:
            title = ("Project Gutenberg Library ({langs}) with {formats}"
                     .format(langs=",".join(languages),
                             formats=",".join(formats)))

    logger.info("\tWritting ZIM for {}".format(title))

    if description is None:
        description = "The first producer of free ebooks"

    if zim_path is None:
        if len(languages) > 1:
            zim_path = "gutenberg_all_{date}.zim".format(
                    date=datetime.datetime.now().strftime('%m_%Y'))
        else:
            zim_path = "gutenberg_{lang}_all_{date}.zim".format(
                    lang=languages[0],
                    date=datetime.datetime.now().strftime('%Y-%m'))

    languages = [ISO_MATRIX.get(lang, lang) for lang in languages]
    languages.sort()

    context = {
        'languages': ','.join(languages),
        'title': title,
        'description': description,
        'creator': 'gutenberg.org',
        'publisher': 'Kiwix',

        'home': 'Home.html',
        'favicon': 'favicon.png',

        'static': static_folder,
        'zim': zim_path
    }

    cmd = ('zimwriterfs --welcome=\\"{home}\\" --favicon=\\"{favicon}\\" '
           '--language=\\"{languages}\\" --title=\\"{title}\\" '
           '--description=\\"{description}\\" '
           '--creator=\\"{creator}\\" --publisher=\\"{publisher}\\" \\"{static}\\" \\"{zim}\\"'
           .format(**context))

    logger.debug("\t\t{}".format(re.sub('\\\\"','"',cmd)))
    if exec_cmd(cmd):
        logger.info("Successfuly created ZIM file at {}".format(zim_path))
    else:
        logger.error("Unable to create ZIM file :(")
예제 #8
0
def download_all_books(url_mirror,
                       download_cache,
                       languages=[],
                       formats=[],
                       only_books=[],
                       force=False):

    available_books = get_list_of_filtered_books(languages=languages,
                                                 formats=formats,
                                                 only_books=only_books)

    # ensure dir exist
    path(download_cache).mkdir_p()

    for book in available_books:

        logger.info(
            "\tDownloading content files for Book #{id}".format(id=book.id))

        # apply filters
        if not formats:
            formats = FORMAT_MATRIX.keys()

        # HTML is our base for ZIM for add it if not present
        if not 'html' in formats:
            formats.append('html')

        for format in formats:

            fpath = os.path.join(download_cache, fname_for(book, format))

            # check if already downloaded
            if path(fpath).exists() and not force:
                logger.debug("\t\t{fmt} already exists at {path}".format(
                    fmt=format, path=fpath))
                continue

            # retrieve corresponding BookFormat
            bfs = BookFormat.filter(book=book)

            if format == 'html':
                patterns = [
                    'mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
                    '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
                    '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
                    '8regr10h.zip', '{id}.html.noimages', '8lgme10h.htm',
                    'tycho10h.htm', 'tycho10h.zip', '8lgme10h.zip',
                    '8indn10h.zip', '8resp10h.zip', '20004-h.htm',
                    '8indn10h.htm', '8memo10h.zip', 'fondu10h.zip',
                    '{id}-h.zip', '8mort10h.zip'
                ]
                bfso = bfs
                bfs = bfs.join(Format).filter(Format.pattern << patterns)
                if not bfs.count():
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfs]))
                    from pprint import pprint as pp
                    pp(
                        list([(b.format.mime, b.format.images,
                               b.format.pattern) for b in bfso]))
                    logger.error("html not found")
                    continue
            else:
                bfs = bfs.filter(BookFormat.format << Format.filter(
                    mime=FORMAT_MATRIX.get(format)))

            if not bfs.count():
                logger.debug("[{}] not avail. for #{}# {}".format(
                    format, book.id, book.title))
                continue

            if bfs.count() > 1:
                try:
                    bf = bfs.join(Format).filter(Format.images == True).get()
                except:
                    bf = bfs.get()
            else:
                bf = bfs.get()

            logger.debug("[{}] Requesting URLs for #{}# {}".format(
                format, book.id, book.title))

            # retrieve list of URLs for format unless we have it in DB
            if bf.downloaded_from and not force:
                urls = [bf.downloaded_from]
            else:
                urld = get_urls(book)
                urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))

            import copy
            allurls = copy.copy(urls)

            while (urls):
                url = urls.pop()

                if not resource_exists(url):
                    continue

                # HTML files are *sometime* available as ZIP files
                if url.endswith('.zip'):
                    zpath = "{}.zip".format(fpath)

                    if not download_file(url, zpath):
                        logger.error(
                            "ZIP file donwload failed: {}".format(zpath))
                        continue

                    # extract zipfile
                    handle_zipped_epub(zippath=zpath,
                                       book=book,
                                       download_cache=download_cache)
                else:
                    if not download_file(url, fpath):
                        logger.error("file donwload failed: {}".format(fpath))
                        continue

                # store working URL in DB
                bf.downloaded_from = url
                bf.save()

            if not bf.downloaded_from:
                logger.error("NO FILE FOR #{}/{}".format(book.id, format))
                from pprint import pprint as pp
                pp(allurls)
                continue
예제 #9
0
def main(arguments):

    # actions constants
    DO_PREPARE = arguments.get('--prepare', False)
    DO_PARSE = arguments.get('--parse', False)
    DO_DOWNLOAD = arguments.get('--download', False)
    DO_EXPORT = arguments.get('--export', False)
    DO_ZIM = arguments.get('--zim', False)
    DO_CHECKDEPS = arguments.get('--check', False)
    COMPLETE_DUMP = arguments.get('--complete', False)

    URL_MIRROR = arguments.get(
        '--mirror') or 'http://zimfarm.kiwix.org/gutenberg'
    RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files')
    STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static')
    ZIM_FILE = arguments.get('--zim-file')
    WIPE_DB = not arguments.get('--keep-db') or False
    RDF_URL = arguments.get(
        '--rdf-url'
    ) or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
    DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache')
    BOOKS = arguments.get('--books') or ''
    ZTITLE = arguments.get('--zim-title')
    ZDESC = arguments.get('--zim-desc')

    # create tmp dir
    path('tmp').mkdir_p()

    LANGUAGES = [
        x.strip().lower()
        for x in (arguments.get('--languages') or '').split(',') if x.strip()
    ]
    # special shortcuts for "all"
    if arguments.get('--formats') in ['all', None]:
        FORMATS = ['epub', 'pdf']
    else:
        FORMATS = [
            x.strip().lower()
            for x in (arguments.get('--formats') or '').split(',')
            if x.strip()
        ]

    try:
        BOOKS = [bid for bid in BOOKS.split(',')]
        f = lambda x: map(int, [i for i in x.split('-') if i.isdigit()])
        books = []
        for i in BOOKS:
            blst = f(i)
            if len(blst) > 1:
                blst = range(blst[0], blst[1] + 1)
            books.extend(blst)
        BOOKS = list(set(books))
    except Exception as e:
        logger.error(e)
        BOOKS = []

    # no arguments, default to --complete
    if not (DO_PREPARE + DO_PARSE + DO_DOWNLOAD + DO_EXPORT + DO_ZIM):
        COMPLETE_DUMP = True

    if COMPLETE_DUMP:
        DO_CHECKDEPS = DO_PREPARE = DO_PARSE = \
            DO_DOWNLOAD = DO_EXPORT = DO_ZIM = True

    if DO_CHECKDEPS:
        logger.info("CHECKING for dependencies on the system")
        if not check_dependencies()[0]:
            logger.error("Exiting...")
            sys.exit(1)

    if DO_PREPARE:
        logger.info("PREPARING rdf-files cache from {}".format(RDF_URL))
        setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER)

    if DO_PARSE:
        logger.info("PARSING rdf-files in {}".format(RDF_FOLDER))
        setup_database(wipe=WIPE_DB)
        parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS)

    if DO_DOWNLOAD:
        logger.info("DOWNLOADING ebooks from mirror using filters")
        download_all_books(url_mirror=URL_MIRROR,
                           download_cache=DL_CACHE,
                           languages=LANGUAGES,
                           formats=FORMATS,
                           only_books=BOOKS)

    if DO_EXPORT:
        logger.info("EXPORTING ebooks to static folder (and JSON)")
        export_all_books(static_folder=STATIC_FOLDER,
                         download_cache=DL_CACHE,
                         languages=LANGUAGES,
                         formats=FORMATS,
                         only_books=BOOKS)

    if DO_ZIM:
        if not check_dependencies()[1]:
            logger.error("You don't have zimwriterfs installed.")
            sys.exit(1)
        logger.info("BUILDING ZIM off static folder {}".format(STATIC_FOLDER))
        build_zimfile(static_folder=STATIC_FOLDER,
                      zim_path=ZIM_FILE,
                      languages=LANGUAGES,
                      formats=FORMATS,
                      only_books=BOOKS,
                      title=ZTITLE,
                      description=ZDESC)
예제 #10
0
파일: export.py 프로젝트: emijrp/gutenberg
def export_book_to(book,
                   static_folder, download_cache,
                   cached_files, languages, formats, books):
    logger.info("\tExporting Book #{id}.".format(id=book.id))

    # actual book content, as HTML
    html = html_content_for(book=book,
                            static_folder=static_folder,
                            download_cache=download_cache)
    if html:
        article_fpath = os.path.join(static_folder, article_name_for(book))
        logger.info("\t\tExporting to {}".format(article_fpath))
        try:
            new_html = update_html_for_static(book=book, html_content=html)
        except:
            new_html = html
        with open(article_fpath, 'w') as f:
            f.write(new_html)

    def symlink_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tSymlinking {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).link(dst)  # hard link
        except IOError:
            logger.error("/!\ Unable to symlink missing file {}".format(src))
            return

    def copy_from_cache(fname, dstfname=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)
        logger.info("\t\tCopying {}".format(dst))
        path(dst).unlink_p()
        try:
            path(src).copy(dst)
        except IOError:
            logger.error("/!\ Unable to copy missing file {}".format(src))
            return

    def optimize_image(fpath):
        if path(fpath).ext == '.png':
            return optimize_png(fpath)
        if path(fpath).ext in ('.jpg', '.jpeg'):
            return optimize_jpeg(fpath)
        if path(fpath).ext == '.gif':
            return optimize_gif(fpath)
        return fpath

    def optimize_gif(fpath):
        exec_cmd('gifsicle -O3 "{path}" -o "{path}"'.format(path=fpath))

    def optimize_png(fpath):
        pngquant = 'pngquant --nofs --force --ext=".png" "{path}"'
        advdef = 'advdef -z -4 -i 5 "{path}"'
        exec_cmd(pngquant.format(path=fpath))
        exec_cmd(advdef.format(path=fpath))

    def optimize_jpeg(fpath):
        exec_cmd('jpegoptim --strip-all -m50 "{path}"'.format(path=fpath))

    def optimize_epub(src, dst):
        logger.info("\t\tCreating ePUB at {}".format(dst))
        zipped_files = []
        # create temp directory to extract to
        tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
        with zipfile.ZipFile(src, 'r') as zf:
            zipped_files = zf.namelist()
            zf.extractall(tmpd)

        remove_cover = False
        for fname in zipped_files:
            fnp = os.path.join(tmpd, fname)
            if path(fname).ext in ('.png', '.jpeg', '.jpg', '.gif'):

                # special case to remove ugly cover
                if fname.endswith('cover.jpg') and is_bad_cover(fnp):
                    zipped_files.remove(fname)
                    remove_cover = True
                else:
                    optimize_image(path_for_cmd(fnp))

            if path(fname).ext in ('.htm', '.html'):
                f = open(fnp, 'r')
                html = update_html_for_static(book=book,
                                              html_content=f.read(),
                                              epub=True)
                f.close()
                with open(fnp, 'w') as f:
                    f.write(html)

            if path(fname).ext == '.ncx':
                pattern = "*** START: FULL LICENSE ***"
                f = open(fnp, 'r')
                ncx = f.read()
                f.close()
                soup = BeautifulSoup(ncx, ["lxml", "xml"])
                for tag in soup.findAll('text'):
                    if pattern in tag.text:
                        s = tag.parent.parent
                        s.decompose()
                        for s in s.next_siblings:
                            s.decompose()
                        s.next_sibling

                with open(fnp, 'w') as f:
                    f.write(soup.encode())

        # delete {id}/cover.jpg if exist and update {id}/content.opf
        if remove_cover:

            # remove cover
            path(os.path.join(tmpd, str(book.id), 'cover.jpg')).unlink_p()

            soup = None
            opff = os.path.join(tmpd, str(book.id), 'content.opf')
            if os.path.exists(opff):
                with open(opff, 'r') as fd:
                    soup = BeautifulSoup(fd.read(), ["lxml", "xml"])

                for elem in soup.findAll():
                    if getattr(elem, 'attrs', {}).get('href') == 'cover.jpg':
                        elem.decompose()

                with(open(opff, 'w')) as fd:
                    fd.write(soup.encode())

        with cd(tmpd):
            exec_cmd('zip -q0X "{dst}" mimetype'.format(dst=path_for_cmd(dst)))
            exec_cmd('zip -qXr9D "{dst}" {files}'
                     .format(dst=path_for_cmd(dst),
                             files=" ".join([f for f in zipped_files
                                             if not f == 'mimetype'])))

        path(tmpd).rmtree_p()

    def handle_companion_file(fname, dstfname=None, book=None):
        src = os.path.join(path(download_cache).abspath(), fname)
        if dstfname is None:
            dstfname = fname
        dst = os.path.join(path(static_folder).abspath(), dstfname)

        # optimization based on mime/extension
        if path(fname).ext in ('.png', '.jpg', '.jpeg', '.gif'):
            copy_from_cache(src, dst)
            optimize_image(path_for_cmd(dst))
        elif path(fname).ext == '.epub':
            tmp_epub = tempfile.NamedTemporaryFile(suffix='.epub',
                                                   dir=TMP_FOLDER)
            tmp_epub.close()
            optimize_epub(src, tmp_epub.name)
            path(tmp_epub.name).move(dst)
        else:
            # excludes files created by Windows Explorer
            if src.endswith('_Thumbs.db'):
                return
            # copy otherwise (PDF mostly)
            logger.debug("\t\tshitty ext: {}".format(dst))
            copy_from_cache(src, dst)

    # associated files (images, etc)
    for fname in [fn for fn in cached_files
                  if fn.startswith("{}_".format(book.id))]:

        if path(fname).ext in ('.html', '.htm'):
            src = os.path.join(path(download_cache).abspath(), fname)
            dst = os.path.join(path(static_folder).abspath(), fname)

            logger.info("\t\tExporting HTML file to {}".format(dst))
            html = "CAN'T READ FILE"
            with open(src, 'r') as f:
                html = f.read()
            new_html = update_html_for_static(book=book, html_content=html)
            with open(dst, 'w') as f:
                f.write(new_html)
        else:
            logger.info("\t\tCopying companion file to {}".format(fname))
            try:
                handle_companion_file(fname)
            except Exception as e:
                logger.error("\t\tException while handling companion file: {}"
                             .format(e))

    # other formats
    for format in formats:
        if format not in book.formats() or format == 'html':
            continue
        logger.info("\t\tCopying format file to {}"
                    .format(archive_name_for(book, format)))
        try:
            handle_companion_file(fname_for(book, format),
                                  archive_name_for(book, format))
        except Exception as e:
            logger.error("\t\tException while handling companion file: {}"
                         .format(e))

    # book presentation article
    cover_fpath = os.path.join(static_folder,
                               article_name_for(book=book, cover=True))
    logger.info("\t\tExporting to {}".format(cover_fpath))
    html = cover_html_content_for(book=book,
                                  static_folder=static_folder,
                                  books=books)
    with open(cover_fpath, 'w') as f:
        f.write(html.encode('utf-8'))
예제 #11
0
def save_rdf_in_database(parser):

    # Insert author, if it not exists
    if parser.author_id:
        try:
            author_record = Author.get(gut_id=parser.author_id)
            if parser.last_name:
                author_record.last_name
            if parser.first_name:
                author_record.first_names = parser.first_name
            if parser.birth_year:
                author_record.birth_year = parser.birth_year
            if parser.death_year:
                author_record.death_year = parser.death_year
            author_record.save()
        except:
            author_record = Author.create(
                gut_id=parser.author_id,
                last_name=parser.last_name,
                first_names=parser.first_name,
                birth_year=parser.birth_year,
                death_year=parser.death_year)
    else:
        # No author, set Anonymous
        author_record = Author.get(gut_id='216')

    # Get license
    try:
        license_record = License.get(name=parser.license)
    except:
        license_record = None

    # Insert book
    book_record = Book.create(
        id=parser.gid,
        title=parser.title.strip(),
        subtitle=parser.subtitle.strip(),
        author=author_record,  # foreign key
        license=license_record,  # foreign key
        language=parser.language.strip(),
        downloads=parser.downloads
    )

    # Insert formats
    for file_type in parser.file_types:

        # Sanitize MIME
        mime = parser.file_types[file_type]
        if not mime.startswith('text/plain'):
            mime = re.sub(r'; charset=[a-z0-9-]+', '', mime)
        # else:
        #    charset = re.match(r'; charset=([a-z0-9-]+)', mime).groups()[0]

        # Insert format type
        pattern = re.sub(r'' + parser.gid, '{id}', file_type)
        pattern = pattern.split('/')[-1]

        bid = int(book_record.id)

        if bid in BAD_BOOKS_FORMATS.keys() \
            and mime in [FORMAT_MATRIX.get(f)
                         for f in BAD_BOOKS_FORMATS.get(bid)]:
            logger.error("\t**** EXCLUDING **** {} for book #{} from list."
                         .format(mime, bid))
            continue


        format_record = Format.get_or_create(
            mime=mime,
            images=file_type.endswith(
                '.images') or parser.file_types[file_type] == 'application/pdf',
            pattern=pattern)

        # Insert book format
        BookFormat.create(
            book=book_record,  # foreign key
            format=format_record  # foreign key
        )