Exemplos de HTMLPreProcessor em Python, exemplos de calibre.ebooks.conversion.preprocess.HTMLPreProcessor em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: nonpdf.py Projeto: outrera/calibre-extract-isbn

def get_isbn_from_non_pdf(log, book_path):
    scanner = BookScanner(log)
    iterator = EbookIterator(book_path)
    try:
        iterator.__enter__(only_input_plugin=True,
                           run_char_count=False,
                           read_anchor_map=False)
        if len(iterator.spine) == 0:
            return
        preprocessor = HTMLPreProcessor()

        def _process_file(path, forward=True):
            if not os.path.exists(path):
                log.error('  File does not exist:', path)
                return
            with open(path, 'rb') as f:
                html = f.read().decode('utf-8', 'replace')
            html = preprocessor(html, get_preprocess_html=True)
            scanner.look_for_identifiers_in_text([html], forward=forward)

        # For PDFs we scan the first 10 pages then the last 5
        # For other formats (all converted to ePub) there is no concept
        # of pages, only files in the spine (manifest).
        # So based on the size of the ePub, we will scan the first few
        # files, then the last few in reverse, then the rest of the content.
        count = len(iterator.spine)
        for min_files, front_count, rear_count in EPUB_FILE_SCANS:
            if count >= min_files:
                first_files = iterator.spine[:front_count]
                last_files = []
                if rear_count != 0:
                    last_files = iterator.spine[rear_count:]
                middle_files = []
                if count - min_files > 0:
                    middle_files = iterator.spine[front_count:rear_count]
                break

        log('  Scanning first %d, then last %d, then remaining %d files' %\
                 (len(first_files), len(last_files), len(middle_files)))
        for path in first_files:
            _process_file(path, forward=True)
            if scanner.has_identifier():
                break

        if not scanner.has_identifier() and last_files:
            for path in reversed(last_files):
                _process_file(path, forward=False)
                if scanner.has_identifier():
                    break

        if not scanner.has_identifier() and middle_files:
            for path in middle_files:
                _process_file(path, forward=True)
                if scanner.has_identifier():
                    break
    finally:
        if iterator:
            iterator.__exit__()

    return scanner.get_isbn_result()

Exemplo n.º 2

0

Exibir arquivo

def CreateOeb(log, path_or_stream, opts, encoding='utf-8'):
    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
    from calibre.ebooks.oeb.base import OEBBook
    html_preprocessor = HTMLPreProcessor(log, opts)
    if not encoding:
        encoding = None
    pretty_print = opts.pretty_print if opts else False
    return OEBBook(log, html_preprocessor, pretty_print=pretty_print, input_encoding=encoding)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: makeoeb.py Projeto: youngshook/KindleEar

def CreateOeb(log, path_or_stream, opts, encoding='utf-8'):
    """创建一个空的OEB书籍 """
    html_preprocessor = HTMLPreProcessor(log, opts)
    if not encoding:
        encoding = None
    return OEBBook(log,
                   html_preprocessor,
                   pretty_print=opts.pretty_print,
                   input_encoding=encoding)

Exemplo n.º 4

0

Exibir arquivo

    def __init__(self, rootpath, opfpath, log, clone_data=None):
        self.root = clone_data[
            'root'] if clone_data is not None else os.path.abspath(rootpath)
        self.log = log
        self.html_preprocessor = HTMLPreProcessor()
        self.css_preprocessor = CSSPreProcessor()
        self.tweak_mode = False

        self.parsed_cache = {}
        self.mime_map = {}
        self.name_path_map = {}
        self.dirtied = set()
        self.encoding_map = {}
        self.pretty_print = set()
        self.cloned = False
        self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map',
                            'encoding_map', 'dirtied', 'pretty_print')

        if clone_data is not None:
            self.cloned = True
            for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print',
                      'encoding_map', 'tweak_mode'):
                setattr(self, x, clone_data[x])
            self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name])
            return

        # Map of relative paths with '/' separators from root of unzipped ePub
        # to absolute paths on filesystem with os-specific separators
        opfpath = os.path.abspath(os.path.realpath(opfpath))
        for dirpath, _dirnames, filenames in os.walk(self.root):
            for f in filenames:
                path = join(dirpath, f)
                name = self.abspath_to_name(path)
                self.name_path_map[name] = path
                self.mime_map[name] = guess_type(path)
                # Special case if we have stumbled onto the opf
                if path == opfpath:
                    self.opf_name = name
                    self.opf_dir = os.path.dirname(path)
                    self.mime_map[name] = guess_type('a.opf')

        if not hasattr(self, 'opf_name'):
            raise InvalidBook('Could not locate opf file: %r' % opfpath)

        # Update mime map with data from the OPF
        for item in self.opf_xpath(
                '//opf:manifest/opf:item[@href and @media-type]'):
            href = item.get('href')
            name = self.href_to_name(href, self.opf_name)
            if name in self.mime_map:
                self.mime_map[name] = item.get('media-type')

Exemplo n.º 5

0

Exibir arquivo

Arquivo: __init__.py Projeto: syn-gowthamsrungarapu/calibre

def get_preprocess_html(path_to_ebook, output):
    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
    iterator = EbookIterator(path_to_ebook)
    iterator.__enter__(only_input_plugin=True,
                       run_char_count=False,
                       read_anchor_map=False)
    preprocessor = HTMLPreProcessor(None, False)
    with open(output, 'wb') as out:
        for path in iterator.spine:
            with open(path, 'rb') as f:
                html = f.read().decode('utf-8', 'replace')
            html = preprocessor(html, get_preprocess_html=True)
            out.write(html.encode('utf-8'))
            out.write(b'\n\n' + b'-' * 80 + b'\n\n')

Exemplo n.º 6

0

Exibir arquivo

        if x > SCREEN_X or y > SCREEN_Y:
            xScale = float(x) / SCREEN_X
            yScale = float(y) / SCREEN_Y
            scale = max(xScale, yScale)
            # TODO : intelligent image rotation
            #     img = img.rotate(90)
            #     x,y = y,x
            img = resize_image(img, x // scale, y // scale)
        with lopen(imagePath, 'wb') as f:
            f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))


if __name__ == '__main__':
    from calibre.ebooks.oeb.reader import OEBReader
    from calibre.ebooks.oeb.base import OEBBook
    from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
    from calibre.customize.profiles import HanlinV3Output

    class OptionValues:
        pass

    opts = OptionValues()
    opts.output_profile = HanlinV3Output(None)

    html_preprocessor = HTMLPreProcessor(None, None, opts)
    from calibre.utils.logging import default_log
    oeb = OEBBook(default_log, html_preprocessor)
    reader = OEBReader
    reader()(oeb, '/tmp/bbb/processed/')
    SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)