def get_isbn_from_non_pdf(log, book_path): scanner = BookScanner(log) iterator = EbookIterator(book_path) try: iterator.__enter__(only_input_plugin=True, run_char_count=False, read_anchor_map=False) if len(iterator.spine) == 0: return preprocessor = HTMLPreProcessor() def _process_file(path, forward=True): if not os.path.exists(path): log.error(' File does not exist:', path) return with open(path, 'rb') as f: html = f.read().decode('utf-8', 'replace') html = preprocessor(html, get_preprocess_html=True) scanner.look_for_identifiers_in_text([html], forward=forward) # For PDFs we scan the first 10 pages then the last 5 # For other formats (all converted to ePub) there is no concept # of pages, only files in the spine (manifest). # So based on the size of the ePub, we will scan the first few # files, then the last few in reverse, then the rest of the content. count = len(iterator.spine) for min_files, front_count, rear_count in EPUB_FILE_SCANS: if count >= min_files: first_files = iterator.spine[:front_count] last_files = [] if rear_count != 0: last_files = iterator.spine[rear_count:] middle_files = [] if count - min_files > 0: middle_files = iterator.spine[front_count:rear_count] break log(' Scanning first %d, then last %d, then remaining %d files' %\ (len(first_files), len(last_files), len(middle_files))) for path in first_files: _process_file(path, forward=True) if scanner.has_identifier(): break if not scanner.has_identifier() and last_files: for path in reversed(last_files): _process_file(path, forward=False) if scanner.has_identifier(): break if not scanner.has_identifier() and middle_files: for path in middle_files: _process_file(path, forward=True) if scanner.has_identifier(): break finally: if iterator: iterator.__exit__() return scanner.get_isbn_result()
def CreateOeb(log, path_or_stream, opts, encoding='utf-8'): from calibre.ebooks.conversion.preprocess import HTMLPreProcessor from calibre.ebooks.oeb.base import OEBBook html_preprocessor = HTMLPreProcessor(log, opts) if not encoding: encoding = None pretty_print = opts.pretty_print if opts else False return OEBBook(log, html_preprocessor, pretty_print=pretty_print, input_encoding=encoding)
def CreateOeb(log, path_or_stream, opts, encoding='utf-8'): """创建一个空的OEB书籍 """ html_preprocessor = HTMLPreProcessor(log, opts) if not encoding: encoding = None return OEBBook(log, html_preprocessor, pretty_print=opts.pretty_print, input_encoding=encoding)
def __init__(self, rootpath, opfpath, log, clone_data=None): self.root = clone_data[ 'root'] if clone_data is not None else os.path.abspath(rootpath) self.log = log self.html_preprocessor = HTMLPreProcessor() self.css_preprocessor = CSSPreProcessor() self.tweak_mode = False self.parsed_cache = {} self.mime_map = {} self.name_path_map = {} self.dirtied = set() self.encoding_map = {} self.pretty_print = set() self.cloned = False self.cache_names = ('parsed_cache', 'mime_map', 'name_path_map', 'encoding_map', 'dirtied', 'pretty_print') if clone_data is not None: self.cloned = True for x in ('name_path_map', 'opf_name', 'mime_map', 'pretty_print', 'encoding_map', 'tweak_mode'): setattr(self, x, clone_data[x]) self.opf_dir = os.path.dirname(self.name_path_map[self.opf_name]) return # Map of relative paths with '/' separators from root of unzipped ePub # to absolute paths on filesystem with os-specific separators opfpath = os.path.abspath(os.path.realpath(opfpath)) for dirpath, _dirnames, filenames in os.walk(self.root): for f in filenames: path = join(dirpath, f) name = self.abspath_to_name(path) self.name_path_map[name] = path self.mime_map[name] = guess_type(path) # Special case if we have stumbled onto the opf if path == opfpath: self.opf_name = name self.opf_dir = os.path.dirname(path) self.mime_map[name] = guess_type('a.opf') if not hasattr(self, 'opf_name'): raise InvalidBook('Could not locate opf file: %r' % opfpath) # Update mime map with data from the OPF for item in self.opf_xpath( '//opf:manifest/opf:item[@href and @media-type]'): href = item.get('href') name = self.href_to_name(href, self.opf_name) if name in self.mime_map: self.mime_map[name] = item.get('media-type')
def get_preprocess_html(path_to_ebook, output): from calibre.ebooks.conversion.preprocess import HTMLPreProcessor iterator = EbookIterator(path_to_ebook) iterator.__enter__(only_input_plugin=True, run_char_count=False, read_anchor_map=False) preprocessor = HTMLPreProcessor(None, False) with open(output, 'wb') as out: for path in iterator.spine: with open(path, 'rb') as f: html = f.read().decode('utf-8', 'replace') html = preprocessor(html, get_preprocess_html=True) out.write(html.encode('utf-8')) out.write(b'\n\n' + b'-' * 80 + b'\n\n')
if x > SCREEN_X or y > SCREEN_Y: xScale = float(x) / SCREEN_X yScale = float(y) / SCREEN_Y scale = max(xScale, yScale) # TODO : intelligent image rotation # img = img.rotate(90) # x,y = y,x img = resize_image(img, x // scale, y // scale) with lopen(imagePath, 'wb') as f: f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1])) if __name__ == '__main__': from calibre.ebooks.oeb.reader import OEBReader from calibre.ebooks.oeb.base import OEBBook from calibre.ebooks.conversion.preprocess import HTMLPreProcessor from calibre.customize.profiles import HanlinV3Output class OptionValues: pass opts = OptionValues() opts.output_profile = HanlinV3Output(None) html_preprocessor = HTMLPreProcessor(None, None, opts) from calibre.utils.logging import default_log oeb = OEBBook(default_log, html_preprocessor) reader = OEBReader reader()(oeb, '/tmp/bbb/processed/') SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)