def get_metadata(stream): if isinstance(stream, bytes): stream = DummyFile(stream) root = parse_opf(stream) ver = parse_opf_version(root.get('version')) opf = OPF(None, preparsed_opf=root, read_toc=False) return opf.to_book_metadata(), ver, opf.raster_cover, opf.first_spine_item()
def get_metadata(stream): from calibre.ebooks.lit.reader import LitContainer from calibre.utils.logging import Log litfile = LitContainer(stream, Log()) src = litfile.get_metadata().encode('utf-8') litfile = litfile._litfile opf = OPF(cStringIO.StringIO(src), os.getcwdu()) mi = opf.to_book_metadata() covers = [] for item in opf.iterguide(): if 'cover' not in item.get('type', '').lower(): continue ctype = item.get('type') href = item.get('href', '') candidates = [href, href.replace('&', '%26')] for item in litfile.manifest.values(): if item.path in candidates: try: covers.append((litfile.get_file('/data/'+item.internal), ctype)) except: pass break covers.sort(cmp=lambda x, y:cmp(len(x[0]), len(y[0])), reverse=True) idx = 0 if len(covers) > 1: if covers[1][1] == covers[0][1]+'-standard': idx = 1 mi.cover_data = ('jpg', covers[idx][0]) return mi
def read_embedded_metadata(self, root, elem, guide): raw = '<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \ html.tostring(elem, encoding='utf-8') + '</package>' stream = cStringIO.StringIO(raw) opf = OPF(stream) self.embedded_mi = opf.to_book_metadata() if guide is not None: for ref in guide.xpath('descendant::reference'): if 'cover' in ref.get('type', '').lower(): href = ref.get('href', '') if href.startswith('#'): href = href[1:] anchors = root.xpath('//*[@id="%s"]' % href) if anchors: cpos = anchors[0] reached = False for elem in root.iter(): if elem is cpos: reached = True if reached and elem.tag == 'img': cover = elem.get('src', None) self.embedded_mi.cover = cover elem.getparent().remove(elem) break break
def handle_zip_of_opf_files(self, stream): ''' Given a zip up of a bunch of opf files, either merge them or add them to library ''' result = {'updated':0, 'added':0} with ZipFile(stream, 'r') as zf: self.start_applying_updates() for zi in zf.infolist(): ext = zi.filename.rpartition('.')[-1].lower() if ext in {'opf'}: try: raw = zf.open(zi) opf = OPF(raw) mi = opf.to_book_metadata() casanova_id = self.extract_id(mi) if casanova_id: book_mi = self.get_casanova_metadata(casanova_id['id']) if book_mi: # Update an existing book's metadata! result['updated'] = result['updated'] + 1 self.apply_metadata_update(casanova_id['id'], book_mi, mi) else: # Create a new book entry result['added'] = result['added'] + 1 self.model.db.import_book(mi,[]) except: foo=False if ext in {'jpg', 'png', 'gif'}: # try and handle the cover casanova_id = zi.filename.partition('.')[0].lower() if casanova_id in self.book_map: book_id = self.book_map[casanova_id] raw = zf.open(zi) self.db.set_cover(book_id, raw) self.finish_applying_updates() return result
def process_dir(self, dirpath, filenames, book_id): book_id = int(book_id) formats = list(filter(self.is_ebook_file, filenames)) fmts = [os.path.splitext(x)[1][1:].upper() for x in formats] sizes = [os.path.getsize(os.path.join(dirpath, x)) for x in formats] names = [os.path.splitext(x)[0] for x in formats] opf = os.path.join(dirpath, 'metadata.opf') parsed_opf = OPF(opf, basedir=dirpath) mi = parsed_opf.to_book_metadata() annotations = tuple(parsed_opf.read_annotations()) timestamp = os.path.getmtime(opf) path = os.path.relpath(dirpath, self.src_library_path).replace(os.sep, '/') if int(mi.application_id) == book_id: self.books.append({ 'mi': mi, 'timestamp': timestamp, 'formats': list(zip(fmts, sizes, names)), 'id': book_id, 'dirpath': dirpath, 'path': path, 'annotations': annotations }) else: self.mismatched_dirs.append(dirpath) alm = mi.get('author_link_map', {}) for author, link in iteritems(alm): existing_link, timestamp = self.authors_links.get(author, (None, None)) if existing_link is None or existing_link != link and timestamp < mi.timestamp: self.authors_links[author] = (link, mi.timestamp)
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) try: with ZipFile(stream) as zf: opf_name = get_first_opf_name(zf) opf_stream = StringIO(zf.read(opf_name)) opf = OPF(opf_stream) mi = opf.to_book_metadata() if extract_cover: cover_href = opf.raster_cover if not cover_href: for meta in opf.metadata.xpath('//*[local-name()="meta" and @name="cover"]'): val = meta.get('content') if val.rpartition('.')[2].lower() in {'jpeg', 'jpg', 'png'}: cover_href = val break if cover_href: try: mi.cover_data = (os.path.splitext(cover_href)[1], zf.read(cover_href)) except Exception: pass except Exception: return mi return mi
def get_metadata(stream): from calibre.ebooks.lit.reader import LitContainer from calibre.utils.logging import Log litfile = LitContainer(stream, Log()) src = litfile.get_metadata().encode('utf-8') litfile = litfile._litfile opf = OPF(io.BytesIO(src), getcwd()) mi = opf.to_book_metadata() covers = [] for item in opf.iterguide(): if 'cover' not in item.get('type', '').lower(): continue ctype = item.get('type') href = item.get('href', '') candidates = [href, href.replace('&', '%26')] for item in litfile.manifest.values(): if item.path in candidates: try: covers.append( (litfile.get_file('/data/' + item.internal), ctype)) except: pass break covers.sort(key=lambda x: len(x[0]), reverse=True) idx = 0 if len(covers) > 1: if covers[1][1] == covers[0][1] + '-standard': idx = 1 mi.cover_data = ('jpg', covers[idx][0]) return mi
def get_metadata(stream): if isinstance(stream, bytes): stream = DummyFile(stream) root = parse_opf(stream) ver = parse_opf_version(root.get('version')) opf = OPF(None, preparsed_opf=root, read_toc=False) return opf.to_book_metadata(), ver, opf.raster_cover, opf.first_spine_item( )
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) try: with ZipFile(stream) as zf: opf_name = get_first_opf_name(zf) opf_stream = StringIO(zf.read(opf_name)) opf = OPF(opf_stream) mi = opf.to_book_metadata() if extract_cover: cover_href = opf.raster_cover if cover_href: mi.cover_data = (os.path.splitext(cover_href)[1], zf.read(cover_href)) except: return mi return mi
def opf_metadata(opfpath): if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', os.getcwdu()) else: f = open(opfpath, 'rb') try: opf = OPF(f, os.path.dirname(opfpath)) if opf.application_id is not None: mi = opf.to_book_metadata() if hasattr(opf, 'cover') and opf.cover: cpath = os.path.join(os.path.dirname(opfpath), opf.cover) if os.access(cpath, os.R_OK): fmt = cpath.rpartition('.')[-1] data = open(cpath, 'rb').read() mi.cover_data = (fmt, data) return mi except: import traceback traceback.print_exc() pass
def opf_metadata(opfpath): if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', getcwd()) else: f = open(opfpath, 'rb') try: opf = OPF(f, os.path.dirname(opfpath)) if opf.application_id is not None: mi = opf.to_book_metadata() if hasattr(opf, 'cover') and opf.cover: cpath = os.path.join(os.path.dirname(opfpath), opf.cover) if os.access(cpath, os.R_OK): fmt = cpath.rpartition('.')[-1] data = open(cpath, 'rb').read() mi.cover_data = (fmt, data) return mi except: import traceback traceback.print_exc() pass
def zip_opf_metadata(opfpath, zf): from calibre.ebooks.metadata.opf2 import OPF if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', getcwd()) else: f = open(opfpath, 'rb') opf = OPF(f, os.path.dirname(opfpath)) mi = opf.to_book_metadata() # This is broken, in that it only works for # when both the OPF file and the cover file are in the root of the # zip file and the cover is an actual raster image, but I don't care # enough to make it more robust if getattr(mi, 'cover', None): covername = os.path.basename(mi.cover) mi.cover = None names = zf.namelist() if covername in names: fmt = covername.rpartition('.')[-1] data = zf.read(covername) mi.cover_data = (fmt, data) return mi
def handle_zip_of_opf_files(self, stream): ''' Given a zip up of a bunch of opf files, either merge them or add them to library ''' result = {'updated': 0, 'added': 0} with ZipFile(stream, 'r') as zf: self.start_applying_updates() for zi in zf.infolist(): ext = zi.filename.rpartition('.')[-1].lower() if ext in {'opf'}: try: raw = zf.open(zi) opf = OPF(raw) mi = opf.to_book_metadata() casanova_id = self.extract_id(mi) if casanova_id: book_mi = self.get_casanova_metadata( casanova_id['id']) if book_mi: # Update an existing book's metadata! result['updated'] = result['updated'] + 1 self.apply_metadata_update( casanova_id['id'], book_mi, mi) else: # Create a new book entry result['added'] = result['added'] + 1 self.model.db.import_book(mi, []) except: foo = False if ext in {'jpg', 'png', 'gif'}: # try and handle the cover casanova_id = zi.filename.partition('.')[0].lower() if casanova_id in self.book_map: book_id = self.book_map[casanova_id] raw = zf.open(zi) self.db.set_cover(book_id, raw) self.finish_applying_updates() return result
def read_user_metadata(self): ''' Read all metadata specified by the user. Command line options override metadata from a specified OPF file. ''' from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPF mi = MetaInformation(None, []) if self.opts.read_metadata_from_opf is not None: self.opts.read_metadata_from_opf = os.path.abspath( self.opts.read_metadata_from_opf) opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'), os.path.dirname(self.opts.read_metadata_from_opf)) mi = opf.to_book_metadata() self.opts_to_mi(mi) if mi.cover: if mi.cover.startswith('http:') or mi.cover.startswith('https:'): mi.cover = self.download_cover(mi.cover) ext = mi.cover.rpartition('.')[-1].lower().strip() if ext not in ('png', 'jpg', 'jpeg', 'gif'): ext = 'jpg' mi.cover_data = (ext, open(mi.cover, 'rb').read()) mi.cover = None self.user_metadata = mi
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == "inline": from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == "tag": from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory("_htmlz_output") as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = "index" if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(unicode(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname + ".html"), "wb") as tf: if isinstance(html, unicode): html = html.encode("utf-8") tf.write(html) # CSS if opts.htmlz_css_type == "class" and opts.htmlz_class_style == "external": with open(os.path.join(tdir, "style.css"), "wb") as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, "images")): os.makedirs(os.path.join(tdir, "images")) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = unicode(etree.tostring(item.data, encoding=unicode)) else: data = item.data fname = os.path.join(tdir, "images", images[item.href]) with open(fname, "wb") as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.magick.draw import save_cover_data_to cover_path = os.path.join(tdir, "cover.jpg") with open(cover_path, "w") as cf: cf.write("") save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, "metadata.opf"), "wb") as mdataf: opf = OPF(StringIO(etree.tostring(oeb_book.metadata.to_opf1()))) mi = opf.to_book_metadata() if cover_path: mi.cover = "cover.jpg" mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, "w") htmlz.add_dir(tdir)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename( unicode_type(oeb_book.metadata.title[0])), ))[0] with open(os.path.join(tdir, fname + u'.html'), 'wb') as tf: if isinstance(html, unicode_type): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = unicode_type( etree.tostring(item.data, encoding=unicode_type)) else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF( io.BytesIO( etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def convert(self, oeb_book, output, input_plugin, opts, log): self.report_version(log) #for mivals in oeb_book.metadata.items.values(): # for mival in mivals: # log.info("metadata: %s" % repr(mival)) try: book_name = str(oeb_book.metadata.title[0]) except Exception: book_name = "" asin = None if not tweaks.get("kfx_output_ignore_asin_metadata", False): for idre in ["^mobi-asin$", "^amazon.*$", "^asin$"]: for ident in oeb_book.metadata["identifier"]: idtype = ident.get(OPFNS("scheme"), "").lower() if re.match(idre, idtype) and re.match(ASIN_RE, ident.value): asin = ident.value log.info("Found ASIN metadata %s: %s" % (idtype, asin)) break if asin: break #with open(opts.read_metadata_from_opf, "rb") as opff: # log.info("opf: %s" % opff.read()) if opts.approximate_pages: page_count = 0 if opts.number_of_pages_field and opts.number_of_pages_field != AUTO_PAGES and opts.read_metadata_from_opf: # This OPF contains custom column metadata not present in the oeb_book OPF opf = OPF(opts.read_metadata_from_opf, populate_spine=False, try_to_guess_cover=False, read_toc=False) mi = opf.to_book_metadata() page_count_str = mi.get(opts.number_of_pages_field, None) if page_count_str is not None: try: page_count = int(page_count_str) except Exception: pass log.info("Page count value from field %s: %d ('%s')" % (opts.number_of_pages_field, page_count, page_count_str)) else: log.warning("Book has no page count field %s" % opts.number_of_pages_field) else: page_count = -1 #log.info("oeb_book contains %d pages" % len(oeb_book.pages.pages)) #log.info("options: %s" % str(opts.__dict__)) # set default values for options expected by the EPUB Output plugin for optrec in EPUBOutput.options: setattr(opts, optrec.option.name, optrec.recommended_value) # override currently known EPUB Output plugin options opts.extract_to = None opts.dont_split_on_page_breaks = False opts.flow_size = 0 opts.no_default_epub_cover = False opts.no_svg_cover = False opts.preserve_cover_aspect_ratio = True opts.epub_flatten = False opts.epub_inline_toc = False opts.epub_toc_at_end = False opts.toc_title = None epub_filename = self.temporary_file(".epub").name self.epub_output_plugin.convert(oeb_book, epub_filename, input_plugin, opts, log) # convert input format to EPUB log.info("Successfully converted input format to EPUB") if PREPARED_FILE_SAVE_DIR: if not os.path.exists(PREPARED_FILE_SAVE_DIR): os.makedirs(PREPARED_FILE_SAVE_DIR) prepared_file_path = os.path.join(PREPARED_FILE_SAVE_DIR, os.path.basename(epub_filename)) shutil.copyfile(epub_filename, prepared_file_path) log.warning("Saved conversion input file: %s" % prepared_file_path) self.convert_using_previewer( JobLog(log), book_name, epub_filename, asin, opts.cde_type_pdoc, page_count, opts.show_kpr_logs, False, TIMEOUT if opts.enable_timeout else None, output)
class EbookIterator(BookmarksMixin): CHARACTERS_PER_PAGE = 1000 def __init__(self, pathtoebook, log=None, copy_bookmarks_to_file=True, use_tdir_in_cache=False): BookmarksMixin.__init__(self, copy_bookmarks_to_file=copy_bookmarks_to_file) self.use_tdir_in_cache = use_tdir_in_cache self.log = log or default_log pathtoebook = pathtoebook.strip() self.pathtoebook = os.path.abspath(pathtoebook) self.config = DynamicConfig(name='iterator') ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) self.ebook_ext = ext.replace('original_', '') def search(self, text, index, backwards=False): from calibre.ebooks.oeb.polish.parsing import parse pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() q = text.lower() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) root = parse(raw) fragments = [] def serialize(elem): if elem.text: fragments.append(elem.text.lower()) if elem.tail: fragments.append(elem.tail.lower()) for child in elem.iterchildren(): if hasattr(getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition('}')[-1] not in {'script', 'style', 'del'}: serialize(child) elif getattr(child, 'tail', None): fragments.append(child.tail.lower()) for body in root.xpath('//*[local-name() = "body"]'): body.tail = None serialize(body) if q in ''.join(fragments): return i def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' self.delete_on_exit = [] if self.use_tdir_in_cache: self._tdir = tdir_in_cache('ev') else: self._tdir = PersistentTemporaryDirectory('_ebook_iter') self.base = os.path.realpath(self._tdir) self.book_format, self.pathtoopf, input_fmt = run_extract_book( self.pathtoebook, self.base, only_input_plugin=only_input_plugin, view_kepub=view_kepub, processed=processed) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.mi = self.opf.to_book_metadata() self.language = None if self.mi.languages: self.language = self.mi.languages[0].lower() ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') is_comic = input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz'}: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self def verify_links(self): spine_paths = {s:s for s in self.spine} for item in self.spine: base = os.path.dirname(item) for link in item.all_links: try: p = urlparse(urlunquote(link)) except Exception: continue if not p.scheme and not p.netloc: path = os.path.abspath(os.path.join(base, p.path)) if p.path else item try: path = spine_paths[path] except Exception: continue if not p.fragment or p.fragment in path.anchor_map: item.verified_links.add((path, p.fragment)) def __exit__(self, *args): remove_dir(self._tdir) for x in self.delete_on_exit: try: os.remove(x) except: pass
def get_metadata2(root, ver): opf = OPF(None, preparsed_opf=root, read_toc=False) return opf.to_book_metadata(), ver, opf.raster_cover, opf.first_spine_item( )
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf: if isinstance(html, unicode_type): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = etree.tostring(item.data, encoding='unicode') else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def get_metadata2(root, ver): opf = OPF(None, preparsed_opf=root, read_toc=False) return opf.to_book_metadata(), ver, opf.raster_cover, opf.first_spine_item()
class EbookIterator(BookmarksMixin): CHARACTERS_PER_PAGE = 1000 def __init__(self, pathtoebook, log=None, copy_bookmarks_to_file=True, use_tdir_in_cache=False): BookmarksMixin.__init__(self, copy_bookmarks_to_file=copy_bookmarks_to_file) self.use_tdir_in_cache = use_tdir_in_cache self.log = log or default_log pathtoebook = pathtoebook.strip() self.pathtoebook = os.path.abspath(pathtoebook) self.config = DynamicConfig(name='iterator') ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) self.ebook_ext = ext.replace('original_', '') def search(self, text, index, backwards=False): from calibre.ebooks.oeb.polish.parsing import parse pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() q = text.lower() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) root = parse(raw) fragments = [] def serialize(elem): if elem.text: fragments.append(elem.text.lower()) if elem.tail: fragments.append(elem.tail.lower()) for child in elem.iterchildren(): if hasattr( getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition( '}')[-1] not in {'script', 'style', 'del'}: serialize(child) elif getattr(child, 'tail', None): fragments.append(child.tail.lower()) for body in root.xpath('//*[local-name() = "body"]'): body.tail = None serialize(body) if q in ''.join(fragments): return i def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' self.delete_on_exit = [] if self.use_tdir_in_cache: self._tdir = tdir_in_cache('ev') else: self._tdir = PersistentTemporaryDirectory('_ebook_iter') self.base = os.path.realpath(self._tdir) self.book_format, self.pathtoopf, input_fmt = run_extract_book( self.pathtoebook, self.base, only_input_plugin=only_input_plugin, view_kepub=view_kepub, processed=processed) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.mi = self.opf.to_book_metadata() self.language = None if self.mi.languages: self.language = self.mi.languages[0].lower() self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') if input_fmt.lower() == 'htmlz': self.spine.append( Spiny(os.path.join(os.path.dirname(self.pathtoopf), 'index.html'), mime_type='text/html')) else: ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] is_comic = input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in { 'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz' }: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE % prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [ math.ceil(i / float(self.CHARACTERS_PER_PAGE)) for i in sizes ] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self def verify_links(self): spine_paths = {s: s for s in self.spine} for item in self.spine: base = os.path.dirname(item) for link in item.all_links: try: p = urlparse(urlunquote(link)) except Exception: continue if not p.scheme and not p.netloc: path = os.path.abspath(os.path.join( base, p.path)) if p.path else item try: path = spine_paths[path] except Exception: continue if not p.fragment or p.fragment in path.anchor_map: item.verified_links.add((path, p.fragment)) def __exit__(self, *args): remove_dir(self._tdir) for x in self.delete_on_exit: try: os.remove(x) except: pass