def upshift_markup(self): # {{{ 'Upgrade markup to comply with XHTML 1.1 where possible' for x in self.oeb.spine: root = x.data if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')): root.set(base.tag('xml', 'lang'), root.get('lang')) body = base.XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue for u in base.XPath('//h:u')(root): u.tag = 'span' seen_ids, seen_names = set(), set() for x in base.XPath('//*[@id or @name]')(root): eid, name = x.get('id', None), x.get('name', None) if eid: if eid in seen_ids: del x.attrib['id'] else: seen_ids.add(eid) if name: if name in seen_names: del x.attrib['name'] else: seen_names.add(name)
def workaround_webkit_quirks(self): # {{{ for x in self.oeb.spine: root = x.data body = base.XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue for pre in base.XPath('//h:pre')(body): if not pre.text and len(pre) == 0: pre.tag = 'div'
def frag_is_at_top(root, frag): elem = base.XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root) if elem: elem = elem[0] else: return False return item_at_top(elem)
def verify_toc_destinations(container, toc): anchor_map = {} anchor_xpath = base.XPath('//*/@id|//h:a/@name') for item in toc.iterdescendants(): name = item.dest if not name: item.dest_exists = False item.dest_error = 'No file named %s exists' % name continue try: root = container.parsed(name) except KeyError: item.dest_exists = False item.dest_error = 'No file named %s exists' % name continue if not hasattr(root, 'xpath'): item.dest_exists = False item.dest_error = 'No HTML file named %s exists' % name continue if not item.frag: item.dest_exists = True continue if name not in anchor_map: anchor_map[name] = frozenset(anchor_xpath(root)) item.dest_exists = item.frag in anchor_map[name] if not item.dest_exists: item.dest_error = ('The anchor %(a)s does not exist in file ' '%(f)s' % dict(a=item.frag, f=name))
def from_links(container): ''' Generate a Table of Contents from links in the book. ''' toc = TOC() link_path = base.XPath('//h:a[@href]') seen_titles, seen_dests = set(), set() for name, is_linear in container.spine_names: root = container.parsed(name) for a in link_path(root): href = a.get('href') if not href or not href.strip(): continue frag = None if href.startswith('#'): dest = name frag = href[1:] else: href, _, frag = href.partition('#') dest = container.href_to_name(href, base=name) frag = frag or None if (dest, frag) in seen_dests: continue seen_dests.add((dest, frag)) text = elem_to_toc_text(a) if text in seen_titles: continue seen_titles.add(text) toc.add(text, dest, frag=frag) verify_toc_destinations(container, toc) for child in toc: if not child.dest_exists: toc.remove(child) return toc
def __call__(self, oeb, context): has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False) if 'toc' in oeb.guide: # Ensure toc pointed to in <guide> is in spine from ebook_converter.ebooks.oeb.base import urlnormalize href = urlnormalize(oeb.guide['toc'].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and base.XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) return elif has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if not has_toc: return oeb.logger.info('Generating in-line TOC...') title = self.title or oeb.translate(DEFAULT_TITLE) style = self.style if style not in STYLE_CSS: oeb.logger.error('Unknown TOC style %r', style) style = 'nested' id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css') oeb.manifest.add(id, css_href, base.CSS_MIME, data=STYLE_CSS[style]) language = str(oeb.metadata.language[0]) contents = base.element(None, base.tag('xhtml', 'html'), nsmap={None: const.XHTML_NS}, attrib={base.tag('xml', 'lang'): language}) head = base.element(contents, base.tag('xhtml', 'head')) htitle = base.element(head, base.tag('xhtml', 'title')) htitle.text = title base.element(head, base.tag('xhtml', 'link'), rel='stylesheet', type=base.CSS_MIME, href=css_href) body = base.element(contents, base.tag('xhtml', 'body'), attrib={'class': 'calibre_toc'}) h1 = base.element(body, base.tag('xhtml', 'h2'), attrib={'class': 'calibre_toc_header'}) h1.text = title self.add_toc_level(body, oeb.toc) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = oeb.manifest.add(id, href, base.XHTML_MIME, data=contents) if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) oeb.guide.add('toc', 'Table of Contents', href)
def merge_css(container, names, master): p = container.parsed msheet = p(master) master_base = os.path.dirname(master) merged = set() for name in names: if name == master: continue # Rebase links if master is in a different directory if os.path.dirname(name) != master_base: container.replace_links(name, LinkRebaser(container, name, master)) sheet = p(name) # Remove charset rules cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE] [sheet.deleteRule(sheet.cssRules.index(r)) for r in cr] for rule in sheet.cssRules: msheet.add(rule) container.remove_item(name) merged.add(name) # Remove links to merged stylesheets in the html files, replacing with a # link to the master sheet for name, mt in container.mime_map.items(): if mt in base.OEB_DOCS: removed = False root = p(name) for link in base.XPath('//h:link[@href]')(root): q = container.href_to_name(link.get('href'), name) if q in merged: container.remove_from_xml(link) removed = True if removed: container.dirty(name) if removed and master not in set(all_stylesheets(container, name)): head = root.find('h:head', namespaces=const.XPNSMAP) if head is not None: link = head.makeelement(base.tag('xhtml', 'link'), type='text/css', rel='stylesheet', href=container.name_to_href( master, name)) container.insert_into_xml(head, link)
def process_item(self, item): self.current_item = item stylizer = self.svg_rasterizer.stylizer_cache.get(item) if stylizer is None: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, profile=self.opts.output_profile, base_css=self.base_css) self.abshref = self.images_manager.abshref = item.abshref self.current_lang = lang_for_tag( item.data) or self.styles_manager.document_lang for i, body in enumerate(base.XPath('//h:body')(item.data)): with self.blocks: self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor( self.links_manager.top_anchor, self.current_item, body) self.process_tag(body, stylizer, is_first_tag=i == 0)
def from_files(container): ''' Generate a Table of Contents from files in the book. ''' toc = TOC() for i, spinepath in enumerate(container.spine_items): name = container.abspath_to_name(spinepath) root = container.parsed(name) body = base.XPath('//h:body')(root) if not body: continue text = find_text(body[0]) if not text: text = name.rpartition('/')[-1] if i == 0 and text.rpartition('.')[0].lower() in { 'titlepage', 'cover' }: text = 'Cover' toc.add(text, name) return toc
def item_at_top(elem): try: body = base.XPath('//h:body')(elem.getroottree().getroot())[0] except (TypeError, IndexError, KeyError, AttributeError): return False tree = body.getroottree() path = tree.getpath(elem) for el in body.iterdescendants(etree.Element): epath = tree.getpath(el) if epath == path: break try: if el.tag.endswith('}img') or (el.text and el.text.strip()): return False except Exception: return False if not path.startswith(epath): # Only check tail of non-parent elements if el.tail and el.tail.strip(): return False return True
def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = base.XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in base.OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urllib.parse.urldefrag(href) if not href: continue try: href = item.abshref(base.urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in base.OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) for item in sorted(extras): if item.href in removed_items_to_ignore: continue if version >= 2: self.logger.warn('Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False)
def all_stylesheets(container, name): for link in base.XPath('//h:head/h:link[@href]')(container.parsed(name)): name = container.href_to_name(link.get('href'), name) typ = link.get('type', 'text/css') if typ == 'text/css': yield name
def from_xpaths(container, xpaths): ''' Generate a Table of Contents from a list of XPath expressions. Each expression in the list corresponds to a level of the generate ToC. For example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags. ''' tocroot = TOC() xpaths = [base.XPath(xp) for xp in xpaths] # Find those levels that have no elements in all spine items maps = collections.OrderedDict() empty_levels = {i + 1 for i, xp in enumerate(xpaths)} for spinepath in container.spine_items: name = container.abspath_to_name(spinepath) root = container.parsed(name) level_item_map = maps[name] = { i + 1: frozenset(xp(root)) for i, xp in enumerate(xpaths) } for lvl, elems in level_item_map.items(): if elems: empty_levels.discard(lvl) # Remove empty levels from all level_maps if empty_levels: for name, lmap in tuple(maps.items()): lmap = { lvl: items for lvl, items in lmap.items() if lvl not in empty_levels } lmap = sorted(lmap.items(), key=operator.itemgetter(0)) lmap = {i + 1: items for i, (l, items) in enumerate(lmap)} maps[name] = lmap node_level_map = {tocroot: 0} def parent_for_level(child_level): limit = child_level - 1 def process_node(node): child = node.last_child if child is None: return node lvl = node_level_map[child] return (node if lvl > limit else child if lvl == limit else process_node(child)) return process_node(tocroot) for name, level_item_map in maps.items(): root = container.parsed(name) item_level_map = { e: i for i, elems in level_item_map.items() for e in elems } item_dirtied = False all_ids = set(root.xpath('//*/@id')) for item in root.iterdescendants(etree.Element): lvl = item_level_map.get(item, None) if lvl is None: continue text = elem_to_toc_text(item) parent = parent_for_level(lvl) if item_at_top(item): dirtied, elem_id = False, None else: dirtied, elem_id = ensure_id(item, all_ids) item_dirtied = dirtied or item_dirtied toc = parent.add(text, name, elem_id) node_level_map[toc] = lvl toc.dest_exists = True if item_dirtied: container.commit_item(name, keep_parsed=True) return tocroot
def do_split(split_point, log, before=True): ''' Split tree into a *before* and an *after* tree at ``split_point``. :param split_point: The Element at which to split :param before: If True tree is split before split_point, otherwise after split_point :return: before_tree, after_tree ''' if before: # We cannot adjust for after since moving an after split point to a # parent will cause breakage if the parent contains any content # after the original split point split_point = adjust_split_point(split_point, log) tree = split_point.getroottree() path = tree.getpath(split_point) tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree) root, root2 = tree.getroot(), tree2.getroot() body, body2 = map(get_body, (root, root2)) split_point = root.xpath(path)[0] split_point2 = root2.xpath(path)[0] def nix_element(elem, top=True): # Remove elem unless top is False in which case replace elem by its # children parent = elem.getparent() if top: parent.remove(elem) else: index = parent.index(elem) parent[index:index + 1] = list(elem.iterchildren()) # Tree 1 hit_split_point = False keep_descendants = False split_point_descendants = frozenset(split_point.iterdescendants()) for elem in tuple(body.iterdescendants()): if elem is split_point: hit_split_point = True if before: nix_element(elem) else: # We want to keep the descendants of the split point in # Tree 1 keep_descendants = True # We want the split point element, but not its tail elem.tail = '\n' continue if hit_split_point: if keep_descendants: if elem in split_point_descendants: # elem is a descendant keep it continue else: # We are out of split_point, so prevent further set # lookups of split_point_descendants keep_descendants = False nix_element(elem) # Tree 2 ancestors = frozenset(base.XPath('ancestor::*')(split_point2)) for elem in tuple(body2.iterdescendants()): if elem is split_point2: if not before: # Keep the split point element's tail, if it contains non-whitespace # text tail = elem.tail if tail and not tail.isspace(): parent = elem.getparent() idx = parent.index(elem) if idx == 0: parent.text = (parent.text or '') + tail else: sib = parent[idx - 1] sib.tail = (sib.tail or '') + tail # Remove the element itself nix_element(elem) break if elem in ancestors: # We have to preserve the ancestors as they could have CSS # styles that are inherited/applicable, like font or # width. So we only remove the text, if any. elem.text = '\n' else: nix_element(elem, top=False) body2.text = '\n' return tree, tree2
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True): if not isinstance(node.tag, (str, bytes)) \ or parse_utils.namespace(node.tag) != const.XHTML_NS: return tag = parse_utils.barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, numbers.Number): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] if cssdict['text-align'] == 'center': # align=center causes tables to be center aligned, # which text-align does not. And the ever trustworthy Word # uses this construct in its HTML output. See # https://bugs.launchpad.net/bugs/1569583 if tag == 'table': if 'margin-left' not in cssdict and 'margin-right' not in cssdict: cssdict['margin-left'] = cssdict[ 'margin-right'] = 'auto' else: for table in node.iterchildren( base.tag('xhtml', "table")): ts = stylizer.style(table) if ts.get('margin-left') is None and ts.get( 'margin-right') is None: ts.set('margin-left', 'auto') ts.set('margin-right', 'auto') else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if 'valign' in node.attrib and tag == 'td': if cssdict.get('vertical-align') == 'inherit': cssdict['vertical-align'] = node.attrib['valign'] del node.attrib['valign'] if node.tag == base.tag('xhtml', 'font'): tags = [ 'descendant::h:%s' % x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote') ] # TODO(gryf): this will override tag from line 355. On purpose? tag = 'div' if base.XPath('|'.join(tags))(node) else 'span' node.tag = base.tag('xhtml', tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt' % font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = cp_css.Property('color', node.attrib['color']).value except (ValueError, dom.SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = cp_css.Property( 'background-color', node.attrib['bgcolor']).value except (ValueError, dom.SyntaxErr): pass del node.attrib['bgcolor'] if tag == 'ol' and 'type' in node.attrib: del node.attrib['type'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict[ 'font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = ( cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f))) # Detect drop caps generated by the docx input plugin if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \ not node.tail and 'line-height' in cssdict and 'font-size' in cssdict: dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len( dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = node.attrib.pop('data-calibre-rescale', None) if dyn_rescale is not None: try: dyn_rescale = float(dyn_rescale) / 100 except Exception: dyn_rescale = 1 fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem' % (fsize / psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt' % fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. slh = style['line-height'] if not is_drop_cap and isinstance( slh, numbers.Number) and slh < minlh * fsize: cssdict['line-height'] = str(minlh) except Exception: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: popval = cssdict.pop(x, None) if self.body_font_family and popval and x == 'font-family' \ and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]: cssdict[x] = popval if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s' % (prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(cssdict.items()) css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' classes_list = classes.split() # lower() because otherwise if the document uses the same class # name with different case, both cases will apply, leading # to incorrect results. klass = ascii_text(STRIPNUM.sub( '', classes_list[0])).lower().strip().replace(' ', '_') if css in styles: match = styles[css] else: match = klass + str(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in pseudo_classes.items(): items = sorted(cssdict.items()) css = ';\n'.join('%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + str(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] if recurse: for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def read_inline_toc(self, href, frag): ans = TOC() base_href = '/'.join(href.split('/')[:-1]) with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) body = base.XPath('//h:body')(root) reached = False if body: start = body[0] else: start = None reached = True if frag: elems = base.XPath('//*[@id="%s"]' % frag)(root) if elems: start = elems[0] def node_depth(elem): ans = 0 parent = elem.getparent() while parent is not None: parent = parent.getparent() ans += 1 return ans # Layer the ToC based on nesting order in the source HTML current_depth = None parent = ans seen = set() links = [] for elem in root.iterdescendants(etree.Element): if reached and elem.tag == base.tag('xhtml', 'a') and elem.get( 'href', False): href = elem.get('href') href, frag = urllib.parse.urldefrag(href) href = base_href + '/' + href text = base.xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) links.append((text, href, frag, node_depth(elem))) elif elem is start: reached = True depths = sorted(set(x[-1] for x in links)) depth_map = {x: i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: current_depth = 0 parent.add_item(href, frag, text) elif current_depth == depth: parent.add_item(href, frag, text) elif current_depth < depth: parent = parent[-1] if len(parent) > 0 else parent parent.add_item(href, frag, text) current_depth += 1 else: delta = current_depth - depth while delta > 0 and parent.parent is not None: parent = parent.parent delta -= 1 parent.add_item(href, frag, text) current_depth = depth return ans
def workaround_ade_quirks(self): # {{{ """ Perform various markup transforms to get the output to render correctly in the quirky ADE. """ stylesheet = self.oeb.manifest.main_stylesheet # ADE cries big wet tears when it encounters an invalid fragment # identifier in the NCX toc. frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$') for node in self.oeb.toc.iter(): href = getattr(node, 'href', None) if hasattr(href, 'partition'): _base, _, frag = href.partition('#') frag = urllib.parse.unquote(frag) if frag and frag_pat.match(frag) is None: self.log.warn( 'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag) node.href = _base for x in self.oeb.spine: root = x.data body = base.XPath('//h:body')(root) if body: body = body[0] if hasattr(body, 'xpath'): # remove <img> tags with empty src elements bad = [] for x in base.XPath('//h:img')(body): src = x.get('src', '').strip() if src in ('', '#') or src.startswith('http:'): bad.append(x) for img in bad: img.getparent().remove(img) # Add id attribute to <a> tags that have name for x in base.XPath('//h:a[@name]')(body): if not x.get('id', False): x.set('id', x.get('name')) # The delightful epubcheck has started complaining about <a> tags that # have name attributes. x.attrib.pop('name') # Replace <br> that are children of <body> as ADE doesn't handle them for br in base.XPath('./h:br')(body): if br.getparent() is None: continue try: prior = next(br.itersiblings(preceding=True)) priortag = parse_utils.barename(prior.tag) priortext = prior.tail except: priortag = 'body' priortext = body.text if priortext: priortext = priortext.strip() br.tag = base.tag('xhtml', 'p') br.text = '\u00a0' style = br.get('style', '').split(';') style = list(filter(None, map(lambda x: x.strip(), style))) style.append('margin:0pt; border:0pt') # If the prior tag is a block (including a <br> we replaced) # then this <br> replacement should have a 1-line height. # Otherwise it should have no height. if not priortext and priortag in block_level_tags: style.append('height:1em') else: style.append('height:0pt') br.set('style', '; '.join(style)) for tag in base.XPath('//h:embed')(root): tag.getparent().remove(tag) for tag in base.XPath('//h:object')(root): if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}: continue tag.getparent().remove(tag) for tag in base.XPath('//h:title|//h:style')(root): if not tag.text: tag.getparent().remove(tag) for tag in base.XPath('//h:script')(root): if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'): tag.getparent().remove(tag) for tag in base.XPath('//h:body/descendant::h:script')(root): tag.getparent().remove(tag) formchildren = base.XPath('./h:input|./h:button|./h:textarea|' './h:label|./h:fieldset|./h:legend') for tag in base.XPath('//h:form')(root): if formchildren(tag): tag.getparent().remove(tag) else: # Not a real form tag.tag = base.tag('xhtml', 'div') for tag in base.XPath('//h:center')(root): tag.tag = base.tag('xhtml', 'div') tag.set('style', 'text-align:center') # ADE can't handle & in an img url for tag in base.XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) # ADE whimpers in fright when it encounters a <td> outside a # <table> in_table = base.XPath('ancestor::h:table') for tag in base.XPath('//h:td|//h:tr|//h:th')(root): if not in_table(tag): tag.tag = base.tag('xhtml', 'div') # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces special_chars = re.compile('[\u200b\u00ad]') for elem in root.iterdescendants('*'): if elem.text: elem.text = special_chars.sub('', elem.text) elem.text = elem.text.replace('\u2011', '-') if elem.tail: elem.tail = special_chars.sub('', elem.tail) elem.tail = elem.tail.replace('\u2011', '-') if stylesheet is not None: # ADE doesn't render lists correctly if they have left margins from css_parser.css import CSSRule for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root): sel = '.'+lb.get('class') for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): if sel == rule.selectorList.selectorText: rule.style.removeProperty('margin-left') # padding-left breaks rendering in webkit and gecko rule.style.removeProperty('padding-left') # Change whitespace:pre to pre-wrap to accommodate readers that # cannot scroll horizontally for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): style = rule.style ws = style.getPropertyValue('white-space') if ws == 'pre': style.setProperty('white-space', 'pre-wrap')
def merge_html(container, names, master, insert_page_breaks=False): p = container.parsed root = p(master) # Ensure master has a <head> head = root.find('h:head', namespaces=const.XPNSMAP) if head is None: head = root.makeelement(base.tag('xhtml', 'head')) container.insert_into_xml(root, head, 0) seen_anchors = all_anchors(root) seen_stylesheets = set(all_stylesheets(container, master)) master_body = p(master).findall('h:body', namespaces=const.XPNSMAP)[-1] master_base = os.path.dirname(master) anchor_map = {n: {} for n in names if n != master} first_anchor_map = {} for name in names: if name == master: continue # Insert new stylesheets into master for sheet in all_stylesheets(container, name): if sheet not in seen_stylesheets: seen_stylesheets.add(sheet) link = head.makeelement(base.tag('xhtml', 'link'), rel='stylesheet', type='text/css', href=container.name_to_href( sheet, master)) container.insert_into_xml(head, link) # Rebase links if master is in a different directory if os.path.dirname(name) != master_base: container.replace_links(name, LinkRebaser(container, name, master)) root = p(name) children = [] for body in p(name).findall('h:body', namespaces=const.XPNSMAP): children.append( body.text if body.text and body.text.strip() else '\n\n') children.extend(body) first_child = '' for first_child in children: if not isinstance(first_child, (str, bytes)): break if isinstance(first_child, (str, bytes)): # body contained only text, no tags first_child = body.makeelement(base.tag('xhtml', 'p')) first_child.text, children[0] = children[0], first_child amap = anchor_map[name] remove_name_attributes(root) for elem in root.xpath('//*[@id]'): val = elem.get('id') if not val: continue if val in seen_anchors: nval = unique_anchor(seen_anchors, val) elem.set('id', nval) amap[val] = nval else: seen_anchors.add(val) if 'id' not in first_child.attrib: first_child.set('id', unique_anchor(seen_anchors, 'top')) seen_anchors.add(first_child.get('id')) first_anchor_map[name] = first_child.get('id') if insert_page_breaks: first_child.set( 'style', first_child.get('style', '') + '; page-break-before: always') amap[''] = first_child.get('id') # Fix links that point to local changed anchors for a in base.XPath('//h:a[starts-with(@href, "#")]')(root): q = a.get('href')[1:] if q in amap: a.set('href', '#' + amap[q]) for child in children: if isinstance(child, (str, bytes)): add_text(master_body, child) else: master_body.append(copy.deepcopy(child)) container.remove_item(name, remove_from_guide=False) # Fix all links in the container that point to merged files for fname, media_type in container.mime_map.items(): repl = MergeLinkReplacer(fname, anchor_map, master, container) container.replace_links(fname, repl) return first_anchor_map
class OEBReader(object): """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" COVER_SVG_XP = base.XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = base.XPath('h:body//h:object[@data][position() = 1]') Container = base.DirContainer """Container type used to access book files. Override in sub-classes.""" DEFAULT_PROFILE = 'PRS505' """Default renderer profile for content read with this Reader.""" TRANSFORMS = [] """List of transforms to apply to content read with this Reader.""" @classmethod def config(cls, cfg): """Add any book-reading options to the :class:`Config` object :param:`cfg`. """ return @classmethod def generate(cls, opts): """Generate a Reader instance from command-line options.""" return cls() def __call__(self, oeb, path): """Read the book at :param:`path` into the :class:`OEBBook` object :param:`oeb`. """ self.oeb = oeb self.logger = self.log = oeb.logger oeb.container = self.Container(path, self.logger) oeb.container.log = oeb.log opf = self._read_opf() self._all_from_opf(opf) return oeb def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and ':' not in parse_utils.barename(elem.tag)): elem.tag = base.tag('opf', parse_utils.barename(elem.tag)) nsmap.update(const.OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(base.tag('opf', 'package'), nsmap={None: const.OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'), nsmap=nsmap) ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata')) for elem in base.xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if parse_utils.namespace(elem.tag) in const.DC_NSES: tag = parse_utils.barename(elem.tag).lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) metadata.append(elem) for element in base.xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in base.xpath(opf, tag): nroot.append(element) return nroot def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) data = base.XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', const.OPF1_NS, data) try: opf = etree.fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = data.replace( '<dc-metadata>', '<dc-metadata xmlns:dc="' 'http://purl.org/metadata/dublin_core">') opf = etree.fromstring(data) self.logger.warn('OPF contains invalid tours section') ns = parse_utils.namespace(opf.tag) if ns not in ('', const.OPF1_NS, const.OPF2_NS): raise base.OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf def _metadata_from_opf(self, opf): from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata stream = io.BytesIO( etree.tostring(opf, xml_declaration=True, encoding='utf-8')) o = OPF(stream) pwm = o.primary_writing_mode if pwm: self.oeb.metadata.primary_writing_mode = pwm mi = o.to_book_metadata() if not mi.language: mi.language = get_lang().replace('_', '-') self.oeb.metadata.add('language', mi.language) if not mi.book_producer: mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' % dict(a=__appname__, v=__version__)) meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger) m = self.oeb.metadata m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid') self.oeb.uid = self.oeb.metadata.identifier[-1] if not m.title: m.add('title', self.oeb.translate('Unknown')) has_aut = False for x in m.creator: if getattr(x, 'role', '').lower() in ('', 'aut'): has_aut = True break if not has_aut: m.add('creator', self.oeb.translate('Unknown'), role='aut') def _manifest_prune_invalid(self): ''' Remove items from manifest that contain invalid data. This prevents catastrophic conversion failure, when a few files contain corrupted data. ''' bad = [] check = base.OEB_DOCS.union(base.OEB_STYLES) for item in list(self.oeb.manifest.values()): if item.media_type in check: try: item.data except KeyboardInterrupt: raise except Exception: self.logger.exception('Failed to parse content in %s' % item.href) bad.append(item) self.oeb.manifest.remove(item) return bad def _manifest_add_missing(self, invalid): import css_parser manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = base.OEB_DOCS | base.OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except Exception: self.oeb.log.exception('Failed to read from manifest ' 'entry with id: %s, ignoring' % item.id) invalid.add(item) continue if data is None: continue if (item.media_type in base.OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in base.iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') href, _ = urllib.parse.urldefrag(href) if not href: continue try: href = item.abshref(base.urlnormalize(href)) scheme = urllib.parse.urlparse(href).scheme except Exception: self.oeb.log.exception('Skipping invalid href: ' '%r' % href) continue if not scheme and href not in known: new.add(href) elif item.media_type in base.OEB_STYLES: try: urls = list(css_parser.getUrls(data)) except Exception: urls = [] for url in urls: href, _ = urllib.parse.urldefrag(url) href = item.abshref(base.urlnormalize(href)) scheme = urllib.parse.urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set() for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(base.urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = mimetypes.guess_type(href)[0] media_type = guessed or base.BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item) def _manifest_from_opf(self, opf): manifest = self.oeb.manifest for elem in base.xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') media_type = elem.get('media-type', None) if media_type is None: media_type = elem.get('mediatype', None) if not media_type or media_type == 'text/xml': guessed = mimetypes.guess_type(href)[0] media_type = guessed or media_type or base.BINARY_MIME if hasattr(media_type, 'lower'): media_type = media_type.lower() fallback = elem.get('fallback') if href in manifest.hrefs: self.logger.warn('Duplicate manifest entry for %r' % href) continue if not self.oeb.container.exists(href): self.logger.warn('Manifest item %r not found' % href) continue if id in manifest.ids: self.logger.warn('Duplicate manifest id %r' % id) id, href = manifest.generate(id, href) manifest.add(id, href, media_type, fallback) invalid = self._manifest_prune_invalid() self._manifest_add_missing(invalid) def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = base.XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in base.OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urllib.parse.urldefrag(href) if not href: continue try: href = item.abshref(base.urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in base.OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) for item in sorted(extras): if item.href in removed_items_to_ignore: continue if version >= 2: self.logger.warn('Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False) def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn('Spine item %r not found' % idref) continue item = manifest.ids[idref] if (item.media_type.lower() in base.OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx')): spine.add(item, elem.get('linear')) else: if (hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html')): item.media_type = base.XHTML_MIME spine.add(item, elem.get('linear')) else: self.oeb.log.warn('The item %s is not a XML document.' ' Removing it from spine.' % item.href) if len(spine) == 0: raise base.OEBError("Spine is empty") self._spine_add_extra() for val in base.xpath( opf, '/o2:package/o2:spine/@page-progression-direction'): if val in {'ltr', 'rtl'}: spine.page_progression_direction = val def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn('Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href) def _find_ncx(self, opf): result = base.xpath(opf, '/o2:package/o2:spine/@toc') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == base.NCX_MIME: self.oeb.manifest.remove(item) return item return None def _toc_from_navpoint(self, item, toc, navpoint): children = base.xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()')) title = base.COLLAPSE_RE.sub(' ', title.strip()) href = base.xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not base.xpath(child, 'ncx:navPoint'): # This node is useless continue if href and href[0]: href = item.abshref(base.urlnormalize(href[0])) else: href = '' path, _ = urllib.parse.urldefrag(href) if path and path not in self.oeb.manifest.hrefs: path = base.urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = base.xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except Exception: po = self.oeb.toc.next_play_order() authorElement = base.xpath( child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = base.xpath( child, 'descendant::calibre:meta[@name = ' '"description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding='unicode').strip() if not description: description = None else: description = None index_image = base.xpath( child, 'descendant::calibre:meta[@name = ' '"toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child) def _toc_from_ncx(self, item): if (item is None) or (item.data is None): return False self.log.debug('Reading TOC from NCX...') ncx = item.data title = ''.join(base.xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = base.COLLAPSE_RE.sub(' ', title.strip()) title = title or str(self.oeb.metadata.title[0]) toc = self.oeb.toc toc.title = title navmaps = base.xpath(ncx, 'ncx:navMap') for navmap in navmaps: self._toc_from_navpoint(item, toc, navmap) return True def _toc_from_tour(self, opf): result = base.xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = base.xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urllib.parse.urldefrag(base.urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = base.xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = base.xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not base.xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = collections.defaultdict(list) order = [] for anchor in base.xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(base.urlnormalize(href)) path, frag = urllib.parse.urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = base.xml2text(anchor) title = base.COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True def _toc_from_spine(self, opf): self.log.warn('Generating default TOC from spine...') toc = self.oeb.toc titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(base.xpath(html, '/h:html/h:head/h:title/text()')) title = base.COLLAPSE_RE.sub(' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(base.xpath(html, expr % tag)) header = base.COLLAPSE_RE.sub(' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) return True def _toc_from_opf(self, opf, item): self.oeb.auto_generated_toc = False if self._toc_from_ncx(item): return # Prefer HTML to tour based TOC, since several LIT files # have good HTML TOCs but bad tour based TOCs if self._toc_from_html(opf): return if self._toc_from_tour(opf): return self._toc_from_spine(opf) self.oeb.auto_generated_toc = True def _pages_from_ncx(self, opf, item): if item is None: return False ncx = item.data if ncx is None: return False ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget') if not ptargets: return False pages = self.oeb.pages for ptarget in ptargets: name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) name = base.COLLAPSE_RE.sub(' ', name.strip()) href = base.xpath(ptarget, 'ncx:content/@src') if not href: continue href = item.abshref(base.urlnormalize(href[0])) id = ptarget.get('id') type = ptarget.get('type', 'normal') klass = ptarget.get('class') pages.add(name, href, type=type, id=id, klass=klass) return True def _find_page_map(self, opf): result = base.xpath(opf, '/o2:package/o2:spine/@page-map') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == base.PAGE_MAP_MIME: self.oeb.manifest.remove(item) return item return None def _pages_from_page_map(self, opf): item = self._find_page_map(opf) if item is None: return False pmap = item.data pages = self.oeb.pages for page in base.xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue name = base.COLLAPSE_RE.sub(' ', name.strip()) href = item.abshref(base.urlnormalize(href)) type = 'normal' if not name: type = 'special' elif name.lower().strip('ivxlcdm') == '': type = 'front' pages.add(name, href, type=type) return True def _pages_from_opf(self, opf, item): if self._pages_from_ncx(opf, item): return if self._pages_from_page_map(opf): return return def _cover_from_html(self, hcover): from ebook_converter.ebooks import render_html_svg_workaround with TemporaryDirectory('_html_cover') as tdir: writer = OEBWriter() writer(self.oeb, tdir) path = os.path.join(tdir, unquote(hcover.href)) data = render_html_svg_workaround(path, self.logger) if not data: data = b'' id, href = self.oeb.manifest.generate('cover', 'cover.jpg') item = self.oeb.manifest.add(id, href, base.JPEG_MIME, data=data) return item def _locate_cover_image(self): if self.oeb.metadata.cover: id = str(self.oeb.metadata.cover[0]) item = self.oeb.manifest.ids.get(id, None) if item is not None and item.media_type in base.OEB_IMAGES: return item else: self.logger.warn('Invalid cover image @id %r' % id) hcover = self.oeb.spine[0] if 'cover' in self.oeb.guide: href = self.oeb.guide['cover'].href item = self.oeb.manifest.hrefs[href] media_type = item.media_type if media_type in base.OEB_IMAGES: return item elif media_type in base.OEB_DOCS: hcover = item html = hcover.data if base.MS_COVER_TYPE in self.oeb.guide: href = self.oeb.guide[base.MS_COVER_TYPE].href item = self.oeb.manifest.hrefs.get(href, None) if item is not None and item.media_type in base.OEB_IMAGES: return item if self.COVER_SVG_XP(html): svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) href = os.path.splitext(hcover.href)[0] + '.svg' id, href = self.oeb.manifest.generate(hcover.id, href) item = self.oeb.manifest.add(id, href, base.SVG_MIME, data=svg) return item if self.COVER_OBJECT_XP(html): object = self.COVER_OBJECT_XP(html)[0] href = hcover.abshref(object.get('data')) item = self.oeb.manifest.hrefs.get(href, None) if item is not None and item.media_type in base.OEB_IMAGES: return item return self._cover_from_html(hcover) def _ensure_cover_image(self): cover = self._locate_cover_image() if self.oeb.metadata.cover: self.oeb.metadata.cover[0].value = cover.id return self.oeb.metadata.add('cover', cover.id) def _manifest_remove_duplicates(self): seen = set() dups = set() for item in self.oeb.manifest: if item.href in seen: dups.add(item.href) seen.add(item.href) for href in dups: items = [x for x in self.oeb.manifest if x.href == href] for x in items: if x not in self.oeb.spine: self.oeb.log.warn( 'Removing duplicate manifest item with ' 'id:', x.id) self.oeb.manifest.remove_duplicate_item(x) def _all_from_opf(self, opf): self.oeb.version = opf.get('version', '1.2') self._metadata_from_opf(opf) self._manifest_from_opf(opf) self._spine_from_opf(opf) self._manifest_remove_duplicates() self._guide_from_opf(opf) item = self._find_ncx(opf) self._toc_from_opf(opf, item) self._pages_from_opf(opf, item)