Exemplo n.º 1
0
    def upshift_markup(self):  # {{{
        'Upgrade markup to comply with XHTML 1.1 where possible'
        for x in self.oeb.spine:
            root = x.data
            if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')):
                root.set(base.tag('xml', 'lang'), root.get('lang'))
            body = base.XPath('//h:body')(root)
            if body:
                body = body[0]

            if not hasattr(body, 'xpath'):
                continue
            for u in base.XPath('//h:u')(root):
                u.tag = 'span'

            seen_ids, seen_names = set(), set()
            for x in base.XPath('//*[@id or @name]')(root):
                eid, name = x.get('id', None), x.get('name', None)
                if eid:
                    if eid in seen_ids:
                        del x.attrib['id']
                    else:
                        seen_ids.add(eid)
                if name:
                    if name in seen_names:
                        del x.attrib['name']
                    else:
                        seen_names.add(name)
Exemplo n.º 2
0
    def workaround_webkit_quirks(self):  # {{{
        for x in self.oeb.spine:
            root = x.data
            body = base.XPath('//h:body')(root)
            if body:
                body = body[0]

            if not hasattr(body, 'xpath'):
                continue

            for pre in base.XPath('//h:pre')(body):
                if not pre.text and len(pre) == 0:
                    pre.tag = 'div'
Exemplo n.º 3
0
 def frag_is_at_top(root, frag):
     elem = base.XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
     if elem:
         elem = elem[0]
     else:
         return False
     return item_at_top(elem)
Exemplo n.º 4
0
def verify_toc_destinations(container, toc):
    anchor_map = {}
    anchor_xpath = base.XPath('//*/@id|//h:a/@name')
    for item in toc.iterdescendants():
        name = item.dest
        if not name:
            item.dest_exists = False
            item.dest_error = 'No file named %s exists' % name
            continue
        try:
            root = container.parsed(name)
        except KeyError:
            item.dest_exists = False
            item.dest_error = 'No file named %s exists' % name
            continue
        if not hasattr(root, 'xpath'):
            item.dest_exists = False
            item.dest_error = 'No HTML file named %s exists' % name
            continue
        if not item.frag:
            item.dest_exists = True
            continue
        if name not in anchor_map:
            anchor_map[name] = frozenset(anchor_xpath(root))
        item.dest_exists = item.frag in anchor_map[name]
        if not item.dest_exists:
            item.dest_error = ('The anchor %(a)s does not exist in file '
                               '%(f)s' % dict(a=item.frag, f=name))
Exemplo n.º 5
0
def from_links(container):
    '''
    Generate a Table of Contents from links in the book.
    '''
    toc = TOC()
    link_path = base.XPath('//h:a[@href]')
    seen_titles, seen_dests = set(), set()
    for name, is_linear in container.spine_names:
        root = container.parsed(name)
        for a in link_path(root):
            href = a.get('href')
            if not href or not href.strip():
                continue
            frag = None
            if href.startswith('#'):
                dest = name
                frag = href[1:]
            else:
                href, _, frag = href.partition('#')
                dest = container.href_to_name(href, base=name)
            frag = frag or None
            if (dest, frag) in seen_dests:
                continue
            seen_dests.add((dest, frag))
            text = elem_to_toc_text(a)
            if text in seen_titles:
                continue
            seen_titles.add(text)
            toc.add(text, dest, frag=frag)
    verify_toc_destinations(container, toc)
    for child in toc:
        if not child.dest_exists:
            toc.remove(child)
    return toc
Exemplo n.º 6
0
    def __call__(self, oeb, context):
        has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False)

        if 'toc' in oeb.guide:
            # Ensure toc pointed to in <guide> is in spine
            from ebook_converter.ebooks.oeb.base import urlnormalize
            href = urlnormalize(oeb.guide['toc'].href)
            if href in oeb.manifest.hrefs:
                item = oeb.manifest.hrefs[href]
                if (hasattr(item.data, 'xpath') and
                        base.XPath('//h:a[@href]')(item.data)):
                    if oeb.spine.index(item) < 0:
                        if self.position == 'end':
                            oeb.spine.add(item, linear=False)
                        else:
                            oeb.spine.insert(0, item, linear=True)
                    return
                elif has_toc:
                    oeb.guide.remove('toc')
            else:
                oeb.guide.remove('toc')
        if not has_toc:
            return
        oeb.logger.info('Generating in-line TOC...')
        title = self.title or oeb.translate(DEFAULT_TITLE)
        style = self.style
        if style not in STYLE_CSS:
            oeb.logger.error('Unknown TOC style %r', style)
            style = 'nested'
        id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css')
        oeb.manifest.add(id, css_href, base.CSS_MIME, data=STYLE_CSS[style])
        language = str(oeb.metadata.language[0])
        contents = base.element(None, base.tag('xhtml', 'html'),
                                nsmap={None: const.XHTML_NS},
                                attrib={base.tag('xml', 'lang'): language})
        head = base.element(contents, base.tag('xhtml', 'head'))
        htitle = base.element(head, base.tag('xhtml', 'title'))
        htitle.text = title
        base.element(head, base.tag('xhtml', 'link'), rel='stylesheet',
                     type=base.CSS_MIME, href=css_href)
        body = base.element(contents, base.tag('xhtml', 'body'),
                            attrib={'class': 'calibre_toc'})
        h1 = base.element(body, base.tag('xhtml', 'h2'),
                          attrib={'class': 'calibre_toc_header'})
        h1.text = title
        self.add_toc_level(body, oeb.toc)
        id, href = oeb.manifest.generate('contents', 'contents.xhtml')
        item = oeb.manifest.add(id, href, base.XHTML_MIME, data=contents)
        if self.position == 'end':
            oeb.spine.add(item, linear=False)
        else:
            oeb.spine.insert(0, item, linear=True)
        oeb.guide.add('toc', 'Table of Contents', href)
Exemplo n.º 7
0
def merge_css(container, names, master):
    p = container.parsed
    msheet = p(master)
    master_base = os.path.dirname(master)
    merged = set()

    for name in names:
        if name == master:
            continue
        # Rebase links if master is in a different directory
        if os.path.dirname(name) != master_base:
            container.replace_links(name, LinkRebaser(container, name, master))

        sheet = p(name)

        # Remove charset rules
        cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE]
        [sheet.deleteRule(sheet.cssRules.index(r)) for r in cr]
        for rule in sheet.cssRules:
            msheet.add(rule)

        container.remove_item(name)
        merged.add(name)

    # Remove links to merged stylesheets in the html files, replacing with a
    # link to the master sheet
    for name, mt in container.mime_map.items():
        if mt in base.OEB_DOCS:
            removed = False
            root = p(name)
            for link in base.XPath('//h:link[@href]')(root):
                q = container.href_to_name(link.get('href'), name)
                if q in merged:
                    container.remove_from_xml(link)
                    removed = True
            if removed:
                container.dirty(name)
            if removed and master not in set(all_stylesheets(container, name)):
                head = root.find('h:head', namespaces=const.XPNSMAP)
                if head is not None:
                    link = head.makeelement(base.tag('xhtml', 'link'),
                                            type='text/css',
                                            rel='stylesheet',
                                            href=container.name_to_href(
                                                master, name))
                    container.insert_into_xml(head, link)
Exemplo n.º 8
0
    def process_item(self, item):
        self.current_item = item
        stylizer = self.svg_rasterizer.stylizer_cache.get(item)
        if stylizer is None:
            stylizer = Stylizer(item.data,
                                item.href,
                                self.oeb,
                                self.opts,
                                profile=self.opts.output_profile,
                                base_css=self.base_css)
        self.abshref = self.images_manager.abshref = item.abshref

        self.current_lang = lang_for_tag(
            item.data) or self.styles_manager.document_lang
        for i, body in enumerate(base.XPath('//h:body')(item.data)):
            with self.blocks:
                self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(
                    self.links_manager.top_anchor, self.current_item, body)
                self.process_tag(body, stylizer, is_first_tag=i == 0)
Exemplo n.º 9
0
def from_files(container):
    '''
    Generate a Table of Contents from files in the book.
    '''
    toc = TOC()
    for i, spinepath in enumerate(container.spine_items):
        name = container.abspath_to_name(spinepath)
        root = container.parsed(name)
        body = base.XPath('//h:body')(root)
        if not body:
            continue
        text = find_text(body[0])
        if not text:
            text = name.rpartition('/')[-1]
            if i == 0 and text.rpartition('.')[0].lower() in {
                    'titlepage', 'cover'
            }:
                text = 'Cover'
        toc.add(text, name)
    return toc
Exemplo n.º 10
0
def item_at_top(elem):
    try:
        body = base.XPath('//h:body')(elem.getroottree().getroot())[0]
    except (TypeError, IndexError, KeyError, AttributeError):
        return False
    tree = body.getroottree()
    path = tree.getpath(elem)
    for el in body.iterdescendants(etree.Element):
        epath = tree.getpath(el)
        if epath == path:
            break
        try:
            if el.tag.endswith('}img') or (el.text and el.text.strip()):
                return False
        except Exception:
            return False
        if not path.startswith(epath):
            # Only check tail of non-parent elements
            if el.tail and el.tail.strip():
                return False
    return True
Exemplo n.º 11
0
 def _spine_add_extra(self):
     manifest = self.oeb.manifest
     spine = self.oeb.spine
     unchecked = set(spine)
     selector = base.XPath('h:body//h:a/@href')
     extras = set()
     while unchecked:
         new = set()
         for item in unchecked:
             if item.media_type not in base.OEB_DOCS:
                 # TODO: handle fallback chains
                 continue
             for href in selector(item.data):
                 href, _ = urllib.parse.urldefrag(href)
                 if not href:
                     continue
                 try:
                     href = item.abshref(base.urlnormalize(href))
                 except ValueError:  # Malformed URL
                     continue
                 if href not in manifest.hrefs:
                     continue
                 found = manifest.hrefs[href]
                 if found.media_type not in base.OEB_DOCS or \
                    found in spine or found in extras:
                     continue
                 new.add(found)
         extras.update(new)
         unchecked = new
     version = int(self.oeb.version[0])
     removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore',
                                       ())
     for item in sorted(extras):
         if item.href in removed_items_to_ignore:
             continue
         if version >= 2:
             self.logger.warn('Spine-referenced file %r not in spine' %
                              item.href)
         spine.add(item, linear=False)
Exemplo n.º 12
0
def all_stylesheets(container, name):
    for link in base.XPath('//h:head/h:link[@href]')(container.parsed(name)):
        name = container.href_to_name(link.get('href'), name)
        typ = link.get('type', 'text/css')
        if typ == 'text/css':
            yield name
Exemplo n.º 13
0
def from_xpaths(container, xpaths):
    '''
    Generate a Table of Contents from a list of XPath expressions. Each
    expression in the list corresponds to a level of the generate ToC. For
    example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level
    Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
    '''
    tocroot = TOC()
    xpaths = [base.XPath(xp) for xp in xpaths]

    # Find those levels that have no elements in all spine items
    maps = collections.OrderedDict()
    empty_levels = {i + 1 for i, xp in enumerate(xpaths)}
    for spinepath in container.spine_items:
        name = container.abspath_to_name(spinepath)
        root = container.parsed(name)
        level_item_map = maps[name] = {
            i + 1: frozenset(xp(root))
            for i, xp in enumerate(xpaths)
        }
        for lvl, elems in level_item_map.items():
            if elems:
                empty_levels.discard(lvl)
    # Remove empty levels from all level_maps
    if empty_levels:
        for name, lmap in tuple(maps.items()):
            lmap = {
                lvl: items
                for lvl, items in lmap.items() if lvl not in empty_levels
            }
            lmap = sorted(lmap.items(), key=operator.itemgetter(0))
            lmap = {i + 1: items for i, (l, items) in enumerate(lmap)}
            maps[name] = lmap

    node_level_map = {tocroot: 0}

    def parent_for_level(child_level):
        limit = child_level - 1

        def process_node(node):
            child = node.last_child
            if child is None:
                return node
            lvl = node_level_map[child]
            return (node if lvl > limit else
                    child if lvl == limit else process_node(child))

        return process_node(tocroot)

    for name, level_item_map in maps.items():
        root = container.parsed(name)
        item_level_map = {
            e: i
            for i, elems in level_item_map.items() for e in elems
        }
        item_dirtied = False
        all_ids = set(root.xpath('//*/@id'))

        for item in root.iterdescendants(etree.Element):
            lvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            text = elem_to_toc_text(item)
            parent = parent_for_level(lvl)
            if item_at_top(item):
                dirtied, elem_id = False, None
            else:
                dirtied, elem_id = ensure_id(item, all_ids)
            item_dirtied = dirtied or item_dirtied
            toc = parent.add(text, name, elem_id)
            node_level_map[toc] = lvl
            toc.dest_exists = True

        if item_dirtied:
            container.commit_item(name, keep_parsed=True)

    return tocroot
Exemplo n.º 14
0
def do_split(split_point, log, before=True):
    '''
    Split tree into a *before* and an *after* tree at ``split_point``.

    :param split_point: The Element at which to split
    :param before: If True tree is split before split_point, otherwise after split_point
    :return: before_tree, after_tree
    '''
    if before:
        # We cannot adjust for after since moving an after split point to a
        # parent will cause breakage if the parent contains any content
        # after the original split point
        split_point = adjust_split_point(split_point, log)
    tree = split_point.getroottree()
    path = tree.getpath(split_point)

    tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
    root, root2 = tree.getroot(), tree2.getroot()
    body, body2 = map(get_body, (root, root2))
    split_point = root.xpath(path)[0]
    split_point2 = root2.xpath(path)[0]

    def nix_element(elem, top=True):
        # Remove elem unless top is False in which case replace elem by its
        # children
        parent = elem.getparent()
        if top:
            parent.remove(elem)
        else:
            index = parent.index(elem)
            parent[index:index + 1] = list(elem.iterchildren())

    # Tree 1
    hit_split_point = False
    keep_descendants = False
    split_point_descendants = frozenset(split_point.iterdescendants())
    for elem in tuple(body.iterdescendants()):
        if elem is split_point:
            hit_split_point = True
            if before:
                nix_element(elem)
            else:
                # We want to keep the descendants of the split point in
                # Tree 1
                keep_descendants = True
                # We want the split point element, but not its tail
                elem.tail = '\n'

            continue
        if hit_split_point:
            if keep_descendants:
                if elem in split_point_descendants:
                    # elem is a descendant keep it
                    continue
                else:
                    # We are out of split_point, so prevent further set
                    # lookups of split_point_descendants
                    keep_descendants = False
            nix_element(elem)

    # Tree 2
    ancestors = frozenset(base.XPath('ancestor::*')(split_point2))
    for elem in tuple(body2.iterdescendants()):
        if elem is split_point2:
            if not before:
                # Keep the split point element's tail, if it contains non-whitespace
                # text
                tail = elem.tail
                if tail and not tail.isspace():
                    parent = elem.getparent()
                    idx = parent.index(elem)
                    if idx == 0:
                        parent.text = (parent.text or '') + tail
                    else:
                        sib = parent[idx - 1]
                        sib.tail = (sib.tail or '') + tail
                # Remove the element itself
                nix_element(elem)
            break
        if elem in ancestors:
            # We have to preserve the ancestors as they could have CSS
            # styles that are inherited/applicable, like font or
            # width. So we only remove the text, if any.
            elem.text = '\n'
        else:
            nix_element(elem, top=False)

    body2.text = '\n'

    return tree, tree2
Exemplo n.º 15
0
    def flatten_node(self,
                     node,
                     stylizer,
                     names,
                     styles,
                     pseudo_styles,
                     psize,
                     item_id,
                     recurse=True):
        if not isinstance(node.tag, (str, bytes)) \
           or parse_utils.namespace(node.tag) != const.XHTML_NS:
            return
        tag = parse_utils.barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, numbers.Number):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
                if cssdict['text-align'] == 'center':
                    # align=center causes tables to be center aligned,
                    # which text-align does not. And the ever trustworthy Word
                    # uses this construct in its HTML output. See
                    # https://bugs.launchpad.net/bugs/1569583
                    if tag == 'table':
                        if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
                            cssdict['margin-left'] = cssdict[
                                'margin-right'] = 'auto'
                    else:
                        for table in node.iterchildren(
                                base.tag('xhtml', "table")):
                            ts = stylizer.style(table)
                            if ts.get('margin-left') is None and ts.get(
                                    'margin-right') is None:
                                ts.set('margin-left', 'auto')
                                ts.set('margin-right', 'auto')
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if 'valign' in node.attrib and tag == 'td':
            if cssdict.get('vertical-align') == 'inherit':
                cssdict['vertical-align'] = node.attrib['valign']
            del node.attrib['valign']
        if node.tag == base.tag('xhtml', 'font'):
            tags = [
                'descendant::h:%s' % x
                for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5',
                          'h6', 'ol', 'ul', 'dl', 'blockquote')
            ]
            # TODO(gryf): this will override tag from line 355. On purpose?
            tag = 'div' if base.XPath('|'.join(tags))(node) else 'span'
            node.tag = base.tag('xhtml', tag)
            if 'size' in node.attrib:

                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))

                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt' % font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = cp_css.Property('color',
                                                   node.attrib['color']).value
            except (ValueError, dom.SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = cp_css.Property(
                    'background-color', node.attrib['bgcolor']).value
            except (ValueError, dom.SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if tag == 'ol' and 'type' in node.attrib:
            del node.attrib['type']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict[
                'font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (
            cssdict.get('float', None) == 'left' and 'font-size' in cssdict
            and len(node) == 0 and node.text and
            (len(node.text) == 1 or
             (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
        # Detect drop caps generated by the docx input plugin
        if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
                not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(
                    dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
            if dyn_rescale is not None:
                try:
                    dyn_rescale = float(dyn_rescale) / 100
                except Exception:
                    dyn_rescale = 1
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem' % (fsize / psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt' % fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            slh = style['line-height']
            if not is_drop_cap and isinstance(
                    slh, numbers.Number) and slh < minlh * fsize:
                cssdict['line-height'] = str(minlh)
        except Exception:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                popval = cssdict.pop(x, None)
                if self.body_font_family and popval and x == 'font-family' \
                    and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
                    cssdict[x] = popval

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing
                or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s' % (prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents
                    and cssdict.get('text-align',
                                    None) not in ('center', 'right')):
                cssdict['text-indent'] = "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(cssdict.items())
                css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                classes_list = classes.split()
                # lower() because otherwise if the document uses the same class
                # name with different case, both cases will apply, leading
                # to incorrect results.
                klass = ascii_text(STRIPNUM.sub(
                    '', classes_list[0])).lower().strip().replace(' ', '_')
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + str(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in pseudo_classes.items():
                items = sorted(cssdict.items())
                css = ';\n'.join('%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + str(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        if recurse:
            for child in node:
                self.flatten_node(child, stylizer, names, styles,
                                  pseudo_styles, psize, item_id)
Exemplo n.º 16
0
    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = base.XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = base.XPath('//*[@id="%s"]' % frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == base.tag('xhtml', 'a') and elem.get(
                    'href', False):
                href = elem.get('href')
                href, frag = urllib.parse.urldefrag(href)
                href = base_href + '/' + href
                text = base.xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted(set(x[-1] for x in links))
        depth_map = {x: i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans
Exemplo n.º 17
0
    def workaround_ade_quirks(self):  # {{{
        """
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        """

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                _base, _, frag = href.partition('#')
                frag = urllib.parse.unquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                            'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
                    node.href = _base

        for x in self.oeb.spine:
            root = x.data
            body = base.XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in base.XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in base.XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in base.XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = next(br.itersiblings(preceding=True))
                        priortag = parse_utils.barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = base.tag('xhtml', 'p')
                    br.text = '\u00a0'
                    style = br.get('style', '').split(';')
                    style = list(filter(None, map(lambda x: x.strip(), style)))
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in base.XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in base.XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
                    continue
                tag.getparent().remove(tag)

            for tag in base.XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in base.XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in base.XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = base.XPath('./h:input|./h:button|./h:textarea|'
                    './h:label|./h:fieldset|./h:legend')
            for tag in base.XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = base.tag('xhtml', 'div')

            for tag in base.XPath('//h:center')(root):
                tag.tag = base.tag('xhtml', 'div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in base.XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = base.XPath('ancestor::h:table')
            for tag in base.XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = base.tag('xhtml', 'div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile('[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace('\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace('\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from css_parser.css import CSSRule
                for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.'+lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')
Exemplo n.º 18
0
def merge_html(container, names, master, insert_page_breaks=False):
    p = container.parsed
    root = p(master)

    # Ensure master has a <head>
    head = root.find('h:head', namespaces=const.XPNSMAP)
    if head is None:
        head = root.makeelement(base.tag('xhtml', 'head'))
        container.insert_into_xml(root, head, 0)

    seen_anchors = all_anchors(root)
    seen_stylesheets = set(all_stylesheets(container, master))
    master_body = p(master).findall('h:body', namespaces=const.XPNSMAP)[-1]
    master_base = os.path.dirname(master)
    anchor_map = {n: {} for n in names if n != master}
    first_anchor_map = {}

    for name in names:
        if name == master:
            continue
        # Insert new stylesheets into master
        for sheet in all_stylesheets(container, name):
            if sheet not in seen_stylesheets:
                seen_stylesheets.add(sheet)
                link = head.makeelement(base.tag('xhtml', 'link'),
                                        rel='stylesheet',
                                        type='text/css',
                                        href=container.name_to_href(
                                            sheet, master))
                container.insert_into_xml(head, link)

        # Rebase links if master is in a different directory
        if os.path.dirname(name) != master_base:
            container.replace_links(name, LinkRebaser(container, name, master))

        root = p(name)
        children = []
        for body in p(name).findall('h:body', namespaces=const.XPNSMAP):
            children.append(
                body.text if body.text and body.text.strip() else '\n\n')
            children.extend(body)

        first_child = ''
        for first_child in children:
            if not isinstance(first_child, (str, bytes)):
                break
        if isinstance(first_child, (str, bytes)):
            # body contained only text, no tags
            first_child = body.makeelement(base.tag('xhtml', 'p'))
            first_child.text, children[0] = children[0], first_child

        amap = anchor_map[name]
        remove_name_attributes(root)

        for elem in root.xpath('//*[@id]'):
            val = elem.get('id')
            if not val:
                continue
            if val in seen_anchors:
                nval = unique_anchor(seen_anchors, val)
                elem.set('id', nval)
                amap[val] = nval
            else:
                seen_anchors.add(val)

        if 'id' not in first_child.attrib:
            first_child.set('id', unique_anchor(seen_anchors, 'top'))
            seen_anchors.add(first_child.get('id'))
        first_anchor_map[name] = first_child.get('id')

        if insert_page_breaks:
            first_child.set(
                'style',
                first_child.get('style', '') + '; page-break-before: always')

        amap[''] = first_child.get('id')

        # Fix links that point to local changed anchors
        for a in base.XPath('//h:a[starts-with(@href, "#")]')(root):
            q = a.get('href')[1:]
            if q in amap:
                a.set('href', '#' + amap[q])

        for child in children:
            if isinstance(child, (str, bytes)):
                add_text(master_body, child)
            else:
                master_body.append(copy.deepcopy(child))

        container.remove_item(name, remove_from_guide=False)

    # Fix all links in the container that point to merged files
    for fname, media_type in container.mime_map.items():
        repl = MergeLinkReplacer(fname, anchor_map, master, container)
        container.replace_links(fname, repl)

    return first_anchor_map
Exemplo n.º 19
0
class OEBReader(object):
    """Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""

    COVER_SVG_XP = base.XPath('h:body//svg:svg[position() = 1]')
    COVER_OBJECT_XP = base.XPath('h:body//h:object[@data][position() = 1]')

    Container = base.DirContainer
    """Container type used to access book files.  Override in sub-classes."""

    DEFAULT_PROFILE = 'PRS505'
    """Default renderer profile for content read with this Reader."""

    TRANSFORMS = []
    """List of transforms to apply to content read with this Reader."""
    @classmethod
    def config(cls, cfg):
        """Add any book-reading options to the :class:`Config` object
        :param:`cfg`.
        """
        return

    @classmethod
    def generate(cls, opts):
        """Generate a Reader instance from command-line options."""
        return cls()

    def __call__(self, oeb, path):
        """Read the book at :param:`path` into the :class:`OEBBook` object
        :param:`oeb`.
        """
        self.oeb = oeb
        self.logger = self.log = oeb.logger
        oeb.container = self.Container(path, self.logger)
        oeb.container.log = oeb.log
        opf = self._read_opf()
        self._all_from_opf(opf)
        return oeb

    def _clean_opf(self, opf):
        nsmap = {}
        for elem in opf.iter(tag=etree.Element):
            nsmap.update(elem.nsmap)
        for elem in opf.iter(tag=etree.Element):
            if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS)
                    and ':' not in parse_utils.barename(elem.tag)):
                elem.tag = base.tag('opf', parse_utils.barename(elem.tag))
        nsmap.update(const.OPF2_NSMAP)
        attrib = dict(opf.attrib)
        nroot = etree.Element(base.tag('opf', 'package'),
                              nsmap={None: const.OPF2_NS},
                              attrib=attrib)
        metadata = etree.SubElement(nroot,
                                    base.tag('opf', 'metadata'),
                                    nsmap=nsmap)
        ignored = (base.tag('opf',
                            'dc-metadata'), base.tag('opf', 'x-metadata'))
        for elem in base.xpath(opf, 'o2:metadata//*'):
            if elem.tag in ignored:
                continue
            if parse_utils.namespace(elem.tag) in const.DC_NSES:
                tag = parse_utils.barename(elem.tag).lower()
                elem.tag = '{%s}%s' % (const.DC11_NS, tag)
            if elem.tag.startswith('dc:'):
                tag = elem.tag.partition(':')[-1].lower()
                elem.tag = '{%s}%s' % (const.DC11_NS, tag)
            metadata.append(elem)
        for element in base.xpath(opf, 'o2:metadata//o2:meta'):
            metadata.append(element)
        for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
            for element in base.xpath(opf, tag):
                nroot.append(element)
        return nroot

    def _read_opf(self):
        data = self.oeb.container.read(None)
        data = self.oeb.decode(data)
        data = base.XMLDECL_RE.sub('', data)
        data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
                      const.OPF1_NS, data)
        try:
            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
            try:
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
            except etree.XMLSyntaxError:
                data = re.sub(r'(?is)<tours>.+</tours>', '', data)
                data = data.replace(
                    '<dc-metadata>', '<dc-metadata xmlns:dc="'
                    'http://purl.org/metadata/dublin_core">')
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid tours section')

        ns = parse_utils.namespace(opf.tag)
        if ns not in ('', const.OPF1_NS, const.OPF2_NS):
            raise base.OEBError('Invalid namespace %r for OPF document' % ns)
        opf = self._clean_opf(opf)
        return opf

    def _metadata_from_opf(self, opf):
        from ebook_converter.ebooks.metadata.opf2 import OPF
        from ebook_converter.ebooks.oeb.transforms.metadata import \
            meta_info_to_oeb_metadata
        stream = io.BytesIO(
            etree.tostring(opf, xml_declaration=True, encoding='utf-8'))
        o = OPF(stream)
        pwm = o.primary_writing_mode
        if pwm:
            self.oeb.metadata.primary_writing_mode = pwm
        mi = o.to_book_metadata()
        if not mi.language:
            mi.language = get_lang().replace('_', '-')
        self.oeb.metadata.add('language', mi.language)
        if not mi.book_producer:
            mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' %
                                dict(a=__appname__, v=__version__))
        meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
        m = self.oeb.metadata
        m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid')
        self.oeb.uid = self.oeb.metadata.identifier[-1]
        if not m.title:
            m.add('title', self.oeb.translate('Unknown'))
        has_aut = False
        for x in m.creator:
            if getattr(x, 'role', '').lower() in ('', 'aut'):
                has_aut = True
                break
        if not has_aut:
            m.add('creator', self.oeb.translate('Unknown'), role='aut')

    def _manifest_prune_invalid(self):
        '''
        Remove items from manifest that contain invalid data. This prevents
        catastrophic conversion failure, when a few files contain corrupted
        data.
        '''
        bad = []
        check = base.OEB_DOCS.union(base.OEB_STYLES)
        for item in list(self.oeb.manifest.values()):
            if item.media_type in check:
                try:
                    item.data
                except KeyboardInterrupt:
                    raise
                except Exception:
                    self.logger.exception('Failed to parse content in %s' %
                                          item.href)
                    bad.append(item)
                    self.oeb.manifest.remove(item)
        return bad

    def _manifest_add_missing(self, invalid):
        import css_parser
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        cdoc = base.OEB_DOCS | base.OEB_STYLES
        invalid = set()
        while unchecked:
            new = set()
            for item in unchecked:
                data = None
                if (item.media_type in cdoc
                        or item.media_type[-4:] in ('/xml', '+xml')):
                    try:
                        data = item.data
                    except Exception:
                        self.oeb.log.exception('Failed to read from manifest '
                                               'entry with id: %s, ignoring' %
                                               item.id)
                        invalid.add(item)
                        continue
                if data is None:
                    continue

                if (item.media_type in base.OEB_DOCS
                        or item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in base.iterlinks(data)]
                    for href in hrefs:
                        if isinstance(href, bytes):
                            href = href.decode('utf-8')
                        href, _ = urllib.parse.urldefrag(href)
                        if not href:
                            continue
                        try:
                            href = item.abshref(base.urlnormalize(href))
                            scheme = urllib.parse.urlparse(href).scheme
                        except Exception:
                            self.oeb.log.exception('Skipping invalid href: '
                                                   '%r' % href)
                            continue
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in base.OEB_STYLES:
                    try:
                        urls = list(css_parser.getUrls(data))
                    except Exception:
                        urls = []
                    for url in urls:
                        href, _ = urllib.parse.urldefrag(url)
                        href = item.abshref(base.urlnormalize(href))
                        scheme = urllib.parse.urlparse(href).scheme
                        if not scheme and href not in known:
                            new.add(href)
            unchecked.clear()
            warned = set()
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(base.urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    if href not in warned:
                        self.logger.warn('Referenced file %r not found' % href)
                        warned.add(href)
                    continue
                if href not in warned:
                    self.logger.warn('Referenced file %r not in manifest' %
                                     href)
                    warned.add(href)
                id, _ = manifest.generate(id='added')
                guessed = mimetypes.guess_type(href)[0]
                media_type = guessed or base.BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)

            for item in invalid:
                self.oeb.manifest.remove(item)

    def _manifest_from_opf(self, opf):
        manifest = self.oeb.manifest
        for elem in base.xpath(opf, '/o2:package/o2:manifest/o2:item'):
            id = elem.get('id')
            href = elem.get('href')
            media_type = elem.get('media-type', None)
            if media_type is None:
                media_type = elem.get('mediatype', None)
            if not media_type or media_type == 'text/xml':
                guessed = mimetypes.guess_type(href)[0]
                media_type = guessed or media_type or base.BINARY_MIME
            if hasattr(media_type, 'lower'):
                media_type = media_type.lower()
            fallback = elem.get('fallback')
            if href in manifest.hrefs:
                self.logger.warn('Duplicate manifest entry for %r' % href)
                continue
            if not self.oeb.container.exists(href):
                self.logger.warn('Manifest item %r not found' % href)
                continue
            if id in manifest.ids:
                self.logger.warn('Duplicate manifest id %r' % id)
                id, href = manifest.generate(id, href)
            manifest.add(id, href, media_type, fallback)
        invalid = self._manifest_prune_invalid()
        self._manifest_add_missing(invalid)

    def _spine_add_extra(self):
        manifest = self.oeb.manifest
        spine = self.oeb.spine
        unchecked = set(spine)
        selector = base.XPath('h:body//h:a/@href')
        extras = set()
        while unchecked:
            new = set()
            for item in unchecked:
                if item.media_type not in base.OEB_DOCS:
                    # TODO: handle fallback chains
                    continue
                for href in selector(item.data):
                    href, _ = urllib.parse.urldefrag(href)
                    if not href:
                        continue
                    try:
                        href = item.abshref(base.urlnormalize(href))
                    except ValueError:  # Malformed URL
                        continue
                    if href not in manifest.hrefs:
                        continue
                    found = manifest.hrefs[href]
                    if found.media_type not in base.OEB_DOCS or \
                       found in spine or found in extras:
                        continue
                    new.add(found)
            extras.update(new)
            unchecked = new
        version = int(self.oeb.version[0])
        removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore',
                                          ())
        for item in sorted(extras):
            if item.href in removed_items_to_ignore:
                continue
            if version >= 2:
                self.logger.warn('Spine-referenced file %r not in spine' %
                                 item.href)
            spine.add(item, linear=False)

    def _spine_from_opf(self, opf):
        spine = self.oeb.spine
        manifest = self.oeb.manifest
        for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'):
            idref = elem.get('idref')
            if idref not in manifest.ids:
                self.logger.warn('Spine item %r not found' % idref)
                continue
            item = manifest.ids[idref]
            if (item.media_type.lower() in base.OEB_DOCS
                    and hasattr(item.data, 'xpath')
                    and not getattr(item.data, 'tag', '').endswith('}ncx')):
                spine.add(item, elem.get('linear'))
            else:
                if (hasattr(item.data, 'tag') and item.data.tag
                        and item.data.tag.endswith('}html')):
                    item.media_type = base.XHTML_MIME
                    spine.add(item, elem.get('linear'))
                else:
                    self.oeb.log.warn('The item %s is not a XML document.'
                                      ' Removing it from spine.' % item.href)
        if len(spine) == 0:
            raise base.OEBError("Spine is empty")
        self._spine_add_extra()
        for val in base.xpath(
                opf, '/o2:package/o2:spine/@page-progression-direction'):
            if val in {'ltr', 'rtl'}:
                spine.page_progression_direction = val

    def _guide_from_opf(self, opf):
        guide = self.oeb.guide
        manifest = self.oeb.manifest
        for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'):
            ref_href = elem.get('href')
            path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0])
            if path not in manifest.hrefs:
                corrected_href = None
                for href in manifest.hrefs:
                    if href.lower() == path.lower():
                        corrected_href = href
                        break
                if corrected_href is None:
                    self.logger.warn('Guide reference %r not found' % ref_href)
                    continue
                ref_href = corrected_href
            typ = elem.get('type')
            if typ not in guide:
                guide.add(typ, elem.get('title'), ref_href)

    def _find_ncx(self, opf):
        result = base.xpath(opf, '/o2:package/o2:spine/@toc')
        if result:
            id = result[0]
            if id not in self.oeb.manifest.ids:
                return None
            item = self.oeb.manifest.ids[id]
            self.oeb.manifest.remove(item)
            return item
        for item in self.oeb.manifest.values():
            if item.media_type == base.NCX_MIME:
                self.oeb.manifest.remove(item)
                return item
        return None

    def _toc_from_navpoint(self, item, toc, navpoint):
        children = base.xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = base.COLLAPSE_RE.sub(' ', title.strip())
            href = base.xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if (not href
                    or not href[0]) and not base.xpath(child, 'ncx:navPoint'):
                # This node is useless
                continue
            if href and href[0]:
                href = item.abshref(base.urlnormalize(href[0]))
            else:
                href = ''
            path, _ = urllib.parse.urldefrag(href)
            if path and path not in self.oeb.manifest.hrefs:
                path = base.urlnormalize(path)
            if href and path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = base.xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder',
                                   self.oeb.toc.next_play_order()))
            except Exception:
                po = self.oeb.toc.next_play_order()

            authorElement = base.xpath(
                child, 'descendant::calibre:meta[@name = "author"]')
            if authorElement:
                author = authorElement[0].text
            else:
                author = None

            descriptionElement = base.xpath(
                child, 'descendant::calibre:meta[@name = '
                '"description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                                             method='text',
                                             encoding='unicode').strip()
                if not description:
                    description = None
            else:
                description = None

            index_image = base.xpath(
                child, 'descendant::calibre:meta[@name = '
                '"toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title,
                           href,
                           id=id,
                           klass=klass,
                           play_order=po,
                           description=description,
                           author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)

    def _toc_from_ncx(self, item):
        if (item is None) or (item.data is None):
            return False
        self.log.debug('Reading TOC from NCX...')
        ncx = item.data
        title = ''.join(base.xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
        title = base.COLLAPSE_RE.sub(' ', title.strip())
        title = title or str(self.oeb.metadata.title[0])
        toc = self.oeb.toc
        toc.title = title
        navmaps = base.xpath(ncx, 'ncx:navMap')
        for navmap in navmaps:
            self._toc_from_navpoint(item, toc, navmap)
        return True

    def _toc_from_tour(self, opf):
        result = base.xpath(opf, 'o2:tours/o2:tour')
        if not result:
            return False
        self.log.debug('Reading TOC from tour...')
        tour = result[0]
        toc = self.oeb.toc
        toc.title = tour.get('title')
        sites = base.xpath(tour, 'o2:site')
        for site in sites:
            title = site.get('title')
            href = site.get('href')
            if not title or not href:
                continue
            path, _ = urllib.parse.urldefrag(base.urlnormalize(href))
            if path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                continue
            id = site.get('id')
            toc.add(title, href, id=id)
        return True

    def _toc_from_html(self, opf):
        if 'toc' not in self.oeb.guide:
            return False
        self.log.debug('Reading TOC from HTML...')
        itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href)
        item = self.oeb.manifest.hrefs[itempath]
        html = item.data
        if frag:
            elems = base.xpath(html, './/*[@id="%s"]' % frag)
            if not elems:
                elems = base.xpath(html, './/*[@name="%s"]' % frag)
            elem = elems[0] if elems else html
            while elem != html and not base.xpath(elem, './/h:a[@href]'):
                elem = elem.getparent()
            html = elem
        titles = collections.defaultdict(list)
        order = []
        for anchor in base.xpath(html, './/h:a[@href]'):
            href = anchor.attrib['href']
            href = item.abshref(base.urlnormalize(href))
            path, frag = urllib.parse.urldefrag(href)
            if path not in self.oeb.manifest.hrefs:
                continue
            title = base.xml2text(anchor)
            title = base.COLLAPSE_RE.sub(' ', title.strip())
            if href not in titles:
                order.append(href)
            titles[href].append(title)
        toc = self.oeb.toc
        for href in order:
            toc.add(' '.join(titles[href]), href)
        return True

    def _toc_from_spine(self, opf):
        self.log.warn('Generating default TOC from spine...')
        toc = self.oeb.toc
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(base.xpath(html, '/h:html/h:head/h:title/text()'))
            title = base.COLLAPSE_RE.sub(' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(base.xpath(html, expr % tag))
                header = base.COLLAPSE_RE.sub(' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in zip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)
        return True

    def _toc_from_opf(self, opf, item):
        self.oeb.auto_generated_toc = False
        if self._toc_from_ncx(item):
            return
        # Prefer HTML to tour based TOC, since several LIT files
        # have good HTML TOCs but bad tour based TOCs
        if self._toc_from_html(opf):
            return
        if self._toc_from_tour(opf):
            return
        self._toc_from_spine(opf)
        self.oeb.auto_generated_toc = True

    def _pages_from_ncx(self, opf, item):
        if item is None:
            return False
        ncx = item.data
        if ncx is None:
            return False
        ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget')
        if not ptargets:
            return False
        pages = self.oeb.pages
        for ptarget in ptargets:
            name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
            name = base.COLLAPSE_RE.sub(' ', name.strip())
            href = base.xpath(ptarget, 'ncx:content/@src')
            if not href:
                continue
            href = item.abshref(base.urlnormalize(href[0]))
            id = ptarget.get('id')
            type = ptarget.get('type', 'normal')
            klass = ptarget.get('class')
            pages.add(name, href, type=type, id=id, klass=klass)
        return True

    def _find_page_map(self, opf):
        result = base.xpath(opf, '/o2:package/o2:spine/@page-map')
        if result:
            id = result[0]
            if id not in self.oeb.manifest.ids:
                return None
            item = self.oeb.manifest.ids[id]
            self.oeb.manifest.remove(item)
            return item
        for item in self.oeb.manifest.values():
            if item.media_type == base.PAGE_MAP_MIME:
                self.oeb.manifest.remove(item)
                return item
        return None

    def _pages_from_page_map(self, opf):
        item = self._find_page_map(opf)
        if item is None:
            return False
        pmap = item.data
        pages = self.oeb.pages
        for page in base.xpath(pmap, 'o2:page'):
            name = page.get('name', '')
            href = page.get('href')
            if not href:
                continue
            name = base.COLLAPSE_RE.sub(' ', name.strip())
            href = item.abshref(base.urlnormalize(href))
            type = 'normal'
            if not name:
                type = 'special'
            elif name.lower().strip('ivxlcdm') == '':
                type = 'front'
            pages.add(name, href, type=type)
        return True

    def _pages_from_opf(self, opf, item):
        if self._pages_from_ncx(opf, item):
            return
        if self._pages_from_page_map(opf):
            return
        return

    def _cover_from_html(self, hcover):
        from ebook_converter.ebooks import render_html_svg_workaround
        with TemporaryDirectory('_html_cover') as tdir:
            writer = OEBWriter()
            writer(self.oeb, tdir)
            path = os.path.join(tdir, unquote(hcover.href))
            data = render_html_svg_workaround(path, self.logger)
            if not data:
                data = b''
        id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
        item = self.oeb.manifest.add(id, href, base.JPEG_MIME, data=data)
        return item

    def _locate_cover_image(self):
        if self.oeb.metadata.cover:
            id = str(self.oeb.metadata.cover[0])
            item = self.oeb.manifest.ids.get(id, None)
            if item is not None and item.media_type in base.OEB_IMAGES:
                return item
            else:
                self.logger.warn('Invalid cover image @id %r' % id)
        hcover = self.oeb.spine[0]
        if 'cover' in self.oeb.guide:
            href = self.oeb.guide['cover'].href
            item = self.oeb.manifest.hrefs[href]
            media_type = item.media_type
            if media_type in base.OEB_IMAGES:
                return item
            elif media_type in base.OEB_DOCS:
                hcover = item
        html = hcover.data
        if base.MS_COVER_TYPE in self.oeb.guide:
            href = self.oeb.guide[base.MS_COVER_TYPE].href
            item = self.oeb.manifest.hrefs.get(href, None)
            if item is not None and item.media_type in base.OEB_IMAGES:
                return item
        if self.COVER_SVG_XP(html):
            svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
            href = os.path.splitext(hcover.href)[0] + '.svg'
            id, href = self.oeb.manifest.generate(hcover.id, href)
            item = self.oeb.manifest.add(id, href, base.SVG_MIME, data=svg)
            return item
        if self.COVER_OBJECT_XP(html):
            object = self.COVER_OBJECT_XP(html)[0]
            href = hcover.abshref(object.get('data'))
            item = self.oeb.manifest.hrefs.get(href, None)
            if item is not None and item.media_type in base.OEB_IMAGES:
                return item
        return self._cover_from_html(hcover)

    def _ensure_cover_image(self):
        cover = self._locate_cover_image()
        if self.oeb.metadata.cover:
            self.oeb.metadata.cover[0].value = cover.id
            return
        self.oeb.metadata.add('cover', cover.id)

    def _manifest_remove_duplicates(self):
        seen = set()
        dups = set()
        for item in self.oeb.manifest:
            if item.href in seen:
                dups.add(item.href)
            seen.add(item.href)

        for href in dups:
            items = [x for x in self.oeb.manifest if x.href == href]
            for x in items:
                if x not in self.oeb.spine:
                    self.oeb.log.warn(
                        'Removing duplicate manifest item with '
                        'id:', x.id)
                    self.oeb.manifest.remove_duplicate_item(x)

    def _all_from_opf(self, opf):
        self.oeb.version = opf.get('version', '1.2')
        self._metadata_from_opf(opf)
        self._manifest_from_opf(opf)
        self._spine_from_opf(opf)
        self._manifest_remove_duplicates()
        self._guide_from_opf(opf)
        item = self._find_ncx(opf)
        self._toc_from_opf(opf, item)
        self._pages_from_opf(opf, item)