示例#1
0
文件: toc.py 项目: BobPyron/calibre
    def __init__(self, oeb, opts):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.title = opts.toc_title or DEFAULT_TITLE
        self.at_start = opts.mobi_toc_at_start
        self.generated_item = None
        self.added_toc_guide_entry = False
        self.has_toc = oeb.toc and oeb.toc.count() > 1

        if 'toc' in oeb.guide:
            # Remove spurious toc entry from guide if it is not in spine or it
            # does not have any hyperlinks
            href = urlnormalize(oeb.guide['toc'].href.partition('#')[0])
            if href in oeb.manifest.hrefs:
                item = oeb.manifest.hrefs[href]
                if (hasattr(item.data, 'xpath') and
                    XPath('//h:a[@href]')(item.data)):
                    if oeb.spine.index(item) < 0:
                        oeb.spine.add(item, linear=False)
                    return
                elif self.has_toc:
                    oeb.guide.remove('toc')
            else:
                oeb.guide.remove('toc')

        if (not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc or
            getattr(opts, 'mobi_passthrough', False)):
            return

        self.log('\tGenerating in-line ToC')

        embed_css = ''
        s = getattr(oeb, 'store_embed_font_rules', None)
        if getattr(s, 'body_font_family', None):
            css = [x.cssText for x in s.rules] + [
                    'body { font-family: %s }'%s.body_font_family]
            embed_css = '\n\n'.join(css)

        root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS,
            title=self.title, embed_css=embed_css,
            extra_css=(opts.extra_css or '')))
        parent = XPath('//h:ul')(root)[0]
        parent.text = '\n\t'
        for child in self.oeb.toc:
            self.process_toc_node(child, parent)

        id, href = oeb.manifest.generate('contents', 'contents.xhtml')
        item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME,
                data=root)
        if self.at_start:
            oeb.spine.insert(0, item, linear=True)
        else:
            oeb.spine.add(item, linear=False)

        oeb.guide.add('toc', 'Table of Contents', href)
示例#2
0
文件: toc.py 项目: Eksmo/calibre
    def __init__(self, oeb, opts):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.title = opts.toc_title or DEFAULT_TITLE
        self.at_start = opts.mobi_toc_at_start
        self.generated_item = None
        self.added_toc_guide_entry = False
        self.has_toc = oeb.toc and oeb.toc.count() > 1

        if 'toc' in oeb.guide:
            # Remove spurious toc entry from guide if it is not in spine or it
            # does not have any hyperlinks
            href = urlnormalize(oeb.guide['toc'].href)
            if href in oeb.manifest.hrefs:
                item = oeb.manifest.hrefs[href]
                if (hasattr(item.data, 'xpath') and
                    XPath('//h:a[@href]')(item.data)):
                    if oeb.spine.index(item) < 0:
                        oeb.spine.add(item, linear=False)
                    return
                elif self.has_toc:
                    oeb.guide.remove('toc')
            else:
                oeb.guide.remove('toc')

        if not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc:
            return

        self.log('\tGenerating in-line ToC')

        root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS,
            title=self.title))
        parent = XPath('//h:ul')(root)[0]
        parent.text = '\n\t'
        for child in self.oeb.toc:
            self.process_toc_node(child, parent)

        id, href = oeb.manifest.generate('contents', 'contents.xhtml')
        item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME,
                data=root)
        if self.at_start:
            oeb.spine.insert(0, item, linear=True)
        else:
            oeb.spine.add(item, linear=False)

        oeb.guide.add('toc', 'Table of Contents', href)
示例#3
0
def item_at_top(elem):
    try:
        body = XPath('//h:body')(elem.getroottree().getroot())[0]
    except (TypeError, IndexError, KeyError, AttributeError):
        return False
    tree = body.getroottree()
    path = tree.getpath(elem)
    for el in body.iterdescendants(etree.Element):
        epath = tree.getpath(el)
        if epath == path:
            break
        try:
            if el.tag.endswith('}img') or (el.text and el.text.strip()):
                return False
        except:
            return False
        if not path.startswith(epath):
            # Only check tail of non-parent elements
            if el.tail and el.tail.strip():
                return False
    return True
示例#4
0
 def frag_is_at_top(root, frag):
     body = XPath('//h:body')(root)
     if body:
         body = body[0]
     else:
         return False
     tree = body.getroottree()
     elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
     if elem:
         elem = elem[0]
     else:
         return False
     path = tree.getpath(elem)
     for el in body.iterdescendants():
         epath = tree.getpath(el)
         if epath == path:
             break
         if el.text and el.text.strip():
             return False
         if not path.startswith(epath):
             # Only check tail of non-parent elements
             if el.tail and el.tail.strip():
                 return False
     return True
示例#5
0
文件: reader.py 项目: pkuhzx/calibre
class OEBReader(object):
    """Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""

    COVER_SVG_XP    = XPath('h:body//svg:svg[position() = 1]')
    COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')

    Container = DirContainer
    """Container type used to access book files.  Override in sub-classes."""

    DEFAULT_PROFILE = 'PRS505'
    """Default renderer profile for content read with this Reader."""

    TRANSFORMS = []
    """List of transforms to apply to content read with this Reader."""

    @classmethod
    def config(cls, cfg):
        """Add any book-reading options to the :class:`Config` object
        :param:`cfg`.
        """
        return

    @classmethod
    def generate(cls, opts):
        """Generate a Reader instance from command-line options."""
        return cls()

    def __call__(self, oeb, path):
        """Read the book at :param:`path` into the :class:`OEBBook` object
        :param:`oeb`.
        """
        self.oeb = oeb
        self.logger = self.log = oeb.logger
        oeb.container = self.Container(path, self.logger)
        oeb.container.log = oeb.log
        opf = self._read_opf()
        self._all_from_opf(opf)
        return oeb

    def _clean_opf(self, opf):
        nsmap = {}
        for elem in opf.iter(tag=etree.Element):
            nsmap.update(elem.nsmap)
        for elem in opf.iter(tag=etree.Element):
            if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
                elem.tag = OPF(barename(elem.tag))
        nsmap.update(OPF2_NSMAP)
        attrib = dict(opf.attrib)
        nroot = etree.Element(OPF('package'),
            nsmap={None: OPF2_NS}, attrib=attrib)
        metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
        ignored = (OPF('dc-metadata'), OPF('x-metadata'))
        for elem in xpath(opf, 'o2:metadata//*'):
            if elem.tag in ignored:
                continue
            if namespace(elem.tag) in DC_NSES:
                tag = barename(elem.tag).lower()
                elem.tag = '{%s}%s' % (DC11_NS, tag)
            if elem.tag.startswith('dc:'):
                tag = elem.tag.partition(':')[-1].lower()
                elem.tag = '{%s}%s' % (DC11_NS, tag)
            metadata.append(elem)
        for element in xpath(opf, 'o2:metadata//o2:meta'):
            metadata.append(element)
        for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
            for element in xpath(opf, tag):
                nroot.append(element)
        return nroot

    def _read_opf(self):
        data = self.oeb.container.read(None)
        data = self.oeb.decode(data)
        data = XMLDECL_RE.sub('', data)
        data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
                OPF1_NS, data)
        try:
            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
            try:
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
            except etree.XMLSyntaxError:
                data = re.sub(r'(?is)<tours>.+</tours>', '', data)
                data = data.replace('<dc-metadata>',
                    '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
                try:
                    opf = etree.fromstring(data)
                    self.logger.warn('OPF contains invalid tours section')
                except etree.XMLSyntaxError:
                    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
                    opf = etree.fromstring(data, parser=RECOVER_PARSER)
                    self.logger.warn('OPF contains invalid markup, trying to parse it anyway')

        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):
            raise OEBError('Invalid namespace %r for OPF document' % ns)
        opf = self._clean_opf(opf)
        return opf

    def _metadata_from_opf(self, opf):
        from calibre.ebooks.metadata.opf2 import OPF
        from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
        stream = cStringIO.StringIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8'))
        o = OPF(stream)
        pwm = o.primary_writing_mode
        if pwm:
            self.oeb.metadata.primary_writing_mode = pwm
        mi = o.to_book_metadata()
        if not mi.language:
            mi.language = get_lang().replace('_', '-')
        self.oeb.metadata.add('language', mi.language)
        if not mi.book_producer:
            mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\
                dict(a=__appname__, v=__version__)
        meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
        m = self.oeb.metadata
        m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid')
        self.oeb.uid = self.oeb.metadata.identifier[-1]
        if not m.title:
            m.add('title', self.oeb.translate(__('Unknown')))
        has_aut = False
        for x in m.creator:
            if getattr(x, 'role', '').lower() in ('', 'aut'):
                has_aut = True
                break
        if not has_aut:
            m.add('creator', self.oeb.translate(__('Unknown')), role='aut')

    def _manifest_prune_invalid(self):
        '''
        Remove items from manifest that contain invalid data. This prevents
        catastrophic conversion failure, when a few files contain corrupted
        data.
        '''
        bad = []
        check = OEB_DOCS.union(OEB_STYLES)
        for item in list(self.oeb.manifest.values()):
            if item.media_type in check:
                try:
                    item.data
                except KeyboardInterrupt:
                    raise
                except:
                    self.logger.exception('Failed to parse content in %s'%
                            item.href)
                    bad.append(item)
                    self.oeb.manifest.remove(item)
        return bad

    def _manifest_add_missing(self, invalid):
        import css_parser
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        cdoc = OEB_DOCS|OEB_STYLES
        invalid = set()
        while unchecked:
            new = set()
            for item in unchecked:
                data = None
                if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')):
                    try:
                        data = item.data
                    except:
                        self.oeb.log.exception(u'Failed to read from manifest '
                                u'entry with id: %s, ignoring'%item.id)
                        invalid.add(item)
                        continue
                if data is None:
                    continue

                if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in iterlinks(data)]
                    for href in hrefs:
                        if isinstance(href, bytes):
                            href = href.decode('utf-8')
                        href, _ = urldefrag(href)
                        if not href:
                            continue
                        try:
                            href = item.abshref(urlnormalize(href))
                            scheme = urlparse(href).scheme
                        except:
                            self.oeb.log.exception(
                                'Skipping invalid href: %r'%href)
                            continue
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
                    try:
                        urls = list(css_parser.getUrls(data))
                    except:
                        urls = []
                    for url in urls:
                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
                            new.add(href)
            unchecked.clear()
            warned = set([])
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    if href not in warned:
                        self.logger.warn('Referenced file %r not found' % href)
                        warned.add(href)
                    continue
                if href not in warned:
                    self.logger.warn('Referenced file %r not in manifest' % href)
                    warned.add(href)
                id, _ = manifest.generate(id='added')
                guessed = guess_type(href)[0]
                media_type = guessed or BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)

            for item in invalid:
                self.oeb.manifest.remove(item)

    def _manifest_from_opf(self, opf):
        manifest = self.oeb.manifest
        for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
            id = elem.get('id')
            href = elem.get('href')
            media_type = elem.get('media-type', None)
            if media_type is None:
                media_type = elem.get('mediatype', None)
            if not media_type or media_type == 'text/xml':
                guessed = guess_type(href)[0]
                media_type = guessed or media_type or BINARY_MIME
            if hasattr(media_type, 'lower'):
                media_type = media_type.lower()
            fallback = elem.get('fallback')
            if href in manifest.hrefs:
                self.logger.warn(u'Duplicate manifest entry for %r' % href)
                continue
            if not self.oeb.container.exists(href):
                self.logger.warn(u'Manifest item %r not found' % href)
                continue
            if id in manifest.ids:
                self.logger.warn(u'Duplicate manifest id %r' % id)
                id, href = manifest.generate(id, href)
            manifest.add(id, href, media_type, fallback)
        invalid = self._manifest_prune_invalid()
        self._manifest_add_missing(invalid)

    def _spine_add_extra(self):
        manifest = self.oeb.manifest
        spine = self.oeb.spine
        unchecked = set(spine)
        selector = XPath('h:body//h:a/@href')
        extras = set()
        while unchecked:
            new = set()
            for item in unchecked:
                if item.media_type not in OEB_DOCS:
                    # TODO: handle fallback chains
                    continue
                for href in selector(item.data):
                    href, _ = urldefrag(href)
                    if not href:
                        continue
                    try:
                        href = item.abshref(urlnormalize(href))
                    except ValueError:  # Malformed URL
                        continue
                    if href not in manifest.hrefs:
                        continue
                    found = manifest.hrefs[href]
                    if found.media_type not in OEB_DOCS or \
                       found in spine or found in extras:
                        continue
                    new.add(found)
            extras.update(new)
            unchecked = new
        version = int(self.oeb.version[0])
        removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ())
        for item in sorted(extras):
            if item.href in removed_items_to_ignore:
                continue
            if version >= 2:
                self.logger.warn(
                    'Spine-referenced file %r not in spine' % item.href)
            spine.add(item, linear=False)

    def _spine_from_opf(self, opf):
        spine = self.oeb.spine
        manifest = self.oeb.manifest
        for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
            idref = elem.get('idref')
            if idref not in manifest.ids:
                self.logger.warn(u'Spine item %r not found' % idref)
                continue
            item = manifest.ids[idref]
            if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'):
                spine.add(item, elem.get('linear'))
            else:
                if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'):
                    item.media_type = XHTML_MIME
                    spine.add(item, elem.get('linear'))
                else:
                    self.oeb.log.warn('The item %s is not a XML document.'
                        ' Removing it from spine.'%item.href)
        if len(spine) == 0:
            raise OEBError("Spine is empty")
        self._spine_add_extra()
        for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'):
            if val in {'ltr', 'rtl'}:
                spine.page_progression_direction = val

    def _guide_from_opf(self, opf):
        guide = self.oeb.guide
        manifest = self.oeb.manifest
        for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
            ref_href = elem.get('href')
            path = urlnormalize(urldefrag(ref_href)[0])
            if path not in manifest.hrefs:
                corrected_href = None
                for href in manifest.hrefs:
                    if href.lower() == path.lower():
                        corrected_href = href
                        break
                if corrected_href is None:
                    self.logger.warn(u'Guide reference %r not found' % ref_href)
                    continue
                ref_href = corrected_href
            typ = elem.get('type')
            if typ not in guide:
                guide.add(typ, elem.get('title'), ref_href)

    def _find_ncx(self, opf):
        result = xpath(opf, '/o2:package/o2:spine/@toc')
        if result:
            id = result[0]
            if id not in self.oeb.manifest.ids:
                return None
            item = self.oeb.manifest.ids[id]
            self.oeb.manifest.remove(item)
            return item
        for item in self.oeb.manifest.values():
            if item.media_type == NCX_MIME:
                self.oeb.manifest.remove(item)
                return item
        return None

    def _toc_from_navpoint(self, item, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            href = xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
                # This node is useless
                continue
            href = item.abshref(urlnormalize(href[0])) if href and href[0] else ''
            path, _ = urldefrag(href)
            if path and path not in self.oeb.manifest.hrefs:
                path = urlnormalize(path)
            if href and path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
            except:
                po = self.oeb.toc.next_play_order()

            authorElement = xpath(child,
                    'descendant::calibre:meta[@name = "author"]')
            if authorElement:
                author = authorElement[0].text
            else:
                author = None

            descriptionElement = xpath(child,
                    'descendant::calibre:meta[@name = "description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                method='text', encoding=unicode_type).strip()
                if not description:
                    description = None
            else:
                description = None

            index_image = xpath(child,
                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title, href, id=id, klass=klass,
                    play_order=po, description=description, author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)

    def _toc_from_ncx(self, item):
        if (item is None) or (item.data is None):
            return False
        self.log.debug('Reading TOC from NCX...')
        ncx = item.data
        title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
        title = COLLAPSE_RE.sub(' ', title.strip())
        title = title or unicode_type(self.oeb.metadata.title[0])
        toc = self.oeb.toc
        toc.title = title
        navmaps = xpath(ncx, 'ncx:navMap')
        for navmap in navmaps:
            self._toc_from_navpoint(item, toc, navmap)
        return True

    def _toc_from_tour(self, opf):
        result = xpath(opf, 'o2:tours/o2:tour')
        if not result:
            return False
        self.log.debug('Reading TOC from tour...')
        tour = result[0]
        toc = self.oeb.toc
        toc.title = tour.get('title')
        sites = xpath(tour, 'o2:site')
        for site in sites:
            title = site.get('title')
            href = site.get('href')
            if not title or not href:
                continue
            path, _ = urldefrag(urlnormalize(href))
            if path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                continue
            id = site.get('id')
            toc.add(title, href, id=id)
        return True

    def _toc_from_html(self, opf):
        if 'toc' not in self.oeb.guide:
            return False
        self.log.debug('Reading TOC from HTML...')
        itempath, frag = urldefrag(self.oeb.guide['toc'].href)
        item = self.oeb.manifest.hrefs[itempath]
        html = item.data
        if frag:
            elems = xpath(html, './/*[@id="%s"]' % frag)
            if not elems:
                elems = xpath(html, './/*[@name="%s"]' % frag)
            elem = elems[0] if elems else html
            while elem != html and not xpath(elem, './/h:a[@href]'):
                elem = elem.getparent()
            html = elem
        titles = defaultdict(list)
        order = []
        for anchor in xpath(html, './/h:a[@href]'):
            href = anchor.attrib['href']
            href = item.abshref(urlnormalize(href))
            path, frag = urldefrag(href)
            if path not in self.oeb.manifest.hrefs:
                continue
            title = xml2text(anchor)
            title = COLLAPSE_RE.sub(' ', title.strip())
            if href not in titles:
                order.append(href)
            titles[href].append(title)
        toc = self.oeb.toc
        for href in order:
            toc.add(' '.join(titles[href]), href)
        return True

    def _toc_from_spine(self, opf):
        self.log.warn('Generating default TOC from spine...')
        toc = self.oeb.toc
        titles = []
        headers = []
        for item in self.oeb.spine:
            if not item.linear:
                continue
            html = item.data
            title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            if title:
                titles.append(title)
            headers.append('(unlabled)')
            for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
                expr = '/h:html/h:body//h:%s[position()=1]/text()'
                header = ''.join(xpath(html, expr % tag))
                header = COLLAPSE_RE.sub(' ', header.strip())
                if header:
                    headers[-1] = header
                    break
        use = titles
        if len(titles) > len(set(titles)):
            use = headers
        for title, item in izip(use, self.oeb.spine):
            if not item.linear:
                continue
            toc.add(title, item.href)
        return True

    def _toc_from_opf(self, opf, item):
        self.oeb.auto_generated_toc = False
        if self._toc_from_ncx(item):
            return
        # Prefer HTML to tour based TOC, since several LIT files
        # have good HTML TOCs but bad tour based TOCs
        if self._toc_from_html(opf):
            return
        if self._toc_from_tour(opf):
            return
        self._toc_from_spine(opf)
        self.oeb.auto_generated_toc = True

    def _pages_from_ncx(self, opf, item):
        if item is None:
            return False
        ncx = item.data
        if ncx is None:
            return False
        ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
        if not ptargets:
            return False
        pages = self.oeb.pages
        for ptarget in ptargets:
            name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
            name = COLLAPSE_RE.sub(' ', name.strip())
            href = xpath(ptarget, 'ncx:content/@src')
            if not href:
                continue
            href = item.abshref(urlnormalize(href[0]))
            id = ptarget.get('id')
            type = ptarget.get('type', 'normal')
            klass = ptarget.get('class')
            pages.add(name, href, type=type, id=id, klass=klass)
        return True

    def _find_page_map(self, opf):
        result = xpath(opf, '/o2:package/o2:spine/@page-map')
        if result:
            id = result[0]
            if id not in self.oeb.manifest.ids:
                return None
            item = self.oeb.manifest.ids[id]
            self.oeb.manifest.remove(item)
            return item
        for item in self.oeb.manifest.values():
            if item.media_type == PAGE_MAP_MIME:
                self.oeb.manifest.remove(item)
                return item
        return None

    def _pages_from_page_map(self, opf):
        item = self._find_page_map(opf)
        if item is None:
            return False
        pmap = item.data
        pages = self.oeb.pages
        for page in xpath(pmap, 'o2:page'):
            name = page.get('name', '')
            href = page.get('href')
            if not href:
                continue
            name = COLLAPSE_RE.sub(' ', name.strip())
            href = item.abshref(urlnormalize(href))
            type = 'normal'
            if not name:
                type = 'special'
            elif name.lower().strip('ivxlcdm') == '':
                type = 'front'
            pages.add(name, href, type=type)
        return True

    def _pages_from_opf(self, opf, item):
        if self._pages_from_ncx(opf, item):
            return
        if self._pages_from_page_map(opf):
            return
        return

    def _cover_from_html(self, hcover):
        from calibre.ebooks import render_html_svg_workaround
        with TemporaryDirectory('_html_cover') as tdir:
            writer = OEBWriter()
            writer(self.oeb, tdir)
            path = os.path.join(tdir, urlunquote(hcover.href))
            data = render_html_svg_workaround(path, self.logger)
            if not data:
                data = ''
        id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
        item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
        return item

    def _locate_cover_image(self):
        if self.oeb.metadata.cover:
            id = unicode_type(self.oeb.metadata.cover[0])
            item = self.oeb.manifest.ids.get(id, None)
            if item is not None and item.media_type in OEB_IMAGES:
                return item
            else:
                self.logger.warn('Invalid cover image @id %r' % id)
        hcover = self.oeb.spine[0]
        if 'cover' in self.oeb.guide:
            href = self.oeb.guide['cover'].href
            item = self.oeb.manifest.hrefs[href]
            media_type = item.media_type
            if media_type in OEB_IMAGES:
                return item
            elif media_type in OEB_DOCS:
                hcover = item
        html = hcover.data
        if MS_COVER_TYPE in self.oeb.guide:
            href = self.oeb.guide[MS_COVER_TYPE].href
            item = self.oeb.manifest.hrefs.get(href, None)
            if item is not None and item.media_type in OEB_IMAGES:
                return item
        if self.COVER_SVG_XP(html):
            svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
            href = os.path.splitext(hcover.href)[0] + '.svg'
            id, href = self.oeb.manifest.generate(hcover.id, href)
            item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg)
            return item
        if self.COVER_OBJECT_XP(html):
            object = self.COVER_OBJECT_XP(html)[0]
            href = hcover.abshref(object.get('data'))
            item = self.oeb.manifest.hrefs.get(href, None)
            if item is not None and item.media_type in OEB_IMAGES:
                return item
        return self._cover_from_html(hcover)

    def _ensure_cover_image(self):
        cover = self._locate_cover_image()
        if self.oeb.metadata.cover:
            self.oeb.metadata.cover[0].value = cover.id
            return
        self.oeb.metadata.add('cover', cover.id)

    def _manifest_remove_duplicates(self):
        seen = set()
        dups = set()
        for item in self.oeb.manifest:
            if item.href in seen:
                dups.add(item.href)
            seen.add(item.href)

        for href in dups:
            items = [x for x in self.oeb.manifest if x.href == href]
            for x in items:
                if x not in self.oeb.spine:
                    self.oeb.log.warn('Removing duplicate manifest item with id:', x.id)
                    self.oeb.manifest.remove_duplicate_item(x)

    def _all_from_opf(self, opf):
        self.oeb.version = opf.get('version', '1.2')
        self._metadata_from_opf(opf)
        self._manifest_from_opf(opf)
        self._spine_from_opf(opf)
        self._manifest_remove_duplicates()
        self._guide_from_opf(opf)
        item = self._find_ncx(opf)
        self._toc_from_opf(opf, item)
        self._pages_from_opf(opf, item)
示例#6
0
def css_data(container, book_locale, result_data, *args):
    import tinycss
    from tinycss.css21 import RuleSet, ImportRule

    def css_rules(file_name, rules, sourceline=0):
        ans = []
        for rule in rules:
            if isinstance(rule, RuleSet):
                selector = rule.selector.as_css()
                ans.append(
                    CSSRule(
                        selector,
                        RuleLocation(file_name, sourceline + rule.line,
                                     rule.column)))
            elif isinstance(rule, ImportRule):
                import_name = safe_href_to_name(container, rule.uri, file_name)
                if import_name and container.exists(import_name):
                    ans.append(import_name)
            elif getattr(rule, 'rules', False):
                ans.extend(css_rules(file_name, rule.rules, sourceline))
        return ans

    parser = tinycss.make_full_parser()
    importable_sheets = {}
    html_sheets = {}
    spine_names = {name for name, is_linear in container.spine_names}
    style_path, link_path = XPath('//h:style'), XPath('//h:link/@href')

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_STYLES:
            importable_sheets[name] = css_rules(
                name,
                parser.parse_stylesheet(container.raw_data(name)).rules)
        elif mt in OEB_DOCS and name in spine_names:
            html_sheets[name] = []
            for style in style_path(container.parsed(name)):
                if style.get('type', 'text/css') == 'text/css' and style.text:
                    html_sheets[name].append(
                        css_rules(
                            name,
                            parser.parse_stylesheet(
                                force_unicode(style.text, 'utf-8')).rules,
                            style.sourceline - 1))

    rule_map = defaultdict(lambda: defaultdict(list))

    def rules_in_sheet(sheet):
        for rule in sheet:
            if isinstance(rule, CSSRule):
                yield rule
            else:  # @import rule
                isheet = importable_sheets.get(rule)
                if isheet is not None:
                    yield from rules_in_sheet(isheet)

    def sheets_for_html(name, root):
        for href in link_path(root):
            tname = safe_href_to_name(container, href, name)
            sheet = importable_sheets.get(tname)
            if sheet is not None:
                yield sheet

    tt_cache = {}

    def tag_text(elem):
        ans = tt_cache.get(elem)
        if ans is None:
            tag = elem.tag.rpartition('}')[-1]
            if elem.attrib:
                attribs = ' '.join('{}="{}"'.format(
                    k, prepare_string_for_xml(elem.get(k, ''), True))
                                   for k in elem.keys())
                return f'<{tag} {attribs}>'
            ans = tt_cache[elem] = '<%s>' % tag

    def matches_for_selector(selector, select, class_map, rule):
        lsel = selector.lower()
        try:
            matches = tuple(select(selector))
        except SelectorError:
            return ()
        seen = set()

        def get_elem_and_ancestors(elem):
            p = elem
            while p is not None:
                if p not in seen:
                    yield p
                    seen.add(p)
                p = p.getparent()

        for e in matches:
            for elem in get_elem_and_ancestors(e):
                for cls in elem.get('class', '').split():
                    if '.' + cls.lower() in lsel:
                        class_map[cls][elem].append(rule)

        return (MatchLocation(tag_text(elem), elem.sourceline)
                for elem in matches)

    class_map = defaultdict(lambda: defaultdict(list))

    for name, inline_sheets in iteritems(html_sheets):
        root = container.parsed(name)
        cmap = defaultdict(lambda: defaultdict(list))
        for elem in root.xpath('//*[@class]'):
            for cls in elem.get('class', '').split():
                cmap[cls][elem] = []
        select = Select(root, ignore_inappropriate_pseudo_classes=True)
        for sheet in chain(sheets_for_html(name, root), inline_sheets):
            for rule in rules_in_sheet(sheet):
                rule_map[rule][name].extend(
                    matches_for_selector(rule.selector, select, cmap, rule))
        for cls, elem_map in iteritems(cmap):
            class_elements = class_map[cls][name]
            for elem, usage in iteritems(elem_map):
                class_elements.append(
                    ClassElement(name, elem.sourceline, elem.get('class'),
                                 tag_text(elem), tuple(usage)))

    result_data['classes'] = ans = []
    for cls, name_map in iteritems(class_map):
        la = tuple(
            ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name))
            for name, class_elements in iteritems(name_map) if class_elements)
        num_of_matches = sum(
            sum(len(ce.matched_rules) for ce in cfm.class_elements)
            for cfm in la)
        ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls)))

    ans = []
    for rule, loc_map in iteritems(rule_map):
        la = tuple(
            CSSFileMatch(name, tuple(locations), numeric_sort_key(name))
            for name, locations in iteritems(loc_map) if locations)
        count = sum(len(fm.locations) for fm in la)
        ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector)))

    return ans
示例#7
0
    def extract_css_into_flows(self):
        inlines = defaultdict(list)  # Ensure identical <style>s not repeated
        sheets = {}

        for item in self.oeb.manifest:
            if item.media_type in OEB_STYLES:
                sheet = self.data(item)
                if not self.opts.expand_css and hasattr(item.data, 'cssText'):
                    condense_sheet(sheet)
                sheets[item.href] = len(self.flows)
                self.flows.append(sheet)

        def fix_import_rules(sheet):
            changed = False
            for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
                if rule.href:
                    href = item.abshref(rule.href)
                    idx = sheets.get(href, None)
                    if idx is not None:
                        idx = to_ref(idx)
                        rule.href = 'kindle:flow:%s?mime=text/css' % idx
                        changed = True
            return changed

        for item in self.oeb.spine:
            root = self.data(item)

            for link in XPath('//h:link[@href]')(root):
                href = item.abshref(link.get('href'))
                idx = sheets.get(href, None)
                if idx is not None:
                    idx = to_ref(idx)
                    link.set('href', 'kindle:flow:%s?mime=text/css' % idx)

            for tag in XPath('//h:style')(root):
                p = tag.getparent()
                idx = p.index(tag)
                raw = tag.text
                if not raw or not raw.strip():
                    extract(tag)
                    continue
                sheet = cssutils.parseString(raw, validate=False)
                if fix_import_rules(sheet):
                    raw = force_unicode(sheet.cssText, 'utf-8')

                repl = etree.Element(XHTML('link'),
                                     type='text/css',
                                     rel='stylesheet')
                repl.tail = '\n'
                p.insert(idx, repl)
                extract(tag)
                inlines[raw].append(repl)

        for raw, elems in inlines.iteritems():
            idx = to_ref(len(self.flows))
            self.flows.append(raw)
            for link in elems:
                link.set('href', 'kindle:flow:%s?mime=text/css' % idx)

        for item in self.oeb.manifest:
            if item.media_type in OEB_STYLES:
                sheet = self.data(item)
                if hasattr(sheet, 'cssRules'):
                    fix_import_rules(sheet)

        for i, sheet in enumerate(tuple(self.flows)):
            if hasattr(sheet, 'cssText'):
                self.flows[i] = force_unicode(sheet.cssText, 'utf-8')
示例#8
0
文件: mobi8.py 项目: tokot/calibre
    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = XPath('//*[@id="%s"]'%frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == XHTML('a') and elem.get('href',
                    False):
                href = elem.get('href')
                href, frag = urldefrag(href)
                href = base_href + '/' + href
                text = xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted(set(x[-1] for x in links))
        depth_map = {x:i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans
示例#9
0
def merge_html(container, names, master):
    p = container.parsed
    root = p(master)

    # Ensure master has a <head>
    head = root.find('h:head', namespaces=XPNSMAP)
    if head is None:
        head = root.makeelement(XHTML('head'))
        container.insert_into_xml(root, head, 0)

    seen_anchors = all_anchors(root)
    seen_stylesheets = set(all_stylesheets(container, master))
    master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
    master_base = os.path.dirname(master)
    anchor_map = {n: {} for n in names if n != master}

    for name in names:
        if name == master:
            continue
        # Insert new stylesheets into master
        for sheet in all_stylesheets(container, name):
            if sheet not in seen_stylesheets:
                seen_stylesheets.add(sheet)
                link = head.makeelement(XHTML('link'),
                                        rel='stylesheet',
                                        type='text/css',
                                        href=container.name_to_href(
                                            sheet, master))
                container.insert_into_xml(head, link)

        # Rebase links if master is in a different directory
        if os.path.dirname(name) != master_base:
            container.replace_links(name, LinkRebaser(container, name, master))

        root = p(name)
        children = []
        for body in p(name).findall('h:body', namespaces=XPNSMAP):
            children.append(
                body.text if body.text and body.text.strip() else '\n\n')
            children.extend(body)

        first_child = ''
        for first_child in children:
            if not isinstance(first_child, basestring):
                break
        if isinstance(first_child, basestring):
            # body contained only text, no tags
            first_child = body.makeelement(XHTML('p'))
            first_child.text, children[0] = children[0], first_child

        amap = anchor_map[name]
        remove_name_attributes(root)

        for elem in root.xpath('//*[@id]'):
            val = elem.get('id')
            if not val:
                continue
            if val in seen_anchors:
                nval = unique_anchor(seen_anchors, val)
                elem.set('id', nval)
                amap[val] = nval
            else:
                seen_anchors.add(val)

        if 'id' not in first_child.attrib:
            first_child.set('id', unique_anchor(seen_anchors, 'top'))
            seen_anchors.add(first_child.get('id'))

        amap[''] = first_child.get('id')

        # Fix links that point to local changed anchors
        for a in XPath('//h:a[starts-with(@href, "#")]')(root):
            q = a.get('href')[1:]
            if q in amap:
                a.set('href', '#' + amap[q])

        for child in children:
            if isinstance(child, basestring):
                add_text(master_body, child)
            else:
                master_body.append(copy.deepcopy(child))

        container.remove_item(name, remove_from_guide=False)

    # Fix all links in the container that point to merged files
    for fname, media_type in container.mime_map.iteritems():
        repl = MergeLinkReplacer(fname, anchor_map, master, container)
        container.replace_links(fname, repl)
示例#10
0
    def virtualize_resources(self):

        changed = set()
        link_uid = self.book_render_data['link_uid']
        resource_template = link_uid + '|{}|'
        xlink_xpath = XPath('//*[@xl:href]')
        link_xpath = XPath('//h:a[@href]')
        res_link_xpath = XPath('//h:link[@href]')

        def link_replacer(base, url):
            if url.startswith('#'):
                frag = urlunquote(url[1:])
                if not frag:
                    return url
                changed.add(base)
                return resource_template.format(encode_url(base, frag))
            purl = urlparse(url)
            if purl.netloc or purl.query:
                return url
            if purl.scheme and purl.scheme != 'file':
                return url
            if not purl.path or purl.path.startswith('/'):
                return url
            url, frag = purl.path, purl.fragment
            name = self.href_to_name(url, base)
            if name:
                if self.has_name_and_is_not_empty(name):
                    frag = urlunquote(frag)
                    url = resource_template.format(encode_url(name, frag))
                else:
                    if isinstance(name, unicode_type):
                        name = name.encode('utf-8')
                    url = 'missing:' + force_unicode(quote(name), 'utf-8')
                changed.add(base)
            return url

        ltm = self.book_render_data['link_to_map']

        for name, mt in iteritems(self.mime_map):
            mt = mt.lower()
            if mt in OEB_STYLES:
                replaceUrls(self.parsed(name), partial(link_replacer, name))
                self.virtualized_names.add(name)
            elif mt in OEB_DOCS:
                self.virtualized_names.add(name)
                root = self.parsed(name)
                for link in res_link_xpath(root):
                    ltype = (link.get('type') or 'text/css').lower()
                    rel = (link.get('rel') or 'stylesheet').lower()
                    if ltype != 'text/css' or rel != 'stylesheet':
                        # This link will not be loaded by the browser anyway
                        # and will causes the resource load check to hang
                        link.attrib.clear()
                        changed.add(name)
                rewrite_links(root, partial(link_replacer, name))
                for a in link_xpath(root):
                    href = a.get('href')
                    if href.startswith(link_uid):
                        a.set('href', 'javascript:void(0)')
                        parts = decode_url(href.split('|')[1])
                        lname, lfrag = parts[0], parts[1]
                        ltm.setdefault(lname,
                                       {}).setdefault(lfrag or '',
                                                      set()).add(name)
                        a.set(
                            'data-' + link_uid,
                            json.dumps({
                                'name': lname,
                                'frag': lfrag
                            },
                                       ensure_ascii=False))
                    else:
                        a.set('target', '_blank')
                        a.set('rel', 'noopener noreferrer')
                    changed.add(name)
            elif mt == 'image/svg+xml':
                self.virtualized_names.add(name)
                changed.add(name)
                xlink = XLINK('href')
                for elem in xlink_xpath(self.parsed(name)):
                    elem.set(xlink, link_replacer(name, elem.get(xlink)))

        for name, amap in iteritems(ltm):
            for k, v in tuple(iteritems(amap)):
                amap[k] = tuple(v)  # needed for JSON serialization

        tuple(map(self.dirty, changed))
示例#11
0
def is_current_jacket(root):
    return len(
        XPath('//h:meta[@name="calibre-content" and @content="jacket"]')
        (root)) > 0
示例#12
0
def case_insensitive_element_names(test, parse_function):
    markup = '<HTML><P> </p>'
    root = parse_function(markup)
    err = 'case sensitive parsing, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')
    test.assertEqual(len(XPath('//h:p')(root)), 1, err)
示例#13
0
文件: input.py 项目: wh0197m/calibre
    def epubify_markup(self, root, log):
        from calibre.ebooks.oeb.base import XPath, XHTML
        # Fix empty title tags
        for t in XPath('//h:title')(root):
            if not t.text:
                t.text = u' '
        # Fix <p><div> constructs as the asinine epubchecker complains
        # about them
        pdiv = XPath('//h:p/h:div')
        for div in pdiv(root):
            div.getparent().tag = XHTML('div')

        # Remove the position:relative as it causes problems with some epub
        # renderers. Remove display: block on an image inside a div as it is
        # redundant and prevents text-align:center from working in ADE
        # Also ensure that the img is contained in its containing div
        imgpath = XPath('//h:div/h:img[@style]')
        for img in imgpath(root):
            div = img.getparent()
            if len(div) == 1:
                style = div.attrib.get('style', '')
                if style and not style.endswith(';'):
                    style = style + ';'
                style += 'position:static'  # Ensures position of containing div is static
                # Ensure that the img is always contained in its frame
                div.attrib['style'] = style
                img.attrib['style'] = 'max-width: 100%; max-height: 100%'

        # Handle anchored images. The default markup + CSS produced by
        # odf2xhtml works with WebKit but not with ADE. So we convert the
        # common cases of left/right/center aligned block images to work on
        # both webkit and ADE. We detect the case of setting the side margins
        # to auto and map it to an appropriate text-align directive, which
        # works in both WebKit and ADE.
        # https://bugs.launchpad.net/bugs/1063207
        # https://bugs.launchpad.net/calibre/+bug/859343
        imgpath = XPath('descendant::h:div/h:div/h:img')
        for img in imgpath(root):
            div2 = img.getparent()
            div1 = div2.getparent()
            if (len(div1), len(div2)) != (1, 1):
                continue
            cls = div1.get('class', '')
            first_rules = filter(
                None, [self.get_css_for_class(x) for x in cls.split()])
            has_align = False
            for r in first_rules:
                if r.style.getProperty(u'text-align') is not None:
                    has_align = True
            ml = mr = None
            if not has_align:
                aval = None
                cls = div2.get(u'class', u'')
                rules = filter(
                    None, [self.get_css_for_class(x) for x in cls.split()])
                for r in rules:
                    ml = r.style.getPropertyCSSValue(u'margin-left') or ml
                    mr = r.style.getPropertyCSSValue(u'margin-right') or mr
                    ml = getattr(ml, 'value', None)
                    mr = getattr(mr, 'value', None)
                if ml == mr == u'auto':
                    aval = u'center'
                elif ml == u'auto' and mr != u'auto':
                    aval = 'right'
                elif ml != u'auto' and mr == u'auto':
                    aval = 'left'
                if aval is not None:
                    style = div1.attrib.get('style', '').strip()
                    if style and not style.endswith(';'):
                        style = style + ';'
                    style += 'text-align:%s' % aval
                    has_align = True
                    div1.attrib['style'] = style

            if has_align:
                # This is needed for ADE, without it the text-align has no
                # effect
                style = div2.attrib['style']
                div2.attrib['style'] = 'display:inline;' + style
示例#14
0
def from_xpaths(container, xpaths):
    '''
    Generate a Table of Contents from a list of XPath expressions. Each
    expression in the list corresponds to a level of the generate ToC. For
    example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level
    Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
    '''
    tocroot = TOC()
    xpaths = [XPath(xp) for xp in xpaths]

    # Find those levels that have no elements in all spine items
    maps = OrderedDict()
    empty_levels = {i+1 for i, xp in enumerate(xpaths)}
    for spinepath in container.spine_items:
        name = container.abspath_to_name(spinepath)
        root = container.parsed(name)
        level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)}
        for lvl, elems in level_item_map.items():
            if elems:
                empty_levels.discard(lvl)
    # Remove empty levels from all level_maps
    if empty_levels:
        for name, lmap in tuple(maps.items()):
            lmap = {lvl:items for lvl, items in lmap.items() if lvl not in empty_levels}
            lmap = sorted(iter(lmap.items()), key=itemgetter(0))
            lmap = {i+1:items for i, (l, items) in enumerate(lmap)}
            maps[name] = lmap

    node_level_map = {tocroot: 0}

    def parent_for_level(child_level):
        limit = child_level - 1

        def process_node(node):
            child = node.last_child
            if child is None:
                return node
            lvl = node_level_map[child]
            return node if lvl > limit else child if lvl == limit else process_node(child)

        return process_node(tocroot)

    for name, level_item_map in maps.items():
        root = container.parsed(name)
        item_level_map = {e:i for i, elems in level_item_map.items() for e in elems}
        item_dirtied = False
        all_ids = set(root.xpath('//*/@id'))

        for item in root.iterdescendants(etree.Element):
            lvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            text = elem_to_toc_text(item)
            parent = parent_for_level(lvl)
            if item_at_top(item):
                dirtied, elem_id = False, None
            else:
                dirtied, elem_id = ensure_id(item, all_ids)
            item_dirtied = dirtied or item_dirtied
            toc = parent.add(text, name, elem_id)
            node_level_map[toc] = lvl
            toc.dest_exists = True

        if item_dirtied:
            container.commit_item(name, keep_parsed=True)

    return tocroot
示例#15
0
def validate_xpath_selector(val):
    try:
        XPath(val)
    except Exception:
        return _('{} is not a valid XPath selector').format(val)
示例#16
0
文件: toc.py 项目: JapaChin/calibre
    def __init__(self, oeb, opts, replace_previous_inline_toc=False, ignore_existing_toc=False):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.title = opts.toc_title or DEFAULT_TITLE
        self.at_start = opts.mobi_toc_at_start
        self.generated_item = None
        self.added_toc_guide_entry = False
        self.has_toc = oeb.toc and oeb.toc.count() > 1

        self.tocitem = tocitem = None
        if find_previous_calibre_inline_toc:
            tocitem = self.tocitem = find_previous_calibre_inline_toc(oeb)
        if ignore_existing_toc and "toc" in oeb.guide:
            oeb.guide.remove("toc")

        if "toc" in oeb.guide:
            # Remove spurious toc entry from guide if it is not in spine or it
            # does not have any hyperlinks
            href = urlnormalize(oeb.guide["toc"].href.partition("#")[0])
            if href in oeb.manifest.hrefs:
                item = oeb.manifest.hrefs[href]
                if hasattr(item.data, "xpath") and XPath("//h:a[@href]")(item.data):
                    if oeb.spine.index(item) < 0:
                        oeb.spine.add(item, linear=False)
                    return
                elif self.has_toc:
                    oeb.guide.remove("toc")
            else:
                oeb.guide.remove("toc")

        if not self.has_toc or "toc" in oeb.guide or opts.no_inline_toc or getattr(opts, "mobi_passthrough", False):
            return

        self.log("\tGenerating in-line ToC")

        embed_css = ""
        s = getattr(oeb, "store_embed_font_rules", None)
        if getattr(s, "body_font_family", None):
            css = [x.cssText for x in s.rules] + ["body { font-family: %s }" % s.body_font_family]
            embed_css = "\n\n".join(css)

        root = etree.fromstring(
            TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title, embed_css=embed_css, extra_css=(opts.extra_css or ""))
        )
        parent = XPath("//h:ul")(root)[0]
        parent.text = "\n\t"
        for child in self.oeb.toc:
            self.process_toc_node(child, parent)

        if tocitem is not None:
            href = tocitem.href
            if oeb.spine.index(tocitem) > -1:
                oeb.spine.remove(tocitem)
            tocitem.data = root
        else:
            id, href = oeb.manifest.generate("contents", "contents.xhtml")
            tocitem = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root)
        if self.at_start:
            oeb.spine.insert(0, tocitem, linear=True)
        else:
            oeb.spine.add(tocitem, linear=False)

        oeb.guide.add("toc", "Table of Contents", href)
示例#17
0
    def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize,
                     item_id):
        if not isinstance(node.tag, basestring) \
           or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, (int, float)):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if node.tag == XHTML('font'):
            tags = [
                'descendant::h:%s' % x
                for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5',
                          'h6', 'ol', 'ul', 'dl', 'blockquote')
            ]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:

                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))

                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt' % font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = Property('color',
                                            node.attrib['color']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = Property(
                    'background-color', node.attrib['bgcolor']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict[
                'font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (cssdict.get('float', None) == 'left'
                       and 'font-size' in cssdict and len(node) == 0
                       and node.text and len(node.text) == 1)
        # Detect drop caps generated by the docx input plugin
        if (node.tag and node.tag.endswith('}p') and len(node) == 0
                and node.text and len(node.text.strip()) == 1 and not node.tail
                and 'line-height' in cssdict and 'font-size' in cssdict):
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(
                    dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = dynamic_rescale_factor(node)
            if dyn_rescale is not None:
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem' % (fsize / psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt' % fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            if not is_drop_cap and style['line-height'] < minlh * fsize:
                cssdict['line-height'] = str(minlh)
        except:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                cssdict.pop(x, None)

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing
                or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s' % (prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents
                    and cssdict.get('text-align',
                                    None) not in ('center', 'right')):
                cssdict['text-indent'] = "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(cssdict.items())
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                klass = ascii_text(
                    STRIPNUM.sub('',
                                 classes.split()[0].replace('_', '')))
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + str(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in pseudo_classes.iteritems():
                items = sorted(cssdict.iteritems())
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + str(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        for child in node:
            self.flatten_node(child, stylizer, names, styles, pseudo_styles,
                              psize, item_id)
示例#18
0
 def __init__(self):
     self.html_tags = XPath('descendant::h:*')
示例#19
0
 def __call__(self, oeb, context):
     bx = XPath('//h:body')
     for x in oeb.manifest.items:
         if x.media_type in OEB_DOCS:
             for body in bx(x.data):
                 self.unsmarten(body)
示例#20
0
def attribute_replacement(test, parse_function):
    markup = '<html><body><svg viewbox="0"></svg><svg xmlns="%s" viewbox="1">' % SVG_NS
    root = parse_function(markup)
    err = 'SVG attributes not normalized, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')
    test.assertEqual(len(XPath('//svg:svg[@viewBox]')(root)), 2, err)
示例#21
0
文件: toc.py 项目: pombreda/calibre-1
    def __init__(self,
                 oeb,
                 opts,
                 replace_previous_inline_toc=False,
                 ignore_existing_toc=False):
        self.oeb, self.opts, self.log = oeb, opts, oeb.log
        self.title = opts.toc_title or DEFAULT_TITLE
        self.at_start = opts.mobi_toc_at_start
        self.generated_item = None
        self.added_toc_guide_entry = False
        self.has_toc = oeb.toc and oeb.toc.count() > 1

        self.tocitem = tocitem = None
        if find_previous_calibre_inline_toc:
            tocitem = self.tocitem = find_previous_calibre_inline_toc(oeb)
        if ignore_existing_toc and 'toc' in oeb.guide:
            oeb.guide.remove('toc')

        if 'toc' in oeb.guide:
            # Remove spurious toc entry from guide if it is not in spine or it
            # does not have any hyperlinks
            href = urlnormalize(oeb.guide['toc'].href.partition('#')[0])
            if href in oeb.manifest.hrefs:
                item = oeb.manifest.hrefs[href]
                if (hasattr(item.data, 'xpath')
                        and XPath('//h:a[@href]')(item.data)):
                    if oeb.spine.index(item) < 0:
                        oeb.spine.add(item, linear=False)
                    return
                elif self.has_toc:
                    oeb.guide.remove('toc')
            else:
                oeb.guide.remove('toc')

        if (not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc
                or getattr(opts, 'mobi_passthrough', False)):
            return

        self.log('\tGenerating in-line ToC')

        embed_css = ''
        s = getattr(oeb, 'store_embed_font_rules', None)
        if getattr(s, 'body_font_family', None):
            css = [x.cssText for x in s.rules
                   ] + ['body { font-family: %s }' % s.body_font_family]
            embed_css = '\n\n'.join(css)

        root = etree.fromstring(
            TEMPLATE.format(xhtmlns=XHTML_NS,
                            title=self.title,
                            embed_css=embed_css,
                            extra_css=(opts.extra_css or '')))
        parent = XPath('//h:ul')(root)[0]
        parent.text = '\n\t'
        for child in self.oeb.toc:
            self.process_toc_node(child, parent)

        if tocitem is not None:
            href = tocitem.href
            if oeb.spine.index(tocitem) > -1:
                oeb.spine.remove(tocitem)
            tocitem.data = root
        else:
            id, href = oeb.manifest.generate('contents', 'contents.xhtml')
            tocitem = self.generated_item = oeb.manifest.add(id,
                                                             href,
                                                             XHTML_MIME,
                                                             data=root)
        if self.at_start:
            oeb.spine.insert(0, tocitem, linear=True)
        else:
            oeb.spine.add(tocitem, linear=False)

        oeb.guide.add('toc', 'Table of Contents', href)
示例#22
0
def comments(test, parse_function):
    markup = '<html><!-- -- ---><body/></html>'
    root = parse_function(markup)
    test.assertEqual(len(XPath('//h:body')(root)), 1,
                     'Failed to parse with comment containing dashes')
    test.assertEqual(len(tuple(root.iterdescendants(etree.Comment))), 1)
示例#23
0
def all_stylesheets(container, name):
    for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
        name = container.href_to_name(link.get('href'), name)
        typ = link.get('type', 'text/css')
        if typ == 'text/css':
            yield name
示例#24
0
def namespaces(test, parse_function):
    ae = test.assertEqual

    def match_and_prefix(root, xpath, prefix, err=''):
        matches = XPath(xpath)(root)
        ae(len(matches), 1, err)
        ae(matches[0].prefix, prefix, err)

    markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format(
        xhtml=XHTML_NS)
    root = parse_function(markup)
    ae(
        len(XPath('//h:body[@id="test"]')(root)), 1,
        'Incorrect parsing, parsed markup:\n' +
        etree.tostring(root, encoding='unicode'))
    match_and_prefix(root, '//h:body[@id="test"]', None)

    markup = '''
    <html xmlns="{xhtml}"><head><body id="test">
    <svg:svg xmlns:svg="{svg}"><svg:image xmlns:xlink="{xlink}" xlink:href="xxx"/></svg:svg>
    '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
    root = parse_function(markup)
    err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')
    match_and_prefix(root, '//h:body[@id="test"]', None, err)
    match_and_prefix(root, '//svg:svg', 'svg', err)
    match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err)

    markup = '''
    <html xmlns="{xhtml}"><head><body id="test">
    <svg xmlns="{svg}" xmlns:xlink="{xlink}" ><image xlink:href="xxx"/></svg>
    '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS)
    root = parse_function(markup)
    err = 'Incorrect parsing, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')
    match_and_prefix(root, '//h:body[@id="test"]', None, err)
    match_and_prefix(root, '//svg:svg', None, err)
    match_and_prefix(root, '//svg:image[@xl:href]', None, err)

    markup = '<html><body><svg><image xlink:href="xxx"></svg>'
    root = parse_function(markup)
    err = 'Namespaces not created, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')
    match_and_prefix(root, '//svg:svg', None, err)
    match_and_prefix(root, '//svg:image[@xl:href]', None, err)
    if parse_function is parse:
        image = XPath('//svg:image')(root)[0]
        ae(image.nsmap, {'xlink': XLINK_NS, None: SVG_NS})

    root = parse_function('<html id="a"><p><html xmlns:x="y" lang="en"><p>')
    err = 'Multiple HTML tags not handled, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')
    match_and_prefix(root, '//h:html', None, err)
    match_and_prefix(root, '//h:html[@lang]', None, err)
    match_and_prefix(root, '//h:html[@id]', None, err)

    # if parse_function is not html5_parse:
    #     markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS)
    #     root = parse_function(markup)
    #     err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root, encoding='unicode')
    #     match_and_prefix(root, '//h:html', None, err)

    markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>'
    root = parse_function(markup)
    err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')

    def xpath(expr):
        return etree.XPath(expr, namespaces={'ns1': 'NS', 'ns2': 'NS2'})(root)

    ae(len(xpath('//ns1:tag1')), 1, err)
    ae(len(xpath('//ns1:tag2')), 1, err)
    ae(len(xpath('//ns2:tag3')), 1, err)
    ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err)
    ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err)
    # for tag in root.iter():
    #     if 'NS' in tag.tag:
    #         ae('ns1', tag.prefix)

    markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">'
    root = parse_function(markup)
    err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring(
        root, encoding='unicode')
    ae(len(root.xpath('//*[@lang="en"]')), 2, err)
    ae(len(root.xpath('//*[@lang="de"]')), 1, err)
    ae(len(root.xpath('//*[@lang="es"]')), 1, err)
示例#25
0
def do_split(split_point, log, before=True):
    '''
    Split tree into a *before* and an *after* tree at ``split_point``.

    :param split_point: The Element at which to split
    :param before: If True tree is split before split_point, otherwise after split_point
    :return: before_tree, after_tree
    '''
    if before:
        # We cannot adjust for after since moving an after split point to a
        # parent will cause breakage if the parent contains any content
        # after the original split point
        split_point = adjust_split_point(split_point, log)
    tree = split_point.getroottree()
    path = tree.getpath(split_point)

    tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
    root, root2 = tree.getroot(), tree2.getroot()
    body, body2 = map(get_body, (root, root2))
    split_point = root.xpath(path)[0]
    split_point2 = root2.xpath(path)[0]

    def nix_element(elem, top=True):
        # Remove elem unless top is False in which case replace elem by its
        # children
        parent = elem.getparent()
        if top:
            parent.remove(elem)
        else:
            index = parent.index(elem)
            parent[index:index + 1] = list(elem.iterchildren())

    # Tree 1
    hit_split_point = False
    keep_descendants = False
    split_point_descendants = frozenset(split_point.iterdescendants())
    for elem in tuple(body.iterdescendants()):
        if elem is split_point:
            hit_split_point = True
            if before:
                nix_element(elem)
            else:
                # We want to keep the descendants of the split point in
                # Tree 1
                keep_descendants = True
                # We want the split point element, but not its tail
                elem.tail = '\n'

            continue
        if hit_split_point:
            if keep_descendants:
                if elem in split_point_descendants:
                    # elem is a descendant keep it
                    continue
                else:
                    # We are out of split_point, so prevent further set
                    # lookups of split_point_descendants
                    keep_descendants = False
            nix_element(elem)

    # Tree 2
    ancestors = frozenset(XPath('ancestor::*')(split_point2))
    for elem in tuple(body2.iterdescendants()):
        if elem is split_point2:
            if not before:
                # Keep the split point element's tail, if it contains non-whitespace
                # text
                tail = elem.tail
                if tail and not tail.isspace():
                    parent = elem.getparent()
                    idx = parent.index(elem)
                    if idx == 0:
                        parent.text = (parent.text or '') + tail
                    else:
                        sib = parent[idx - 1]
                        sib.tail = (sib.tail or '') + tail
                # Remove the element itself
                nix_element(elem)
            break
        if elem in ancestors:
            # We have to preserve the ancestors as they could have CSS
            # styles that are inherited/applicable, like font or
            # width. So we only remove the text, if any.
            elem.text = '\n'
        else:
            nix_element(elem, top=False)

    body2.text = '\n'

    return tree, tree2
示例#26
0
    def flatten_node(self,
                     node,
                     stylizer,
                     names,
                     styles,
                     pseudo_styles,
                     psize,
                     item_id,
                     recurse=True):
        if not isinstance(node.tag, string_or_bytes) \
           or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, numbers.Number):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
                if cssdict['text-align'] == 'center':
                    # align=center causes tables to be center aligned,
                    # which text-align does not. And the ever trustworthy Word
                    # uses this construct in its HTML output. See
                    # https://bugs.launchpad.net/bugs/1569583
                    if tag == 'table':
                        if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
                            cssdict['margin-left'] = cssdict[
                                'margin-right'] = 'auto'
                    else:
                        for table in node.iterchildren(XHTML("table")):
                            ts = stylizer.style(table)
                            if ts.get('margin-left') is None and ts.get(
                                    'margin-right') is None:
                                ts.set('margin-left', 'auto')
                                ts.set('margin-right', 'auto')
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if 'valign' in node.attrib and tag == 'td':
            if cssdict.get('vertical-align') == 'inherit':
                cssdict['vertical-align'] = node.attrib['valign']
            del node.attrib['valign']
        if node.tag == XHTML('font'):
            tags = [
                'descendant::h:%s' % x
                for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5',
                          'h6', 'ol', 'ul', 'dl', 'blockquote')
            ]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:

                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))

                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt' % font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = Property('color',
                                            node.attrib['color']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = Property(
                    'background-color', node.attrib['bgcolor']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if tag == 'ol' and 'type' in node.attrib:
            del node.attrib['type']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict[
                'font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (
            cssdict.get('float', None) == 'left' and 'font-size' in cssdict
            and len(node) == 0 and node.text and
            (len(node.text) == 1 or
             (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
        # Detect drop caps generated by the docx input plugin
        if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
                not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(
                    dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
            if dyn_rescale is not None:
                try:
                    dyn_rescale = float(dyn_rescale) / 100
                except Exception:
                    dyn_rescale = 1
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem' % (fsize / psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt' % fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            slh = style['line-height']
            if not is_drop_cap and isinstance(
                    slh, numbers.Number) and slh < minlh * fsize:
                cssdict['line-height'] = unicode_type(minlh)
        except Exception:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                popval = cssdict.pop(x, None)
                if self.body_font_family and popval and x == 'font-family' \
                    and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
                    cssdict[x] = popval

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing
                or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s' % (prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents
                    and cssdict.get('text-align',
                                    None) not in ('center', 'right')):
                cssdict['text-indent'] = "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(iteritems(cssdict))
                css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                classes_list = classes.split()
                # lower() because otherwise if the document uses the same class
                # name with different case, both cases will apply, leading
                # to incorrect results.
                klass = ascii_text(STRIPNUM.sub(
                    '', classes_list[0])).lower().strip().replace(' ', '_')
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + unicode_type(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in iteritems(pseudo_classes):
                items = sorted(iteritems(cssdict))
                css = ';\n'.join('%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + unicode_type(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        if recurse:
            for child in node:
                self.flatten_node(child, stylizer, names, styles,
                                  pseudo_styles, psize, item_id)
示例#27
0
 def match_and_prefix(root, xpath, prefix, err=''):
     matches = XPath(xpath)(root)
     ae(len(matches), 1, err)
     ae(matches[0].prefix, prefix, err)
示例#28
0
def from_xpaths(container, xpaths):
    tocroot = TOC()
    xpaths = [XPath(xp) for xp in xpaths]
    level_prev = {i + 1: None for i in xrange(len(xpaths))}
    level_prev[0] = tocroot

    # Find those levels that have no elements in all spine items
    maps = OrderedDict()
    empty_levels = {i + 1 for i, xp in enumerate(xpaths)}
    for spinepath in container.spine_items:
        name = container.abspath_to_name(spinepath)
        root = container.parsed(name)
        level_item_map = maps[name] = {
            i + 1: frozenset(xp(root))
            for i, xp in enumerate(xpaths)
        }
        for lvl, elems in level_item_map.iteritems():
            if elems:
                empty_levels.discard(lvl)
    # Remove empty levels from all level_maps
    if empty_levels:
        for name, lmap in tuple(maps.iteritems()):
            lmap = {
                lvl: items
                for lvl, items in lmap.iteritems() if lvl not in empty_levels
            }
            lmap = sorted(lmap.iteritems(), key=itemgetter(0))
            lmap = {i + 1: items for i, (l, items) in enumerate(lmap)}
            maps[name] = lmap

    for name, level_item_map in maps.iteritems():
        root = container.parsed(name)
        item_level_map = {
            e: i
            for i, elems in level_item_map.iteritems() for e in elems
        }
        item_dirtied = False

        for item in root.iterdescendants(etree.Element):
            lvl = plvl = item_level_map.get(item, None)
            if lvl is None:
                continue
            parent = None
            while parent is None:
                plvl -= 1
                parent = level_prev[plvl]
            lvl = plvl + 1
            if item_at_top(item):
                dirtied, elem_id = False, None
            else:
                dirtied, elem_id = ensure_id(item)
            text = elem_to_toc_text(item)
            item_dirtied = dirtied or item_dirtied
            toc = parent.add(text, name, elem_id)
            toc.dest_exists = True
            level_prev[lvl] = toc
            for i in xrange(lvl + 1, len(xpaths) + 1):
                level_prev[i] = None

        if item_dirtied:
            container.commit_item(name, keep_parsed=True)

    return tocroot
示例#29
0
    def workaround_ade_quirks(self):  # {{{
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                base, _, frag = href.partition('#')
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                        'Removing invalid fragment identifier %r from TOC' %
                        frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = br.itersiblings(preceding=True).next()
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML('p')
                    br.text = u'\u00a0'
                    style = br.get('style', '').split(';')
                    style = filter(None, map(lambda x: x.strip(), style))
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {
                        'image/svg+xml', 'application/svg+xml'
                }:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False)
                        and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = XPath('./h:input|./h:button|./h:textarea|'
                                 './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath('ancestor::h:table')
            for tag in XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = XHTML('div')

            special_chars = re.compile(u'[\u200b\u00ad]')
            for elem in root.iterdescendants():
                if getattr(elem, 'text', False):
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace(u'\u2011', '-')
                if getattr(elem, 'tail', False):
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace(u'\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from cssutils.css import CSSRule
                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.' + lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(
                            CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(
                        CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')
示例#30
0
def subset_all_fonts(container, font_stats, report):
    remove = set()
    total_old = total_new = 0
    for name, mt in container.mime_map.iteritems():
        if mt in OEB_FONTS or name.rpartition('.')[-1].lower() in {
                'otf', 'ttf'
        }:
            chars = font_stats.get(name, set())
            path = container.name_path_map[name]
            total_old += os.path.getsize(path)
            if not chars:
                remove.add(name)
                report('Removed unused font: %s' % name)
                continue
            with open(path, 'r+b') as f:
                raw = f.read()
                font_name = get_font_names(raw)[-1]
                warnings = []
                container.log('Subsetting font: %s' % (font_name or name))
                try:
                    nraw, old_sizes, new_sizes = subset(raw,
                                                        chars,
                                                        warnings=warnings)
                except UnsupportedFont as e:
                    container.log.warning(
                        'Unsupported font: %s, ignoring.  Error: %s' %
                        (name, as_unicode(e)))
                    continue

                for w in warnings:
                    container.log.warn(w)
                olen = sum(old_sizes.itervalues())
                nlen = sum(new_sizes.itervalues())
                total_new += len(nraw)
                if nlen == olen:
                    report('The font %s was already subset' % font_name)
                else:
                    report(
                        'Decreased the font %s to %.1f%% of its original size'
                        % (font_name, nlen / olen * 100))
                f.seek(0), f.truncate(), f.write(nraw)

    for name in remove:
        container.remove_item(name)

    if remove:
        for name, mt in container.mime_map.iteritems():
            if mt in OEB_STYLES:
                sheet = container.parsed(name)
                if remove_font_face_rules(container, sheet, remove, name):
                    container.dirty(name)
            elif mt in OEB_DOCS:
                for style in XPath('//h:style')(container.parsed(name)):
                    if style.get('type',
                                 'text/css') == 'text/css' and style.text:
                        sheet = container.parse_css(style.text, name)
                        if remove_font_face_rules(container, sheet, remove,
                                                  name):
                            style.text = sheet.cssText
                            container.dirty(name)
    if total_old > 0:
        report('Reduced total font size to %.1f%% of original' %
               (total_new / total_old * 100))
    else:
        report('No embedded fonts found')