예제 #1
0
 def remove_first_image(self):
     deleted_item = None
     for item in self.oeb.spine:
         if XPath(JACKET_XPATH)(item.data):
             continue
         removed = self.remove_images(item)
         if removed > 0:
             self.log('Removed first image')
             body = XPath('//h:body')(item.data)
             if body:
                 raw = xml2text(body[0]).strip()
                 imgs = XPath('//h:img|//svg:svg')(item.data)
                 if not raw and not imgs:
                     self.log('Removing %s as it has no content' %
                              item.href)
                     self.oeb.manifest.remove(item)
                     deleted_item = item
             break
     else:
         self.log.warn('Could not find first image to remove')
     if deleted_item is not None:
         for item in list(self.oeb.toc):
             href = urllib.parse.urldefrag(item.href)[0]
             if href == deleted_item.href:
                 self.oeb.toc.remove(item)
         self.oeb.guide.remove_by_href(deleted_item.href)
예제 #2
0
def linearize_jacket(oeb):
    for x in oeb.spine[:4]:
        if XPath(JACKET_XPATH)(x.data):
            for e in XPath('//h:table|//h:tr|//h:th')(x.data):
                e.tag = base.tag('xhtml', 'div')
            for e in XPath('//h:td')(x.data):
                e.tag = base.tag('xhtml', 'span')
            break
예제 #3
0
    def remove_old_cover(self, cover_item, new_cover_href=None):
        from ebook_converter.ebooks.oeb.base import XPath, XLINK
        from lxml import etree

        self.oeb.manifest.remove(cover_item)

        # Remove any references to the cover in the HTML
        affected_items = set()
        xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
        for i, item in enumerate(self.oeb.spine):
            try:
                images = xp(item.data)
            except Exception:
                images = ()
            removed = False
            for img in images:
                href = img.get('src') or img.get(XLINK('href'))
                try:
                    href = item.abshref(href)
                except Exception:
                    continue  # Invalid URL, ignore
                if href == cover_item.href:
                    if new_cover_href is not None:
                        replacement_href = item.relhref(new_cover_href)
                        attr = ('src'
                                if img.tag.endswith('img') else XLINK('href'))
                        img.set(attr, replacement_href)
                    else:
                        p = img.getparent()
                        if p.tag.endswith('}svg'):
                            p.getparent().remove(p)
                        else:
                            p.remove(img)
                        removed = True
            if removed:
                affected_items.add(item)

        # Check if the resulting HTML has no content, if so remove it
        for item in affected_items:
            body = XPath('//h:body')(item.data)
            if body:
                text = etree.tostring(body[0],
                                      method='text',
                                      encoding='unicode')
            else:
                text = ''
            text = re.sub(r'\s+', '', text)
            if not text and not XPath('//h:img|//svg:svg')(item.data):
                self.log('Removing %s as it is a wrapper around the cover '
                         'image' % item.href)
                self.oeb.spine.remove(item)
                self.oeb.manifest.remove(item)
                self.oeb.guide.remove_by_href(item.href)
예제 #4
0
def remove_links_to(container, predicate):
    ''' predicate must be a function that takes the arguments (name, href,
    fragment=None) and returns True iff the link should be removed '''
    from ebook_converter.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
    stylepath = XPath('//h:style')
    styleattrpath = XPath('//*[@style]')
    changed = set()
    for name, mt in container.mime_map.items():
        removed = False
        if mt in OEB_DOCS:
            root = container.parsed(name)
            for el, attr, href, pos in iterlinks(root,
                                                 find_links_in_css=False):
                hname = container.href_to_name(href, name)
                frag = href.partition('#')[-1]
                if predicate(hname, href, frag):
                    if attr is None:
                        el.text = None
                    else:
                        if el.tag == XHTML('link') or el.tag == XHTML('img'):
                            extract(el)
                        else:
                            del el.attrib[attr]
                    removed = True
            for tag in stylepath(root):
                if tag.text and (tag.get('type')
                                 or 'text/css').lower() == 'text/css':
                    sheet = container.parse_css(tag.text)
                    if remove_links_in_sheet(
                            partial(container.href_to_name, base=name), sheet,
                            predicate):
                        tag.text = css_text(sheet)
                        removed = True
            for tag in styleattrpath(root):
                style = tag.get('style')
                if style:
                    style = container.parse_css(style, is_declaration=True)
                    if remove_links_in_declaration(
                            partial(container.href_to_name, base=name), style,
                            predicate):
                        removed = True
                        tag.set('style', css_text(style))
        elif mt in OEB_STYLES:
            removed = remove_links_in_sheet(
                partial(container.href_to_name, base=name),
                container.parsed(name), predicate)
        if removed:
            changed.add(name)
    tuple(map(container.dirty, changed))
    return changed
예제 #5
0
    def parse_html_toc(self, item):
        from ebook_converter.ebooks.oeb.base import TOC, XPath
        dx = XPath('./h:div')
        ax = XPath('./h:a[1]')

        def do_node(parent, div):
            for child in dx(div):
                a = ax(child)[0]
                c = parent.add(a.text, a.attrib['href'])
                do_node(c, child)

        toc = TOC()
        root = XPath('//h:div[1]')(item.data)[0]
        do_node(toc, root)
        return toc
예제 #6
0
 def __call__(self, oeb, opts):
     from ebook_converter.utils.imghdr import what
     self.log = oeb.log
     attr_path = XPath('//h:img[@src]')
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for img in attr_path(root):
             raw = img.get('src', '')
             if not raw.startswith('data:'):
                 continue
             header, data = raw.partition(',')[0::2]
             if not header.startswith('data:image/') or not data:
                 continue
             if ';base64' in header:
                 data = re.sub(r'\s+', '', data)
                 try:
                     data = from_base64_bytes(data)
                 except Exception:
                     self.log.error('Found invalid base64 encoded data '
                                    'URI, ignoring it')
                     continue
             else:
                 data = urllib.parse.unquote(data)
             data = as_bytes(data)
             fmt = what(None, data)
             if not fmt:
                 self.log.warn('Image encoded as data URL has unknown '
                               'format, ignoring')
                 continue
             img.set(
                 'src',
                 item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
예제 #7
0
def referenced_images(root):
    for img in XPath('//h:img[@src]')(root):
        src = img.get('src')
        if src.startswith('file://'):
            path = src[7:]
            if os.path.exists(path):
                yield img, path
예제 #8
0
 def remove_existing_jacket(self):
     for x in self.oeb.spine[:4]:
         if XPath(JACKET_XPATH)(x.data):
             self.remove_images(x, limit=sys.maxsize)
             self.oeb.manifest.remove(x)
             self.log('Removed existing jacket')
             break
예제 #9
0
 def specialize(self, oeb, opts, log, output_fmt):
     if opts.no_inline_navbars:
         from ebook_converter.ebooks.oeb.base import XPath
         for item in oeb.spine:
             for div in XPath(
                     '//h:div[contains(@class, "calibre_navbar")]')(
                         item.data):
                 div.getparent().remove(div)
예제 #10
0
    def find_levels(self):

        def level_of(elem, body):
            ans = 1
            while elem.getparent() is not body:
                ans += 1
                elem = elem.getparent()
            return ans

        paras = XPath('descendant::h:p|descendant::h:div')

        for item in self.oeb.spine:
            body = XPath('//h:body')(item.data)
            if not body:
                continue
            body = body[0]

            for p in paras(body):
                level = level_of(p, body)
                level = '%s_%d' % (parse_utils.barename(p.tag), level)
                if level not in self.levels:
                    self.levels[level] = []
                self.levels[level].append(p)

        remove = set()
        for k, v in self.levels.items():
            num = len(v)
            self.log.debug('Found %d items of level:'%num, k)
            level = int(k.split('_')[-1])
            tag = k.split('_')[0]
            if tag == 'p' and num < 25:
                remove.add(k)
            if tag == 'div':
                if level > 2 and num < 25:
                    remove.add(k)
                elif level < 3:
                    # Check each level < 3 element and only keep those
                    # that have many child paras
                    for elem in list(v):
                        children = len(paras(elem))
                        if children < 5:
                            v.remove(elem)

        for k in remove:
            self.levels.pop(k)
            self.log.debug('Ignoring level', k)
예제 #11
0
    def __call__(self, item, stylizer):
        if not hasattr(item.data, 'xpath'):
            return

        # The Kindle touch displays all black pages if the height is set on
        # body
        for body in XPath('//h:body')(item.data):
            style = stylizer.style(body)
            style.drop('height')
예제 #12
0
def extract_cover_from_embedded_svg(html, base, log):
    from ebook_converter.ebooks.oeb.base import XPath, SVG, XLINK
    root = etree.fromstring(html)

    svg = XPath('//svg:svg')(root)
    if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
        image = svg[0][0]
        href = image.get(XLINK('href'), None)
        if href:
            path = os.path.join(base, *href.split('/'))
            return return_raster_image(path)
예제 #13
0
 def postprocess_book(self, oeb, opts, log):
     from ebook_converter.ebooks.oeb.base import XPath, XHTML
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'):
             continue
         for bad in ('metadata', 'guide'):
             metadata = XPath('//h:'+bad)(root)
             if metadata:
                 for x in metadata:
                     x.getparent().remove(x)
         body = XPath('//h:body')(root)
         if body:
             body = body[0]
             if len(body) == 1 and body[0].tag == XHTML('pre'):
                 pre = body[0]
                 from ebook_converter.ebooks.txt.processor import \
                     convert_basic, separate_paragraphs_single_line
                 from ebook_converter.ebooks.chardet import xml_to_unicode
                 self.log('LIT file with all text in singe <pre> tag '
                          'detected')
                 html = separate_paragraphs_single_line(pre.text)
                 html = convert_basic(html).replace('<html>',
                                                    '<html xmlns="%s">' %
                                                    const.XHTML_NS)
                 html = xml_to_unicode(html, strip_encoding_pats=True,
                                       resolve_entities=True)[0]
                 if opts.smarten_punctuation:
                     # SmartyPants skips text inside <pre> tags
                     from ebook_converter.ebooks.conversion import \
                             preprocess
                     html = preprocess.smarten_punctuation(html, self.log)
                 root = etree.fromstring(html)
                 body = XPath('//h:body')(root)
                 pre.tag = XHTML('div')
                 pre.text = ''
                 for elem in body:
                     ne = copy.deepcopy(elem)
                     pre.append(ne)
예제 #14
0
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
    from ebook_converter.ebooks.oeb.base import XPath, XHTML
    changed_names = set()
    snames = set(sheets)
    lp = XPath('//h:link[@href]')
    hp = XPath('//h:head')
    for name in names:
        root = container.parsed(name)
        if remove:
            for link in lp(root):
                if (link.get('type', mtype) or mtype) == mtype:
                    container.remove_from_xml(link)
                    changed_names.add(name)
                    container.dirty(name)
        existing = {
            container.href_to_name(l.get('href'), name)
            for l in lp(root) if (l.get('type', mtype) or mtype) == mtype
        }
        extra = snames - existing
        if extra:
            changed_names.add(name)
            try:
                parent = hp(root)[0]
            except (TypeError, IndexError):
                parent = root.makeelement(XHTML('head'))
                container.insert_into_xml(root, parent, index=0)
            for sheet in sheets:
                if sheet in extra:
                    container.insert_into_xml(
                        parent,
                        parent.makeelement(XHTML('link'),
                                           rel='stylesheet',
                                           type=mtype,
                                           href=container.name_to_href(
                                               sheet, name)))
            container.dirty(name)

    return changed_names
예제 #15
0
 def remove_images(self, item, limit=1):
     path = XPath('//h:img[@src]')
     removed = 0
     for img in path(item.data):
         if removed >= limit:
             break
         href = item.abshref(img.get('src'))
         image = self.oeb.manifest.hrefs.get(href)
         if image is None:
             href = urlnormalize(href)
             image = self.oeb.manifest.hrefs.get(href)
         if image is not None:
             self.oeb.manifest.remove(image)
             self.oeb.guide.remove_by_href(href)
             img.getparent().remove(img)
             removed += 1
     return removed
예제 #16
0
    def epubify_markup(self, root, log):
        from ebook_converter.ebooks.oeb.base import XPath, XHTML
        # Fix empty title tags
        for t in XPath('//h:title')(root):
            if not t.text:
                t.text = u' '
        # Fix <p><div> constructs as the asinine epubchecker complains
        # about them
        pdiv = XPath('//h:p/h:div')
        for div in pdiv(root):
            div.getparent().tag = XHTML('div')

        # Remove the position:relative as it causes problems with some epub
        # renderers. Remove display: block on an image inside a div as it is
        # redundant and prevents text-align:center from working in ADE
        # Also ensure that the img is contained in its containing div
        imgpath = XPath('//h:div/h:img[@style]')
        for img in imgpath(root):
            div = img.getparent()
            if len(div) == 1:
                style = div.attrib.get('style', '')
                if style and not style.endswith(';'):
                    style = style + ';'
                style += 'position:static'  # Ensures position of containing div is static
                # Ensure that the img is always contained in its frame
                div.attrib['style'] = style
                img.attrib['style'] = 'max-width: 100%; max-height: 100%'

        # Handle anchored images. The default markup + CSS produced by
        # odf2xhtml works with WebKit but not with ADE. So we convert the
        # common cases of left/right/center aligned block images to work on
        # both webkit and ADE. We detect the case of setting the side margins
        # to auto and map it to an appropriate text-align directive, which
        # works in both WebKit and ADE.
        # https://bugs.launchpad.net/bugs/1063207
        # https://bugs.launchpad.net/calibre/+bug/859343
        imgpath = XPath('descendant::h:div/h:div/h:img')
        for img in imgpath(root):
            div2 = img.getparent()
            div1 = div2.getparent()
            if (len(div1), len(div2)) != (1, 1):
                continue
            cls = div1.get('class', '')
            first_rules = list(
                filter(None, [self.get_css_for_class(x) for x in cls.split()]))
            has_align = False
            for r in first_rules:
                if r.style.getProperty(u'text-align') is not None:
                    has_align = True
            ml = mr = None
            if not has_align:
                aval = None
                cls = div2.get(u'class', u'')
                rules = list(
                    filter(None,
                           [self.get_css_for_class(x) for x in cls.split()]))
                for r in rules:
                    ml = r.style.getPropertyCSSValue(u'margin-left') or ml
                    mr = r.style.getPropertyCSSValue(u'margin-right') or mr
                    ml = getattr(ml, 'value', None)
                    mr = getattr(mr, 'value', None)
                if ml == mr == u'auto':
                    aval = u'center'
                elif ml == u'auto' and mr != u'auto':
                    aval = 'right'
                elif ml != u'auto' and mr == u'auto':
                    aval = 'left'
                if aval is not None:
                    style = div1.attrib.get('style', '').strip()
                    if style and not style.endswith(';'):
                        style = style + ';'
                    style += 'text-align:%s' % aval
                    has_align = True
                    div1.attrib['style'] = style

            if has_align:
                # This is needed for ADE, without it the text-align has no
                # effect
                style = div2.attrib['style']
                div2.attrib['style'] = 'display:inline;' + style