def remove_first_image(self): deleted_item = None for item in self.oeb.spine: if XPath(JACKET_XPATH)(item.data): continue removed = self.remove_images(item) if removed > 0: self.log('Removed first image') body = XPath('//h:body')(item.data) if body: raw = xml2text(body[0]).strip() imgs = XPath('//h:img|//svg:svg')(item.data) if not raw and not imgs: self.log('Removing %s as it has no content' % item.href) self.oeb.manifest.remove(item) deleted_item = item break else: self.log.warn('Could not find first image to remove') if deleted_item is not None: for item in list(self.oeb.toc): href = urllib.parse.urldefrag(item.href)[0] if href == deleted_item.href: self.oeb.toc.remove(item) self.oeb.guide.remove_by_href(deleted_item.href)
def linearize_jacket(oeb): for x in oeb.spine[:4]: if XPath(JACKET_XPATH)(x.data): for e in XPath('//h:table|//h:tr|//h:th')(x.data): e.tag = base.tag('xhtml', 'div') for e in XPath('//h:td')(x.data): e.tag = base.tag('xhtml', 'span') break
def remove_old_cover(self, cover_item, new_cover_href=None): from ebook_converter.ebooks.oeb.base import XPath, XLINK from lxml import etree self.oeb.manifest.remove(cover_item) # Remove any references to the cover in the HTML affected_items = set() xp = XPath('//h:img[@src]|//svg:image[@xl:href]') for i, item in enumerate(self.oeb.spine): try: images = xp(item.data) except Exception: images = () removed = False for img in images: href = img.get('src') or img.get(XLINK('href')) try: href = item.abshref(href) except Exception: continue # Invalid URL, ignore if href == cover_item.href: if new_cover_href is not None: replacement_href = item.relhref(new_cover_href) attr = ('src' if img.tag.endswith('img') else XLINK('href')) img.set(attr, replacement_href) else: p = img.getparent() if p.tag.endswith('}svg'): p.getparent().remove(p) else: p.remove(img) removed = True if removed: affected_items.add(item) # Check if the resulting HTML has no content, if so remove it for item in affected_items: body = XPath('//h:body')(item.data) if body: text = etree.tostring(body[0], method='text', encoding='unicode') else: text = '' text = re.sub(r'\s+', '', text) if not text and not XPath('//h:img|//svg:svg')(item.data): self.log('Removing %s as it is a wrapper around the cover ' 'image' % item.href) self.oeb.spine.remove(item) self.oeb.manifest.remove(item) self.oeb.guide.remove_by_href(item.href)
def remove_links_to(container, predicate): ''' predicate must be a function that takes the arguments (name, href, fragment=None) and returns True iff the link should be removed ''' from ebook_converter.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML stylepath = XPath('//h:style') styleattrpath = XPath('//*[@style]') changed = set() for name, mt in container.mime_map.items(): removed = False if mt in OEB_DOCS: root = container.parsed(name) for el, attr, href, pos in iterlinks(root, find_links_in_css=False): hname = container.href_to_name(href, name) frag = href.partition('#')[-1] if predicate(hname, href, frag): if attr is None: el.text = None else: if el.tag == XHTML('link') or el.tag == XHTML('img'): extract(el) else: del el.attrib[attr] removed = True for tag in stylepath(root): if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css': sheet = container.parse_css(tag.text) if remove_links_in_sheet( partial(container.href_to_name, base=name), sheet, predicate): tag.text = css_text(sheet) removed = True for tag in styleattrpath(root): style = tag.get('style') if style: style = container.parse_css(style, is_declaration=True) if remove_links_in_declaration( partial(container.href_to_name, base=name), style, predicate): removed = True tag.set('style', css_text(style)) elif mt in OEB_STYLES: removed = remove_links_in_sheet( partial(container.href_to_name, base=name), container.parsed(name), predicate) if removed: changed.add(name) tuple(map(container.dirty, changed)) return changed
def parse_html_toc(self, item): from ebook_converter.ebooks.oeb.base import TOC, XPath dx = XPath('./h:div') ax = XPath('./h:a[1]') def do_node(parent, div): for child in dx(div): a = ax(child)[0] c = parent.add(a.text, a.attrib['href']) do_node(c, child) toc = TOC() root = XPath('//h:div[1]')(item.data)[0] do_node(toc, root) return toc
def __call__(self, oeb, opts): from ebook_converter.utils.imghdr import what self.log = oeb.log attr_path = XPath('//h:img[@src]') for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for img in attr_path(root): raw = img.get('src', '') if not raw.startswith('data:'): continue header, data = raw.partition(',')[0::2] if not header.startswith('data:image/') or not data: continue if ';base64' in header: data = re.sub(r'\s+', '', data) try: data = from_base64_bytes(data) except Exception: self.log.error('Found invalid base64 encoded data ' 'URI, ignoring it') continue else: data = urllib.parse.unquote(data) data = as_bytes(data) fmt = what(None, data) if not fmt: self.log.warn('Image encoded as data URL has unknown ' 'format, ignoring') continue img.set( 'src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
def referenced_images(root): for img in XPath('//h:img[@src]')(root): src = img.get('src') if src.startswith('file://'): path = src[7:] if os.path.exists(path): yield img, path
def remove_existing_jacket(self): for x in self.oeb.spine[:4]: if XPath(JACKET_XPATH)(x.data): self.remove_images(x, limit=sys.maxsize) self.oeb.manifest.remove(x) self.log('Removed existing jacket') break
def specialize(self, oeb, opts, log, output_fmt): if opts.no_inline_navbars: from ebook_converter.ebooks.oeb.base import XPath for item in oeb.spine: for div in XPath( '//h:div[contains(@class, "calibre_navbar")]')( item.data): div.getparent().remove(div)
def find_levels(self): def level_of(elem, body): ans = 1 while elem.getparent() is not body: ans += 1 elem = elem.getparent() return ans paras = XPath('descendant::h:p|descendant::h:div') for item in self.oeb.spine: body = XPath('//h:body')(item.data) if not body: continue body = body[0] for p in paras(body): level = level_of(p, body) level = '%s_%d' % (parse_utils.barename(p.tag), level) if level not in self.levels: self.levels[level] = [] self.levels[level].append(p) remove = set() for k, v in self.levels.items(): num = len(v) self.log.debug('Found %d items of level:'%num, k) level = int(k.split('_')[-1]) tag = k.split('_')[0] if tag == 'p' and num < 25: remove.add(k) if tag == 'div': if level > 2 and num < 25: remove.add(k) elif level < 3: # Check each level < 3 element and only keep those # that have many child paras for elem in list(v): children = len(paras(elem)) if children < 5: v.remove(elem) for k in remove: self.levels.pop(k) self.log.debug('Ignoring level', k)
def __call__(self, item, stylizer): if not hasattr(item.data, 'xpath'): return # The Kindle touch displays all black pages if the height is set on # body for body in XPath('//h:body')(item.data): style = stylizer.style(body) style.drop('height')
def extract_cover_from_embedded_svg(html, base, log): from ebook_converter.ebooks.oeb.base import XPath, SVG, XLINK root = etree.fromstring(html) svg = XPath('//svg:svg')(root) if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): image = svg[0][0] href = image.get(XLINK('href'), None) if href: path = os.path.join(base, *href.split('/')) return return_raster_image(path)
def postprocess_book(self, oeb, opts, log): from ebook_converter.ebooks.oeb.base import XPath, XHTML for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for bad in ('metadata', 'guide'): metadata = XPath('//h:'+bad)(root) if metadata: for x in metadata: x.getparent().remove(x) body = XPath('//h:body')(root) if body: body = body[0] if len(body) == 1 and body[0].tag == XHTML('pre'): pre = body[0] from ebook_converter.ebooks.txt.processor import \ convert_basic, separate_paragraphs_single_line from ebook_converter.ebooks.chardet import xml_to_unicode self.log('LIT file with all text in singe <pre> tag ' 'detected') html = separate_paragraphs_single_line(pre.text) html = convert_basic(html).replace('<html>', '<html xmlns="%s">' % const.XHTML_NS) html = xml_to_unicode(html, strip_encoding_pats=True, resolve_entities=True)[0] if opts.smarten_punctuation: # SmartyPants skips text inside <pre> tags from ebook_converter.ebooks.conversion import \ preprocess html = preprocess.smarten_punctuation(html, self.log) root = etree.fromstring(html) body = XPath('//h:body')(root) pre.tag = XHTML('div') pre.text = '' for elem in body: ne = copy.deepcopy(elem) pre.append(ne)
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'): from ebook_converter.ebooks.oeb.base import XPath, XHTML changed_names = set() snames = set(sheets) lp = XPath('//h:link[@href]') hp = XPath('//h:head') for name in names: root = container.parsed(name) if remove: for link in lp(root): if (link.get('type', mtype) or mtype) == mtype: container.remove_from_xml(link) changed_names.add(name) container.dirty(name) existing = { container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype } extra = snames - existing if extra: changed_names.add(name) try: parent = hp(root)[0] except (TypeError, IndexError): parent = root.makeelement(XHTML('head')) container.insert_into_xml(root, parent, index=0) for sheet in sheets: if sheet in extra: container.insert_into_xml( parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype, href=container.name_to_href( sheet, name))) container.dirty(name) return changed_names
def remove_images(self, item, limit=1): path = XPath('//h:img[@src]') removed = 0 for img in path(item.data): if removed >= limit: break href = item.abshref(img.get('src')) image = self.oeb.manifest.hrefs.get(href) if image is None: href = urlnormalize(href) image = self.oeb.manifest.hrefs.get(href) if image is not None: self.oeb.manifest.remove(image) self.oeb.guide.remove_by_href(href) img.getparent().remove(img) removed += 1 return removed
def epubify_markup(self, root, log): from ebook_converter.ebooks.oeb.base import XPath, XHTML # Fix empty title tags for t in XPath('//h:title')(root): if not t.text: t.text = u' ' # Fix <p><div> constructs as the asinine epubchecker complains # about them pdiv = XPath('//h:p/h:div') for div in pdiv(root): div.getparent().tag = XHTML('div') # Remove the position:relative as it causes problems with some epub # renderers. Remove display: block on an image inside a div as it is # redundant and prevents text-align:center from working in ADE # Also ensure that the img is contained in its containing div imgpath = XPath('//h:div/h:img[@style]') for img in imgpath(root): div = img.getparent() if len(div) == 1: style = div.attrib.get('style', '') if style and not style.endswith(';'): style = style + ';' style += 'position:static' # Ensures position of containing div is static # Ensure that the img is always contained in its frame div.attrib['style'] = style img.attrib['style'] = 'max-width: 100%; max-height: 100%' # Handle anchored images. The default markup + CSS produced by # odf2xhtml works with WebKit but not with ADE. So we convert the # common cases of left/right/center aligned block images to work on # both webkit and ADE. We detect the case of setting the side margins # to auto and map it to an appropriate text-align directive, which # works in both WebKit and ADE. # https://bugs.launchpad.net/bugs/1063207 # https://bugs.launchpad.net/calibre/+bug/859343 imgpath = XPath('descendant::h:div/h:div/h:img') for img in imgpath(root): div2 = img.getparent() div1 = div2.getparent() if (len(div1), len(div2)) != (1, 1): continue cls = div1.get('class', '') first_rules = list( filter(None, [self.get_css_for_class(x) for x in cls.split()])) has_align = False for r in first_rules: if r.style.getProperty(u'text-align') is not None: has_align = True ml = mr = None if not has_align: aval = None cls = div2.get(u'class', u'') rules = list( filter(None, [self.get_css_for_class(x) for x in cls.split()])) for r in rules: ml = r.style.getPropertyCSSValue(u'margin-left') or ml mr = r.style.getPropertyCSSValue(u'margin-right') or mr ml = getattr(ml, 'value', None) mr = getattr(mr, 'value', None) if ml == mr == u'auto': aval = u'center' elif ml == u'auto' and mr != u'auto': aval = 'right' elif ml != u'auto' and mr == u'auto': aval = 'left' if aval is not None: style = div1.attrib.get('style', '').strip() if style and not style.endswith(';'): style = style + ';' style += 'text-align:%s' % aval has_align = True div1.attrib['style'] = style if has_align: # This is needed for ADE, without it the text-align has no # effect style = div2.attrib['style'] div2.attrib['style'] = 'display:inline;' + style