def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(urlnormalize(href)) path, frag = urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = xml2text(anchor) title = COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def _toc_from_spine(self, opf): self.log.warn('Generating default TOC from spine...') toc = self.oeb.toc titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = COLLAPSE_RE.sub(' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) return True
def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag): elem.tag = OPF(barename(elem.tag)) nsmap.update(OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(OPF('package'), nsmap={None: OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) ignored = (OPF('dc-metadata'), OPF('x-metadata')) for elem in xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if namespace(elem.tag) in DC_NSES: tag = barename(elem.tag).lower() elem.tag = '{%s}%s' % (DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = '{%s}%s' % (DC11_NS, tag) metadata.append(elem) for element in xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in xpath(opf, tag): nroot.append(element) return nroot
def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if not href: gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue href = 'missing.html' href = item.abshref(urlnormalize(href[0])) path, _ = urldefrag(href) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement : author = authorElement[0].text else : author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding=unicode).strip() if not description: description = None else : description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def _toc_from_ncx(self, item): if (item is None) or (item.data is None): return False self.log.debug('Reading TOC from NCX...') ncx = item.data title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) title = title or unicode(self.oeb.metadata.title[0]) toc = self.oeb.toc toc.title = title navmaps = xpath(ncx, 'ncx:navMap') for navmap in navmaps: self._toc_from_navpoint(item, toc, navmap) return True
def _manifest_from_opf(self, opf): manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') media_type = elem.get('media-type', None) if media_type is None: media_type = elem.get('mediatype', None) if not media_type or media_type == 'text/xml': guessed = guess_type(href)[0] media_type = guessed or media_type or BINARY_MIME if hasattr(media_type, 'lower'): media_type = media_type.lower() fallback = elem.get('fallback') if href in manifest.hrefs: self.logger.warn(u'Duplicate manifest entry for %r' % href) continue if not self.oeb.container.exists(href): self.logger.warn(u'Manifest item %r not found' % href) continue if id in manifest.ids: self.logger.warn(u'Duplicate manifest id %r' % id) id, href = manifest.generate(id, href) manifest.add(id, href, media_type, fallback) invalid = self._manifest_prune_invalid() self._manifest_add_missing(invalid)
def _manifest_from_opf(self, opf): manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') media_type = elem.get('media-type', None) if media_type is None: media_type = elem.get('mediatype', None) if not media_type or media_type == 'text/xml': guessed = guess_type(href)[0] media_type = guessed or media_type or BINARY_MIME if hasattr(media_type, 'lower'): media_type = media_type.lower() fallback = elem.get('fallback') if href in manifest.hrefs: self.logger.warn('Duplicate manifest entry for %r' % href) continue if not self.oeb.container.exists(href): self.logger.warn('Manifest item %r not found' % href) continue if id in manifest.ids: self.logger.warn('Duplicate manifest id %r' % id) id, href = manifest.generate(id, href) manifest.add(id, href, media_type, fallback) invalid = self._manifest_prune_invalid() self._manifest_add_missing(invalid)
def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): # This node is useless continue href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' path, _ = urldefrag(href) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding=unicode).strip() if not description: description = None else: description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def rasterize_item(self, item): html = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(html, '//h:img[@src]'): src = urlnormalize(elem.attrib['src']) image = hrefs.get(item.abshref(src), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME): data = urlnormalize(elem.attrib['data']) image = hrefs.get(item.abshref(data), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//svg:svg'): style = self.stylizer(item).style(elem) self.rasterize_inline(elem, style, item)
def collect_properties(container): for item in container.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'): mt = item.get('media-type') or '' if mt.lower() not in OEB_DOCS: continue name = container.href_to_name(item.get('href'), container.opf_name) root = container.parsed(name) root = ensure_namespace_prefixes(root, {'epub': EPUB_NS}) properties = set() container.replace(name, root) # Ensure entities are converted if xpath(root, '//svg:svg'): properties.add('svg') if xpath(root, '//h:script'): properties.add('scripted') if xpath(root, '//mathml:math'): properties.add('mathml') if xpath(root, '//epub:switch'): properties.add('switch') if properties: add_properties(item, *tuple(properties))
def collect_properties(container): for item in container.opf_xpath( '//opf:manifest/opf:item[@href and @media-type]'): mt = item.get('media-type') or '' if mt.lower() not in OEB_DOCS: continue name = container.href_to_name(item.get('href'), container.opf_name) root = container.parsed(name) properties = set() container.dirty(name) # Ensure entities are converted if xpath(root, '//svg:svg'): properties.add('svg') if xpath(root, '//h:script'): properties.add('scripted') if xpath(root, '//mathml:math'): properties.add('mathml') if xpath(root, '//epub:switch'): properties.add('switch') if properties: add_properties(item, *tuple(properties))
def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn(u'Spine item %r not found' % idref) continue item = manifest.ids[idref] if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath'): spine.add(item, elem.get('linear')) else: self.oeb.log.warn('The item %s is not a XML document.' ' Removing it from spine.'%item.href) if len(spine) == 0: raise OEBError("Spine is empty") self._spine_add_extra() for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'): if val in {'ltr', 'rtl'}: spine.page_progression_direction = val
def _spine_from_opf(self, opf): manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: continue item = manifest.ids[idref] if (item.media_type.lower() == 'application/xml' and hasattr(item.data, 'xpath') and item.data.xpath('/html')): item.media_type = 'application/xhtml+xml' item.data = item._parse_xhtml(etree.tostring(item.data)) super(LitReader, self)._spine_from_opf(opf)
def _toc_from_tour(self, opf): result = xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urldefrag(urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True
def _pages_from_ncx(self, opf, item): if item is None: return False ncx = item.data if ncx is None: return False ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') if not ptargets: return False pages = self.oeb.pages for ptarget in ptargets: name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) name = COLLAPSE_RE.sub(' ', name.strip()) href = xpath(ptarget, 'ncx:content/@src') if not href: continue href = item.abshref(urlnormalize(href[0])) id = ptarget.get('id') type = ptarget.get('type', 'normal') klass = ptarget.get('class') pages.add(name, href, type=type, id=id, klass=klass) return True
def _find_page_map(self, opf): result = xpath(opf, '/o2:package/o2:spine/@page-map') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == PAGE_MAP_MIME: self.oeb.manifest.remove(item) return item return None
def _find_ncx(self, opf): result = xpath(opf, '/o2:package/o2:spine/@toc') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in list(self.oeb.manifest.values()): if item.media_type == NCX_MIME: self.oeb.manifest.remove(item) return item return None
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = base64.encodestring(str(linkee)) data = "data:%s;base64,%s" % (linkee.media_type, data) elem.attrib[XLINK('href')] = data return svg
def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn(u'Spine item %r not found' % idref) continue item = manifest.ids[idref] spine.add(item, elem.get('linear')) for item in spine: if item.media_type.lower() not in OEB_DOCS: if not hasattr(item.data, 'xpath'): self.oeb.log.warn('The item %s is not a XML document.' ' Removing it from spine.'%item.href) spine.remove(item) if len(spine) == 0: raise OEBError("Spine is empty") self._spine_add_extra()
def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn(u'Spine item %r not found' % idref) continue item = manifest.ids[idref] spine.add(item, elem.get('linear')) for item in spine: if item.media_type.lower() not in OEB_DOCS: if not hasattr(item.data, 'xpath'): self.oeb.log.warn('The item %s is not a XML document.' ' Removing it from spine.' % item.href) spine.remove(item) if len(spine) == 0: raise OEBError("Spine is empty") self._spine_add_extra()
def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = urlnormalize(urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn(u'Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href)
def _pages_from_page_map(self, opf): item = self._find_page_map(opf) if item is None: return False pmap = item.data pages = self.oeb.pages for page in xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue name = COLLAPSE_RE.sub(' ', name.strip()) href = item.abshref(urlnormalize(href)) type = 'normal' if not name: type = 'special' elif name.lower().strip('ivxlcdm') == '': type = 'front' pages.add(name, href, type=type) return True
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.'+ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import cssutils, logging cssutils.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn(u'Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=ascii_filename(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) item = oeb.manifest.hrefs[htmlfile_map[path]] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) return oeb
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = XHTML_CSS_NAMESPACE + text text = oeb.css_preprocessor(text) stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS' % (path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css': extra_css, 'user_css': user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.' % w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') selector = get_css_selector(text) matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: if not unicodedata.category( text[0]).startswith('P'): break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import ( DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, ) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist import cssutils, logging cssutils.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: oeb.logger.warn("Language not specified") metadata.add("language", get_lang().replace("_", "-")) if not metadata.creator: oeb.logger.warn("Creator not specified") metadata.add("creator", self.oeb.translate(__("Unknown"))) if not metadata.title: oeb.logger.warn("Title not specified") metadata.add("title", self.oeb.translate(__("Unknown"))) bookid = str(uuid.uuid4()) metadata.add("identifier", bookid, id="uuid_id", scheme="uuid") for ident in metadata.identifier: if "id" in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id="html", href=ascii_filename(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, "text/html") item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log("Normalizing filename cases") for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log("Rewriting HTML links") for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) item = oeb.manifest.hrefs[htmlfile_map[path]] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = "".join(xpath(html, "/h:html/h:head/h:title/text()")) title = re.sub(r"\s+", " ", title.strip()) if title: titles.append(title) headers.append("(unlabled)") for tag in ("h1", "h2", "h3", "h4", "h5", "strong"): expr = "/h:html/h:body//h:%s[position()=1]/text()" header = "".join(xpath(html, expr % tag)) header = re.sub(r"\s+", " ", header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) return oeb
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == XHTML('link') and elem.get('href') and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( 'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media')) ): href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: if media_ok(rule.media.mediaText): for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = x.makeelement('{%s}span' % XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = unicode_type(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls( item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb
def _metadata_from_opf(self, opf): for e in xpath(opf, 'o2:metadata//o2:meta'): if e.attrib.get('name') == 'original-resolution': comic_book_exth_values['original-resolution'] = e.attrib.get('content', '660x800') return OEBReader._metadata_from_opf(self, opf)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css="", user_css=""): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == "default": self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + ".css" stylesheets = [html_css_stylesheet()] style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile["name"], profile["props"], profile["macros"]) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger("calibre.css")) self.font_face_rules = [] for elem in style_tags: if elem.tag == XHTML("style") and elem.get("type", CSS_MIME) in OEB_STYLES: text = elem.text if elem.text else u"" for x in elem: t = getattr(x, "text", None) if t: text += u"\n\n" + force_unicode(t, u"utf-8") t = getattr(x, "tail", None) if t: text += u"\n\n" + force_unicode(t, u"utf-8") if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ("utf-8", b"")) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces["h"] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == "amzn-mobi": continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn("Ignoring missing stylesheet in @import rule:", rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn("CSS @import of non-CSS file %r" % rule.href) continue stylesheets.append(sitem.data) for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif ( elem.tag == XHTML("link") and elem.get("href") and elem.get("rel", "stylesheet").lower() == "stylesheet" and elem.get("type", CSS_MIME).lower() in OEB_STYLES ): href = urlnormalize(elem.attrib["href"]) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn("Stylesheet %r referenced by file %r not in manifest" % (path, item.href)) continue if not hasattr(sitem.data, "cssRules"): self.logger.warn("Stylesheet %r referenced by file %r is not CSS" % (path, item.href)) continue stylesheets.append(sitem.data) csses = {"extra_css": extra_css, "user_css": user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces["h"] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception("Failed to parse %s, ignoring." % w) self.logger.debug("Bad css: ") self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({"all", "screen", "amzn-kf8"}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index == 0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index == 0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur":(first-letter|first-line|link|hover|visited|active|focus|before|after)", re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), "") selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == "first-letter" and getattr(self.oeb, "plumber_output_format", "").lower() == u"mobi": # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {"P", "Z"}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u"".join(punctuation_chars) + (text[0] if text else u"") span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, "//h:*[@style]"): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r"[0-9.]+$") for elem in xpath(tree, "//h:img[@width or @height]"): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get("width", "auto") != "auto" or style._style.get("height", "auto") != "auto" if not is_styled: # Update img style dimension using width and height upd = {} for prop in ("width", "height"): val = elem.get(prop, "").strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += "px" upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces['h'] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn( 'Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn( 'CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS' % (path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css': extra_css, 'user_css': user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.' % w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = { rule.media.item(i) for i in xrange(rule.media.length) } if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile( ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), '') selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() == u'mobi': # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces['h'] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), '') selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() == u'mobi': # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = XHTML_CSS_NAMESPACE + text text = oeb.css_preprocessor(text) stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') selector = get_css_selector(text) matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: if not unicodedata.category(text[0]).startswith('P'): break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)