def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(urlnormalize(href)) path, frag = urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = xml2text(anchor) title = COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def serialize_href(self, href, base=None): ''' Serialize the href attribute of an <a> or <reference> tag. It is serialized as filepos="000000000" and a pointer to its location is stored in self.href_offsets so that the correct value can be filled in at the end. ''' hrefs = self.oeb.manifest.hrefs try: path, frag = urldefrag(urlnormalize(href)) except ValueError: # Unparsable URL return False if path and base: path = base.abshref(path) if path and path not in hrefs: return False buf = self.buf item = hrefs[path] if path else None if item and item.spine_position is None: return False path = item.href if item else base.href href = '#'.join((path, frag)) if frag else path buf.write(b'filepos=') self.href_offsets[href].append(buf.tell()) buf.write(b'0000000000') return True
def __call__(self, oeb, opts): import css_parser self.log = oeb.logger self.opts = opts self.oeb = oeb for item in oeb.manifest.items: self.current_item = item if etree.iselement(item.data): rewrite_links(self.current_item.data, self.url_replacer) elif hasattr(item.data, 'cssText'): css_parser.replaceUrls(item.data, self.url_replacer) if self.oeb.guide: for ref in self.oeb.guide.values(): href = urlnormalize(ref.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref += '#' + frag ref.href = nhref if self.oeb.toc: self.fix_toc_entry(self.oeb.toc)
def serialize_href(self, href, base=None): ''' Serialize the href attribute of an <a> or <reference> tag. It is serialized as filepos="000000000" and a pointer to its location is stored in self.href_offsets so that the correct value can be filled in at the end. ''' hrefs = self.oeb.manifest.hrefs try: path, frag = urldefrag(urlnormalize(href)) except ValueError: # Unparseable URL return False if path and base: path = base.abshref(path) if path and path not in hrefs: return False buf = self.buf item = hrefs[path] if path else None if item and item.spine_position is None: return False path = item.href if item else base.href href = '#'.join((path, frag)) if frag else path buf.write(b'filepos=') self.href_offsets[href].append(buf.tell()) buf.write(b'0000000000') return True
def serialize_guide(self): ''' The Kindle decides where to open a book based on the presence of an item in the guide that looks like <reference type="text" title="Start" href="chapter-one.xhtml"/> Similarly an item with type="toc" controls where the Goto Table of Contents operation on the kindle goes. ''' buf = self.buf hrefs = self.oeb.manifest.hrefs buf.write(b'<guide>') for ref in self.oeb.guide.values(): path = urldefrag(ref.href)[0] if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: continue buf.write(b'<reference type="') if ref.type.startswith('other.') : self.serialize_text(ref.type.replace('other.',''), quot=True) else: self.serialize_text(ref.type, quot=True) buf.write(b'" ') if ref.title is not None: buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') if is_guide_ref_start(ref): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not buf.write(b' />') buf.write(b'</guide>')
def map_resources(self, oeb_book): for item in oeb_book.manifest: if item.media_type in OEB_IMAGES: if item.href not in self.images: ext = os.path.splitext(item.href)[1] fname = '%s%s' % (len(self.images), ext) fname = fname.zfill(10) self.images[item.href] = fname if item in oeb_book.spine: self.get_link_id(item.href) root = item.data.find(XHTML('body')) link_attrs = set(html.defs.link_attrs) link_attrs.add(XLINK('href')) for el in root.iter(): attribs = el.attrib try: if not isinstance(el.tag, string_or_bytes): continue except: continue for attr in attribs: if attr in link_attrs: href = item.abshref(attribs[attr]) href, id = urldefrag(href) if href in self.base_hrefs: self.get_link_id(href, id)
def serialize_guide(self): ''' The Kindle decides where to open a book based on the presence of an item in the guide that looks like <reference type="text" title="Start" href="chapter-one.xhtml"/> Similarly an item with type="toc" controls where the Goto Table of Contents operation on the kindle goes. ''' buf = self.buf hrefs = self.oeb.manifest.hrefs buf.write(b'<guide>') for ref in self.oeb.guide.values(): path = urldefrag(ref.href)[0] if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: continue buf.write(b'<reference type="') if ref.type.startswith('other.'): self.serialize_text(ref.type.replace('other.', ''), quot=True) else: self.serialize_text(ref.type, quot=True) buf.write(b'" ') if ref.title is not None: buf.write(b'title="') self.serialize_text(ref.title, quot=True) buf.write(b'" ') if is_guide_ref_start(ref): self._start_href = ref.href self.serialize_href(ref.href) # Space required or won't work, I kid you not buf.write(b' />') buf.write(b'</guide>')
def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): # This node is useless continue href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' path, _ = urldefrag(href) if path and path not in self.oeb.manifest.hrefs: path = urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding='unicode').strip() if not description: description = None else: description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def write_opf(self, guide, toc, spine, resource_map): mi = self.header.exth.mi if (self.cover_offset is not None and self.cover_offset < len(resource_map)): mi.cover = resource_map[self.cover_offset] if len(list(toc)) < 2: self.log.warn('KF8 has no metadata Table of Contents') for ref in guide: if ref.type == 'toc': href = ref.href() href, frag = urldefrag(href) if os.path.exists(href.replace('/', os.sep)): try: toc = self.read_inline_toc(href, frag) except: self.log.exception('Failed to read inline ToC') opf = OPFCreator(os.getcwd(), mi) opf.guide = guide def exclude(path): return os.path.basename(path) == 'debug-raw.html' # If there are no images then the azw3 input plugin dumps all # binary records as .unknown images, remove them if self.for_tweak and os.path.exists('images') and os.path.isdir( 'images'): files = os.listdir('images') unknown = [x for x in files if x.endswith('.unknown')] if len(files) == len(unknown): [os.remove('images/' + f) for f in files] if self.for_tweak: try: os.remove('debug-raw.html') except: pass opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude) for entry in opf.manifest: if entry.mime_type == 'text/html': entry.mime_type = 'application/xhtml+xml' opf.create_spine(spine) opf.set_toc(toc) ppd = getattr(self.header.exth, 'page_progression_direction', None) if ppd in {'ltr', 'rtl', 'default'}: opf.page_progression_direction = ppd pwm = getattr(self.header.exth, 'primary_writing_mode', None) if pwm is not None: opf.primary_writing_mode = pwm with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx: opf.render(of, ncx, 'toc.ncx') return 'metadata.opf'
def __call__(self, oeb, context): import css_parser oeb.logger.info('Trimming unused files from manifest...') self.opts = context used = set() for term in oeb.metadata: for item in oeb.metadata[term]: if item.value in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[item.value]) elif item.value in oeb.manifest.ids: used.add(oeb.manifest.ids[item.value]) for ref in oeb.guide.values(): path, _ = urldefrag(ref.href) if path in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[path]) # TOC items are required to be in the spine for item in oeb.spine: used.add(item) unchecked = used while unchecked: new = set() for item in unchecked: if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') try: href = item.abshref(urlnormalize(href)) except: continue if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) elif item.media_type == CSS_MIME: for href in css_parser.getUrls(item.data): href = item.abshref(urlnormalize(href)) if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) used.update(new) unchecked = new for item in oeb.manifest.values(): if item not in used: oeb.logger.info('Trimming %r from manifest' % item.href) oeb.manifest.remove(item)
def fix_toc_entry(self, toc): if toc.href: href = urlnormalize(toc.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref = '#'.join((nhref, frag)) toc.href = nhref for x in toc: self.fix_toc_entry(x)
def url_replacer(self, orig_url): url = urlnormalize(orig_url) parts = urlparse(url) if parts.scheme: # Only rewrite local URLs return orig_url path, frag = urldefrag(url) if self.renamed_items_map: orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) else: orig_item = self.current_item href = orig_item.abshref(path) replacement = self.current_item.relhref(self.rename_map.get(href, href)) if frag: replacement += '#' + frag return replacement
def url_replacer(self, orig_url): url = urlnormalize(orig_url) parts = urlparse(url) if parts.scheme: # Only rewrite local URLs return orig_url path, frag = urldefrag(url) if self.renamed_items_map: orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) else: orig_item = self.current_item href = orig_item.abshref(path) replacement = self.current_item.relhref(self.rename_map.get( href, href)) if frag: replacement += '#' + frag return replacement
def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = urlnormalize(urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn(u'Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href)
def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = urlnormalize(urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn('Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href)
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.'+ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def _toc_from_tour(self, opf): result = xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urldefrag(urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True
def fixup_links(self): ''' Fill in the correct values for all filepos="..." links with the offsets of the linked to content (as stored in id_offsets). ''' buf = self.buf id_offsets = self.id_offsets start_href = getattr(self, '_start_href', None) for href, hoffs in self.href_offsets.items(): is_start = (href and href == start_href) # Iterate over all filepos items if href not in id_offsets: self.logger.warn('Hyperlink target %r not found' % href) # Link to the top of the document, better than just ignoring href, _ = urldefrag(href) if href in self.id_offsets: ioff = self.id_offsets[href] if is_start: self.start_offset = ioff for hoff in hoffs: buf.seek(hoff) buf.write(('%010d' % ioff).encode('utf-8'))
def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) for item in extras: if item.href in removed_items_to_ignore: continue if version >= 2: self.logger.warn('Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False)
def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) for item in sorted(extras): if item.href in removed_items_to_ignore: continue if version >= 2: self.logger.warn( 'Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False)
def _manifest_add_missing(self, invalid): import css_parser manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = OEB_DOCS | OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except: self.oeb.log.exception('Failed to read from manifest ' 'entry with id: %s, ignoring' % item.id) invalid.add(item) continue if data is None: continue if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme except: self.oeb.log.exception( 'Skipping invalid href: %r' % href) continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: try: urls = list(css_parser.getUrls(data)) except: urls = [] for url in urls: href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set() for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] media_type = guessed or BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item)
def _manifest_add_missing(self, invalid): import css_parser manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = OEB_DOCS|OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except: self.oeb.log.exception(u'Failed to read from manifest ' u'entry with id: %s, ignoring'%item.id) invalid.add(item) continue if data is None: continue if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme except: self.oeb.log.exception( 'Skipping invalid href: %r'%href) continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: try: urls = list(css_parser.getUrls(data)) except: urls = [] for url in urls: href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set([]) for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] media_type = guessed or BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item)
def spine_item(tocitem): href = urldefrag(tocitem.href)[0] for item in self.oeb.spine: if item.href == href: return item
def read_inline_toc(self, href, frag): ans = TOC() base_href = '/'.join(href.split('/')[:-1]) with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) body = XPath('//h:body')(root) reached = False if body: start = body[0] else: start = None reached = True if frag: elems = XPath('//*[@id="%s"]' % frag)(root) if elems: start = elems[0] def node_depth(elem): ans = 0 parent = elem.getparent() while parent is not None: parent = parent.getparent() ans += 1 return ans # Layer the ToC based on nesting order in the source HTML current_depth = None parent = ans seen = set() links = [] for elem in root.iterdescendants(etree.Element): if reached and elem.tag == XHTML('a') and elem.get('href', False): href = elem.get('href') href, frag = urldefrag(href) href = base_href + '/' + href text = xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) links.append((text, href, frag, node_depth(elem))) elif elem is start: reached = True depths = sorted({x[-1] for x in links}) depth_map = {x: i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: current_depth = 0 parent.add_item(href, frag, text) elif current_depth == depth: parent.add_item(href, frag, text) elif current_depth < depth: parent = parent[-1] if len(parent) > 0 else parent parent.add_item(href, frag, text) current_depth += 1 else: delta = current_depth - depth while delta > 0 and parent.parent is not None: parent = parent.parent delta -= 1 parent.add_item(href, frag, text) current_depth = depth return ans
def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[], inhead=False, preserve=False): if not isinstance(elem.tag, string_or_bytes): # Don't emit any comments or raw entities return nsrmap = copy.copy(nsrmap) attrib = dict(elem.attrib) style = self.stylizer.style(elem) if self.stylizer else None for key, value in elem.nsmap.items(): if value not in nsrmap or nsrmap[value] != key: xmlns = ('xmlns:' + key) if key else 'xmlns' attrib[xmlns] = value nsrmap[value] = key tag = prefixname(elem.tag, nsrmap) tag_offset = self.buf.tell() if tag == 'head': inhead = True flags = FLAG_OPENING if not elem.text and len(elem) == 0: flags |= FLAG_CLOSING if inhead: flags |= FLAG_HEAD if style and self.is_block(style): flags |= FLAG_BLOCK self.write(0, flags) tattrs = self.tattrs[0] if tag in self.tags: index = self.tags[tag] self.write(index) if self.tattrs[index]: tattrs = self.tattrs[index] else: self.write(FLAG_CUSTOM, len(tag) + 1, tag) last_break = self.page_breaks[-1][0] if self.page_breaks else None if style and last_break != tag_offset \ and style['page-break-before'] in PAGE_BREAKS: self.page_breaks.append((tag_offset, list(parents))) for attr, value in attrib.items(): attr = prefixname(attr, nsrmap) if attr in ('href', 'src'): value = urlnormalize(value) path, frag = urldefrag(value) if self.item: path = self.item.abshref(path) prefix = codepoint_to_chr(3) if path in self.manifest.hrefs: prefix = codepoint_to_chr(2) value = self.manifest.hrefs[path].id if frag: value = '#'.join((value, frag)) value = prefix + value elif attr in ('id', 'name'): self.anchors.append((value, tag_offset)) elif attr.startswith('ms--'): attr = '%' + attr[4:] elif tag == 'link' and attr == 'type' and value in OEB_STYLES: value = CSS_MIME if attr in tattrs: self.write(tattrs[attr]) else: self.write(FLAG_CUSTOM, len(attr) + 1, attr) try: self.write(ATTR_NUMBER, int(value) + 1) except ValueError: self.write(len(value) + 1, value) self.write(0) old_preserve = preserve if style: preserve = (style['white-space'] in ('pre', 'pre-wrap')) xml_space = elem.get(XML('space')) if xml_space == 'preserve': preserve = True elif xml_space == 'normal': preserve = False if elem.text: if preserve: self.write(elem.text) elif len(elem) == 0 or not elem.text.isspace(): self.write(COLLAPSE.sub(' ', elem.text)) # else: de nada parents.append(tag_offset) child = cstyle = nstyle = None for next in chain(elem, [None]): if self.stylizer: nstyle = None if next is None else self.stylizer.style(next) if child is not None: if not preserve \ and (inhead or not nstyle or self.is_block(cstyle) or self.is_block(nstyle)) \ and child.tail and child.tail.isspace(): child.tail = None self.tree_to_binary(child, nsrmap, parents, inhead, preserve) child, cstyle = next, nstyle parents.pop() preserve = old_preserve if not flags & FLAG_CLOSING: self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) if elem.tail and tag != 'html': tail = elem.tail if not preserve: tail = COLLAPSE.sub(' ', tail) self.write(tail) if style and style['page-break-after'] not in ('avoid', 'auto'): self.page_breaks.append((self.buf.tell(), list(parents)))
def binary_to_text_inner(self, bin, buf, stack): (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, is_goingdown, state, flags) = stack.pop() if state == 'close tag': if not tag_name: raise LitError('Tag ends before it begins.') buf.write(encode(''.join(('</', tag_name, '>')))) dynamic_tag = 0 tag_name = None state = 'text' while self.cpos < len(bin): c, self.cpos = read_utf8_char(bin, self.cpos) oc = ord(c) if state == 'text': if oc == 0: state = 'get flags' continue elif c == '\v': c = '\n' elif c == '>': c = '>>' elif c == '<': c = '<<' buf.write(encode(c)) elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc buf.write(b'<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: state = 'get custom length' continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: raise LitError("atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): tag_name = self.tag_map[tag] current_map = self.tag_to_attr_map[tag] else: dynamic_tag += 1 errors += 1 tag_name = '?' + codepoint_to_chr(tag) + '?' current_map = self.tag_to_attr_map[tag] print('WARNING: tag %s unknown' % codepoint_to_chr(tag)) buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag %s at %d' % (tag_name, self.cpos)) break elif state == 'get attr': in_censorship = False if oc == 0: state = 'text' if not is_goingdown: tag_name = None dynamic_tag = 0 buf.write(b' />') else: buf.write(b'>') frame = (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, False, 'close tag', flags) stack.append(frame) frame = (depth + 1, None, None, 0, 0, False, False, 'text', 0) stack.append(frame) break else: if oc == 0x8000: state = 'get attr length' continue attr = None if current_map and oc in current_map and current_map[oc]: attr = current_map[oc] elif oc in self.attr_map: attr = self.attr_map[oc] if not attr or not isinstance(attr, string_or_bytes): raise LitError('Unknown attribute %d in tag %s' % (oc, tag_name)) if attr.startswith('%'): in_censorship = True state = 'get value length' continue buf.write(b' ' + encode(attr) + b'=') if attr in ['href', 'src']: state = 'get href length' else: state = 'get value length' elif state == 'get value length': if not in_censorship: buf.write(b'"') count = oc - 1 if count == 0: if not in_censorship: buf.write(b'"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue if count < 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: buf.write(encode('%s"' % (oc - 1))) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: if c == '"': c = '"' elif c == '<': c = '<' if isinstance(c, unicode_type): c = c.encode('ascii', 'xmlcharrefreplace') buf.write(c) count -= 1 if count == 0: if not in_censorship: buf.write(b'"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin) - self.cpos: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' tag_name = '' elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) buf.write(b' ') state = 'get custom attr' elif state == 'get custom attr': buf.write(encode(c)) count -= 1 if count == 0: buf.write(b'=') state = 'get value length' elif state == 'get href length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' elif state == 'get href': href += c count -= 1 if count == 0: doc, frag = urldefrag(href[1:]) path = self.item_path(doc) if frag: path = '#'.join((path, frag)) path = urlnormalize(path) buf.write(encode('"%s"' % path)) state = 'get attr'
def binary_to_text_inner(self, bin, buf, stack): (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, is_goingdown, state, flags) = stack.pop() if state == 'close tag': if not tag_name: raise LitError('Tag ends before it begins.') buf.write(encode(u''.join(('</', tag_name, '>')))) dynamic_tag = 0 tag_name = None state = 'text' while self.cpos < len(bin): c, self.cpos = read_utf8_char(bin, self.cpos) oc = ord(c) if state == 'text': if oc == 0: state = 'get flags' continue elif c == '\v': c = '\n' elif c == '>': c = '>>' elif c == '<': c = '<<' buf.write(encode(c)) elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc buf.write(b'<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: state = 'get custom length' continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: raise LitError( "atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): tag_name = self.tag_map[tag] current_map = self.tag_to_attr_map[tag] else: dynamic_tag += 1 errors += 1 tag_name = '?'+codepoint_to_chr(tag)+'?' current_map = self.tag_to_attr_map[tag] print('WARNING: tag %s unknown' % codepoint_to_chr(tag)) buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag %s at %d'%(tag_name, self.cpos)) break elif state == 'get attr': in_censorship = False if oc == 0: state = 'text' if not is_goingdown: tag_name = None dynamic_tag = 0 buf.write(b' />') else: buf.write(b'>') frame = (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, False, 'close tag', flags) stack.append(frame) frame = (depth+1, None, None, 0, 0, False, False, 'text', 0) stack.append(frame) break else: if oc == 0x8000: state = 'get attr length' continue attr = None if current_map and oc in current_map and current_map[oc]: attr = current_map[oc] elif oc in self.attr_map: attr = self.attr_map[oc] if not attr or not isinstance(attr, string_or_bytes): raise LitError( 'Unknown attribute %d in tag %s' % (oc, tag_name)) if attr.startswith('%'): in_censorship = True state = 'get value length' continue buf.write(b' ' + encode(attr) + b'=') if attr in ['href', 'src']: state = 'get href length' else: state = 'get value length' elif state == 'get value length': if not in_censorship: buf.write(b'"') count = oc - 1 if count == 0: if not in_censorship: buf.write(b'"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue if count < 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: buf.write(encode('%s"' % (oc - 1))) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: if c == '"': c = '"' elif c == '<': c = '<' if isinstance(c, unicode_type): c = c.encode('ascii', 'xmlcharrefreplace') buf.write(c) count -= 1 if count == 0: if not in_censorship: buf.write(b'"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin)-self.cpos: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' tag_name = '' elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) buf.write(b' ') state = 'get custom attr' elif state == 'get custom attr': buf.write(encode(c)) count -= 1 if count == 0: buf.write(b'=') state = 'get value length' elif state == 'get href length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' elif state == 'get href': href += c count -= 1 if count == 0: doc, frag = urldefrag(href[1:]) path = self.item_path(doc) if frag: path = '#'.join((path, frag)) path = urlnormalize(path) buf.write(encode(u'"%s"' % path)) state = 'get attr'