def commit_ncx_toc(container, toc, lang=None, uid=None): tocname = find_existing_ncx_toc(container) if tocname is None: item = container.generate_item('toc.ncx', id_prefix='toc') tocname = container.href_to_name(item.get('href'), base=container.opf_name) ncx_id = item.get('id') [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')] if not lang: lang = get_lang() for _l in container.opf_xpath('//dc:language'): _l = canonicalize_lang(base.xml2text(_l).strip()) if _l: lang = _l lang = lang_as_iso639_1(_l) or _l break lang = lang_as_iso639_1(lang) or lang if not uid: uid = base.uuid_id() eid = container.opf.get('unique-identifier', None) if eid: m = container.opf_xpath('//*[@id="%s"]' % eid) if m: uid = base.xml2text(m[0]) title = 'Table of Contents' m = container.opf_xpath('//dc:title') if m: x = base.xml2text(m[0]).strip() title = x or title to_href = functools.partial(container.name_to_href, base=tocname) root = create_ncx(toc, to_href, title, lang, uid) container.replace(tocname, root) container.pretty_print.add(tocname)
def remove_first_image(self): deleted_item = None for item in self.oeb.spine: if XPath(JACKET_XPATH)(item.data): continue removed = self.remove_images(item) if removed > 0: self.log('Removed first image') body = XPath('//h:body')(item.data) if body: raw = xml2text(body[0]).strip() imgs = XPath('//h:img|//svg:svg')(item.data) if not raw and not imgs: self.log('Removing %s as it has no content' % item.href) self.oeb.manifest.remove(item) deleted_item = item break else: self.log.warn('Could not find first image to remove') if deleted_item is not None: for item in list(self.oeb.toc): href = urllib.parse.urldefrag(item.href)[0] if href == deleted_item.href: self.oeb.toc.remove(item) self.oeb.guide.remove_by_href(deleted_item.href)
def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = base.xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = base.xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not base.xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = collections.defaultdict(list) order = [] for anchor in base.xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(base.urlnormalize(href)) path, frag = urllib.parse.urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = base.xml2text(anchor) title = base.COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def create_toc_from_links(self): num = 0 for item in self.oeb.spine: for a in XPath('//h:a[@href]')(item.data): href = a.get('href') try: purl = urllib.parse.urlparse(href) except ValueError: self.log.warning('Ignoring malformed URL:', href) continue if not purl[0] or purl[0] == 'file': href, frag = purl.path, purl.fragment href = item.abshref(href) if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): text = base.xml2text(a) text = text[:100].strip() if (not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text)): continue try: self.oeb.toc.add( text, href, play_order=self.oeb.toc.next_play_order()) num += 1 except ValueError: self.oeb.log.exception('Failed to process link: ' '%r' % href) # Most likely an incorrectly URL encoded link continue if self.opts.max_toc_links > 0 and \ num >= self.opts.max_toc_links: self.log('Maximum TOC links reached, stopping.') return
def elem_to_toc_text(elem): text = base.xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: text = elem.get('alt', '') text = re.sub(r'\s+', ' ', text.strip()) text = text[:1000].strip() if not text: text = '(Untitled)' return text
def detect_chapters(self): self.detected_chapters = [] self.chapter_title_attribute = None def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except Exception: self.log.warn('Invalid chapter expression, ignoring: %s' % expr) return [] if self.opts.chapter: chapter_path, title_attribute = ( self.get_toc_parts_for_xpath(self.opts.chapter)) self.chapter_title_attribute = title_attribute for item in self.oeb.spine: for x in find_matches(chapter_path, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' c = collections.Counter() for item, elem in self.detected_chapters: c[item] += 1 text = base.xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue if chapter_mark == 'rule': mark = elem.makeelement(base.tag('xhtml', 'hr')) elif chapter_mark == 'pagebreak': if c[item] < 3 and at_start(elem): # For the first two elements in this item, check if # they are at the start of the file, in which case # inserting a page break in unnecessary and can lead # to extra blank pages in the PDF Output plugin. We # need to use two as feedbooks epubs match both a # heading tag and its containing div with the default # chapter expression. continue mark = elem.makeelement(base.tag('xhtml', 'div'), style=page_break_after) else: # chapter_mark == 'both': mark = elem.makeelement(base.tag('xhtml', 'hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter')
def find_text(node): LIMIT = 200 pat = re.compile(r'\s+') for child in node: if isinstance(child, etree._Element): text = base.xml2text(child).strip() text = pat.sub(' ', text) if len(text) < 1: continue if len(text) > LIMIT: # Look for less text in a child of this node, recursively ntext = find_text(child) return ntext or (text[:LIMIT] + '...') else: return text
def elem_to_link(self, item, elem, title_attribute, counter): text = '' if title_attribute is not None: text = elem.get(title_attribute, '') if not text: text = base.xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: text = elem.get('alt', '') text = re.sub(r'\s+', ' ', text.strip()) text = text[:1000].strip() id = elem.get('id', 'calibre_toc_%d' % counter) elem.set('id', id) href = '#'.join((item.href, id)) return text, href
def read_inline_toc(self, href, frag): ans = TOC() base_href = '/'.join(href.split('/')[:-1]) with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) body = base.XPath('//h:body')(root) reached = False if body: start = body[0] else: start = None reached = True if frag: elems = base.XPath('//*[@id="%s"]' % frag)(root) if elems: start = elems[0] def node_depth(elem): ans = 0 parent = elem.getparent() while parent is not None: parent = parent.getparent() ans += 1 return ans # Layer the ToC based on nesting order in the source HTML current_depth = None parent = ans seen = set() links = [] for elem in root.iterdescendants(etree.Element): if reached and elem.tag == base.tag('xhtml', 'a') and elem.get( 'href', False): href = elem.get('href') href, frag = urllib.parse.urldefrag(href) href = base_href + '/' + href text = base.xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) links.append((text, href, frag, node_depth(elem))) elif elem is start: reached = True depths = sorted(set(x[-1] for x in links)) depth_map = {x: i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: current_depth = 0 parent.add_item(href, frag, text) elif current_depth == depth: parent.add_item(href, frag, text) elif current_depth < depth: parent = parent[-1] if len(parent) > 0 else parent parent.add_item(href, frag, text) current_depth += 1 else: delta = current_depth - depth while delta > 0 and parent.parent is not None: parent = parent.parent delta -= 1 parent.add_item(href, frag, text) current_depth = depth return ans