def commit_toc(container, toc, lang=None, uid=None): tocname = find_existing_toc(container) if tocname is None: item = container.generate_item("toc.ncx", id_prefix="toc") tocname = container.href_to_name(item.get("href"), base=container.opf_name) if not lang: lang = get_lang() for l in container.opf_xpath("//dc:language"): l = canonicalize_lang(xml2text(l).strip()) if l: lang = l lang = lang_as_iso639_1(l) or l break lang = lang_as_iso639_1(lang) or lang if not uid: uid = uuid_id() eid = container.opf.get("unique-identifier", None) if eid: m = container.opf_xpath('//*[@id="%s"]' % eid) if m: uid = xml2text(m[0]) title = _("Table of Contents") m = container.opf_xpath("//dc:title") if m: x = xml2text(m[0]).strip() title = x or title to_href = partial(container.name_to_href, base=tocname) root = create_ncx(toc, to_href, title, lang, uid) container.replace(tocname, root) container.pretty_print.add(tocname)
def commit_ncx_toc(container, toc, lang=None, uid=None): tocname = find_existing_ncx_toc(container) if tocname is None: item = container.generate_item('toc.ncx', id_prefix='toc') tocname = container.href_to_name(item.get('href'), base=container.opf_name) ncx_id = item.get('id') [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')] if not lang: lang = get_lang() for l in container.opf_xpath('//dc:language'): l = canonicalize_lang(xml2text(l).strip()) if l: lang = l lang = lang_as_iso639_1(l) or l break lang = lang_as_iso639_1(lang) or lang if not uid: uid = uuid_id() eid = container.opf.get('unique-identifier', None) if eid: m = container.opf_xpath('//*[@id="%s"]'%eid) if m: uid = xml2text(m[0]) title = _('Table of Contents') m = container.opf_xpath('//dc:title') if m: x = xml2text(m[0]).strip() title = x or title to_href = partial(container.name_to_href, base=tocname) root = create_ncx(toc, to_href, title, lang, uid) container.replace(tocname, root) container.pretty_print.add(tocname)
def detect_chapters(self): self.detected_chapters = [] def find_matches(expr, doc): try: return XPath(expr)(doc) except: self.log.warn('Invalid chapter expression, ignoring: %s'%expr) return [] if self.opts.chapter: for item in self.oeb.spine: for x in find_matches(self.opts.chapter, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' for item, elem in self.detected_chapters: text = xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue elif chapter_mark == 'rule': mark = etree.Element(XHTML('hr')) elif chapter_mark == 'pagebreak': mark = etree.Element(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': mark = etree.Element(XHTML('hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter')
def remove_first_image(self): deleted_item = None for item in self.oeb.spine: if XPath(JACKET_XPATH)(item.data): continue removed = self.remove_images(item) if removed > 0: self.log('Removed first image') body = XPath('//h:body')(item.data) if body: raw = xml2text(body[0]).strip() imgs = XPath('//h:img|//svg:svg')(item.data) if not raw and not imgs: self.log('Removing %s as it has no content'%item.href) self.oeb.manifest.remove(item) deleted_item = item break else: self.log.warn('Could not find first image to remove') if deleted_item is not None: for item in list(self.oeb.toc): href = urldefrag(item.href)[0] if href == deleted_item.href: self.oeb.toc.remove(item) self.oeb.guide.remove_by_href(deleted_item.href)
def create_toc_from_links(self): num = 0 for item in self.oeb.spine: for a in XPath('//h:a[@href]')(item.data): href = a.get('href') try: purl = urlparse(href) except ValueError: self.log.warning('Ignoring malformed URL:', href) continue if not purl[0] or purl[0] == 'file': href, frag = purl.path, purl.fragment href = item.abshref(href) if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): text = xml2text(a) text = text[:100].strip() if (not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text)): continue try: self.oeb.toc.add(text, href, play_order=self.oeb.toc.next_play_order()) num += 1 except ValueError: self.oeb.log.exception('Failed to process link: %r' % href) continue # Most likely an incorrectly URL encoded link if self.opts.max_toc_links > 0 and \ num >= self.opts.max_toc_links: self.log('Maximum TOC links reached, stopping.') return
def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(urlnormalize(href)) path, frag = urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = xml2text(anchor) title = COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def detect_chapters(self): self.detected_chapters = [] if self.opts.chapter: chapter_xpath = XPath(self.opts.chapter) for item in self.oeb.spine: for x in chapter_xpath(item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' for item, elem in self.detected_chapters: text = xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue elif chapter_mark == 'rule': mark = etree.Element(XHTML('hr')) elif chapter_mark == 'pagebreak': mark = etree.Element(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': mark = etree.Element(XHTML('hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter')
def description_for_anchor(elem): def check(x, min_len=4): if x: x = x.strip() if len(x) >= min_len: return x[:30] desc = check(elem.get('title')) if desc is not None: return desc desc = check(elem.text) if desc is not None: return desc if len(elem) > 0: desc = check(elem[0].text) if desc is not None: return desc # Get full text for tags that have only a few descendants for i, x in enumerate(elem.iterdescendants('*')): if i > 5: break else: desc = check(xml2text(elem), min_len=1) if desc is not None: return desc
def elem_to_toc_text(elem): text = xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: text = elem.get('alt', '') text = re.sub(r'\s+', ' ', text.strip()) text = text[:1000].strip() if not text: text = _('(Untitled)') return text
def elem_to_toc_text(elem): text = xml2text(elem).strip() if not text: text = elem.get("title", "") if not text: text = elem.get("alt", "") text = re.sub(r"\s+", " ", text.strip()) text = text[:1000].strip() if not text: text = _("(Untitled)") return text
def elem_to_link(self, item, elem, counter): text = xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: text = elem.get('alt', '') text = re.sub(r'\s+', ' ', text.strip()) text = text[:1000].strip() id = elem.get('id', 'calibre_toc_%d'%counter) elem.set('id', id) href = '#'.join((item.href, id)) return text, href
def elem_to_link(self, item, elem, counter): text = xml2text(elem).strip() if not text: text = elem.get("title", "") if not text: text = elem.get("alt", "") text = re.sub(r"\s+", " ", text.strip()) text = text[:1000].strip() id = elem.get("id", "calibre_toc_%d" % counter) elem.set("id", id) href = "#".join((item.href, id)) return text, href
def detect_chapters(self): self.detected_chapters = [] self.chapter_title_attribute = None def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except: self.log.warn('Invalid chapter expression, ignoring: %s'%expr) return [] if self.opts.chapter: chapter_path, title_attribute = self.get_toc_parts_for_xpath(self.opts.chapter) self.chapter_title_attribute = title_attribute for item in self.oeb.spine: for x in find_matches(chapter_path, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' c = Counter() for item, elem in self.detected_chapters: c[item] += 1 text = xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue if chapter_mark == 'rule': mark = elem.makeelement(XHTML('hr')) elif chapter_mark == 'pagebreak': if c[item] < 3 and at_start(elem): # For the first two elements in this item, check if they # are at the start of the file, in which case inserting a # page break in unnecessary and can lead to extra blank # pages in the PDF Output plugin. We need to use two as # feedbooks epubs match both a heading tag and its # containing div with the default chapter expression. continue mark = elem.makeelement(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': mark = elem.makeelement(XHTML('hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter')
def find_text(node): LIMIT = 200 pat = re.compile(r'\s+') for child in node: if isinstance(child, etree._Element): text = xml2text(child).strip() text = pat.sub(' ', text) if len(text) < 1: continue if len(text) > LIMIT: # Look for less text in a child of this node, recursively ntext = find_text(child) return ntext or (text[:LIMIT] + '...') else: return text
def elem_to_link(self, item, elem, title_attribute, counter): text = '' if title_attribute is not None: text = elem.get(title_attribute, '') if not text: text = xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: text = elem.get('alt', '') text = re.sub(r'\s+', ' ', text.strip()) text = text[:1000].strip() id = elem.get('id', 'calibre_toc_%d' % counter) elem.set('id', id) href = '#'.join((item.href, id)) return text, href
def detect_chapters(self): self.detected_chapters = [] def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except: self.log.warn("Invalid chapter expression, ignoring: %s" % expr) return [] if self.opts.chapter: for item in self.oeb.spine: for x in find_matches(self.opts.chapter, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = "display: block; page-break-before: always" page_break_after = "display: block; page-break-after: always" c = Counter() for item, elem in self.detected_chapters: c[item] += 1 text = xml2text(elem).strip() text = re.sub(r"\s+", " ", text.strip()) self.log("\tDetected chapter:", text[:50]) if chapter_mark == "none": continue if chapter_mark == "rule": mark = etree.Element(XHTML("hr")) elif chapter_mark == "pagebreak": if c[item] < 3 and at_start(elem): # For the first two elements in this item, check if they # are at the start of the file, in which case inserting a # page break in unnecessary and can lead to extra blank # pages in the PDF Output plugin. We need to use two as # feedbooks epubs match both a heading tag and its # containing div with the default chapter expression. continue mark = etree.Element(XHTML("div"), style=page_break_after) else: # chapter_mark == 'both': mark = etree.Element(XHTML("hr"), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception("Failed to mark chapter")
def find_cover_image_in_page(container, cover_page): root = container.parsed(cover_page) body = XPath('//h:body')(root) if len(body) != 1: return body = body[0] images = [] for img in XPath('descendant::h:img[@src]|descendant::svg:svg/descendant::svg:image')(body): href = img.get('src') or img.get(XLINK('href')) if href: name = container.href_to_name(href, base=cover_page) images.append(name) text = re.sub(r'\s+', '', xml2text(body)) if text or len(images) > 1: # Document has more content than a single image return if images: return images[0]
def remove_first_image(self): deleted_item = None for item in self.oeb.spine: removed = self.remove_images(item) if removed > 0: self.log('Removed first image') body = XPath('//h:body')(item.data) if body: raw = xml2text(body[0]).strip() imgs = XPath('//h:img|//svg:svg')(item.data) if not raw and not imgs: self.log('Removing %s as it has no content'%item.href) self.oeb.manifest.remove(item) deleted_item = item break if deleted_item is not None: for item in list(self.oeb.toc): href = urldefrag(item.href)[0] if href == deleted_item.href: self.oeb.toc.remove(item)
def create_toc_from_links(self): num = 0 for item in self.oeb.spine: for a in XPath("//h:a[@href]")(item.data): href = a.get("href") purl = urlparse(href) if not purl[0] or purl[0] == "file": href, frag = purl.path, purl.fragment href = item.abshref(href) if frag: href = "#".join((href, frag)) if not self.oeb.toc.has_href(href): text = xml2text(a) text = text[:100].strip() if not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text): continue num += 1 self.oeb.toc.add(text, href, play_order=self.oeb.toc.next_play_order()) if self.opts.max_toc_links > 0 and num >= self.opts.max_toc_links: self.log("Maximum TOC links reached, stopping.") return
def detect_chapters(self): self.detected_chapters = [] def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except: self.log.warn('Invalid chapter expression, ignoring: %s'%expr) return [] if self.opts.chapter: for item in self.oeb.spine: for x in find_matches(self.opts.chapter, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' for item, elem in self.detected_chapters: text = xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue elif chapter_mark == 'rule': mark = etree.Element(XHTML('hr')) elif chapter_mark == 'pagebreak': mark = etree.Element(XHTML('div'), style=page_break_after) else: # chapter_mark == 'both': mark = etree.Element(XHTML('hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter')
def create_toc_from_links(self): num = 0 for item in self.oeb.spine: for a in XPath('//h:a[@href]')(item.data): href = a.get('href') purl = urlparse(href) if not purl[0] or purl[0] == 'file': href, frag = purl.path, purl.fragment href = item.abshref(href) if frag: href = '#'.join((href, frag)) if not self.oeb.toc.has_href(href): text = xml2text(a) text = text[:100].strip() if (not self.opts.duplicate_links_in_toc and self.oeb.toc.has_text(text)): continue num += 1 self.oeb.toc.add(text, href, play_order=self.oeb.toc.next_play_order()) if self.opts.max_toc_links > 0 and \ num >= self.opts.max_toc_links: self.log('Maximum TOC links reached, stopping.') return
def read_inline_toc(self, href, frag): ans = TOC() base_href = '/'.join(href.split('/')[:-1]) with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) body = XPath('//h:body')(root) reached = False if body: start = body[0] else: start = None reached = True if frag: elems = XPath('//*[@id="%s"]' % frag)(root) if elems: start = elems[0] def node_depth(elem): ans = 0 parent = elem.getparent() while parent is not None: parent = parent.getparent() ans += 1 return ans # Layer the ToC based on nesting order in the source HTML current_depth = None parent = ans seen = set() links = [] for elem in root.iterdescendants(etree.Element): if reached and elem.tag == XHTML('a') and elem.get('href', False): href = elem.get('href') href, frag = urldefrag(href) href = base_href + '/' + href text = xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) links.append((text, href, frag, node_depth(elem))) elif elem is start: reached = True depths = sorted(set(x[-1] for x in links)) depth_map = {x: i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: current_depth = 0 parent.add_item(href, frag, text) elif current_depth == depth: parent.add_item(href, frag, text) elif current_depth < depth: parent = parent[-1] if len(parent) > 0 else parent parent.add_item(href, frag, text) current_depth += 1 else: delta = current_depth - depth while delta > 0 and parent.parent is not None: parent = parent.parent delta -= 1 parent.add_item(href, frag, text) current_depth = depth return ans
def read_inline_toc(self, href, frag): ans = TOC() base_href = '/'.join(href.split('/')[:-1]) with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) body = XPath('//h:body')(root) reached = False if body: start = body[0] else: start = None reached = True if frag: elems = XPath('//*[@id="%s"]'%frag)(root) if elems: start = elems[0] def node_depth(elem): ans = 0 parent = elem.getparent() while parent is not None: parent = parent.getparent() ans += 1 return ans # Layer the ToC based on nesting order in the source HTML current_depth = None parent = ans seen = set() links = [] for elem in root.iterdescendants(etree.Element): if reached and elem.tag == XHTML('a') and elem.get('href', False): href = elem.get('href') href, frag = urldefrag(href) href = base_href + '/' + href text = xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) links.append((text, href, frag, node_depth(elem))) elif elem is start: reached = True depths = sorted(set(x[-1] for x in links)) depth_map = {x:i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: current_depth = 0 parent.add_item(href, frag, text) elif current_depth == depth: parent.add_item(href, frag, text) elif current_depth < depth: parent = parent[-1] if len(parent) > 0 else parent parent.add_item(href, frag, text) current_depth += 1 else: delta = current_depth - depth while delta > 0 and parent.parent is not None: parent = parent.parent delta -= 1 parent.add_item(href, frag, text) current_depth = depth return ans