def get_header_text(header): """ clean header text """ text = gg.normalize( etree.tostring(header, method="text", encoding=six.text_type, with_tail=False)) return header.get('title', text).strip()
def strip_pagenumbers(xhtml, strip_classes): """ Strip dp page numbers. Rationale: DP implements page numbers either with float or with absolute positioning. Float is not supported by Kindle. Absolute positioning is not allowed in epub. If we'd leave these in, they would show up as numbers in the middle of the text. To still keep links working, we replace all page number contraptions we can find with empty <a>'s. """ # look for elements with a class that is in strip_classes for class_ in strip_classes: xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_ count = 0 for elem in xpath(xhtml, xp): # save textual content text = gg.normalize( etree.tostring(elem, method="text", encoding=six.text_type, with_tail=False)) if len(text) > 10: # safeguard against removing things that are not pagenumbers continue if not text: text = elem.get('title') # look for id anywhere inside element id_ = elem.xpath(".//@id") # transmogrify element into empty <a> tail = elem.tail elem.clear() elem.tag = NS.xhtml.a if id_: # some blockheaded PPers include more than # one page number in one span. take the last id # because the others represent empty pages. elem.set('id', id_[-1]) if class_ in DP_PAGENUMBER_CLASSES: # mark element as rewritten pagenumber. we # actually don't use this class for styling # because it is on an empty element elem.set('class', 'x-ebookmaker-pageno') if text: elem.set('title', text) elem.tail = tail count += 1 # The OPS Spec 2.0 is very clear: "Reading Systems # must be XML processors as defined in XML 1.1." # Nevertheless many browser-plugin ebook readers use # the HTML parsers of the browser. But HTML parsers # don't grok the minimized form of empty elements. # # This will force lxml to output the non-minimized form # of the element. elem.text = '' if count: warning("%d elements having class %s have been rewritten." % (count, class_))
def make_toc(self, xhtml): """ Build a TOC from HTML headers. Return a list of tuples (url, text, depth). Page numbers are also inserted because DTBook NCX needs the play_order to be sequential. """ def id_generator(i=0): """ Generate an id for the TOC to link to. """ while True: yield 'pgepubid%05d' % i i += 1 idg = id_generator() def get_id(elem): """ Get the id of the element or generate and set one. """ if not elem.get('id'): elem.set('id', six.next(idg)) return elem.get('id') toc = [] last_depth = 0 for header in xpath( xhtml, '//xhtml:h1|//xhtml:h2|//xhtml:h3|//xhtml:h4|' # DP page number '//xhtml:*[contains (@class, "pageno")]|' # DocUtils contents header '//xhtml:p[contains (@class, "topic-title")]'): text = gg.normalize( etree.tostring(header, method="text", encoding=six.text_type, with_tail=False)) text = header.get('title', text).strip() if not text: # so <h2 title=""> may be used to suppress TOC entry continue if header.get('class', '').find('pageno') > -1: toc.append( ["%s#%s" % (self.attribs.url, get_id(header)), text, -1]) else: # header if text.lower().startswith('by '): # common error in PG: <h2>by Lewis Carroll</h2> should # yield no TOC entry continue try: depth = int(header.tag[-1:]) except ValueError: depth = 2 # avoid top level # fix bogus header numberings if depth > last_depth + 1: depth = last_depth + 1 last_depth = depth # if <h*> is first element of a <div> use <div> instead parent = header.getparent() if (parent.tag == NS.xhtml.div and parent[0] == header and parent.text and parent.text.strip() == ''): header = parent toc.append([ "%s#%s" % (self.attribs.url, get_id(header)), text, depth ]) return toc