Python GutenbergGlobals.normalize示例

        def get_header_text(header):
            """ clean header text """
            text = gg.normalize(
                etree.tostring(header,
                               method="text",
                               encoding=six.text_type,
                               with_tail=False))

            return header.get('title', text).strip()

示例#2

显示文件

    def strip_pagenumbers(xhtml, strip_classes):
        """ Strip dp page numbers.

        Rationale: DP implements page numbers either with float or
        with absolute positioning. Float is not supported by Kindle.
        Absolute positioning is not allowed in epub.

        If we'd leave these in, they would show up as numbers in the
        middle of the text.

        To still keep links working, we replace all page number
        contraptions we can find with empty <a>'s.

        """

        # look for elements with a class that is in strip_classes

        for class_ in strip_classes:
            xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_

            count = 0
            for elem in xpath(xhtml, xp):

                # save textual content
                text = gg.normalize(
                    etree.tostring(elem,
                                   method="text",
                                   encoding=six.text_type,
                                   with_tail=False))
                if len(text) > 10:
                    # safeguard against removing things that are not pagenumbers
                    continue

                if not text:
                    text = elem.get('title')

                # look for id anywhere inside element
                id_ = elem.xpath(".//@id")

                # transmogrify element into empty <a>
                tail = elem.tail
                elem.clear()
                elem.tag = NS.xhtml.a
                if id_:
                    # some blockheaded PPers include more than
                    # one page number in one span. take the last id
                    # because the others represent empty pages.
                    elem.set('id', id_[-1])

                if class_ in DP_PAGENUMBER_CLASSES:
                    # mark element as rewritten pagenumber. we
                    # actually don't use this class for styling
                    # because it is on an empty element
                    elem.set('class', 'x-ebookmaker-pageno')

                if text:
                    elem.set('title', text)
                elem.tail = tail
                count += 1

                # The OPS Spec 2.0 is very clear: "Reading Systems
                # must be XML processors as defined in XML 1.1."
                # Nevertheless many browser-plugin ebook readers use
                # the HTML parsers of the browser.  But HTML parsers
                # don't grok the minimized form of empty elements.
                #
                # This will force lxml to output the non-minimized form
                # of the element.
                elem.text = ''

            if count:
                warning("%d elements having class %s have been rewritten." %
                        (count, class_))

示例#3

显示文件

文件： __init__.py 项目： verdetamadachi/ebookmaker

    def make_toc(self, xhtml):
        """ Build a TOC from HTML headers.

        Return a list of tuples (url, text, depth).

        Page numbers are also inserted because DTBook NCX needs the
        play_order to be sequential.

        """
        def id_generator(i=0):
            """ Generate an id for the TOC to link to. """
            while True:
                yield 'pgepubid%05d' % i
                i += 1

        idg = id_generator()

        def get_id(elem):
            """ Get the id of the element or generate and set one. """
            if not elem.get('id'):
                elem.set('id', six.next(idg))
            return elem.get('id')

        toc = []
        last_depth = 0

        for header in xpath(
                xhtml,
                '//xhtml:h1|//xhtml:h2|//xhtml:h3|//xhtml:h4|'
                # DP page number
                '//xhtml:*[contains (@class, "pageno")]|'
                # DocUtils contents header
                '//xhtml:p[contains (@class, "topic-title")]'):

            text = gg.normalize(
                etree.tostring(header,
                               method="text",
                               encoding=six.text_type,
                               with_tail=False))

            text = header.get('title', text).strip()

            if not text:
                # so <h2 title=""> may be used to suppress TOC entry
                continue

            if header.get('class', '').find('pageno') > -1:
                toc.append(
                    ["%s#%s" % (self.attribs.url, get_id(header)), text, -1])
            else:
                # header
                if text.lower().startswith('by '):
                    # common error in PG: <h2>by Lewis Carroll</h2> should
                    # yield no TOC entry
                    continue

                try:
                    depth = int(header.tag[-1:])
                except ValueError:
                    depth = 2  # avoid top level

                # fix bogus header numberings
                if depth > last_depth + 1:
                    depth = last_depth + 1

                last_depth = depth

                # if <h*> is first element of a <div> use <div> instead
                parent = header.getparent()
                if (parent.tag == NS.xhtml.div and parent[0] == header
                        and parent.text and parent.text.strip() == ''):
                    header = parent

                toc.append([
                    "%s#%s" % (self.attribs.url, get_id(header)), text, depth
                ])

        return toc