示例#1
0
    def __invoke_callback(self, page_generator, element, callback):
        try:
            callback(page_generator, element)
        except ReplaceWithNothing as exc:
            # Remove the element
            element.parentNode.removeChild(element)
        except ReplaceWithText as exc:
            # Replace the element with the given text
            text_node = element.ownerDocument.createTextNode(exc.text)
            element.parentNode.replaceChild(text_node, element)
        except ReplaceWithNode as exc:
            # Replace the element with the given node
            new_node = exc.node
            if new_node.ownerDocument is not element.ownerDocument:
                new_node = element.ownerDocument.importNode(new_node, True)
            element.parentNode.replaceChild(new_node, element)
            if exc.fix_namespaces:
                # page_generator.content uses HTML without specifying a namespace
                substitute_namespaces(new_node, {XHTML_NAMESPACE: EMPTY_NAMESPACE})
                normalize_namespaces(new_node, strip_dups=True)
        except ReplaceWithHTML as exc:
            #
            # Replace the element with the given HTML code
            #

            # Parse with TagSoupToXml
            p = TagSoupToXml(omit_comments=exc.omit_comments)
            p.feed(exc.html)
            p.close()

            # Get a DOM document
            doc = p.todocument()

            # Find the <body> element
            for node in doc.documentElement.childNodes:
                if node.nodeType != node.ELEMENT_NODE:
                    continue
                if node.localName == 'body':
                    bodyElement = node
                    break
            else:
                raise AssertionError("<body> element not found")

            # At this stage, HTML code doesn't have a namespace assigned yet.
            assert bodyElement.namespaceURI == EMPTY_NAMESPACE

            # Replace the placeholder element with the children of the <body> node.
            for node in bodyElement.childNodes:
                new_node = element.ownerDocument.importNode(node, True)
                element.parentNode.insertBefore(new_node, element)
            element.parentNode.removeChild(element)
示例#2
0
    def load_content(self):
        self.invoke_filters('load_content:before')

        # Convert HTML tag soup to XML
        p = TagSoupToXml(omit_comments=True)  # Omit commented-out parts in the final output
        p.feed(open(self.path_info.source_filename, "rt", encoding="UTF-8").read())   # TODO: support other encodings? (is that safe?)
        p.close()

        # Return a DOM URL
        self.content = p.todocument()

        # Drop any "http://www.w3.org/1999/xhtml" namespace declarations
        substitute_namespaces(self.content.documentElement, {XHTML_NAMESPACE: EMPTY_NAMESPACE})
        normalize_namespaces(self.content.documentElement, strip_dups=True)

        self.invoke_filters('load_content:after')