Exemplo n.º 1
0
def _serialize(domtree):
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(domtree)
    serializer = HTMLSerializer(quote_attr_values=True,
                                alphabetical_attributes=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemplo n.º 2
0
 def serialize(self, output):
     walker = html5lib.treewalkers.getTreeWalker("dom")
     stream = walker(self.document)
     s = HTMLSerializer(omit_optional_tags=True)
     output_generator = s.serialize(stream)
     for item in output_generator:
         output.write(item.encode('utf-8'))
Exemplo n.º 3
0
def app_filter_html_path_inplace(path, filters, log=None):
    """Filter the given HTML file (in-place) based on "app-*" class
    attributes.
    
    For example, the HTML might contain something like:
        <div class="app-ide">
            ...ide info...
        </div>
        <div class="app-edit">
            ...edit info...
        </div>
    If there are no filters, then the HTML is not changed. If the filters
    include "ide" but not "edit", then the ide div remains and the
    edit div is removed.
    """
    if not filters:
        return
    if log:
        log("app-filter `%s'", path)

    # Parse the HTML file.
    with open(path) as f:
        tree = html5lib.parse(f, namespaceHTMLElements=False)

    # Filter out the unwanted elements.
    filtered = False
    assert isinstance(filters, set)
    for elem in tree.getiterator():
        indeces_to_drop = []
        for i, child in enumerate(elem.getchildren()):
            if _should_drop_elem(child, filters, "class", "app-"):
                indeces_to_drop.insert(0, i)
                filtered = True
                if log:
                    tag_str = "<%s" % child.tag
                    if child.attrib:
                        for n, v in child.attrib.items():
                            tag_str += ' %s="%s"' % (n, v)
                    tag_str += ">"
                    if len(tag_str) > 50:
                        tag_str = tag_str[:47] + '...'
                    log("... filter out %s", tag_str)
        for idx in indeces_to_drop:
            del elem[idx]

    # Write out any changes.
    if filtered:
        walker = treewalkers.getTreeWalker("etree", ET)
        stream = walker(tree)
        s = HTMLSerializer()
        outputter = s.serialize(stream)
        content = ''.join(list(outputter))
        f = open(path, 'w')
        f.write("""<!DOCTYPE html>
""")
        try:
            f.write(content)
        finally:
            f.close()
Exemplo n.º 4
0
 def to_unicode(self):
     """Return the unicode serialization of myself."""
     container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
     walker = getTreeWalker(self.TREEBUILDER)
     stream = walker(self._root)
     serializer = HTMLSerializer(quote_attr_values=True,
                                 omit_optional_tags=False)
     return serializer.render(stream)[container_len:-container_len - 1]
Exemplo n.º 5
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = [
        '{http://www.w3.org/1999/xhtml}blockquote',
        '{http://www.w3.org/1999/xhtml}ol', '{http://www.w3.org/1999/xhtml}li',
        '{http://www.w3.org/1999/xhtml}ul'
    ]

    if not string:
        return string

    def parse_html(tree):
        # In etree, a tag may have:
        # - some text content (piece of text before its first child)
        # - a tail (piece of text just after the tag, and before a sibling)
        # - children
        # Eg: "<div>text <b>children's text</b> children's tail</div> tail".

        # Strip new lines directly inside block level elements: first new lines
        # from the text, and:
        # - last new lines from the tail of the last child if there's children
        #   (done in the children loop below).
        # - or last new lines from the text itself.
        if tree.tag in html_blocks:
            if tree.text:
                tree.text = tree.text.lstrip('\n')
                if not len(tree):  # No children.
                    tree.text = tree.text.rstrip('\n')

            # Remove the first new line after a block level element.
            if tree.tail and tree.tail.startswith('\n'):
                tree.tail = tree.tail[1:]

        for child in tree:  # Recurse down the tree.
            if tree.tag in html_blocks:
                # Strip new lines directly inside block level elements: remove
                # the last new lines from the children's tails.
                if child.tail:
                    child.tail = child.tail.rstrip('\n')
            parse_html(child)
        return tree

    parse = parse_html(html5lib.parseFragment(string))

    # Serialize the parsed tree back to html.
    walker = html5lib.treewalkers.getTreeWalker('etree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemplo n.º 6
0
def sanitize_html(html):
    if not html:
        return u''
    if isinstance(html, bytes):
        html = html.decode('utf-8', 'replace')
    import html5lib
    from html5lib.sanitizer import HTMLSanitizer
    from html5lib.serializer.htmlserializer import HTMLSerializer
    from html5lib.treebuilders.etree_lxml import TreeBuilder
    from html5lib.treewalkers.lxmletree import TreeWalker
    parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=TreeBuilder)
    tree = parser.parseFragment(html)
    serializer = HTMLSerializer(quote_attr_values=True, alphabetical_attributes=False, omit_optional_tags=False)
    stream = TreeWalker(tree)
    return serializer.render(stream)
Exemplo n.º 7
0
def clean_nl(string):
    """
    This will clean up newlines so that nl2br can properly be called on the
    cleaned text.
    """

    html_blocks = ['blockquote', 'ol', 'li', 'ul']

    if not string:
        return string

    def parse_html(tree):
        prev_tag = ''
        for i, node in enumerate(tree.childNodes):
            if node.type == 4:  # Text node
                value = node.value

                # Strip new lines directly inside block level elements.
                if node.parent.name in html_blocks:
                    value = value.strip('\n')

                # Remove the first new line after a block level element.
                if (prev_tag in html_blocks and value.startswith('\n')):
                    value = value[1:]

                tree.childNodes[i].value = value
            else:
                tree.insertBefore(parse_html(node), node)
                tree.removeChild(node)

            prev_tag = node.name
        return tree

    parse = parse_html(html5lib.parseFragment(string))
    if not parse.childNodes:
        # The parser couldn't make sense of the given html, eg bad markup.
        return ''

    walker = html5lib.treewalkers.getTreeWalker('simpletree')
    stream = walker(parse)
    serializer = HTMLSerializer(quote_attr_values=True,
                                omit_optional_tags=False)
    return serializer.render(stream)
Exemplo n.º 8
0
def sanitize_html_fragment(fragment):
    """
    Santize an HTML ``fragment``, returning a copy of the fragment
    that has been cleaned up.
    """
    if fragment:
        import html5lib
        from html5lib.sanitizer import HTMLSanitizer
        from html5lib.serializer.htmlserializer import HTMLSerializer

        parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer)
        parsed = parser.parseFragment(fragment)
        walker = html5lib.treewalkers.getTreeWalker('etree')
        stream = walker(parsed)
        serializer = HTMLSerializer(quote_attr_values=True,
                omit_optional_tags=False)
        output = serializer.render(stream)
        return output
    else:
        return fragment
Exemplo n.º 9
0
def independentize_html_path(src, dst, css_dir=None, log=None):
    """Process the `src' HTML path to `dst' making it independent.
    
    - favicon links are removed
    - CSS references are updated (if `css_dir' is given), else removed.
    - Relative links are de-linkified.
    """
    if log:
        log.info("independentize %s %s", src, dst)

    # Parse the HTML file.
    with open(src) as f:
        tree = html5lib.parse(f, namespaceHTMLElements=False)

    # - Drop favicon links.
    # - Update or drop CSS links.
    head = tree.find("head")
    for link in head.getchildren()[:]:
        if link.tag != "link":
            continue
        rel = link.get("rel", "").split()
        if "icon" in rel:  # this is a favicon link
            if log:
                log.debug("%s: remove <link rel='%s'/>", dst, link.get("rel"))
            head.remove(link)
        if "stylesheet" in rel:  # this is a css ref
            if css_dir:  # update the css dir
                href = link.get("href")
                href = posixpath.join(css_dir, posixpath.basename(href))
                link.set("href", href)
                if log:
                    log.debug("%s: update to <link href='%s'/>", dst, href)
            else:
                if log:
                    log.debug("%s: remove <link href='%s'/>", dst,
                              link.get("href"))
                head.remove(link)

    # De-linkify local references within the full docset.
    # TODO: Eventually would like to normalize these to point
    # to online version of the docs.
    body = tree.find("body")
    for elem in body.getiterator():
        if elem.tag != "a":
            continue
        if not elem.get("href"):
            continue
        href = elem.get("href")
        scheme, netloc, path, params, query, fragment = urlparse(href)
        if scheme or netloc:  # externals href
            continue
        if path:
            if log:
                log.debug("%s: de-linkify <a href='%s'>", dst, href)
            elem.tag = u"span"  # de-linkify

    # Write out massaged doc.
    walker = treewalkers.getTreeWalker("etree", ET)
    stream = walker(tree)
    s = HTMLSerializer()
    outputter = s.serialize(stream)
    content = ''.join(list(outputter))
    f = open(dst, 'w')
    try:
        f.write(content)
    finally:
        f.close()