Exemplo n.º 1
0
 def serialize(self, output):
     walker = html5lib.treewalkers.getTreeWalker("dom")
     stream = walker(self.document)
     s = HTMLSerializer(omit_optional_tags=True)
     output_generator = s.serialize(stream)
     for item in output_generator:
         output.write(item.encode('utf-8'))
Exemplo n.º 2
0
    def clean(self, value, model_instance):
        """
        Validates the given value using the provided HTMLCleaner
        and returns its "cleaned" value as a Python object.

        Raises ValidationError for any errors.
        """
        value = super(HTMLField, self).clean(value, model_instance)

        parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer,tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = parser.parseFragment(value)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)

        if self.use_imageproxy:
            from imageproxy import Proxy
            user = User.objects.get(pk=getattr(model_instance, self.user_field))
            proxy = Proxy(user)
            stream = ImageProxyFilter(stream, proxy)

        s = HTMLSerializer(omit_optional_tags=False)
        output_generator = s.serialize(stream)

        clean_value = ''
        for item in output_generator:
            clean_value += item

        return clean_value
Exemplo n.º 3
0
def app_filter_html_path_inplace(path, filters, log=None):
    """Filter the given HTML file (in-place) based on "app-*" class
    attributes.
    
    For example, the HTML might contain something like:
        <div class="app-ide">
            ...ide info...
        </div>
        <div class="app-edit">
            ...edit info...
        </div>
    If there are no filters, then the HTML is not changed. If the filters
    include "ide" but not "edit", then the ide div remains and the
    edit div is removed.
    """
    if not filters:
        return
    if log:
        log("app-filter `%s'", path)

    # Parse the HTML file.
    with open(path) as f:
        tree = html5lib.parse(f, namespaceHTMLElements=False)

    # Filter out the unwanted elements.
    filtered = False
    assert isinstance(filters, set)
    for elem in tree.getiterator():
        indeces_to_drop = []
        for i, child in enumerate(elem.getchildren()):
            if _should_drop_elem(child, filters, "class", "app-"):
                indeces_to_drop.insert(0, i)
                filtered = True
                if log:
                    tag_str = "<%s" % child.tag
                    if child.attrib:
                        for n, v in child.attrib.items():
                            tag_str += ' %s="%s"' % (n, v)
                    tag_str += ">"
                    if len(tag_str) > 50:
                        tag_str = tag_str[:47] + '...'
                    log("... filter out %s", tag_str)
        for idx in indeces_to_drop:
            del elem[idx]

    # Write out any changes.
    if filtered:
        walker = treewalkers.getTreeWalker("etree", ET)
        stream = walker(tree)
        s = HTMLSerializer()
        outputter = s.serialize(stream)
        content = ''.join(list(outputter))
        f = open(path, 'w')
        f.write("""<!DOCTYPE html>
""")
        try:
            f.write(content)
        finally:
            f.close()
Exemplo n.º 4
0
def independentize_html_path(src, dst, css_dir=None, log=None):
    """Process the `src' HTML path to `dst' making it independent.
    
    - favicon links are removed
    - CSS references are updated (if `css_dir' is given), else removed.
    - Relative links are de-linkified.
    """
    if log:
        log.info("independentize %s %s", src, dst)

    # Parse the HTML file.
    with open(src) as f:
        tree = html5lib.parse(f, namespaceHTMLElements=False)

    # - Drop favicon links.
    # - Update or drop CSS links.
    head = tree.find("head")
    for link in head.getchildren()[:]:
        if link.tag != "link":
            continue
        rel = link.get("rel", "").split()
        if "icon" in rel:  # this is a favicon link
            if log:
                log.debug("%s: remove <link rel='%s'/>", dst, link.get("rel"))
            head.remove(link)
        if "stylesheet" in rel:  # this is a css ref
            if css_dir:  # update the css dir
                href = link.get("href")
                href = posixpath.join(css_dir, posixpath.basename(href))
                link.set("href", href)
                if log:
                    log.debug("%s: update to <link href='%s'/>", dst, href)
            else:
                if log:
                    log.debug("%s: remove <link href='%s'/>", dst,
                              link.get("href"))
                head.remove(link)

    # De-linkify local references within the full docset.
    # TODO: Eventually would like to normalize these to point
    # to online version of the docs.
    body = tree.find("body")
    for elem in body.getiterator():
        if elem.tag != "a":
            continue
        if not elem.get("href"):
            continue
        href = elem.get("href")
        scheme, netloc, path, params, query, fragment = urlparse(href)
        if scheme or netloc:  # externals href
            continue
        if path:
            if log:
                log.debug("%s: de-linkify <a href='%s'>", dst, href)
            elem.tag = u"span"  # de-linkify

    # Write out massaged doc.
    walker = treewalkers.getTreeWalker("etree", ET)
    stream = walker(tree)
    s = HTMLSerializer()
    outputter = s.serialize(stream)
    content = ''.join(list(outputter))
    f = open(dst, 'w')
    try:
        f.write(content)
    finally:
        f.close()