Пример #1
0
def filter_html_cb(s, jsrewrite):
    cleaner = lxml.html.clean.Cleaner()
    cleaner.scripts = False
    cleaner.style = True
    doc = lxml.html.fromstring(s)
    clean = cleaner.clean_html(doc)
    for el in clean.iter():
        if el.tag == 'script':
            el.text = jsrewrite(el.text)
            for a in el.attrib:
                del el.attrib[a]
        if 'id' in el.attrib:
            el.attrib['id'] = 'sandbox-' + el.attrib['id']
    return lxml.html.tostring(clean)
Пример #2
0
def filter_html_cb(s, jsrewrite):
    cleaner = lxml.html.clean.Cleaner()
    cleaner.scripts = False
    cleaner.style = True
    doc = lxml.html.fromstring(s)
    clean = cleaner.clean_html(doc)
    for el in clean.iter():
        if el.tag == 'script':
            el.text = jsrewrite(el.text)
            for a in el.attrib:
                del el.attrib[a]
        if 'id' in el.attrib:
            el.attrib['id'] = 'sandbox-' + el.attrib['id']
    return lxml.html.tostring(clean)
Пример #3
0
def _clean_html(element):
    if element.tag == "style":
        css = cssutils.parseString(element.text_content())
        for rule in css:
            try:
                for sel in rule.selectorList:
                    # prefix all id and class selectors with google-
                    sel.selectorText = sel.selectorText.replace(
                        ".", ".google-")
                    sel.selectorText = sel.selectorText.replace(
                        "#", "#google-")
                    # add #resultsFrame in front of every selector (beware of comma-separated parts)
                    selectors = sel.selectorText.split(",")
                    selectors = [
                        "#resultsFrame %s" % s.strip() for s in selectors
                    ]
                    sel.selectorText = ", ".join(selectors)
            except:
                pass
        return "<style>%s</style>" % css.cssText
    elif element.tag == "script":
        return lxml.html.tostring(element)
    else:
        # monkeypatch from http://stackoverflow.com/questions/15386605/lxml-cleaner-to-ignore-base64-image
        # this prevents lxml from removing the data:image
        new_pattern = '\s*(?:javascript:|jscript:|livescript:|vbscript:|data:[^(?:image/.+;base64)]+|about:|mocha:)'
        lxml.html.clean._javascript_scheme_re = re.compile(new_pattern, re.I)

        cleaner = lxml.html.clean.Cleaner()
        cleaner.style = False
        cleaner.safe_attrs_only = False
        clean = cleaner.clean_html(element)
        for e in clean.iter():
            # prefix all ids and classes with google-
            if 'id' in e.attrib:
                e.attrib['id'] = 'google-' + e.attrib['id']
            if 'class' in e.attrib:
                # beware of elements with multiple space-separated classes
                classes = e.attrib['class'].split(" ")
                classes = ["google-%s" % c.strip() for c in classes]
                e.attrib['class'] = " ".join(classes)
        return lxml.html.tostring(clean)
Пример #4
0
def _clean_html(element):
    if element.tag == "style":
        css = cssutils.parseString(element.text_content())
        for rule in css:
            try:
                for sel in rule.selectorList:
                    # prefix all id and class selectors with google-
                    sel.selectorText = sel.selectorText.replace(".", ".google-")
                    sel.selectorText = sel.selectorText.replace("#", "#google-")
                    # add #resultsFrame in front of every selector (beware of comma-separated parts)
                    selectors = sel.selectorText.split(",")
                    selectors = ["#resultsFrame %s" % s.strip() for s in selectors]
                    sel.selectorText = ", ".join(selectors)
            except:
                pass
        return "<style>%s</style>" % css.cssText
    elif element.tag == "script":
        return lxml.html.tostring(element)
    else:
        # monkeypatch from http://stackoverflow.com/questions/15386605/lxml-cleaner-to-ignore-base64-image
        # this prevents lxml from removing the data:image
        new_pattern = '\s*(?:javascript:|jscript:|livescript:|vbscript:|data:[^(?:image/.+;base64)]+|about:|mocha:)'
        lxml.html.clean._javascript_scheme_re = re.compile(new_pattern, re.I)

        cleaner = lxml.html.clean.Cleaner()
        cleaner.style = False
        cleaner.safe_attrs_only = False
        clean = cleaner.clean_html(element)
        for e in clean.iter():
            # prefix all ids and classes with google-
            if 'id' in e.attrib:
                e.attrib['id'] = 'google-' + e.attrib['id']
            if 'class' in e.attrib:
                # beware of elements with multiple space-separated classes
                classes = e.attrib['class'].split(" ")
                classes = ["google-%s" % c.strip() for c in classes]
                e.attrib['class'] = " ".join(classes)
        return lxml.html.tostring(clean)