def filter_html_cb(s, jsrewrite): cleaner = lxml.html.clean.Cleaner() cleaner.scripts = False cleaner.style = True doc = lxml.html.fromstring(s) clean = cleaner.clean_html(doc) for el in clean.iter(): if el.tag == 'script': el.text = jsrewrite(el.text) for a in el.attrib: del el.attrib[a] if 'id' in el.attrib: el.attrib['id'] = 'sandbox-' + el.attrib['id'] return lxml.html.tostring(clean)
def _clean_html(element): if element.tag == "style": css = cssutils.parseString(element.text_content()) for rule in css: try: for sel in rule.selectorList: # prefix all id and class selectors with google- sel.selectorText = sel.selectorText.replace( ".", ".google-") sel.selectorText = sel.selectorText.replace( "#", "#google-") # add #resultsFrame in front of every selector (beware of comma-separated parts) selectors = sel.selectorText.split(",") selectors = [ "#resultsFrame %s" % s.strip() for s in selectors ] sel.selectorText = ", ".join(selectors) except: pass return "<style>%s</style>" % css.cssText elif element.tag == "script": return lxml.html.tostring(element) else: # monkeypatch from http://stackoverflow.com/questions/15386605/lxml-cleaner-to-ignore-base64-image # this prevents lxml from removing the data:image new_pattern = '\s*(?:javascript:|jscript:|livescript:|vbscript:|data:[^(?:image/.+;base64)]+|about:|mocha:)' lxml.html.clean._javascript_scheme_re = re.compile(new_pattern, re.I) cleaner = lxml.html.clean.Cleaner() cleaner.style = False cleaner.safe_attrs_only = False clean = cleaner.clean_html(element) for e in clean.iter(): # prefix all ids and classes with google- if 'id' in e.attrib: e.attrib['id'] = 'google-' + e.attrib['id'] if 'class' in e.attrib: # beware of elements with multiple space-separated classes classes = e.attrib['class'].split(" ") classes = ["google-%s" % c.strip() for c in classes] e.attrib['class'] = " ".join(classes) return lxml.html.tostring(clean)
def _clean_html(element): if element.tag == "style": css = cssutils.parseString(element.text_content()) for rule in css: try: for sel in rule.selectorList: # prefix all id and class selectors with google- sel.selectorText = sel.selectorText.replace(".", ".google-") sel.selectorText = sel.selectorText.replace("#", "#google-") # add #resultsFrame in front of every selector (beware of comma-separated parts) selectors = sel.selectorText.split(",") selectors = ["#resultsFrame %s" % s.strip() for s in selectors] sel.selectorText = ", ".join(selectors) except: pass return "<style>%s</style>" % css.cssText elif element.tag == "script": return lxml.html.tostring(element) else: # monkeypatch from http://stackoverflow.com/questions/15386605/lxml-cleaner-to-ignore-base64-image # this prevents lxml from removing the data:image new_pattern = '\s*(?:javascript:|jscript:|livescript:|vbscript:|data:[^(?:image/.+;base64)]+|about:|mocha:)' lxml.html.clean._javascript_scheme_re = re.compile(new_pattern, re.I) cleaner = lxml.html.clean.Cleaner() cleaner.style = False cleaner.safe_attrs_only = False clean = cleaner.clean_html(element) for e in clean.iter(): # prefix all ids and classes with google- if 'id' in e.attrib: e.attrib['id'] = 'google-' + e.attrib['id'] if 'class' in e.attrib: # beware of elements with multiple space-separated classes classes = e.attrib['class'].split(" ") classes = ["google-%s" % c.strip() for c in classes] e.attrib['class'] = " ".join(classes) return lxml.html.tostring(clean)