def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs["sanitize"] = True else: parser_kwargs["tokenizer"] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs['sanitize'] = True else: parser_kwargs['tokenizer'] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def sanitize_html(input): """ Removes any unwanted HTML tags and attributes, using html5lib. >>> sanitize_html("foobar<p>adf<i></p>abc</i>") u'foobar<p>adf<i></i></p><i>abc</i>' >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>') u'foobar<p style="color: red;">adf<script>alert("Uhoh!")</script><i></i></p><i>abc</i>' """ p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def sanitize_html(input): """ Removes any unwanted HTML tags and attributes, using html5lib. >>> sanitize_html("foobar<p>adf<i></p>abc</i>") u'foobar<p>adf<i></i></p><i>abc</i>' >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>') u'foobar<p style="color: red;">adf<script>alert("Uhoh!")</script><i></i></p><i>abc</i>' """ p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def clean_html(input): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def clean_html(input): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def cleanup_html(string, sanitize=True, fragment=True, stream=False, filter_optional_tags=False, id_prefix=None, update_anchor_links=True): """Clean up some html and convert it to HTML.""" if not string.strip(): return '' string = force_text(string) if sanitize: string = lxml.html.clean.clean_html(string) tree = parse_html(string, fragment) walker = treewalkers.getTreeWalker('lxml')(tree) walker = CleanupFilter(walker, id_prefix, update_anchor_links) if filter_optional_tags: walker = OptionalTagsFilter(walker) serializer = HTMLSerializer( quote_attr_values=True, minimize_boolean_attributes=False, omit_optional_tags=False, ) rv = serializer.serialize(walker, 'utf-8') if stream: return rv return force_text(b''.join(rv))