def parse_text(text): t1 = time.clock() parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder('etree'), tokenizer=MySanitiser) t2 = time.clock() text = text.replace('\r', '') text = text.replace('\n', '<br>') t3 = time.clock() for search,replace in SMILEY_REPLACEMENTS: text = text.replace(search, replace) for regex,replace in BBCODE_REGEXES: text = regex.sub(replace, text) for search,replace in BBCODE_REPLACEMENTS: text = text.replace(search, replace) t4 = time.clock() doc = parser.parse(text) t5 = time.clock() walker = treewalkers.getTreeWalker('etree') stream = walker(doc) s = serializer.htmlserializer.HTMLSerializer() output_generator = s.serialize(stream) t6 = time.clock() done = Markup(''.join(list(output_generator))) t7 = time.clock() print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1)) return done
def filter_response(self, response, encoding=None): """ Filter and fix-up the response object. """ # Parse the response tree_type = settings.TREE_TYPE # Here we check for a TemplateResponse in the case we're being # used as a view decorator. if hasattr(response, 'render') and callable(response.render): response.render() tree = html5parser.parse( response.content, treebuilder=tree_type, encoding=encoding ) # Build the serializer walker = treewalkers.getTreeWalker(tree_type) stream = walker(tree) options = self.get_serializer_options() serializer = htmlserializer.HTMLSerializer(**options) output = serializer.render(stream) output = output.encode(encoding) # Fix up the response response.content = output response['Content-Length'] = str(len(output)) # Add a flag to prevent further filtering if the decorator is already # used on this response. setattr(response, settings.FILTERED_FLAG, True) return response
def get_favicon_url(self, html): """ Parses *html* looking for a favicon URL. Returns a tuple of: (<url>, <mimetime>) If no favicon can be found, returns: (None, None) """ p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) fetch_url = None mimetype = None icon = False found_token = None for token in stream: if 'name' in token: if token['name'] == 'link': for attr in token['data']: if attr[0] == 'rel': if 'shortcut icon' in attr[1].lower(): found_token = token icon = True elif attr[0] == 'href': fetch_url = attr[1] elif attr[0] == 'type': mimetype = attr[1] if fetch_url and icon: if not mimetype: mimetype = "image/x-icon" if mimetype in self.favicon_mimetypes: return (fetch_url, mimetype) return (None, None)
def html2list(payload): """This function reads a block of HTML and returns a cleaned list. :param payload: The HTML string to read. :type payload: str :returns: list -- The parsed output as a list of strings. """ cleaned_output = [] p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('lxml'),tokenizer=sanitizer.HTMLSanitizer) s = serializer.htmlserializer.HTMLSerializer(strip_whitespace=True,omit_optional_tags=True) r = treewalkers.getTreeWalker('lxml')(p.parse(payload)) for item in r.tree.elementtree.getiterator(): if item.getparent() is not None: if (item.getparent().tag.split('}')[-1] == 'html'): item.text = '' else: item.text = '' for k in item.attrib: del item.attrib[k] if type(item.text) is str: for c in P['R']: item.text = re.sub(c,'',item.text) for tag in s.serialize(r): if not re.match("""(?:<|<)/?\w+((\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?(?:>|>)?""",tag): tag = tag.encode('ascii', 'ignore') split_tag = map(lambda x: x.strip(), re.split('[|,;]|(?:=2C|=3B)',tag.replace('&','&'))) for t in split_tag: for e in P['E']: if t == e: split_tag.remove(t) if split_tag: cleaned_output += split_tag return cleaned_output
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search('/tutorials', path) or re.search('/mobile', path)): return '' toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2', 'h3', 'h4']: for attr in element['data']: if attr[0] == 'id': current = { 'level' : int(element['name'][-1:]) - 1, 'id' : attr[1] } elif element['type'] == 'Characters' and current is not None: current['text'] = element['data'] elif element['type'] == 'EndTag' and current is not None: toc.append(current) current = None memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def test_to_sax(): handler = support.TracingSaxHandler() tree = html5lib.parse("""<html xml:lang="en"> <title>Directory Listing</title> <a href="/"><b/></p> """, treebuilder="etree") walker = getTreeWalker("etree") sax.to_sax(walker(tree), handler) expected = [ 'startDocument', ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html', {(None, 'xml:lang'): 'en'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}), ('characters', 'Directory Listing'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'), 'endDocument', ] assert expected == handler.visited
def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) if opts.xml: sys.stdout.write(document.toxml("utf-8")) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document] for fragment in document: sys.stdout.write(parser.tree.testSerializer(fragment)) sys.stdout.write("\n") elif opts.hilite: sys.stdout.write(document.hilite("utf-8")) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: kwargs[opt] = getattr(opts, opt) if not kwargs['quote_char']: del kwargs['quote_char'] tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) for text in serializer.HTMLSerializer(**kwargs).serialize(tokens): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList = [] for pos, errorcode, datavars in parser.errors: errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
def _normalize(html): """ Normalize the given string of HTML, collapsing whitespace. """ # This is taken from the "Serialization of Streams" section of # http://code.google.com/p/html5lib/wiki/UserDocumentation. p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize(stream) # TODO: We're not actually collapsing *all* whitespace; only # entire chunks of whitespace that the serializer gives us. Currently, # this seems "good enough" to pass our unit tests, which are # based on use cases of comparing pre-sanitized HTML to sanitized HTML, # but we may need to change this in the future. parts = [] last_item_was_whitespace = False for item in output_generator: # Is it empty whitespace? if item.strip() != '': parts.append(item) last_item_was_whitespace = False elif not last_item_was_whitespace: # Collapse whitespace. parts.append(' ') last_item_was_whitespace = True return ''.join(parts)
def get_toc(self, path): toc = memcache.get('toc|%s' % path) if toc is None or self.request.cache == False: template_text = webapp.template.render(path, {}); parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2', 'h3', 'h4']: for attr in element['data']: if attr[0] == 'id': current = { 'level' : int(element['name'][-1:]) - 1, 'id' : attr[1] } elif element['type'] == 'Characters' and current is not None: current['text'] = element['data'] elif element['type'] == 'EndTag' and current is not None: toc.append(current) current = None memcache.set('toc|%s' % path, toc, 3600) return toc
def clean_html(buf): """Cleans HTML of dangerous tags and content.""" buf = buf.strip() if not buf: return buf html_parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=HTMLSanitizer) dom_tree = html_parser.parseFragment(buf) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output = s.render(stream, 'utf-8') while 'toberemoved' in output: oldoutput = output matches = re.findall(r'<toberemoved.*?>.*?</toberemoved>', output, re.DOTALL) for s in matches: output = output.replace(s, '') matches = re.findall(r'</toberemoved>', output, re.DOTALL) for s in matches: output = output.replace(s, '') matches = re.findall(r'<toberemoved.*?>', output, re.DOTALL) for s in matches: output = output.replace(s, '') if output == oldoutput: break return output
def sanitize(string, html_type): """ >>> sanitize("\\t<p>a paragraph</p>","html") u'\\t<p>a paragraph</p>' >>> sanitize("\\t<script>alert('evil script');</script>", "xhtml") u"\\t<script>alert('evil script');</script>" """ try: import html5lib from html5lib import sanitizer, serializer, treewalkers, treebuilders except ImportError: raise Exception("html5lib not available") p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) tree = p.parseFragment(string) walker = treewalkers.getTreeWalker("simpletree") stream = walker(tree) if html_type == 'xhtml': s = serializer.xhtmlserializer.XHTMLSerializer() else: s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) return s.render(stream)
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs["sanitize"] = True else: parser_kwargs["tokenizer"] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def to_unicode(self): """Return the unicode serialization of myself.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) serializer = HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) return serializer.render(stream)[container_len : -container_len - 1]
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search('/tutorials', path) or re.search('/mobile', path)): return '' toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2', 'h3', 'h4']: for attr in element['data']: if attr[0] == 'id': current = { 'level': int(element['name'][-1:]) - 1, 'id': attr[1] } elif element['type'] == 'Characters' and current is not None: current['text'] = element['data'] elif element['type'] == 'EndTag' and current is not None: toc.append(current) current = None memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs['sanitize'] = True else: parser_kwargs['tokenizer'] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def __str__(self): """Return the unicode serialization of myself.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) serializer = HTMLSerializer(quote_attr_values='always', omit_optional_tags=False) return serializer.render(stream)[container_len : -container_len - 1]
def search(self, term): # define link for search searchUrl = self.baseUrl + r"/sc/search?&must=" + term + r"&Type=Music&Type=&inandout=true&SRI=true&ND=-1" print(" --> searching on chemical for " + term) print(" --> with " + searchUrl) source = getWebAsStr(searchUrl) # create a parser, we use minidom p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(source) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) # now we can send the stream to our fetcher functions # find links on search result page l_hitLinks = self.fetch_HitLinks(stream) # find short info l_shortInfo = self.fetch_ShortInfo(stream) # create an two dimensional list results = [] for link, info in zip(l_hitLinks, l_shortInfo): results.append([link, info]) return results
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search("/tutorials", path) or re.search("/mobile", path)): return "" toc = memcache.get("%s|toc|%s" % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None for element in stream: if element["type"] == "StartTag": if element["name"] in ["h2", "h3", "h4"]: for attr in element["data"]: if attr[0] == "id": current = {"level": int(element["name"][-1:]) - 1, "id": attr[1]} elif element["type"] == "Characters" and current is not None: current["text"] = element["data"] elif element["type"] == "EndTag" and current is not None: toc.append(current) current = None memcache.set("%s|toc|%s" % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def clean_html(buf): """Cleans HTML of dangerous tags and content.""" buf = buf.strip() if not buf: return buf html_parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("dom"), tokenizer=HTMLSanitizer) dom_tree = html_parser.parseFragment(buf) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer( omit_optional_tags=False, quote_attr_values=True) output = s.render(stream, 'utf-8') while 'toberemoved' in output: oldoutput = output matches = re.findall( r'<toberemoved.*?>.*?</toberemoved>', output, re.DOTALL) for s in matches: output = output.replace(s, '') matches = re.findall(r'</toberemoved>', output, re.DOTALL) for s in matches: output = output.replace(s, '') matches = re.findall(r'<toberemoved.*?>', output, re.DOTALL) for s in matches: output = output.replace(s, '') if output == oldoutput: break return output
def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) if opts.xml: sys.stdout.write(document.toxml("utf-8")) elif opts.tree: if not hasattr(document,'__getitem__'): document = [document] for fragment in document: sys.stdout.write(parser.tree.testSerializer(fragment)) sys.stdout.write("\n") elif opts.hilite: sys.stdout.write(document.hilite("utf-8")) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: kwargs[opt] = getattr(opts,opt) if not kwargs['quote_char']: del kwargs['quote_char'] tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) for text in serializer.HTMLSerializer(**kwargs).serialize(tokens): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList=[] for pos, errorcode, datavars in parser.errors: errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
def hmtl2text(html): p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(html.decode("utf-8")) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) in_script = False outbuf = [] current_line = [] for token in stream: token_name = token.get('name', "").lower() if token_name in ['script', 'style', 'noscript']: in_script = token.get('type', None) == 'StartTag' if in_script: continue if token_name in block_level_elements or token_name == "br": if current_line: outbuf.append(u"".join(current_line)) current_line = [] if token.get(u'type', None) == u'Characters': current_line.append(token['data']) if token.get(u'type', None) == u'SpaceCharacters': if current_line and current_line[-1] != u" ": current_line.append(u" ") if current_line: outbuf.append(u"".join(current_line)) return clean_whitespace("\n".join(outbuf))
def clean(self, value, model_instance): """ Validates the given value using the provided HTMLCleaner and returns its "cleaned" value as a Python object. Raises ValidationError for any errors. """ value = super(HTMLField, self).clean(value, model_instance) parser = html5lib.HTMLParser(tokenizer=HTMLSanitizer,tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parseFragment(value) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) if self.use_imageproxy: from imageproxy import Proxy user = User.objects.get(pk=getattr(model_instance, self.user_field)) proxy = Proxy(user) stream = ImageProxyFilter(stream, proxy) s = HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize(stream) clean_value = '' for item in output_generator: clean_value += item return clean_value
def test_lxml_xml(): expected = [{ 'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag' }, { 'data': {}, 'name': 'div', 'namespace': None, 'type': 'StartTag' }, { 'name': 'div', 'namespace': None, 'type': 'EndTag' }, { 'name': 'div', 'namespace': None, 'type': 'EndTag' }] lxmltree = lxml.etree.fromstring('<div><div></div></div>') walker = treewalkers.getTreeWalker('lxml') output = Lint(walker(lxmltree)) assert list(output) == expected
def app_filter_html_path_inplace(path, filters, log=None): """Filter the given HTML file (in-place) based on "app-*" class attributes. For example, the HTML might contain something like: <div class="app-ide"> ...ide info... </div> <div class="app-edit"> ...edit info... </div> If there are no filters, then the HTML is not changed. If the filters include "ide" but not "edit", then the ide div remains and the edit div is removed. """ if not filters: return if log: log("app-filter `%s'", path) # Parse the HTML file. with open(path) as f: tree = html5lib.parse(f, namespaceHTMLElements=False) # Filter out the unwanted elements. filtered = False assert isinstance(filters, set) for elem in tree.getiterator(): indeces_to_drop = [] for i, child in enumerate(elem.getchildren()): if _should_drop_elem(child, filters, "class", "app-"): indeces_to_drop.insert(0, i) filtered = True if log: tag_str = "<%s" % child.tag if child.attrib: for n, v in child.attrib.items(): tag_str += ' %s="%s"' % (n, v) tag_str += ">" if len(tag_str) > 50: tag_str = tag_str[:47] + '...' log("... filter out %s", tag_str) for idx in indeces_to_drop: del elem[idx] # Write out any changes. if filtered: walker = treewalkers.getTreeWalker("etree", ET) stream = walker(tree) s = HTMLSerializer() outputter = s.serialize(stream) content = ''.join(list(outputter)) f = open(path, 'w') f.write("""<!DOCTYPE html> """) try: f.write(content) finally: f.close()
def SearchMovie(title, year): r = requests.post(DOMAIN_NAME + "/subtitles/searchbytitle", data={"query": title, "l": ""}) p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(r.text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) return SearchTitleMatch(stream)
def run_sanitizer(html, sanitizer): parser = html5lib.HTMLParser(tokenizer=sanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) result = s.serialize(stream) return u"".join(result)
def sanitize_string(self, user_input): p = html5lib.HTMLParser(tokenizer=CommonsHTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(user_input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) return u"".join(s.serialize(stream))
def cleanup_html(html): parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) result = s.render(stream) return u"".join(result)
def parse(f): p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) doc = p.parse(f) walker = treewalkers.getTreeWalker("dom") tokens = [] bintokens = [] waitfor = None for tok in walker(doc): if waitfor: if tok["type"] == waitfor[0] and tok["name"] == waitfor[1]: waitfor = None continue if tok["type"] == "StartTag" and tok["name"] in ("link", "script", "style"): waitfor = ("EndTag", tok["name"]) if tok["type"] in ("EndTag", "StartTag", "EmptyTag", "Comment"): bintokens.append(1) tokens.append(tok) elif tok["type"] in ("Characters",): for tok1 in tok["data"].split(): bintokens.append(0) tokens.append({"type": "Characters", "data": tok1}) elif tok["type"] in ("SpaceCharacters", "Doctype"): pass else: raise ValueError("unrecognizable token type: %r" % tok) cumbintokens = [bintokens[0]] for tok in bintokens[1:]: cumbintokens.append(cumbintokens[-1] + tok) length = len(cumbintokens) midx = None m = None for i in range(length): for j in range(i + 1, length): end_tag = cumbintokens[-1] - cumbintokens[j] start_tag = cumbintokens[i] text_between = (j - i) - (cumbintokens[j] - cumbintokens[i]) nm = end_tag + start_tag + text_between if not midx or nm > m: midx = i, j m = nm i, j = midx return serialize_tokens(tokens[i:j + 1])
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)): return '' toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None innerTagCount = 0 for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2']: for attr in element['data']: if attr[0] == 'id': current = { 'level': int(element['name'][-1:]) - 1, 'id': attr[1], 'text': '' } elif current is not None: innerTagCount += 1 elif element['type'] == 'Characters' and current is not None: # if we already have text check: # - whether the last character is a < or a ( # - the string being added starts with > or ) # in which case do not add a space if current['text'] != '': if current['text'][-1] != '<' and not re.match( r"^[\>\)]", element['data']): current['text'] += ' ' current['text'] = current['text'] + element['data'] elif element['type'] == 'EndTag' and current is not None: if innerTagCount > 0: innerTagCount -= 1 else: current['text'] = cgi.escape(current['text']) toc.append(current) current = None memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def serialize(input, tree="simpletree", format="html", encoding=None, **serializer_opts): # XXX: Should we cache this? walker = treewalkers.getTreeWalker(tree) if format == "html": s = HTMLSerializer(**serializer_opts) else: raise ValueError("type must be html") return s.render(walker(input), encoding)
def strip_tags(html): if html: builder = treebuilders.getTreeBuilder("dom") parser = html5lib.HTMLParser(tree=builder, tokenizer=StripTags) tree = parser.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(tree) serializer = HTMLSerializer() return serializer.render(stream)
def sanitize_html(html): """Sanitizes an HTML fragment.""" p = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(stream) return u"".join(output_generator)
def build_tree(f): html = [] for line in f: line = line.replace("\t", " ") html.append(line) html = "".join(html) encoding = chardet.detect(html) # print "Detected encoding: ", encoding html = html.decode(encoding["encoding"]) p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) chars = "" root = MyDOM(u"root", None) node = root for token in stream: token_type = token.get("type", None) if token_type.endswith("Error"): return None if token_type == "Comment": # ignore comments for now continue if token_type.endswith("Characters"): chars += token.get("data", "") continue if chars.strip(): node.addkid(chars, "chars") chars = "" tag_name = token.get("name", None) if token_type == "EmptyTag": continue node.addkid(tag_name, "tag") for k, v in token.get("data", {}).iteritems(): node.addkid("%s:%s" % (k[1], v), "meta") continue assert tag_name is not None, token tag_name = tag_name.upper() if token_type == "EndTag": assert MyDOM.get_label(node) == tag_name, token node = node.get_parent() assert node is not None, "Unbalanced Tree" if token_type == "StartTag": node = node.addkid(tag_name, "tag") return root
def tostring(lxmltree, options=None): options = options or {'omit_optional_tags': False} walker = treewalkers.getTreeWalker('lxml') stream = walker(lxmltree) s = serializer.HTMLSerializer(**options) output = s.render(stream) if not isinstance(output, str): # Python 2 output = output.encode('utf-8') return output
def writeHtml(writer, nodeList): from html5lib.treewalkers import getTreeWalker #from html5lib.serializer.htmlserializer import HTMLSerializer from html5lib.serializer.xhtmlserializer import XHTMLSerializer walker = getTreeWalker('dom') serializer = XHTMLSerializer() for node in nodeList: for item in serializer.serialize(walker(node)): writer.write(item)
def sanitize(content): parser = HTMLParser(tokenizer = sanitizer.HTMLSanitizer, tree = treebuilders.getTreeBuilder("dom")) dom = parser.parseFragment(content) tree_walker = treewalkers.getTreeWalker("dom") tree_stream = tree_walker(dom) serial = serializer.HTMLSerializer(omit_optional_tags = False, quote_attr_values = True) output = serial.serialize(tree_stream) return u''.join(output)
def sanitize_html(data, encoding=None): parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=sanitizer_factory) walker = treewalkers.getTreeWalker("dom") stream = walker(parser.parseFragment(data, encoding=encoding)) slzr = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True, use_trailing_solidus=True) html = slzr.render(stream, encoding) return html
def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) for item in parser.log: print(item) if document is not None: if opts.xml: tb = opts.treebuilder.lower() if tb == "dom": document.writexml(sys.stdout, encoding="utf-8") elif tb == "lxml": import lxml.etree sys.stdout.write( lxml.etree.tostring(document, encoding="unicode")) elif tb == "etree": sys.stdout.write( _utils.default_etree.tostring(document, encoding="unicode")) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: try: kwargs[opt] = getattr(opts, opt) except Exception: pass if not kwargs['quote_char']: del kwargs['quote_char'] if opts.sanitize: kwargs["sanitize"] = True tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) if sys.version_info[0] >= 3: encoding = None else: encoding = "utf-8" for text in serializer.HTMLSerializer(**kwargs).serialize( tokens, encoding=encoding): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList = [] for pos, errorcode, datavars in parser.errors: errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
def _get_event_description_old(self, div_tag): # TODO: strip tags? # <div class="info_text specHigh1"> \n\t foo <p> \n\t blah blah.</p><p>blub blub.</p> tag = self._get_tag(div_tag, 'div', 'class', 'info_text specHigh1') if tag: description = [] for node in tag.childNodes: tokens = treewalkers.getTreeWalker("dom")(node) for text in serializer.HTMLSerializer(omit_optional_tags=False).serialize(tokens): description.append(text.strip()) return u''.join(description)
def sanitize_html(html): """Sanitizes an HTML fragment.""" p = html5lib.HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(stream) return u''.join(output_generator)
def render(self, dom_tree): walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) if self.method == "xhtml": Serializer = serializer.xhtmlserializer.XHTMLSerializer else: Serializer = serializer.htmlserializer.HTMLSerializer ser = Serializer(strip_whitespace=self.strip_whitespace, quote_attr_values=True, omit_optional_tags=False) return ser.render(stream)
def render(self, dom_tree): walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) if self.method == "xhtml": Serializer = serializer.xhtmlserializer.XHTMLSerializer else: Serializer = serializer.htmlserializer.HTMLSerializer ser = Serializer( strip_whitespace=self.strip_whitespace, quote_attr_values=True, omit_optional_tags=False) return ser.render(stream)
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search('/tutorials', path) or re.search('/mobile', path) or re.search('style-guide', path)): return '' toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None innerTagCount = 0 for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2']: for attr in element['data']: if attr[0] == 'id': current = { 'level' : int(element['name'][-1:]) - 1, 'id' : attr[1], 'text': '' } elif current is not None: innerTagCount += 1 elif element['type'] == 'Characters' and current is not None: # if we already have text check: # - whether the last character is a < or a ( # - the string being added starts with > or ) # in which case do not add a space if current['text'] != '': if current['text'][-1] != '<' and not re.match(r"^[\>\)]", element['data']): current['text'] += ' ' current['text'] = current['text'] + element['data'] elif element['type'] == 'EndTag' and current is not None: if innerTagCount > 0: innerTagCount -= 1 else: current['text'] = cgi.escape(current['text']) toc.append(current) current = None memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def sanitize_html(value): '''A custom filter that sanitzes html output to make sure there is no bad stuff in it''' p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(value) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))
def printOutput(parser, document, opts): if opts.encoding: print('Encoding:', parser.tokenizer.stream.charEncoding) for item in parser.log: print(item) if document is not None: if opts.xml: tb = opts.treebuilder.lower() if tb == 'dom': document.writexml(sys.stdout, encoding='utf-8') elif tb == 'lxml': import lxml.etree sys.stdout.write(lxml.etree.tostring(document)) elif tb == 'etree': sys.stdout.write(utils.default_etree.tostring(document)) elif opts.tree: if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) elif opts.hilite: sys.stdout.write(document.hilite('utf-8')) elif opts.html: kwargs = {} for opt in serializer.HTMLSerializer.options: try: kwargs[opt] = getattr(opts, opt) except: pass if not kwargs['quote_char']: del kwargs['quote_char'] tokens = treewalkers.getTreeWalker(opts.treebuilder)(document) if sys.version_info[0] >= 3: encoding = None else: encoding = 'utf-8' for text in serializer.HTMLSerializer(**kwargs).serialize( tokens, encoding=encoding): sys.stdout.write(text) if not text.endswith('\n'): sys.stdout.write('\n') if opts.error: errList = [] for pos, errorcode, datavars in parser.errors: errList.append('Line %i Col %i' % pos + ' ' + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) sys.stdout.write('\nParse errors:\n' + '\n'.join(errList) + '\n')
def serialize(input, tree=u"simpletree", format=u"html", encoding=None, **serializer_opts): # XXX: Should we cache this? walker = treewalkers.getTreeWalker(tree) if format == u"html": s = HTMLSerializer(**serializer_opts) elif format == u"xhtml": s = XHTMLSerializer(**serializer_opts) else: raise ValueError(u"type must be either html or xhtml") return s.render(walker(input), encoding)
def clean(self, value): chars = super(HTMLField, self).clean(value) #chars = chars.encode('utf-8') # should really find out where we have decoded input to unicode and do it there instead p = html5lib.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) # could use Beautiful Soup here instead s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) dom_tree = p.parseFragment(chars) #encoding="utf-8") - unicode input seems to work fine walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) gen = s.serialize(stream) out = "" for i in gen: out += i return out
def toString(tree, output_encoding="utf-8", serializer="html5lib", **kwargs): # Serialize to XML #if serializer == "lxml.etree": if False: rendered = etree.tostring(tree, encoding=output_encoding) # Serialize to HTML using lxml.html elif serializer == "lxml.html": rendered = lxml.html.tostring(tree, encoding=output_encoding) # Serialize to HTML using html5lib else: walker = treewalkers.getTreeWalker("lxml") s = htmlserializer.HTMLSerializer(**kwargs) rendered = s.render(walker(tree), encoding=output_encoding) return rendered
def clean_html(data, full=True, parser=DEFAULT_PARSER): """ Cleans HTML from XSS vulnerabilities using html5lib If full is False, only the contents inside <body> will be returned (without the <body> tags). """ if full: dom_tree = parser.parse(data) else: dom_tree = parser.parseFragment(data) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) return u''.join(s.serialize(stream))
def clean_html(buf): """Cleans HTML of dangerous tags and content.""" buf = buf.strip() if not buf: return buf p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=sanitizer_factory) dom_tree = p.parseFragment(buf) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) return s.render(stream)
def serialize(self, **kwargs): """Return the unicode serialization of myself, with optional sanitization arguments.""" container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <> walker = getTreeWalker(self.TREEBUILDER) stream = walker(self._root) stream = sortAttributes(stream) serializer = HTMLSerializer(quote_attr_values="always", omit_optional_tags=False) html = serializer.render(stream)[container_len:-container_len - 1] return bleach.clean( html, tags=kwargs.get("tags") or (ALLOWED_TAGS + ["for"]), attributes=kwargs.get("attributes") or ALLOWED_ATTRIBUTES, styles=kwargs.get("styles") or ALLOWED_STYLES, strip_comments=True, )
def GenshiAdapter(tree): text = None for token in treewalkers.getTreeWalker('dom')(tree): type = token['type'] if type in ('Characters', 'SpaceCharacters'): if text is None: text = token['data'] else: text += token['data'] elif text is not None: yield TEXT, text, (None, -1, -1) text = None if type in ('StartTag', 'EmptyTag'): if token['namespace']: name = '{%s}%s' % (token['namespace'], token['name']) else: name = token['name'] attrs = Attrs([ (QName('{%s}%s' % attr if attr[0] is not None else attr[1]), value) for attr, value in token['data'].items() ]) yield (START, (QName(name), attrs), (None, -1, -1)) if type == 'EmptyTag': type = 'EndTag' if type == 'EndTag': if token['namespace']: name = '{%s}%s' % (token['namespace'], token['name']) else: name = token['name'] yield END, QName(name), (None, -1, -1) elif type == 'Comment': yield COMMENT, token['data'], (None, -1, -1) elif type == 'Doctype': yield DOCTYPE, (token['name'], token['publicId'], token['systemId']), (None, -1, -1) else: pass # FIXME: What to do? if text is not None: yield TEXT, text, (None, -1, -1)
def GenshiAdapter(tree): text = None for token in treewalkers.getTreeWalker("dom")(tree): type = token["type"] if type in ("Characters", "SpaceCharacters"): if text is None: text = token["data"] else: text += token["data"] elif text is not None: yield TEXT, text, (None, -1, -1) text = None if type in ("StartTag", "EmptyTag"): if token["namespace"]: name = "{%s}%s" % (token["namespace"], token["name"]) else: name = token["name"] attrs = Attrs([ (QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value) for attr, value in token["data"].items() ]) yield (START, (QName(name), attrs), (None, -1, -1)) if type == "EmptyTag": type = "EndTag" if type == "EndTag": if token["namespace"]: name = "{%s}%s" % (token["namespace"], token["name"]) else: name = token["name"] yield END, QName(name), (None, -1, -1) elif type == "Comment": yield COMMENT, token["data"], (None, -1, -1) elif type == "Doctype": yield DOCTYPE, (token["name"], token["publicId"], token["systemId"]), (None, -1, -1) else: pass # FIXME: What to do? if text is not None: yield TEXT, text, (None, -1, -1)
def sanitize_html(input): """ Removes any unwanted HTML tags and attributes, using html5lib. >>> sanitize_html("foobar<p>adf<i></p>abc</i>") u'foobar<p>adf<i></i></p><i>abc</i>' >>> sanitize_html('foobar<p style="color:red; remove:me; background-image: url(http://example.com/test.php?query_string=bad);">adf<script>alert("Uhoh!")</script><i></p>abc</i>') u'foobar<p style="color: red;">adf<script>alert("Uhoh!")</script><i></i></p><i>abc</i>' """ p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False) return "".join(s.serialize(stream))