def test_ordering(self): assert alphabetize_attributes({ (None, "a"): 1, (None, "b"): 2 }) == OrderedDict([((None, "a"), 1), ((None, "b"), 2)]) assert alphabetize_attributes({ (None, "b"): 1, (None, "a"): 2 }) == OrderedDict([((None, "a"), 2), ((None, "b"), 1)])
def test_ordering(self): assert (alphabetize_attributes({ (None, 'a'): 1, (None, 'b'): 2 }) == OrderedDict([((None, 'a'), 1), ((None, 'b'), 2)])) assert (alphabetize_attributes({ (None, 'b'): 1, (None, 'a'): 2 }) == OrderedDict([((None, 'a'), 2), ((None, 'b'), 1)]))
def allow_token(self, token): """Handles the case where we're allowing the tag""" if 'data' in token: # Loop through all the attributes and drop the ones that are not # allowed, are unsafe or break other rules. Additionally, fix # attribute values that need fixing. # # At the end of this loop, we have the final set of attributes # we're keeping. attrs = {} for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name # Drop attributes that are not explicitly allowed # # NOTE(willkg): We pass in the attribute name--not a namespaced # name. if not self.attr_filter(token['name'], name, val): continue # Drop attributes with uri values that use a disallowed protocol # Sanitize attributes with uri values if namespaced_name in self.attr_val_is_uri: new_value = self.sanitize_uri_value(val, self.allowed_protocols) if new_value is None: continue val = new_value # Drop values in svg attrs with non-local IRIs if namespaced_name in self.svg_attr_val_allows_ref: new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(val)) new_val = new_val.strip() if not new_val: continue else: # Replace the val with the unescaped version because # it's a iri val = new_val # Drop href and xlink:href attr for svg elements with non-local IRIs if (None, token['name']) in self.svg_allow_local_href: if namespaced_name in [ (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href') ]: if re.search(r'^\s*[^#\s]', val): continue # If it's a style attribute, sanitize it if namespaced_name == (None, 'style'): val = self.sanitize_css(val) # At this point, we want to keep the attribute, so add it in attrs[namespaced_name] = val token['data'] = alphabetize_attributes(attrs) return token
def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping. Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. Here callable is a function with two arguments of attribute name and value. It should return true of false. Also gives the option to strip tags instead of encoding. """ token_type = token['type'] if token_type in ['StartTag', 'EndTag', 'EmptyTag']: if token['name'] in self.allowed_elements: return self.allow_token(token) elif self.strip_disallowed_elements: pass else: if 'data' in token: # Alphabetize the attributes before calling .disallowed_token() # so that the resulting string is stable token['data'] = alphabetize_attributes(token['data']) return self.disallowed_token(token) elif token_type == 'Comment': if not self.strip_html_comments: return token else: return token
def allow_token(self, token): """Handles the case where we're allowing the tag""" if 'data' in token: # Loop through all the attributes and drop the ones that are not # allowed, are unsafe or break other rules. Additionally, fix # attribute values that need fixing. # # At the end of this loop, we have the final set of attributes # we're keeping. attrs = {} for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name # Drop attributes that are not explicitly allowed # # NOTE(willkg): We pass in the attribute name--not a namespaced # name. if not self.attr_filter(token['name'], name, val): continue # Drop attributes with uri values that use a disallowed protocol # Sanitize attributes with uri values if namespaced_name in self.attr_val_is_uri: new_value = self.sanitize_uri_value(val, self.allowed_protocols) if new_value is None: continue val = new_value # Drop values in svg attrs with non-local IRIs if namespaced_name in self.svg_attr_val_allows_ref: new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(val)) new_val = new_val.strip() if not new_val: continue else: # Replace the val with the unescaped version because # it's a iri val = new_val # Drop href and xlink:href attr for svg elements with non-local IRIs if (None, token['name']) in self.svg_allow_local_href: if namespaced_name in [ (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href') ]: if re.search(r'^\s*[^#\s]', val): continue # If it's a style attribute, sanitize it if namespaced_name == (None, u'style'): val = self.sanitize_css(val) # At this point, we want to keep the attribute, so add it in attrs[namespaced_name] = val token['data'] = alphabetize_attributes(attrs) return token
def test_different_namespaces(self): assert ( alphabetize_attributes({ ('xlink', 'href'): 'abc', (None, 'alt'): '123' }) == OrderedDict([ ((None, 'alt'), '123'), (('xlink', 'href'), 'abc') ]) )
def test_ordering(self): assert ( alphabetize_attributes({ (None, 'a'): 1, (None, 'b'): 2 }) == OrderedDict([ ((None, 'a'), 1), ((None, 'b'), 2) ]) ) assert ( alphabetize_attributes({ (None, 'b'): 1, (None, 'a'): 2} ) == OrderedDict([ ((None, 'a'), 2), ((None, 'b'), 1) ]) )
def handle_email_addresses(self, src_iter): """Handle email addresses in character tokens""" for token in src_iter: if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 # For each email address we find in the text for match in self.email_re.finditer(text): if match.start() > end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} ) # Run attributes through the callbacks to see what we # should do with this match attrs = { (None, u'href'): u'mailto:%s' % match.group(0), u'_text': match.group(0) } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text--but not as a link new_tokens.append( {u'type': u'Characters', u'data': match.group(0)} ) else: # Add an "a" tag for the new link _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {u'type': u'StartTag', u'name': u'a', u'data': attrs}, {u'type': u'Characters', u'data': force_unicode(_text)}, {u'type': u'EndTag', u'name': 'a'} ]) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def handle_email_addresses(self, src_iter): """Handle email addresses in character tokens""" for token in src_iter: if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 # For each email address we find in the text for match in self.email_re.finditer(text): if match.start() > end: new_tokens.append( {'type': 'Characters', 'data': text[end:match.start()]} ) # Run attributes through the callbacks to see what we # should do with this match attrs = { (None, 'href'): 'mailto:%s' % match.group(0), '_text': match.group(0) } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text--but not as a link new_tokens.append( {'type': 'Characters', 'data': match.group(0)} ) else: # Add an "a" tag for the new link _text = attrs.pop('_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {'type': 'StartTag', 'name': 'a', 'data': attrs}, {'type': 'Characters', 'data': force_unicode(_text)}, {'type': 'EndTag', 'name': 'a'} ]) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({'type': 'Characters', 'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping. Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}. Here callable is a function with two arguments of attribute name and value. It should return true of false. Also gives the option to strip tags instead of encoding. :arg dict token: token to sanitize :returns: token or list of tokens """ token_type = token["type"] if token_type in ["StartTag", "EndTag", "EmptyTag"]: if token["name"] in self.allowed_elements: return self.allow_token(token) elif self.strip_disallowed_elements: return None else: if "data" in token: # Alphabetize the attributes before calling .disallowed_token() # so that the resulting string is stable token["data"] = alphabetize_attributes(token["data"]) return self.disallowed_token(token) elif token_type == "Comment": if not self.strip_html_comments: # call lxml.sax.saxutils to escape &, <, and > in addition to " and ' token["data"] = html5lib_shim.escape(token["data"], entities={ '"': """, "'": "'" }) return token else: return None elif token_type == "Characters": return self.sanitize_characters(token) else: return token
def handle_a_tag(self, token_buffer): """Handle the "a" tag This could adjust the link or drop it altogether depending on what the callbacks return. This yields the new set of tokens. """ a_token = token_buffer[0] if a_token['data']: attrs = a_token['data'] else: attrs = {} text = self.extract_character_data(token_buffer) attrs['_text'] = text attrs = self.apply_callbacks(attrs, False) if attrs is None: # We're dropping the "a" tag and everything else and replacing # it with character data. So emit that token. yield {'type': 'Characters', 'data': text} else: new_text = attrs.pop('_text', '') a_token['data'] = alphabetize_attributes(attrs) if text == new_text: # The callbacks didn't change the text, so we yield the new "a" # token, then whatever else was there, then the end "a" token yield a_token for mem in token_buffer[1:]: yield mem else: # If the callbacks changed the text, then we're going to drop # all the tokens between the start and end "a" tags and replace # it with the new text yield a_token yield {'type': 'Characters', 'data': force_unicode(new_text)} yield token_buffer[-1]
def handle_links(self, src_iter): """Handle links in character tokens""" in_a = False # happens, if parse_email=True and if a mail was found for token in src_iter: if in_a: if token['type'] == 'EndTag' and token['name'] == 'a': in_a = False yield token continue elif token['type'] == 'StartTag' and token['name'] == 'a': in_a = True yield token continue if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append( {'type': 'Characters', 'data': text[end:match.start()]} ) url = match.group(0) prefix = suffix = '' # Sometimes we pick up too much in the url match, so look for # bits we should drop and remove them from the match url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one if PROTO_RE.search(url): href = url else: href = 'http://%s' % url attrs = { (None, 'href'): href, '_text': url } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text new_tokens.append( {'type': 'Characters', 'data': prefix + url + suffix} ) else: # Add the "a" tag! if prefix: new_tokens.append( {'type': 'Characters', 'data': prefix} ) _text = attrs.pop('_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {'type': 'StartTag', 'name': 'a', 'data': attrs}, {'type': 'Characters', 'data': force_unicode(_text)}, {'type': 'EndTag', 'name': 'a'}, ]) if suffix: new_tokens.append( {'type': 'Characters', 'data': suffix} ) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({'type': 'Characters', 'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def allow_token(self, token): """Handles the case where we're allowing the tag""" if 'data' in token: # Loop through all the attributes and drop the ones that are not # allowed, are unsafe or break other rules. Additionally, fix # attribute values that need fixing. # # At the end of this loop, we have the final set of attributes # we're keeping. attrs = {} for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name # Drop attributes that are not explicitly allowed # # NOTE(willkg): We pass in the attribute name--not a namespaced # name. if not self.attr_filter(token['name'], name, val): continue # Look at attributes that have uri values if namespaced_name in self.attr_val_is_uri: val_unescaped = re.sub( "[`\000-\040\177-\240\s]+", '', unescape(val)).lower() # Remove replacement characters from unescaped characters. val_unescaped = val_unescaped.replace("\ufffd", "") # Drop attributes with uri values that have protocols that # aren't allowed if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): continue # Drop values in svg attrs with non-local IRIs if namespaced_name in self.svg_attr_val_allows_ref: new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(val)) new_val = new_val.strip() if not new_val: continue else: # Replace the val with the unescaped version because # it's a iri val = new_val # Drop href and xlink:href attr for svg elements with non-local IRIs if (None, token['name']) in self.svg_allow_local_href: if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]: if re.search(r'^\s*[^#\s]', val): continue # If it's a style attribute, sanitize it if namespaced_name == (None, u'style'): val = self.sanitize_css(val) # At this point, we want to keep the attribute, so add it in attrs[namespaced_name] = val token['data'] = alphabetize_attributes(attrs) return token
def test_different_namespaces(self): assert alphabetize_attributes({ ("xlink", "href"): "abc", (None, "alt"): "123" }) == OrderedDict([((None, "alt"), "123"), (("xlink", "href"), "abc")])
def handle_links(self, src_iter): """Handle links in character tokens""" in_a = False # happens, if parse_email=True and if a mail was found for token in src_iter: if in_a: if token["type"] == "EndTag" and token["name"] == "a": in_a = False yield token continue elif token["type"] == "StartTag" and token["name"] == "a": in_a = True yield token continue if token["type"] == "Characters": text = token["data"] new_tokens = [] end = 0 for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append({ "type": "Characters", "data": text[end:match.start()] }) url = match.group(0) prefix = suffix = "" # Sometimes we pick up too much in the url match, so look for # bits we should drop and remove them from the match url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one if PROTO_RE.search(url): href = url else: href = "http://%s" % url attrs = {(None, "href"): href, "_text": url} attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text new_tokens.append({ "type": "Characters", "data": prefix + url + suffix }) else: # Add the "a" tag! if prefix: new_tokens.append({ "type": "Characters", "data": prefix }) _text = attrs.pop("_text", "") attrs = alphabetize_attributes(attrs) new_tokens.extend([ { "type": "StartTag", "name": "a", "data": attrs }, { "type": "Characters", "data": force_unicode(_text) }, { "type": "EndTag", "name": "a" }, ]) if suffix: new_tokens.append({ "type": "Characters", "data": suffix }) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({ "type": "Characters", "data": text[end:] }) for new_token in new_tokens: yield new_token continue yield token
def handle_email_addresses(self, src_iter): """Handle email addresses in character tokens""" for token in src_iter: if token["type"] == "Characters": text = token["data"] new_tokens = [] end = 0 # For each email address we find in the text for match in self.email_re.finditer(text): if match.start() > end: new_tokens.append({ "type": "Characters", "data": text[end:match.start()] }) # Run attributes through the callbacks to see what we # should do with this match attrs = { (None, "href"): "mailto:%s" % match.group(0), "_text": match.group(0), } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text--but not as a link new_tokens.append({ "type": "Characters", "data": match.group(0) }) else: # Add an "a" tag for the new link _text = attrs.pop("_text", "") attrs = alphabetize_attributes(attrs) new_tokens.extend([ { "type": "StartTag", "name": "a", "data": attrs }, { "type": "Characters", "data": force_unicode(_text) }, { "type": "EndTag", "name": "a" }, ]) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({ "type": "Characters", "data": text[end:] }) for new_token in new_tokens: yield new_token continue yield token
def handle_links(self, src_iter): """Handle links in character tokens""" in_a = False # happens, if parse_email=True and if a mail was found for token in src_iter: if in_a: if token['type'] == 'EndTag' and token['name'] == 'a': in_a = False yield token continue elif token['type'] == 'StartTag' and token['name'] == 'a': in_a = True yield token continue if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} ) url = match.group(0) prefix = suffix = '' # Sometimes we pick up too much in the url match, so look for # bits we should drop and remove them from the match url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one if PROTO_RE.search(url): href = url else: href = u'http://%s' % url attrs = { (None, u'href'): href, u'_text': url } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text new_tokens.append( {u'type': u'Characters', u'data': prefix + url + suffix} ) else: # Add the "a" tag! if prefix: new_tokens.append( {u'type': u'Characters', u'data': prefix} ) _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {u'type': u'StartTag', u'name': u'a', u'data': attrs}, {u'type': u'Characters', u'data': force_unicode(_text)}, {u'type': u'EndTag', u'name': 'a'}, ]) if suffix: new_tokens.append( {u'type': u'Characters', u'data': suffix} ) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def test_empty_cases(self): assert alphabetize_attributes(None) is None assert alphabetize_attributes({}) == {}
def allow_token(self, token): """Handles the case where we're allowing the tag""" if 'data' in token: # Loop through all the attributes and drop the ones that are not # allowed, are unsafe or break other rules. Additionally, fix # attribute values that need fixing. # # At the end of this loop, we have the final set of attributes # we're keeping. attrs = {} for namespaced_name, val in token['data'].items(): namespace, name = namespaced_name # Drop attributes that are not explicitly allowed # # NOTE(willkg): We pass in the attribute name--not a namespaced # name. if not self.attr_filter(token['name'], name, val): continue # Look at attributes that have uri values if namespaced_name in self.attr_val_is_uri: val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(val)).lower() # Remove replacement characters from unescaped characters. val_unescaped = val_unescaped.replace("\ufffd", "") # Drop attributes with uri values that have protocols that # aren't allowed if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols)): continue # Drop values in svg attrs with non-local IRIs if namespaced_name in self.svg_attr_val_allows_ref: new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', ' ', unescape(val)) new_val = new_val.strip() if not new_val: continue else: # Replace the val with the unescaped version because # it's a iri val = new_val # Drop href and xlink:href attr for svg elements with non-local IRIs if (None, token['name']) in self.svg_allow_local_href: if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]: if re.search(r'^\s*[^#\s]', val): continue # If it's a style attribute, sanitize it if namespaced_name == (None, u'style'): val = self.sanitize_css(val) # At this point, we want to keep the attribute, so add it in attrs[namespaced_name] = val token['data'] = alphabetize_attributes(attrs) return token
def test_different_namespaces(self): assert (alphabetize_attributes({ ('xlink', 'href'): 'abc', (None, 'alt'): '123' }) == OrderedDict([((None, 'alt'), '123'), (('xlink', 'href'), 'abc')]))