def linkify(self, text): """Linkify specified text :arg str text: the text to add links to :returns: linkified text as unicode :raises TypeError: if ``text`` is not a text type """ if not isinstance(text, six.string_types): raise TypeError('argument must be of text type') text = force_unicode(text) if not text: return '' dom = self.parser.parseFragment(text) filtered = LinkifyFilter( source=self.walker(dom), callbacks=self.callbacks, skip_tags=self.skip_tags, parse_email=self.parse_email, url_re=self.url_re, email_re=self.email_re, ) return self.serializer.render(filtered)
def linkify(self, text): """Linkify specified text :arg str text: the text to add links to :returns: linkified text as unicode :raises TypeError: if ``text`` is not a text type """ if not isinstance(text, six.string_types): raise TypeError('argument must be of text type') text = force_unicode(text) if not text: return u'' dom = self.parser.parseFragment(text) filtered = LinkifyFilter( source=self.walker(dom), callbacks=self.callbacks, skip_tags=self.skip_tags, parse_email=self.parse_email, url_re=self.url_re, email_re=self.email_re, ) return self.serializer.render(filtered)
def clean(self, text, allowed_domains=False): if not allowed_domains: allowed_domains = [] if not isinstance(text, str): message = "argument cannot be of '{name}' type, must be of text type".format( name=text.__class__.__name__) raise TypeError(message) if not text: return "" text = force_unicode(text) dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: fc = filter_class(source=filtered) filtered = fc.__iter__(allowed_domains=allowed_domains) return self.serializer.render(filtered)
def handle_email_addresses(self, src_iter): """Handle email addresses in character tokens""" for token in src_iter: if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 # For each email address we find in the text for match in self.email_re.finditer(text): if match.start() > end: new_tokens.append( {'type': 'Characters', 'data': text[end:match.start()]} ) # Run attributes through the callbacks to see what we # should do with this match attrs = { (None, 'href'): 'mailto:%s' % match.group(0), '_text': match.group(0) } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text--but not as a link new_tokens.append( {'type': 'Characters', 'data': match.group(0)} ) else: # Add an "a" tag for the new link _text = attrs.pop('_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {'type': 'StartTag', 'name': 'a', 'data': attrs}, {'type': 'Characters', 'data': force_unicode(_text)}, {'type': 'EndTag', 'name': 'a'} ]) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({'type': 'Characters', 'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def handle_email_addresses(self, src_iter): """Handle email addresses in character tokens""" for token in src_iter: if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 # For each email address we find in the text for match in self.email_re.finditer(text): if match.start() > end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} ) # Run attributes through the callbacks to see what we # should do with this match attrs = { (None, u'href'): u'mailto:%s' % match.group(0), u'_text': match.group(0) } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text--but not as a link new_tokens.append( {u'type': u'Characters', u'data': match.group(0)} ) else: # Add an "a" tag for the new link _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {u'type': u'StartTag', u'name': u'a', u'data': attrs}, {u'type': u'Characters', u'data': force_unicode(_text)}, {u'type': u'EndTag', u'name': 'a'} ]) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def handle_a_tag(self, token_buffer): """Handle the "a" tag This could adjust the link or drop it altogether depending on what the callbacks return. This yields the new set of tokens. """ a_token = token_buffer[0] if a_token['data']: attrs = a_token['data'] else: attrs = {} text = self.extract_character_data(token_buffer) attrs['_text'] = text attrs = self.apply_callbacks(attrs, False) if attrs is None: # We're dropping the "a" tag and everything else and replacing # it with character data. So emit that token. yield {'type': 'Characters', 'data': text} else: new_text = attrs.pop('_text', '') a_token['data'] = alphabetize_attributes(attrs) if text == new_text: # The callbacks didn't change the text, so we yield the new "a" # token, then whatever else was there, then the end "a" token yield a_token for mem in token_buffer[1:]: yield mem else: # If the callbacks changed the text, then we're going to drop # all the tokens between the start and end "a" tags and replace # it with the new text yield a_token yield {'type': 'Characters', 'data': force_unicode(new_text)} yield token_buffer[-1]
def clean(self, text): """Cleans text and returns sanitized result as unicode :arg str text: text to be cleaned :returns: sanitized text as unicode :raises TypeError: if ``text`` is not a text type """ if not isinstance(text, six.string_types): message = "argument cannot be of '{name}' type, must be of text type".format( name=text.__class__.__name__) raise TypeError(message) if not text: return '' text = force_unicode(text) dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered) return self.serializer.render(filtered)
def clean(self, text): """Cleans text and returns sanitized result as unicode :arg str text: text to be cleaned :returns: sanitized text as unicode :raises TypeError: if ``text`` is not a text type """ if not isinstance(text, six.string_types): message = "argument cannot be of '{name}' type, must be of text type".format( name=text.__class__.__name__) raise TypeError(message) if not text: return u'' text = force_unicode(text) dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered) return self.serializer.render(filtered)
def handle_links(self, src_iter): """Handle links in character tokens""" in_a = False # happens, if parse_email=True and if a mail was found for token in src_iter: if in_a: if token['type'] == 'EndTag' and token['name'] == 'a': in_a = False yield token continue elif token['type'] == 'StartTag' and token['name'] == 'a': in_a = True yield token continue if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append( {'type': 'Characters', 'data': text[end:match.start()]} ) url = match.group(0) prefix = suffix = '' # Sometimes we pick up too much in the url match, so look for # bits we should drop and remove them from the match url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one if PROTO_RE.search(url): href = url else: href = 'http://%s' % url attrs = { (None, 'href'): href, '_text': url } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text new_tokens.append( {'type': 'Characters', 'data': prefix + url + suffix} ) else: # Add the "a" tag! if prefix: new_tokens.append( {'type': 'Characters', 'data': prefix} ) _text = attrs.pop('_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {'type': 'StartTag', 'name': 'a', 'data': attrs}, {'type': 'Characters', 'data': force_unicode(_text)}, {'type': 'EndTag', 'name': 'a'}, ]) if suffix: new_tokens.append( {'type': 'Characters', 'data': suffix} ) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({'type': 'Characters', 'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def handle_links(self, src_iter): """Handle links in character tokens""" in_a = False # happens, if parse_email=True and if a mail was found for token in src_iter: if in_a: if token["type"] == "EndTag" and token["name"] == "a": in_a = False yield token continue elif token["type"] == "StartTag" and token["name"] == "a": in_a = True yield token continue if token["type"] == "Characters": text = token["data"] new_tokens = [] end = 0 for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append({ "type": "Characters", "data": text[end:match.start()] }) url = match.group(0) prefix = suffix = "" # Sometimes we pick up too much in the url match, so look for # bits we should drop and remove them from the match url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one if PROTO_RE.search(url): href = url else: href = "http://%s" % url attrs = {(None, "href"): href, "_text": url} attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text new_tokens.append({ "type": "Characters", "data": prefix + url + suffix }) else: # Add the "a" tag! if prefix: new_tokens.append({ "type": "Characters", "data": prefix }) _text = attrs.pop("_text", "") attrs = alphabetize_attributes(attrs) new_tokens.extend([ { "type": "StartTag", "name": "a", "data": attrs }, { "type": "Characters", "data": force_unicode(_text) }, { "type": "EndTag", "name": "a" }, ]) if suffix: new_tokens.append({ "type": "Characters", "data": suffix }) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({ "type": "Characters", "data": text[end:] }) for new_token in new_tokens: yield new_token continue yield token
def handle_email_addresses(self, src_iter): """Handle email addresses in character tokens""" for token in src_iter: if token["type"] == "Characters": text = token["data"] new_tokens = [] end = 0 # For each email address we find in the text for match in self.email_re.finditer(text): if match.start() > end: new_tokens.append({ "type": "Characters", "data": text[end:match.start()] }) # Run attributes through the callbacks to see what we # should do with this match attrs = { (None, "href"): "mailto:%s" % match.group(0), "_text": match.group(0), } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text--but not as a link new_tokens.append({ "type": "Characters", "data": match.group(0) }) else: # Add an "a" tag for the new link _text = attrs.pop("_text", "") attrs = alphabetize_attributes(attrs) new_tokens.extend([ { "type": "StartTag", "name": "a", "data": attrs }, { "type": "Characters", "data": force_unicode(_text) }, { "type": "EndTag", "name": "a" }, ]) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({ "type": "Characters", "data": text[end:] }) for new_token in new_tokens: yield new_token continue yield token
def handle_links(self, src_iter): """Handle links in character tokens""" in_a = False # happens, if parse_email=True and if a mail was found for token in src_iter: if in_a: if token['type'] == 'EndTag' and token['name'] == 'a': in_a = False yield token continue elif token['type'] == 'StartTag' and token['name'] == 'a': in_a = True yield token continue if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} ) url = match.group(0) prefix = suffix = '' # Sometimes we pick up too much in the url match, so look for # bits we should drop and remove them from the match url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one if PROTO_RE.search(url): href = url else: href = u'http://%s' % url attrs = { (None, u'href'): href, u'_text': url } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text new_tokens.append( {u'type': u'Characters', u'data': prefix + url + suffix} ) else: # Add the "a" tag! if prefix: new_tokens.append( {u'type': u'Characters', u'data': prefix} ) _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {u'type': u'StartTag', u'name': u'a', u'data': attrs}, {u'type': u'Characters', u'data': force_unicode(_text)}, {u'type': u'EndTag', u'name': 'a'}, ]) if suffix: new_tokens.append( {u'type': u'Characters', u'data': suffix} ) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token