def clean(self, text): """Cleans text and returns sanitized result as unicode :arg str text: text to be cleaned :returns: sanitized text as unicode """ if not text: return u'' text = force_unicode(text) dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered) return self.serializer.render(filtered)
def linkify(self, text): """Linkify specified text :arg str text: the text to add links to :returns: linkified text as unicode """ if isinstance(text, six.string_types): text = force_unicode(text) if not text: return u'' dom = self.parser.parseFragment(text) filtered = LinkifyFilter( source=self.walker(dom), callbacks=self.callbacks, skip_tags=self.skip_tags, parse_email=self.parse_email, url_re=self.url_re, email_re=self.email_re, ) return self.serializer.render(filtered) raise TypeError('argument must of text type')
def clean(self, text): """Cleans text and returns sanitized result as unicode :arg str text: text to be cleaned :returns: sanitized text as unicode """ if not text: return u'' text = force_unicode(text) dom = self.parser.parseFragment(text) filtered = BkBleachSanitizerFilter( source=self.walker(dom), # Bleach-sanitizer-specific things attributes=self.attributes, strip_disallowed_elements=self.strip, strip_html_comments=self.strip_comments, # html5lib-sanitizer things allowed_elements=self.tags, allowed_css_properties=self.styles, allowed_protocols=self.protocols, allowed_svg_properties=[], ) # Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered) return self.serializer.render(filtered)
def html_check_and_safe(value): tags = bleach.ALLOWED_TAGS + [ 'div', 'br', 'font', 'p', 'table', 'tr', 'td', 'th', 'img', 'u', 'span', 'tbody', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr' ] attrs = { '*': [ 'class', 'style', 'color', 'align', 'title', 'data-toggle', 'data-placement' ], 'a': ['href', 'rel'], 'img': ['src', 'alt'], } style = ['line-height', 'background-color', 'font-size', 'margin-top'] text = force_unicode(value) class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attrs allowed_css_properties = style strip_disallowed_elements = True strip_html_comments = True allowed_protocols = ['http', 'https', 'data'] parser = html5lib.HTMLParser(tokenizer=s) return mark_safe(bleach._render(parser.parseFragment(text)))
def render(self, name, value, attrs=None): attrs = attrs or {} final_attrs = self.build_attrs(attrs, name=name) return mark_safe(render_to_string('ckeditor/widget.html', { 'final_attrs': flatatt(final_attrs), 'value': conditional_escape(force_unicode(value)), 'id': final_attrs['id'] }))
def render(self, name, value, attrs=None): attrs = attrs or {} final_attrs = self.build_attrs(attrs, name=name) return mark_safe( render_to_string( 'ckeditor/widget.html', { 'final_attrs': flatatt(final_attrs), 'value': conditional_escape(force_unicode(value)), 'id': final_attrs['id'] }))
def handle_email_addresses(self, src_iter): """Handle email addresses in character tokens""" for token in src_iter: if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 # For each email address we find in the text for match in self.email_re.finditer(text): if match.start() > end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} ) # Run attributes through the callbacks to see what we # should do with this match attrs = { (None, u'href'): u'mailto:%s' % match.group(0), u'_text': match.group(0) } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text--but not as a link new_tokens.append( {u'type': u'Characters', u'data': match.group(0)} ) else: # Add an "a" tag for the new link _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {u'type': u'StartTag', u'name': u'a', u'data': attrs}, {u'type': u'Characters', u'data': force_unicode(_text)}, {u'type': u'EndTag', u'name': 'a'} ]) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, strip=False, strip_comments=True): """Clean an HTML fragment and return it""" if not text: return '' text = force_unicode(text) class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attributes allowed_css_properties = styles strip_disallowed_elements = strip strip_html_comments = strip_comments parser = html5lib.HTMLParser(tokenizer=s) return _render(parser.parseFragment(text))
def handle_a_tag(self, token_buffer): """Handle the "a" tag This could adjust the link or drop it altogether depending on what the callbacks return. This yields the new set of tokens. """ a_token = token_buffer[0] if a_token['data']: attrs = a_token['data'] else: attrs = {} text = self.extract_character_data(token_buffer) attrs['_text'] = text attrs = self.apply_callbacks(attrs, False) if attrs is None: # We're dropping the "a" tag and everything else and replacing # it with character data. So emit that token. yield {'type': 'Characters', 'data': text} else: new_text = attrs.pop('_text', '') a_token['data'] = alphabetize_attributes(attrs) if text == new_text: # The callbacks didn't change the text, so we yield the new "a" # token, then whatever else was there, then the end "a" token yield a_token for mem in token_buffer[1:]: yield mem else: # If the callbacks changed the text, then we're going to drop # all the tokens between the start and end "a" tags and replace # it with the new text yield a_token yield {'type': 'Characters', 'data': force_unicode(new_text)} yield token_buffer[-1]
def linkify(self, text): """Linkify specified text :arg str text: the text to add links to :returns: linkified text as unicode """ text = force_unicode(text) if not text: return u'' dom = self.parser.parseFragment(text) filtered = LinkifyFilter( source=self.walker(dom), callbacks=self.callbacks, skip_tags=self.skip_tags, parse_email=self.parse_email, url_re=self.url_re, email_re=self.email_re, ) return self.serializer.render(filtered)
def html_check_and_safe(value): tags = bleach.ALLOWED_TAGS + ['div', 'br', 'font', 'p', 'table', 'tr', 'td', 'th', 'img', 'u', 'span', 'tbody', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr'] attrs = { '*': ['class', 'style', 'color', 'align', 'title', 'data-toggle', 'data-placement'], 'a': ['href', 'rel'], 'img': ['src', 'alt'], } style = ['line-height', 'background-color', 'font-size', 'margin-top'] text = force_unicode(value) class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attrs allowed_css_properties = style strip_disallowed_elements = True strip_html_comments = True allowed_protocols = ['http', 'https', 'data'] parser = html5lib.HTMLParser(tokenizer=s) return mark_safe(bleach._render(parser.parseFragment(text)))
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): """Clean an HTML fragment of malicious content and return it This function is a security-focused function whose sole purpose is to remove malicious content from a string such that it can be displayed as content in a web page. This function is not designed to use to transform content to be used in non-web-page contexts. :arg text: the text to clean :arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` :arg attributes: whitelist of allowed attributes; defaults to ``bleach.ALLOWED_ATTRIBUTES`` :arg styles: whitelist of allowed css; defaults to ``bleach.ALLOWED_STYLES`` :arg protocols: whitelist of allowed protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` :arg strip: whether or not to strip disallowed elements :arg strip_comments: whether or not to strip HTML comments :returns: cleaned text as unicode """ if not text: return u'' text = force_unicode(text) parser = html5lib.HTMLParser(namespaceHTMLElements=False) dom = parser.parseFragment(text) walker = html5lib.getTreeWalker('etree') filtered = BleachSanitizerFilter( source=walker(dom), # Bleach-sanitizer-specific things allowed_attributes_map=attributes, strip_disallowed_elements=strip, strip_html_comments=strip_comments, # html5lib-sanitizer things allowed_elements=tags, allowed_css_properties=styles, allowed_protocols=protocols, allowed_svg_properties=[], ) s = HTMLSerializer( quote_attr_values='always', omit_optional_tags=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, # Bleach sanitizer alphabetizes already, so don't use the html5lib one alphabetical_attributes=False, ) return s.render(filtered)
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False): """Convert URL-like strings in an HTML fragment to links ``linkify()`` converts strings that look like URLs, domain names and email addresses in text that may be an HTML fragment to links, while preserving: 1. links already in the string 2. urls found in attributes 3. email addresses ``linkify()`` does a best-effort approach and tries to recover from bad situations due to crazy text. """ text = force_unicode(text) if not text: return u'' parser = html5lib.HTMLParser() forest = parser.parseFragment(text) _seen = set() def replace_nodes(tree, new_frag, node, index=0): """Doesn't really replace nodes, but inserts the nodes contained in ``new_frag`` into ``tree`` at position ``index`` and returns the number of nodes inserted. If ``node`` is passed in, it is removed from the resulting tree. :arg tree: tree :arg new_frag: fragment of html text to insert :arg node: the node to "replace" :arg index: the index position to focus on :returns: number of nodes inserted so that you can skip ahead """ count = 0 new_tree = parser.parseFragment(new_frag) # capture any non-tag text at the start of the fragment if new_tree.text: if index == 0: tree.text = (tree.text or '') + new_tree.text else: tree[index - 1].tail = (tree[index - 1].tail or '') + new_tree.text # then put in the tagged elements into the old tree for n in new_tree: if n.tag == ETREE_TAG('a'): _seen.add(n) tree.insert(index + count, n) count += 1 # if we got a node to remove... if node is not None: # first, grab the node tail so we don't lose text if node.tail: if index + count == 0: tree.text = (tree.text or '') + node.tail else: tree[index + count - 1].tail = (tree[index + count - 1].tail or '') + node.tail tree.remove(node) return count def strip_wrapping_parentheses(fragment): """Strips wrapping parentheses. Returns a tuple of the following format:: (string stripped from wrapping parentheses, count of stripped opening parentheses, count of stripped closing parentheses) """ opening_parentheses = closing_parentheses = 0 # Count consecutive opening parentheses # at the beginning of the fragment (string). for char in fragment: if char == '(': opening_parentheses += 1 else: break if opening_parentheses: newer_frag = '' # Cut the consecutive opening brackets from the fragment. fragment = fragment[opening_parentheses:] # Reverse the fragment for easier detection of parentheses # inside the URL. reverse_fragment = fragment[::-1] skip = False for char in reverse_fragment: # Remove the closing parentheses if it has a matching # opening parentheses (they are balanced). if (char == ')' and closing_parentheses < opening_parentheses and not skip): closing_parentheses += 1 continue # Do not remove ')' from the URL itself. elif char != ')': skip = True newer_frag += char fragment = newer_frag[::-1] return fragment, opening_parentheses, closing_parentheses def apply_callbacks(attrs, new): for cb in callbacks: attrs = cb(attrs, new) if attrs is None: return None return attrs def _render_inner(node): out = ['' if node.text is None else node.text] for subnode in node: out.append(_render(subnode)) if subnode.tail: out.append(subnode.tail) return ''.join(out) def linkify_nodes(tree, parse_text=True): children = len(tree) current_child = -1 # start at -1 to process the parent first while current_child < len(tree): if current_child < 0: node = tree if parse_text and node.text: new_txt = old_txt = node.text if parse_email: new_txt = re.sub(email_re, email_repl, node.text) if new_txt and new_txt != node.text: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj linkify_nodes(tree, True) continue new_txt = re.sub(url_re, link_repl, new_txt) if new_txt != old_txt: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj continue else: node = tree[current_child] if parse_text and node.tail: new_tail = old_tail = node.tail if parse_email: new_tail = re.sub(email_re, email_repl, new_tail) if new_tail != node.tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) # Insert the new nodes made from my tail into # the tree right after me. current_child+1 children += adj continue new_tail = re.sub(url_re, link_repl, new_tail) if new_tail != old_tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) children += adj if node.tag == ETREE_TAG('a') and not (node in _seen): if not node.get('href', None) is None: attrs = dict(node.items()) _text = attrs['_text'] = _render_inner(node) attrs = apply_callbacks(attrs, False) if attrs is None: # # <a> tag replaced by the text within it adj = replace_nodes(tree, _text, node, current_child) # pull back current_child by 1 to scan the new nodes # again. current_child -= 1 else: text = force_unicode(attrs.pop('_text')) for attr_key, attr_val in attrs.items(): node.set(attr_key, attr_val) for n in reversed(list(node)): node.remove(n) text = parser.parseFragment(text) node.text = text.text for n in text: node.append(n) _seen.add(node) elif current_child >= 0: if node.tag == ETREE_TAG('pre') and skip_pre: linkify_nodes(node, False) elif not (node in _seen): linkify_nodes(node, parse_text) current_child += 1 def email_repl(match): addr = match.group(0).replace('"', '"') link = { '_text': addr, 'href': 'mailto:{0!s}'.format(addr), } link = apply_callbacks(link, True) if link is None: return addr _href = link.pop('href') _text = link.pop('_text') repl = '<a href="{0!s}" {1!s}>{2!s}</a>' attr = '{0!s}="{1!s}"' attribs = ' '.join(attr.format(k, v) for k, v in link.items()) return repl.format(_href, attribs, _text) def link_repl(match): url = match.group(0) open_brackets = close_brackets = 0 if url.startswith('('): _wrapping = strip_wrapping_parentheses(url) url, open_brackets, close_brackets = _wrapping if url.endswith(')') and '(' not in url: # This is a clumsy handling for the case where we have something # like (foo http://example.com) and the ) gets picked up by the # url_re but we don't want it part of the link. new_url = url.rstrip(')') close_brackets += len(url) - len(new_url) url = new_url end = '' m = re.search(punct_re, url) if m: end = m.group(0) url = url[0:m.start()] if re.search(proto_re, url): href = url else: href = ''.join(['http://', url]) link = { '_text': url, 'href': href, } link = apply_callbacks(link, True) if link is None: return '(' * open_brackets + url + ')' * close_brackets _text = link.pop('_text') _href = link.pop('href') repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}' attr = '{0!s}="{1!s}"' attribs = ' '.join(attr.format(k, v) for k, v in link.items()) return repl.format('(' * open_brackets, _href, attribs, _text, end, ')' * close_brackets) try: linkify_nodes(forest) except RuntimeError as e: # If we hit the max recursion depth, just return what we've got. log.exception('Probable recursion error: {0!r}'.format(e)) return _render(forest)
def handle_links(self, src_iter): """Handle links in character tokens""" for token in src_iter: if token['type'] == 'Characters': text = token['data'] new_tokens = [] end = 0 for match in self.url_re.finditer(text): if match.start() > end: new_tokens.append( {u'type': u'Characters', u'data': text[end:match.start()]} ) url = match.group(0) prefix = suffix = '' # Sometimes we pick up too much in the url match, so look for # bits we should drop and remove them from the match url, prefix, suffix = self.strip_non_url_bits(url) # If there's no protocol, add one if PROTO_RE.search(url): href = url else: href = u'http://%s' % url attrs = { (None, u'href'): href, u'_text': url } attrs = self.apply_callbacks(attrs, True) if attrs is None: # Just add the text new_tokens.append( {u'type': u'Characters', u'data': prefix + url + suffix} ) else: # Add the "a" tag! if prefix: new_tokens.append( {u'type': u'Characters', u'data': prefix} ) _text = attrs.pop(u'_text', '') attrs = alphabetize_attributes(attrs) new_tokens.extend([ {u'type': u'StartTag', u'name': u'a', u'data': attrs}, {u'type': u'Characters', u'data': force_unicode(_text)}, {u'type': u'EndTag', u'name': 'a'}, ]) if suffix: new_tokens.append( {u'type': u'Characters', u'data': suffix} ) end = match.end() if new_tokens: # Yield the adjusted set of tokens and then continue # through the loop if end < len(text): new_tokens.append({u'type': u'Characters', u'data': text[end:]}) for new_token in new_tokens: yield new_token continue yield token
def linkify_nodes(tree, parse_text=True): children = len(tree) current_child = -1 # start at -1 to process the parent first while current_child < len(tree): if current_child < 0: node = tree if parse_text and node.text: new_txt = old_txt = node.text if parse_email: new_txt = re.sub(email_re, email_repl, node.text) if new_txt and new_txt != node.text: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj linkify_nodes(tree, True) continue new_txt = re.sub(url_re, link_repl, new_txt) if new_txt != old_txt: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj continue else: node = tree[current_child] if parse_text and node.tail: new_tail = old_tail = node.tail if parse_email: new_tail = re.sub(email_re, email_repl, new_tail) if new_tail != node.tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) # Insert the new nodes made from my tail into # the tree right after me. current_child+1 children += adj continue new_tail = re.sub(url_re, link_repl, new_tail) if new_tail != old_tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) children += adj if node.tag == ETREE_TAG('a') and not (node in _seen): if not node.get('href', None) is None: attrs = dict(node.items()) _text = attrs['_text'] = _render_inner(node) attrs = apply_callbacks(attrs, False) if attrs is None: # # <a> tag replaced by the text within it adj = replace_nodes(tree, _text, node, current_child) # pull back current_child by 1 to scan the new nodes # again. current_child -= 1 else: text = force_unicode(attrs.pop('_text')) for attr_key, attr_val in attrs.items(): node.set(attr_key, attr_val) for n in reversed(list(node)): node.remove(n) text = parser.parseFragment(text) node.text = text.text for n in text: node.append(n) _seen.add(node) elif current_child >= 0: if node.tag == ETREE_TAG('pre') and skip_pre: linkify_nodes(node, False) elif not (node in _seen): linkify_nodes(node, parse_text) current_child += 1
def _render(tree): """Try rendering as HTML, then XML, then give up.""" return force_unicode(_serialize(tree))
def linkify_nodes(tree, parse_text=True): children = len(tree) current_child = -1 # start at -1 to process the parent first while current_child < len(tree): if current_child < 0: node = tree if parse_text and node.text: new_txt = old_txt = node.text if parse_email: new_txt = re.sub(email_re, email_repl, node.text) if new_txt and new_txt != node.text: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj linkify_nodes(tree, True) continue new_txt = re.sub(url_re, link_repl, new_txt) if new_txt != old_txt: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj continue else: node = tree[current_child] if parse_text and node.tail: new_tail = old_tail = node.tail if parse_email: new_tail = re.sub(email_re, email_repl, new_tail) if new_tail != node.tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) # Insert the new nodes made from my tail into # the tree right after me. current_child+1 children += adj continue new_tail = re.sub(url_re, link_repl, new_tail) if new_tail != old_tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) children += adj if node.tag == ETREE_TAG('a') and not (node in _seen): if not node.get('href', None) is None: attrs = dict(node.items()) _text = attrs['_text'] = _render_inner(node) attrs = apply_callbacks(attrs, False) if attrs is None: # <a> tag replaced by the text within it adj = replace_nodes(tree, _text, node, current_child) current_child -= 1 # pull back current_child by 1 to scan the # new nodes again. else: text = force_unicode(attrs.pop('_text')) for attr_key, attr_val in attrs.items(): node.set(attr_key, attr_val) for n in reversed(list(node)): node.remove(n) text = parser.parseFragment(text) node.text = text.text for n in text: node.append(n) _seen.add(node) elif current_child >= 0: if node.tag == ETREE_TAG('pre') and skip_pre: linkify_nodes(node, False) elif not (node in _seen): linkify_nodes(node, True) current_child += 1
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False, parse_email=False, tokenizer=HTMLSanitizer): """Convert URL-like strings in an HTML fragment to links. linkify() converts strings that look like URLs or domain names in a blob of text that may be an HTML fragment to links, while preserving (a) links already in the string, (b) urls found in attributes, and (c) email addresses. """ text = force_unicode(text) if not text: return '' parser = html5lib.HTMLParser(tokenizer=tokenizer) forest = parser.parseFragment(text) _seen = set([]) def replace_nodes(tree, new_frag, node, index=0): """ Doesn't really replace nodes, but inserts the nodes contained in new_frag into the treee at position index and returns the number of nodes inserted. If node is passed in, it is removed from the tree """ count = 0 new_tree = parser.parseFragment(new_frag) # capture any non-tag text at the start of the fragment if new_tree.text: if index == 0: tree.text = tree.text or '' tree.text += new_tree.text else: tree[index - 1].tail = tree[index - 1].tail or '' tree[index - 1].tail += new_tree.text # the put in the tagged elements into the old tree for n in new_tree: if n.tag == ETREE_TAG('a'): _seen.add(n) tree.insert(index + count, n) count += 1 # if we got a node to remove... if node is not None: tree.remove(node) return count def strip_wrapping_parentheses(fragment): """Strips wrapping parentheses. Returns a tuple of the following format:: (string stripped from wrapping parentheses, count of stripped opening parentheses, count of stripped closing parentheses) """ opening_parentheses = closing_parentheses = 0 # Count consecutive opening parentheses # at the beginning of the fragment (string). for char in fragment: if char == '(': opening_parentheses += 1 else: break if opening_parentheses: newer_frag = '' # Cut the consecutive opening brackets from the fragment. fragment = fragment[opening_parentheses:] # Reverse the fragment for easier detection of parentheses # inside the URL. reverse_fragment = fragment[::-1] skip = False for char in reverse_fragment: # Remove the closing parentheses if it has a matching # opening parentheses (they are balanced). if (char == ')' and closing_parentheses < opening_parentheses and not skip): closing_parentheses += 1 continue # Do not remove ')' from the URL itself. elif char != ')': skip = True newer_frag += char fragment = newer_frag[::-1] return fragment, opening_parentheses, closing_parentheses def apply_callbacks(attrs, new): for cb in callbacks: attrs = cb(attrs, new) if attrs is None: return None return attrs def _render_inner(node): out = ['' if node.text is None else node.text] for subnode in node: out.append(_render(subnode)) if subnode.tail: out.append(subnode.tail) return ''.join(out) def linkify_nodes(tree, parse_text=True): children = len(tree) current_child = -1 # start at -1 to process the parent first while current_child < len(tree): if current_child < 0: node = tree if parse_text and node.text: new_txt = old_txt = node.text if parse_email: new_txt = re.sub(email_re, email_repl, node.text) if new_txt and new_txt != node.text: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj linkify_nodes(tree, True) continue new_txt = re.sub(url_re, link_repl, new_txt) if new_txt != old_txt: node.text = '' adj = replace_nodes(tree, new_txt, None, 0) children += adj current_child += adj continue else: node = tree[current_child] if parse_text and node.tail: new_tail = old_tail = node.tail if parse_email: new_tail = re.sub(email_re, email_repl, new_tail) if new_tail != node.tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) # Insert the new nodes made from my tail into # the tree right after me. current_child+1 children += adj continue new_tail = re.sub(url_re, link_repl, new_tail) if new_tail != old_tail: node.tail = '' adj = replace_nodes(tree, new_tail, None, current_child + 1) children += adj if node.tag == ETREE_TAG('a') and not (node in _seen): if not node.get('href', None) is None: attrs = dict(node.items()) _text = attrs['_text'] = _render_inner(node) attrs = apply_callbacks(attrs, False) if attrs is None: # <a> tag replaced by the text within it adj = replace_nodes(tree, _text, node, current_child) current_child -= 1 # pull back current_child by 1 to scan the # new nodes again. else: text = force_unicode(attrs.pop('_text')) for attr_key, attr_val in attrs.items(): node.set(attr_key, attr_val) for n in reversed(list(node)): node.remove(n) text = parser.parseFragment(text) node.text = text.text for n in text: node.append(n) _seen.add(node) elif current_child >= 0: if node.tag == ETREE_TAG('pre') and skip_pre: linkify_nodes(node, False) elif not (node in _seen): linkify_nodes(node, True) current_child += 1 def email_repl(match): addr = match.group(0).replace('"', '"') link = { '_text': addr, 'href': 'mailto:{0!s}'.format(addr), } link = apply_callbacks(link, True) if link is None: return addr _href = link.pop('href') _text = link.pop('_text') repl = '<a href="{0!s}" {1!s}>{2!s}</a>' attr = '{0!s}="{1!s}"' attribs = ' '.join(attr.format(k, v) for k, v in link.items()) return repl.format(_href, attribs, _text) def link_repl(match): url = match.group(0) open_brackets = close_brackets = 0 if url.startswith('('): _wrapping = strip_wrapping_parentheses(url) url, open_brackets, close_brackets = _wrapping end = '' m = re.search(punct_re, url) if m: end = m.group(0) url = url[0:m.start()] if re.search(proto_re, url): href = url else: href = ''.join(['http://', url]) link = { '_text': url, 'href': href, } link = apply_callbacks(link, True) if link is None: return '(' * open_brackets + url + ')' * close_brackets _text = link.pop('_text') _href = link.pop('href') repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}' attr = '{0!s}="{1!s}"' attribs = ' '.join(attr.format(k, v) for k, v in link.items()) return repl.format('(' * open_brackets, _href, attribs, _text, end, ')' * close_brackets) try: linkify_nodes(forest) except RuntimeError as e: # If we hit the max recursion depth, just return what we've got. log.exception('Probable recursion error: {0!r}'.format(e)) return _render(forest)