def sanitize_payload(payload): "Sanitize HTML" if not payload: return '', '' styles = [] payload = clean_payload(payload) body_style, body_class = get_body_style(payload) if body_style: styles.append(body_style) safe_attrs = set(defs.safe_attrs) safe_attrs.add('style') cleaner = Cleaner(remove_tags=UNCLEANTAGS, safe_attrs_only=True, safe_attrs=safe_attrs) payload = HTMLTITLE_RE.sub('', payload) try: html = cleaner.clean_html(payload) except ValueError: payload = bytes(bytearray(payload, encoding='utf-8')) html = cleaner.clean_html(payload) except XMLSyntaxError: html = '' mainstyle = sanitize_css(get_style(html)) if mainstyle: styles.append(decode(mainstyle)) style = u'\n'.join(styles) html = clean_styles(CSS_COMMENT_RE.sub('', html)) html = set_body_class(html, body_class) return html.strip(), style.strip()
def sanitize_html(self, msg): "Clean up html" cleaner = CustomCleaner(style=True, remove_tags=UNCLEANTAGS, safe_attrs_only=True) # workaround to bug in lxml which does not remove title msg = HTMLTITLE_RE.sub('', msg) html = cleaner.clean_html(msg) return html