def plaintext2html(text, container_tag=False): """ Convert plaintext into html. Content of the text is escaped to manage html entities, using misc.html_escape(). - all \n,\r are replaced by <br /> - enclose content into <p> - convert url into clickable link - 2 or more consecutive <br /> are considered as paragraph breaks :param string container_tag: container of the html; by default the content is embedded into a <div> """ text = misc.html_escape(ustr(text)) # 1. replace \n and \r text = text.replace('\n', '<br/>') text = text.replace('\r', '<br/>') # 2. clickable links text = html_keep_url(text) # 3-4: form paragraphs idx = 0 final = '<p>' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '</p><p>' idx = item.end() final += text[idx:] + '</p>' # 5. container if container_tag: final = '<%s>%s</%s>' % (container_tag, final, container_tag) return ustr(final)
def decode_smtp_header(smtp_header): """Returns unicode() string conversion of the given encoded smtp header text. email.header decode_header method return a decoded string and its charset for each decoded par of the header. This method unicodes the decoded header and join them in a complete string. """ if isinstance(smtp_header, Header): smtp_header = ustr(smtp_header) if smtp_header: text = decode_header(smtp_header.replace('\r', '')) return ''.join([ustr(x[0], x[1]) for x in text]) return u''
def decode_smtp_header(smtp_header): """Returns unicode() string conversion of the given encoded smtp header text. email.header decode_header method return a decoded string and its charset for each decoded par of the header. This method unicodes the decoded header and join them in a complete string. """ if isinstance(smtp_header, Header): smtp_header = ustr(smtp_header) if smtp_header: text = decode_header(smtp_header.replace('\r', '')) # The joining space will not be needed as of Python 3.3 # See https://github.com/python/cpython/commit/07ea53cb218812404cdbde820647ce6e4b2d0f8e sep = ' ' if pycompat.PY2 else '' return sep.join([ustr(x[0], x[1]) for x in text]) return u''
def remove_accents(input_str): """Suboptimal-but-better-than-nothing way to replace accented latin letters by an ASCII equivalent. Will obviously change the meaning of input_str and work only for some cases""" input_str = ustr(input_str) nkfd_form = unicodedata.normalize('NFKD', input_str) return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
def str2bool(s, default=None): s = ustr(s).lower() y = 'y yes 1 true t on'.split() n = 'n no 0 false f off'.split() if s not in (y + n): if default is None: raise ValueError('Use 0/1/yes/no/true/false/on/off') return bool(default) return s in y
def decode_smtp_header(smtp_header): """Returns unicode() string conversion of the given encoded smtp header text. email.header decode_header method return a decoded string and its charset for each decoded par of the header. This method unicodes the decoded header and join them in a complete string. """ if smtp_header: text = decode_header(smtp_header.replace('\r', '')) # The joining space will not be needed as of Python 3.3 # See https://hg.python.org/cpython/rev/8c03fe231877 return ' '.join([ustr(x[0], x[1]) for x in text]) return u''
def scan_languages(): """ Returns all languages supported by OpenERP for translation :returns: a list of (lang_code, lang_name) pairs :rtype: [(str, unicode)] """ csvpath = odoo.modules.module.get_resource_path('base', 'res', 'res.lang.csv') try: # read (code, name) from languages in base/res/res.lang.csv result = [] with open(csvpath) as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') fields = next(reader) code_index = fields.index("code") name_index = fields.index("name") for row in reader: result.append((ustr(row[code_index]), ustr(row[name_index]))) except Exception: _logger.error("Could not read %s", csvpath) result = [] return sorted(result or [('en_US', u'English')], key=itemgetter(1))
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False): """ Append extra content at the end of an HTML snippet, trying to locate the end of the HTML document (</body>, </html>, or EOF), and converting the provided content in html unless ``plaintext`` is False. Content conversion can be done in two ways: - wrapping it into a pre (preserve=True) - use plaintext2html (preserve=False, using container_tag to wrap the whole content) A side-effect of this method is to coerce all HTML tags to lowercase in ``html``, and strip enclosing <html> or <body> tags in content if ``plaintext`` is False. :param str html: html tagsoup (doesn't have to be XHTML) :param str content: extra content to append :param bool plaintext: whether content is plaintext and should be wrapped in a <pre/> tag. :param bool preserve: if content is plaintext, wrap it into a <pre> instead of converting it into html """ html = ustr(html) if plaintext and preserve: content = u'\n<pre>%s</pre>\n' % ustr(content) elif plaintext: content = '\n%s\n' % plaintext2html(content, container_tag) else: content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content) content = u'\n%s\n' % ustr(content) # Force all tags to lowercase html = re.sub(r'(</?)(\w+)([ >])', lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html) insert_location = html.find('</body>') if insert_location == -1: insert_location = html.find('</html>') if insert_location == -1: return '%s%s' % (html, content) return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
def schema_activity(arch): """ Check the activity view against its schema :type arch: etree._Element """ global _activity_validator if _activity_validator is None: with misc.file_open(os.path.join('mail', 'views', 'activity.rng')) as f: _activity_validator = etree.RelaxNG(etree.parse(f)) if _activity_validator.validate(arch): return True for error in _activity_validator.error_log: _logger.error(ustr(error)) return False
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) if not html: return '' tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id,)) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(' ', '') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') html = html.replace('>', '>') html = html.replace('<', '<') html = html.replace('&', '&') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, strip_style=False, strip_classes=False): if not src: return src src = ustr(src, errors='replace') # html: remove encoding attribute inside tags doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) src = doctype.sub(u"", src) logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub(lambda m: (u'cite=' not in m.group(1) and u'alt=' not in m.group(1)) and misc.html_escape(m.group(1)) or m.group(1), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace(u'<%', misc.html_escape(u'<%')) src = src.replace(u'%>', misc.html_escape(u'%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'sanitize_style': sanitize_style, # True = sanitize styling 'forms': True, # True = remove form tags 'remove_unknown_tags': False, 'comments': False, 'processing_instructions': False } if sanitize_tags: kwargs['allow_tags'] = allowed_tags if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if sanitize_attributes and etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(['class']) else: current_safe_attrs = safe_attrs kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': current_safe_attrs, }) else: kwargs.update({ 'safe_attrs_only': False, # keep oe-data attributes + style 'strip_classes': strip_classes, # remove classes, even when keeping other attributes }) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) assert isinstance(cleaned, pycompat.text_type) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace(u'%24', u'$') cleaned = cleaned.replace(u'%7B', u'{') cleaned = cleaned.replace(u'%7D', u'}') cleaned = cleaned.replace(u'%20', u' ') cleaned = cleaned.replace(u'%5B', u'[') cleaned = cleaned.replace(u'%5D', u']') cleaned = cleaned.replace(u'%7C', u'|') cleaned = cleaned.replace(u'<%', u'<%') cleaned = cleaned.replace(u'%>', u'%>') # html considerations so real html content match database value cleaned.replace(u'\xa0', u' ') except etree.ParserError as e: if u'empty' in pycompat.text_type(e): return u"" if not silent: raise logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>Unknown error when sanitizing</p>' # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that if cleaned.startswith(u'<div>') and cleaned.endswith(u'</div>'): cleaned = cleaned[5:-6] return cleaned
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, strip_style=False, strip_classes=False): if not src: return src src = ustr(src, errors='replace') # html: remove encoding attribute inside tags doctype = re.compile( r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) src = doctype.sub(u"", src) logger = logging.getLogger(__name__ + '.html_sanitize') # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace(u'<%', misc.html_escape(u'<%')) src = src.replace(u'%>', misc.html_escape(u'%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'sanitize_style': sanitize_style, # True = sanitize styling 'forms': sanitize_form, # True = remove form tags 'remove_unknown_tags': False, 'comments': False, 'processing_instructions': False } if sanitize_tags: kwargs['allow_tags'] = allowed_tags if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if sanitize_attributes and etree.LXML_VERSION >= ( 3, 1, 0 ): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(['class']) else: current_safe_attrs = safe_attrs kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': current_safe_attrs, }) else: kwargs.update({ 'safe_attrs_only': False, # keep oe-data attributes + style 'strip_classes': strip_classes, # remove classes, even when keeping other attributes }) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) assert isinstance(cleaned, str) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace(u'%24', u'$') cleaned = cleaned.replace(u'%7B', u'{') cleaned = cleaned.replace(u'%7D', u'}') cleaned = cleaned.replace(u'%20', u' ') cleaned = cleaned.replace(u'%5B', u'[') cleaned = cleaned.replace(u'%5D', u']') cleaned = cleaned.replace(u'%7C', u'|') cleaned = cleaned.replace(u'<%', u'<%') cleaned = cleaned.replace(u'%>', u'%>') # html considerations so real html content match database value cleaned.replace(u'\xa0', u' ') except etree.ParserError as e: if 'empty' in str(e): return u"" if not silent: raise logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>Unknown error when sanitizing</p>' # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that if cleaned.startswith(u'<div>') and cleaned.endswith(u'</div>'): cleaned = cleaned[5:-6] return cleaned
def build_email( self, email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False, attachments=None, message_id=None, references=None, object_id=False, subtype="plain", headers=None, body_alternative=None, subtype_alternative="plain", ): """ copy-pasted from odoo/addons/base/ir/ir_mail_server.py::build_email """ ftemplate = "__image-%s__" fcounter = 0 attachments = attachments or [] pattern = re.compile(r'"data:image/png;base64,[^"]*"') pos = 0 new_body = "" body = body or "" while True: match = pattern.search(body, pos) if not match: break s = match.start() e = match.end() data = body[s + len('"data:image/png;base64,'):e - 1] new_body += body[pos:s] fname = ftemplate % fcounter fcounter += 1 attachments.append((fname, base64.b64decode(data))) new_body += '"cid:%s"' % fname pos = e new_body += body[pos:] body = new_body email_from = email_from or tools.config.get("email_from") assert email_from, ( "You must either provide a sender address explicitly or configure " "a global sender address in the server configuration or with the " "--email-from startup parameter.") # Note: we must force all strings to to 8-bit utf-8 when crafting message, # or use encode_header() for headers, which does it automatically. headers = headers or {} # need valid dict later if not email_cc: email_cc = [] if not email_bcc: email_bcc = [] if not body: body = u"" email_body_utf8 = ustr(body).encode("utf-8") email_text_part = MIMEText(email_body_utf8, _subtype=subtype, _charset="utf-8") msg = MIMEMultipart() if not message_id: if object_id: message_id = tools.generate_tracking_message_id(object_id) else: message_id = make_msgid() msg["Message-Id"] = encode_header(message_id) if references: msg["references"] = encode_header(references) msg["Subject"] = encode_header(subject) msg["From"] = encode_rfc2822_address_header(email_from) del msg["Reply-To"] if reply_to: msg["Reply-To"] = encode_rfc2822_address_header(reply_to) else: msg["Reply-To"] = msg["From"] msg["To"] = encode_rfc2822_address_header(COMMASPACE.join(email_to)) if email_cc: msg["Cc"] = encode_rfc2822_address_header( COMMASPACE.join(email_cc)) if email_bcc: msg["Bcc"] = encode_rfc2822_address_header( COMMASPACE.join(email_bcc)) msg["Date"] = formatdate() # Custom headers may override normal headers or provide additional ones for key, value in headers.iteritems(): msg[ustr(key).encode("utf-8")] = encode_header(value) if subtype == "html" and not body_alternative and html2text: # Always provide alternative text body ourselves if possible. text_utf8 = tools.html2text( email_body_utf8.decode("utf-8")).encode("utf-8") alternative_part = MIMEMultipart(_subtype="alternative") alternative_part.attach( MIMEText(text_utf8, _charset="utf-8", _subtype="plain")) alternative_part.attach(email_text_part) msg.attach(alternative_part) elif body_alternative: # Include both alternatives, as specified, within a multipart/alternative part alternative_part = MIMEMultipart(_subtype="alternative") body_alternative_utf8 = ustr(body_alternative).encode("utf-8") alternative_body_part = MIMEText(body_alternative_utf8, _subtype=subtype_alternative, _charset="utf-8") alternative_part.attach(alternative_body_part) alternative_part.attach(email_text_part) msg.attach(alternative_part) else: msg.attach(email_text_part) if attachments: for (fname, fcontent) in attachments: filename_rfc2047 = encode_header_param(fname) part = MIMEBase("application", "octet-stream") # The default RFC2231 encoding of Message.add_header() works in Thunderbird but not GMail # so we fix it by using RFC2047 encoding for the filename instead. part.set_param("name", filename_rfc2047) part.add_header("Content-Disposition", "attachment", filename=filename_rfc2047) part.add_header("Content-ID", "<%s>" % filename_rfc2047) # NEW STUFF part.set_payload(fcontent) Encoders.encode_base64(part) msg.attach(part) return msg
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) if not html: return '' tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id, )) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(' ', '') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') html = html.replace('>', '>') html = html.replace('<', '<') html = html.replace('&', '&') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, strip_style=False, strip_classes=False): if not src: return src src = ustr(src, errors='replace') # html: remove encoding attribute inside tags doctype = re.compile( r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) src = doctype.sub(r"", src) logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub( lambda m: ('cite=' not in m.group(1) and 'alt=' not in m.group(1)) and cgi.escape(m.group(1)) or m.group(1), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace('<%', cgi.escape('<%')) src = src.replace('%>', cgi.escape('%>')) kwargs = { 'page_structure': True, 'style': strip_style, # True = remove style tags/attrs 'sanitize_style': sanitize_style, # True = sanitize styling 'forms': True, # True = remove form tags 'remove_unknown_tags': False, 'comments': False, 'processing_instructions': False } if sanitize_tags: kwargs['allow_tags'] = allowed_tags if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if sanitize_attributes and etree.LXML_VERSION >= ( 3, 1, 0 ): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" if strip_classes: current_safe_attrs = safe_attrs - frozenset(['class']) else: current_safe_attrs = safe_attrs kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': current_safe_attrs, }) else: kwargs.update({ 'safe_attrs_only': False, # keep oe-data attributes + style 'strip_classes': strip_classes, # remove classes, even when keeping other attributes }) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') cleaned = cleaned.replace('%7C', '|') cleaned = cleaned.replace('<%', '<%') cleaned = cleaned.replace('%>', '%>') except etree.ParserError, e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>'
def html_sanitize( src, silent=True, ): if not src: return src src = ustr(src, errors='replace') # html: remove encoding attribute inside tags doctype = re.compile( r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL) src = doctype.sub(u"", src) logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) # remove results containing cite="mid:email_like@address" (ex: blockquote cite) # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE) src = part.sub( lambda m: (u'cite=' not in m.group(1) and u'alt=' not in m.group(1)) and misc.html_escape(m.group(1)) or m.group(1), src) # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner src = src.replace(u'<%', misc.html_escape(u'<%')) src = src.replace(u'%>', misc.html_escape(u'%>')) kwargs = { 'page_structure': True, 'style': False, 'forms': False, 'embedded': False, 'remove_unknown_tags': False, 'comments': False, 'processing_instructions': False, 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, 'safe_attrs_only': False, } try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = _Cleaner(**kwargs) cleaned = cleaner.clean_html(src) assert isinstance(cleaned, pycompat.text_type) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace(u'%24', u'$') cleaned = cleaned.replace(u'%7B', u'{') cleaned = cleaned.replace(u'%7D', u'}') cleaned = cleaned.replace(u'%20', u' ') cleaned = cleaned.replace(u'%5B', u'[') cleaned = cleaned.replace(u'%5D', u']') cleaned = cleaned.replace(u'%7C', u'|') cleaned = cleaned.replace(u'<%', u'<%') cleaned = cleaned.replace(u'%>', u'%>') # html considerations so real html content match database value cleaned.replace(u'\xa0', u' ') except etree.ParserError as e: if u'empty' in pycompat.text_type(e): return u"" if not silent: raise logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = u'<p>Unknown error when sanitizing</p>' # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that if cleaned.startswith(u'<div>') and cleaned.endswith(u'</div>'): cleaned = cleaned[5:-6] return cleaned