Exemplo n.º 1
0
Arquivo: mail.py Projeto: astirpe/odoo
def plaintext2html(text, container_tag=False):
    """ Convert plaintext into html. Content of the text is escaped to manage
        html entities, using misc.html_escape().
        - all \n,\r are replaced by <br />
        - enclose content into <p>
        - convert url into clickable link
        - 2 or more consecutive <br /> are considered as paragraph breaks

        :param string container_tag: container of the html; by default the
            content is embedded into a <div>
    """
    text = misc.html_escape(ustr(text))

    # 1. replace \n and \r
    text = text.replace('\n', '<br/>')
    text = text.replace('\r', '<br/>')

    # 2. clickable links
    text = html_keep_url(text)

    # 3-4: form paragraphs
    idx = 0
    final = '<p>'
    br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
    for item in re.finditer(br_tags, text):
        final += text[idx:item.start()] + '</p><p>'
        idx = item.end()
    final += text[idx:] + '</p>'

    # 5. container
    if container_tag:
        final = '<%s>%s</%s>' % (container_tag, final, container_tag)
    return ustr(final)
Exemplo n.º 2
0
Arquivo: mail.py Projeto: Vauxoo/odoo
def decode_smtp_header(smtp_header):
    """Returns unicode() string conversion of the given encoded smtp header
    text. email.header decode_header method return a decoded string and its
    charset for each decoded par of the header. This method unicodes the
    decoded header and join them in a complete string. """
    if isinstance(smtp_header, Header):
        smtp_header = ustr(smtp_header)
    if smtp_header:
        text = decode_header(smtp_header.replace('\r', ''))
        return ''.join([ustr(x[0], x[1]) for x in text])
    return u''
Exemplo n.º 3
0
Arquivo: mail.py Projeto: astirpe/odoo
def decode_smtp_header(smtp_header):
    """Returns unicode() string conversion of the given encoded smtp header
    text. email.header decode_header method return a decoded string and its
    charset for each decoded par of the header. This method unicodes the
    decoded header and join them in a complete string. """
    if isinstance(smtp_header, Header):
        smtp_header = ustr(smtp_header)
    if smtp_header:
        text = decode_header(smtp_header.replace('\r', ''))
        # The joining space will not be needed as of Python 3.3
        # See https://github.com/python/cpython/commit/07ea53cb218812404cdbde820647ce6e4b2d0f8e
        sep = ' ' if pycompat.PY2 else ''
        return sep.join([ustr(x[0], x[1]) for x in text])
    return u''
Exemplo n.º 4
0
Arquivo: misc.py Projeto: bitctrl/odoo
def remove_accents(input_str):
    """Suboptimal-but-better-than-nothing way to replace accented
    latin letters by an ASCII equivalent. Will obviously change the
    meaning of input_str and work only for some cases"""
    input_str = ustr(input_str)
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u''.join([c for c in nkfd_form if not unicodedata.combining(c)])
Exemplo n.º 5
0
Arquivo: misc.py Projeto: mscenna/odoo
def str2bool(s, default=None):
    s = ustr(s).lower()
    y = 'y yes 1 true t on'.split()
    n = 'n no 0 false f off'.split()
    if s not in (y + n):
        if default is None:
            raise ValueError('Use 0/1/yes/no/true/false/on/off')
        return bool(default)
    return s in y
Exemplo n.º 6
0
def decode_smtp_header(smtp_header):
    """Returns unicode() string conversion of the given encoded smtp header
    text. email.header decode_header method return a decoded string and its
    charset for each decoded par of the header. This method unicodes the
    decoded header and join them in a complete string. """
    if smtp_header:
        text = decode_header(smtp_header.replace('\r', ''))
        # The joining space will not be needed as of Python 3.3
        # See https://hg.python.org/cpython/rev/8c03fe231877
        return ' '.join([ustr(x[0], x[1]) for x in text])
    return u''
Exemplo n.º 7
0
def scan_languages():
    """ Returns all languages supported by OpenERP for translation

    :returns: a list of (lang_code, lang_name) pairs
    :rtype: [(str, unicode)]
    """
    csvpath = odoo.modules.module.get_resource_path('base', 'res', 'res.lang.csv')
    try:
        # read (code, name) from languages in base/res/res.lang.csv
        result = []
        with open(csvpath) as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            fields = next(reader)
            code_index = fields.index("code")
            name_index = fields.index("name")
            for row in reader:
                result.append((ustr(row[code_index]), ustr(row[name_index])))
    except Exception:
        _logger.error("Could not read %s", csvpath)
        result = []

    return sorted(result or [('en_US', u'English')], key=itemgetter(1))
Exemplo n.º 8
0
Arquivo: mail.py Projeto: astirpe/odoo
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
    """ Append extra content at the end of an HTML snippet, trying
        to locate the end of the HTML document (</body>, </html>, or
        EOF), and converting the provided content in html unless ``plaintext``
        is False.
        Content conversion can be done in two ways:
        - wrapping it into a pre (preserve=True)
        - use plaintext2html (preserve=False, using container_tag to wrap the
            whole content)
        A side-effect of this method is to coerce all HTML tags to
        lowercase in ``html``, and strip enclosing <html> or <body> tags in
        content if ``plaintext`` is False.

        :param str html: html tagsoup (doesn't have to be XHTML)
        :param str content: extra content to append
        :param bool plaintext: whether content is plaintext and should
            be wrapped in a <pre/> tag.
        :param bool preserve: if content is plaintext, wrap it into a <pre>
            instead of converting it into html
    """
    html = ustr(html)
    if plaintext and preserve:
        content = u'\n<pre>%s</pre>\n' % ustr(content)
    elif plaintext:
        content = '\n%s\n' % plaintext2html(content, container_tag)
    else:
        content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
        content = u'\n%s\n' % ustr(content)
    # Force all tags to lowercase
    html = re.sub(r'(</?)(\w+)([ >])',
        lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
    insert_location = html.find('</body>')
    if insert_location == -1:
        insert_location = html.find('</html>')
    if insert_location == -1:
        return '%s%s' % (html, content)
    return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
Exemplo n.º 9
0
def schema_activity(arch):
    """ Check the activity view against its schema

    :type arch: etree._Element
    """
    global _activity_validator

    if _activity_validator is None:
        with misc.file_open(os.path.join('mail', 'views', 'activity.rng')) as f:
            _activity_validator = etree.RelaxNG(etree.parse(f))

    if _activity_validator.validate(arch):
        return True

    for error in _activity_validator.error_log:
        _logger.error(ustr(error))
    return False
Exemplo n.º 10
0
Arquivo: mail.py Projeto: astirpe/odoo
def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)

    if not html:
        return ''

    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))
    # \r char is converted into &#13;, must remove it
    html = html.replace('&#13;', '')

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')
    html = html.replace('&gt;', '>')
    html = html.replace('&lt;', '<')
    html = html.replace('&amp;', '&')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i + 1, url)

    return html
Exemplo n.º 11
0
Arquivo: mail.py Projeto: astirpe/odoo
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, strip_style=False, strip_classes=False):
    if not src:
        return src
    src = ustr(src, errors='replace')
    # html: remove encoding attribute inside tags
    doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
    src = doctype.sub(u"", src)

    logger = logging.getLogger(__name__ + '.html_sanitize')

    # html encode email tags
    part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
    # remove results containing cite="mid:email_like@address" (ex: blockquote cite)
    # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE)
    src = part.sub(lambda m: (u'cite=' not in m.group(1) and u'alt=' not in m.group(1)) and misc.html_escape(m.group(1)) or m.group(1), src)
    # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
    src = src.replace(u'<%', misc.html_escape(u'<%'))
    src = src.replace(u'%>', misc.html_escape(u'%>'))

    kwargs = {
        'page_structure': True,
        'style': strip_style,              # True = remove style tags/attrs
        'sanitize_style': sanitize_style,  # True = sanitize styling
        'forms': True,                     # True = remove form tags
        'remove_unknown_tags': False,
        'comments': False,
        'processing_instructions': False
    }
    if sanitize_tags:
        kwargs['allow_tags'] = allowed_tags
        if etree.LXML_VERSION >= (2, 3, 1):
            # kill_tags attribute has been added in version 2.3.1
            kwargs.update({
                'kill_tags': tags_to_kill,
                'remove_tags': tags_to_remove,
            })
        else:
            kwargs['remove_tags'] = tags_to_kill + tags_to_remove

    if sanitize_attributes and etree.LXML_VERSION >= (3, 1, 0):  # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
        if strip_classes:
            current_safe_attrs = safe_attrs - frozenset(['class'])
        else:
            current_safe_attrs = safe_attrs
        kwargs.update({
            'safe_attrs_only': True,
            'safe_attrs': current_safe_attrs,
        })
    else:
        kwargs.update({
            'safe_attrs_only': False,  # keep oe-data attributes + style
            'strip_classes': strip_classes,  # remove classes, even when keeping other attributes
        })

    try:
        # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
        cleaner = _Cleaner(**kwargs)
        cleaned = cleaner.clean_html(src)
        assert isinstance(cleaned, pycompat.text_type)
        # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
        cleaned = cleaned.replace(u'%24', u'$')
        cleaned = cleaned.replace(u'%7B', u'{')
        cleaned = cleaned.replace(u'%7D', u'}')
        cleaned = cleaned.replace(u'%20', u' ')
        cleaned = cleaned.replace(u'%5B', u'[')
        cleaned = cleaned.replace(u'%5D', u']')
        cleaned = cleaned.replace(u'%7C', u'|')
        cleaned = cleaned.replace(u'&lt;%', u'<%')
        cleaned = cleaned.replace(u'%&gt;', u'%>')
        # html considerations so real html content match database value
        cleaned.replace(u'\xa0', u'&nbsp;')
    except etree.ParserError as e:
        if u'empty' in pycompat.text_type(e):
            return u""
        if not silent:
            raise
        logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True)
        cleaned = u'<p>ParserError when sanitizing</p>'
    except Exception:
        if not silent:
            raise
        logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True)
        cleaned = u'<p>Unknown error when sanitizing</p>'

    # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
    if cleaned.startswith(u'<div>') and cleaned.endswith(u'</div>'):
        cleaned = cleaned[5:-6]

    return cleaned
Exemplo n.º 12
0
def html_sanitize(src,
                  silent=True,
                  sanitize_tags=True,
                  sanitize_attributes=False,
                  sanitize_style=False,
                  sanitize_form=True,
                  strip_style=False,
                  strip_classes=False):
    if not src:
        return src
    src = ustr(src, errors='replace')
    # html: remove encoding attribute inside tags
    doctype = re.compile(
        r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)',
        re.IGNORECASE | re.DOTALL)
    src = doctype.sub(u"", src)

    logger = logging.getLogger(__name__ + '.html_sanitize')

    # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
    src = src.replace(u'<%', misc.html_escape(u'<%'))
    src = src.replace(u'%>', misc.html_escape(u'%>'))

    kwargs = {
        'page_structure': True,
        'style': strip_style,  # True = remove style tags/attrs
        'sanitize_style': sanitize_style,  # True = sanitize styling
        'forms': sanitize_form,  # True = remove form tags
        'remove_unknown_tags': False,
        'comments': False,
        'processing_instructions': False
    }
    if sanitize_tags:
        kwargs['allow_tags'] = allowed_tags
        if etree.LXML_VERSION >= (2, 3, 1):
            # kill_tags attribute has been added in version 2.3.1
            kwargs.update({
                'kill_tags': tags_to_kill,
                'remove_tags': tags_to_remove,
            })
        else:
            kwargs['remove_tags'] = tags_to_kill + tags_to_remove

    if sanitize_attributes and etree.LXML_VERSION >= (
            3, 1, 0
    ):  # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
        if strip_classes:
            current_safe_attrs = safe_attrs - frozenset(['class'])
        else:
            current_safe_attrs = safe_attrs
        kwargs.update({
            'safe_attrs_only': True,
            'safe_attrs': current_safe_attrs,
        })
    else:
        kwargs.update({
            'safe_attrs_only': False,  # keep oe-data attributes + style
            'strip_classes':
            strip_classes,  # remove classes, even when keeping other attributes
        })

    try:
        # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
        cleaner = _Cleaner(**kwargs)
        cleaned = cleaner.clean_html(src)
        assert isinstance(cleaned, str)
        # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
        cleaned = cleaned.replace(u'%24', u'$')
        cleaned = cleaned.replace(u'%7B', u'{')
        cleaned = cleaned.replace(u'%7D', u'}')
        cleaned = cleaned.replace(u'%20', u' ')
        cleaned = cleaned.replace(u'%5B', u'[')
        cleaned = cleaned.replace(u'%5D', u']')
        cleaned = cleaned.replace(u'%7C', u'|')
        cleaned = cleaned.replace(u'&lt;%', u'<%')
        cleaned = cleaned.replace(u'%&gt;', u'%>')
        # html considerations so real html content match database value
        cleaned.replace(u'\xa0', u'&nbsp;')
    except etree.ParserError as e:
        if 'empty' in str(e):
            return u""
        if not silent:
            raise
        logger.warning(u'ParserError obtained when sanitizing %r',
                       src,
                       exc_info=True)
        cleaned = u'<p>ParserError when sanitizing</p>'
    except Exception:
        if not silent:
            raise
        logger.warning(u'unknown error obtained when sanitizing %r',
                       src,
                       exc_info=True)
        cleaned = u'<p>Unknown error when sanitizing</p>'

    # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
    if cleaned.startswith(u'<div>') and cleaned.endswith(u'</div>'):
        cleaned = cleaned[5:-6]

    return cleaned
Exemplo n.º 13
0
    def build_email(
        self,
        email_from,
        email_to,
        subject,
        body,
        email_cc=None,
        email_bcc=None,
        reply_to=False,
        attachments=None,
        message_id=None,
        references=None,
        object_id=False,
        subtype="plain",
        headers=None,
        body_alternative=None,
        subtype_alternative="plain",
    ):
        """ copy-pasted from odoo/addons/base/ir/ir_mail_server.py::build_email """

        ftemplate = "__image-%s__"
        fcounter = 0
        attachments = attachments or []

        pattern = re.compile(r'"data:image/png;base64,[^"]*"')
        pos = 0
        new_body = ""
        body = body or ""
        while True:
            match = pattern.search(body, pos)
            if not match:
                break
            s = match.start()
            e = match.end()
            data = body[s + len('"data:image/png;base64,'):e - 1]
            new_body += body[pos:s]

            fname = ftemplate % fcounter
            fcounter += 1
            attachments.append((fname, base64.b64decode(data)))

            new_body += '"cid:%s"' % fname
            pos = e

        new_body += body[pos:]
        body = new_body

        email_from = email_from or tools.config.get("email_from")
        assert email_from, (
            "You must either provide a sender address explicitly or configure "
            "a global sender address in the server configuration or with the "
            "--email-from startup parameter.")

        # Note: we must force all strings to to 8-bit utf-8 when crafting message,
        #       or use encode_header() for headers, which does it automatically.

        headers = headers or {}  # need valid dict later

        if not email_cc:
            email_cc = []
        if not email_bcc:
            email_bcc = []
        if not body:
            body = u""

        email_body_utf8 = ustr(body).encode("utf-8")
        email_text_part = MIMEText(email_body_utf8,
                                   _subtype=subtype,
                                   _charset="utf-8")
        msg = MIMEMultipart()

        if not message_id:
            if object_id:
                message_id = tools.generate_tracking_message_id(object_id)
            else:
                message_id = make_msgid()
        msg["Message-Id"] = encode_header(message_id)
        if references:
            msg["references"] = encode_header(references)
        msg["Subject"] = encode_header(subject)
        msg["From"] = encode_rfc2822_address_header(email_from)
        del msg["Reply-To"]
        if reply_to:
            msg["Reply-To"] = encode_rfc2822_address_header(reply_to)
        else:
            msg["Reply-To"] = msg["From"]
        msg["To"] = encode_rfc2822_address_header(COMMASPACE.join(email_to))
        if email_cc:
            msg["Cc"] = encode_rfc2822_address_header(
                COMMASPACE.join(email_cc))
        if email_bcc:
            msg["Bcc"] = encode_rfc2822_address_header(
                COMMASPACE.join(email_bcc))
        msg["Date"] = formatdate()
        # Custom headers may override normal headers or provide additional ones
        for key, value in headers.iteritems():
            msg[ustr(key).encode("utf-8")] = encode_header(value)

        if subtype == "html" and not body_alternative and html2text:
            # Always provide alternative text body ourselves if possible.
            text_utf8 = tools.html2text(
                email_body_utf8.decode("utf-8")).encode("utf-8")
            alternative_part = MIMEMultipart(_subtype="alternative")
            alternative_part.attach(
                MIMEText(text_utf8, _charset="utf-8", _subtype="plain"))
            alternative_part.attach(email_text_part)
            msg.attach(alternative_part)
        elif body_alternative:
            # Include both alternatives, as specified, within a multipart/alternative part
            alternative_part = MIMEMultipart(_subtype="alternative")
            body_alternative_utf8 = ustr(body_alternative).encode("utf-8")
            alternative_body_part = MIMEText(body_alternative_utf8,
                                             _subtype=subtype_alternative,
                                             _charset="utf-8")
            alternative_part.attach(alternative_body_part)
            alternative_part.attach(email_text_part)
            msg.attach(alternative_part)
        else:
            msg.attach(email_text_part)

        if attachments:
            for (fname, fcontent) in attachments:
                filename_rfc2047 = encode_header_param(fname)
                part = MIMEBase("application", "octet-stream")

                # The default RFC2231 encoding of Message.add_header() works in Thunderbird but not GMail
                # so we fix it by using RFC2047 encoding for the filename instead.
                part.set_param("name", filename_rfc2047)
                part.add_header("Content-Disposition",
                                "attachment",
                                filename=filename_rfc2047)
                part.add_header("Content-ID",
                                "<%s>" % filename_rfc2047)  # NEW STUFF

                part.set_payload(fcontent)
                Encoders.encode_base64(part)
                msg.attach(part)
        return msg
Exemplo n.º 14
0
def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)

    if not html:
        return ''

    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id, ))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))
    # \r char is converted into &#13;, must remove it
    html = html.replace('&#13;', '')

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')
    html = html.replace('&gt;', '>')
    html = html.replace('&lt;', '<')
    html = html.replace('&amp;', '&')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i + 1, url)

    return html
Exemplo n.º 15
0
def html_sanitize(src,
                  silent=True,
                  sanitize_tags=True,
                  sanitize_attributes=False,
                  sanitize_style=False,
                  strip_style=False,
                  strip_classes=False):
    if not src:
        return src
    src = ustr(src, errors='replace')
    # html: remove encoding attribute inside tags
    doctype = re.compile(
        r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)',
        re.IGNORECASE | re.DOTALL)
    src = doctype.sub(r"", src)

    logger = logging.getLogger(__name__ + '.html_sanitize')

    # html encode email tags
    part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)",
                      re.IGNORECASE | re.DOTALL)
    # remove results containing cite="mid:email_like@address" (ex: blockquote cite)
    # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE)
    src = part.sub(
        lambda m: ('cite=' not in m.group(1) and 'alt=' not in m.group(1)) and
        cgi.escape(m.group(1)) or m.group(1), src)
    # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
    src = src.replace('<%', cgi.escape('<%'))
    src = src.replace('%>', cgi.escape('%>'))

    kwargs = {
        'page_structure': True,
        'style': strip_style,  # True = remove style tags/attrs
        'sanitize_style': sanitize_style,  # True = sanitize styling
        'forms': True,  # True = remove form tags
        'remove_unknown_tags': False,
        'comments': False,
        'processing_instructions': False
    }
    if sanitize_tags:
        kwargs['allow_tags'] = allowed_tags
        if etree.LXML_VERSION >= (2, 3, 1):
            # kill_tags attribute has been added in version 2.3.1
            kwargs.update({
                'kill_tags': tags_to_kill,
                'remove_tags': tags_to_remove,
            })
        else:
            kwargs['remove_tags'] = tags_to_kill + tags_to_remove

    if sanitize_attributes and etree.LXML_VERSION >= (
            3, 1, 0
    ):  # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
        if strip_classes:
            current_safe_attrs = safe_attrs - frozenset(['class'])
        else:
            current_safe_attrs = safe_attrs
        kwargs.update({
            'safe_attrs_only': True,
            'safe_attrs': current_safe_attrs,
        })
    else:
        kwargs.update({
            'safe_attrs_only': False,  # keep oe-data attributes + style
            'strip_classes':
            strip_classes,  # remove classes, even when keeping other attributes
        })

    try:
        # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
        cleaner = _Cleaner(**kwargs)
        cleaned = cleaner.clean_html(src)
        # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
        cleaned = cleaned.replace('%24', '$')
        cleaned = cleaned.replace('%7B', '{')
        cleaned = cleaned.replace('%7D', '}')
        cleaned = cleaned.replace('%20', ' ')
        cleaned = cleaned.replace('%5B', '[')
        cleaned = cleaned.replace('%5D', ']')
        cleaned = cleaned.replace('%7C', '|')
        cleaned = cleaned.replace('&lt;%', '<%')
        cleaned = cleaned.replace('%&gt;', '%>')
    except etree.ParserError, e:
        if 'empty' in str(e):
            return ""
        if not silent:
            raise
        logger.warning('ParserError obtained when sanitizing %r',
                       src,
                       exc_info=True)
        cleaned = '<p>ParserError when sanitizing</p>'
Exemplo n.º 16
0
def html_sanitize(
    src,
    silent=True,
):
    if not src:
        return src

    src = ustr(src, errors='replace')

    # html: remove encoding attribute inside tags
    doctype = re.compile(
        r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)',
        re.IGNORECASE | re.DOTALL)
    src = doctype.sub(u"", src)

    logger = logging.getLogger(__name__ + '.html_sanitize')

    # html encode email tags
    part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)",
                      re.IGNORECASE | re.DOTALL)
    # remove results containing cite="mid:email_like@address" (ex: blockquote cite)
    # cite_except = re.compile(r"^((?!cite[\s]*=['\"]).)*$", re.IGNORECASE)
    src = part.sub(
        lambda m: (u'cite=' not in m.group(1) and u'alt=' not in m.group(1))
        and misc.html_escape(m.group(1)) or m.group(1), src)
    # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
    src = src.replace(u'<%', misc.html_escape(u'<%'))
    src = src.replace(u'%>', misc.html_escape(u'%>'))

    kwargs = {
        'page_structure': True,
        'style': False,
        'forms': False,
        'embedded': False,
        'remove_unknown_tags': False,
        'comments': False,
        'processing_instructions': False,
        'kill_tags': tags_to_kill,
        'remove_tags': tags_to_remove,
        'safe_attrs_only': False,
    }

    try:
        # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
        cleaner = _Cleaner(**kwargs)
        cleaned = cleaner.clean_html(src)
        assert isinstance(cleaned, pycompat.text_type)
        # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
        cleaned = cleaned.replace(u'%24', u'$')
        cleaned = cleaned.replace(u'%7B', u'{')
        cleaned = cleaned.replace(u'%7D', u'}')
        cleaned = cleaned.replace(u'%20', u' ')
        cleaned = cleaned.replace(u'%5B', u'[')
        cleaned = cleaned.replace(u'%5D', u']')
        cleaned = cleaned.replace(u'%7C', u'|')
        cleaned = cleaned.replace(u'&lt;%', u'<%')
        cleaned = cleaned.replace(u'%&gt;', u'%>')
        # html considerations so real html content match database value
        cleaned.replace(u'\xa0', u'&nbsp;')
    except etree.ParserError as e:
        if u'empty' in pycompat.text_type(e):
            return u""
        if not silent:
            raise
        logger.warning(u'ParserError obtained when sanitizing %r',
                       src,
                       exc_info=True)
        cleaned = u'<p>ParserError when sanitizing</p>'
    except Exception:
        if not silent:
            raise
        logger.warning(u'unknown error obtained when sanitizing %r',
                       src,
                       exc_info=True)
        cleaned = u'<p>Unknown error when sanitizing</p>'

    # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
    if cleaned.startswith(u'<div>') and cleaned.endswith(u'</div>'):
        cleaned = cleaned[5:-6]

    return cleaned