def test_escape_html(self):
        """Test escape HTML on selected codepoints."""
        test_input = test_common.ASCII_AND_SELECTED_CODEPOINTS

        want = (
            u'�\x01\x02\x03\x04\x05\x06\x07'
            u'\x08\t\n\x0B\x0C\r\x0E\x0F'
            u'\x10\x11\x12\x13\x14\x15\x16\x17'
            u'\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'
            u' !"#$%&'()*+,-./'
            u'0123456789:;<=>?'
            u'@ABCDEFGHIJKLMNO'
            u'PQRSTUVWXYZ[\]^_'
            u'`abcdefghijklmno'
            u'pqrstuvwxyz{|}~\x7f'
            u'\u00A0\u0100\u2028\u2029\ufdec\ufeff\U0001D11E')

        got = escaping.escape_html(test_input)
        self.assertEquals(
            want, got, 'escaped:\n\t%r\n!=\n\t%r' % (want, got))
        want, got = u'\ufffd%s' % test_input[1:], html.unescape_html(got)
        self.assertEquals(
            want, got, 'reversible:\n\t%r\n!=\n\t%r' % (want, got))

        self.assertEquals('42', escaping.escape_html(42))
        self.assertEquals('', escaping.escape_html(None))
    def test_escape_html(self):
        """Test escape HTML on selected codepoints."""
        test_input = test_common.ASCII_AND_SELECTED_CODEPOINTS

        want = (u'�\x01\x02\x03\x04\x05\x06\x07'
                u'\x08\t\n\x0B\x0C\r\x0E\x0F'
                u'\x10\x11\x12\x13\x14\x15\x16\x17'
                u'\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f'
                u' !"#$%&'()*+,-./'
                u'0123456789:;<=>?'
                u'@ABCDEFGHIJKLMNO'
                u'PQRSTUVWXYZ[\]^_'
                u'`abcdefghijklmno'
                u'pqrstuvwxyz{|}~\x7f'
                u'\u00A0\u0100\u2028\u2029\ufdec\ufeff\U0001D11E')

        got = escaping.escape_html(test_input)
        self.assertEquals(want, got, 'escaped:\n\t%r\n!=\n\t%r' % (want, got))
        want, got = u'\ufffd%s' % test_input[1:], html.unescape_html(got)
        self.assertEquals(want, got,
                          'reversible:\n\t%r\n!=\n\t%r' % (want, got))

        self.assertEquals('42', escaping.escape_html(42))
        self.assertEquals('', escaping.escape_html(None))
def process_raw_text(raw_text, context):
    """
    raw_text - A chunk of HTML/CSS/JS.
    context - The context before raw_text.

    Returns (
      the context after raw_text which may be an error context,
      a normalized version of the text or None if an error occurred,
      None or the context immediately prior to the error,
      None or the unprocessed suffix of raw_text when the error occurred)

    May raise ContextUpdateFailure which is equivalent to returning
    STATE_ERROR but with a more informative error message.
    """

    normalized = StringIO()

    while raw_text:
        prior_context, prior_raw_text = context, raw_text

        delim_type = delim_type_of(context)

        # If we are in an attribute value, then decode raw_text (except
        # for the delimiter) up to the next occurrence of delimiter.

        # The end of the section to decode.  Either before a delimiter
        # or > symbol that closes an attribute, at the end of the raw_text,
        # or -1 if no decoding needs to happen.

        attr_value_end = _end_of_attr_value(raw_text, delim_type)
        if attr_value_end == -1:
            # Outside an attribute value.  No need to decode.
            num_consumed, context, replacement_text = _process_next_token(
                raw_text, context)
            raw_text = raw_text[num_consumed:]
            normalized.write(replacement_text)

            if delim_type_of(context) == DELIM_SPACE_OR_TAG_END:
                # Introduce a double quote when we transition into an unquoted
                # attribute body.
                normalized.write('"')
        else:
            # Inside an attribute value.  Find the end and decode up to it.

            if delim_type == DELIM_SPACE_OR_TAG_END:
                # Check for suspicious characters in the value.
                # http://www.w3.org/TR/html5/tokenization.html
                # #attribute-value-unquoted-state
                # identifies [\0"'<=`] as transitions to error states.
                # If they occur in an unquoted value they are almost surely
                # an indication of an error in the template.
                bad = re.search(r'[\x00"\'<=`]', raw_text[:attr_value_end])
                if bad:
                    raise ContextUpdateFailure(
                        '%r in unquoted attr: %r'
                        % (bad.group(), raw_text[:attr_value_end]))

            # All of the languages we deal with (HTML, CSS, and JS) use
            # quotes as delimiters.
            # When one language is embedded in the other, we need to
            # decode delimiters before trying to parse the content in the
            # embedded language.

            # For example, in
            #       <a onclick="alert(&quot;Hello {$world}&quot;)">
            # the decoded value of the event handler is
            #       alert("Hello {$world}")
            # so to determine the appropriate escaping convention we decode
            # the attribute value before delegating to _process_next_token.

            # We could take the cross-product of two languages to avoid
            # decoding but that leads to either an explosion in the
            # number of states, or the amount of lookahead required.

            # The end of the attribute value.  At attr_value_end, or
            # attr_value_end + 1 if a delimiter needs to be consumed.
            if attr_value_end < len(raw_text):
                attr_end = attr_value_end + len(DELIM_TEXT[delim_type])
            else:
                attr_end = -1

            # Decode so that the JavaScript rules work on attribute values
            # like
            #     <a onclick='alert(&quot;{$msg}!&quot;)'>

            # If we've already processed the tokens "<a", " onclick='" to
            # get into the single quoted JS attribute context, then we do
            # three things:
            #   (1) This class will decode "&quot;" to "\"" and work below
            #       to go from STATE_JS to STATE_JSDQ_STR.
            #   (2) Then the caller checks {$msg} and realizes that $msg is
            #       part of a JS string.
            #   (3) Then, the above will identify the "'" as the end, and
            #       so we reach here with:
            #       r a w T e x t = " ! & q u o t ; ) ' > "
            #                                         ^ ^
            #                            attr_value_end attr_end

            # We use this example more in the comments below.

            attr_value_tail = html.unescape_html(raw_text[:attr_value_end])
            # attr_value_tail is "!\")" in the example above.

            if delim_type == DELIM_SINGLE_QUOTE:
                escaper = escaping.escape_html_sq_only
            else:
                escaper = escaping.escape_html_dq_only

            # Recurse on the decoded value.
            while attr_value_tail:
                num_consumed, context, replacement = _process_next_token(
                    attr_value_tail, context)
                attr_value_tail = attr_value_tail[num_consumed:]
                normalized.write(escaper(replacement))

            # TODO: Maybe check that context is legal to end an attr in.
            # Throw if the attribute ends inside a quoted string.

            if attr_end != -1:
                raw_text = raw_text[attr_end:]
                # raw_text is now ">" from the example above.

                # When an attribute ends, we're back in the tag.
                context = STATE_TAG | element_type_of(context)

                # Append the delimiter on exiting an attribute.
                if delim_type == DELIM_SINGLE_QUOTE:
                    normalized.write("'")
                else:
                    # Inserts an end quote for unquoted attributes.
                    normalized.write('"')
            else:
                # Whole tail is part of an unterminated attribute.
                if attr_value_end != len(raw_text):  # pragma: no cover
                    raise AssertionError()  # Illegal state.
                raw_text = ""
        if is_error_context(context):
            return (context, None, prior_context, prior_raw_text)
    return (context, normalized.getvalue(), None, None)
 def test_unescape_html(self):
     """
     Test unescape_html on corner cases like supplemental codepoints,
     re-escaping, broken escapes, etc.
     """
     self.assertEquals('', html.unescape_html(''))
     self.assertEquals('foo', html.unescape_html('foo'))
     self.assertEquals('foo<bar', html.unescape_html('foo&lt;bar'))
     self.assertEquals('foo< bar', html.unescape_html('foo&lt bar'))
     self.assertEquals('foo&amp;bar', html.unescape_html('foo&amp;amp;bar'))
     self.assertEquals('foo&bogus;bar', html.unescape_html('foo&bogus;bar'))
     self.assertEquals(u'>>>\u226b&gt;',
                       html.unescape_html('&gt&gt;&GT;&Gt;&amp;gt;'))
     self.assertEquals('""""',
                       html.unescape_html('&#34;&#x22;&#X22;&quot;'))
     self.assertEquals('<<<<', html.unescape_html('&#60;&#x3c;&#X3C;&lt;'))
     self.assertEquals(u'\u1234\u1234',
                       html.unescape_html('&#4660;&#x1234;'))
     self.assertEquals(u'\uabcd\uabcd',
                       html.unescape_html('&#43981;&#xabcd;'))
     self.assertEquals(u"\U0001D11E\U0001D11E",
                       html.unescape_html('&#x1d11e;&#xd834;&#xdd1e;'))
     self.assertEquals("&#;&#gt;&#xxa0;", "&#;&#gt;&#xxa0;")
def process_raw_text(raw_text, context):
    """
    raw_text - A chunk of HTML/CSS/JS.
    context - The context before raw_text.

    Returns (
      the context after raw_text which may be an error context,
      a normalized version of the text or None if an error occurred,
      None or the context immediately prior to the error,
      None or the unprocessed suffix of raw_text when the error occurred)

    May raise ContextUpdateFailure which is equivalent to returning
    STATE_ERROR but with a more informative error message.
    """

    normalized = StringIO()

    while raw_text:
        prior_context, prior_raw_text = context, raw_text

        delim_type = delim_type_of(context)

        # If we are in an attribute value, then decode raw_text (except
        # for the delimiter) up to the next occurrence of delimiter.

        # The end of the section to decode.  Either before a delimiter
        # or > symbol that closes an attribute, at the end of the raw_text,
        # or -1 if no decoding needs to happen.

        attr_value_end = _end_of_attr_value(raw_text, delim_type)
        if attr_value_end == -1:
            # Outside an attribute value.  No need to decode.
            num_consumed, context, replacement_text = _process_next_token(
                raw_text, context)
            raw_text = raw_text[num_consumed:]
            normalized.write(replacement_text)

            if delim_type_of(context) == DELIM_SPACE_OR_TAG_END:
                # Introduce a double quote when we transition into an unquoted
                # attribute body.
                normalized.write('"')
        else:
            # Inside an attribute value.  Find the end and decode up to it.

            if delim_type == DELIM_SPACE_OR_TAG_END:
                # Check for suspicious characters in the value.
                # http://www.w3.org/TR/html5/tokenization.html
                # #attribute-value-unquoted-state
                # identifies [\0"'<=`] as transitions to error states.
                # If they occur in an unquoted value they are almost surely
                # an indication of an error in the template.
                bad = re.search(r'[\x00"\'<=`]', raw_text[:attr_value_end])
                if bad:
                    raise ContextUpdateFailure(
                        '%r in unquoted attr: %r' %
                        (bad.group(), raw_text[:attr_value_end]))

            # All of the languages we deal with (HTML, CSS, and JS) use
            # quotes as delimiters.
            # When one language is embedded in the other, we need to
            # decode delimiters before trying to parse the content in the
            # embedded language.

            # For example, in
            #       <a onclick="alert(&quot;Hello {$world}&quot;)">
            # the decoded value of the event handler is
            #       alert("Hello {$world}")
            # so to determine the appropriate escaping convention we decode
            # the attribute value before delegating to _process_next_token.

            # We could take the cross-product of two languages to avoid
            # decoding but that leads to either an explosion in the
            # number of states, or the amount of lookahead required.

            # The end of the attribute value.  At attr_value_end, or
            # attr_value_end + 1 if a delimiter needs to be consumed.
            if attr_value_end < len(raw_text):
                attr_end = attr_value_end + len(DELIM_TEXT[delim_type])
            else:
                attr_end = -1

            # Decode so that the JavaScript rules work on attribute values
            # like
            #     <a onclick='alert(&quot;{$msg}!&quot;)'>

            # If we've already processed the tokens "<a", " onclick='" to
            # get into the single quoted JS attribute context, then we do
            # three things:
            #   (1) This class will decode "&quot;" to "\"" and work below
            #       to go from STATE_JS to STATE_JSDQ_STR.
            #   (2) Then the caller checks {$msg} and realizes that $msg is
            #       part of a JS string.
            #   (3) Then, the above will identify the "'" as the end, and
            #       so we reach here with:
            #       r a w T e x t = " ! & q u o t ; ) ' > "
            #                                         ^ ^
            #                            attr_value_end attr_end

            # We use this example more in the comments below.

            attr_value_tail = html.unescape_html(raw_text[:attr_value_end])
            # attr_value_tail is "!\")" in the example above.

            if delim_type == DELIM_SINGLE_QUOTE:
                escaper = escaping.escape_html_sq_only
            else:
                escaper = escaping.escape_html_dq_only

            # Recurse on the decoded value.
            while attr_value_tail:
                num_consumed, context, replacement = _process_next_token(
                    attr_value_tail, context)
                attr_value_tail = attr_value_tail[num_consumed:]
                normalized.write(escaper(replacement))

            # TODO: Maybe check that context is legal to end an attr in.
            # Throw if the attribute ends inside a quoted string.

            if attr_end != -1:
                raw_text = raw_text[attr_end:]
                # raw_text is now ">" from the example above.

                # When an attribute ends, we're back in the tag.
                context = STATE_TAG | element_type_of(context)

                # Append the delimiter on exiting an attribute.
                if delim_type == DELIM_SINGLE_QUOTE:
                    normalized.write("'")
                else:
                    # Inserts an end quote for unquoted attributes.
                    normalized.write('"')
            else:
                # Whole tail is part of an unterminated attribute.
                if attr_value_end != len(raw_text):  # pragma: no cover
                    raise AssertionError()  # Illegal state.
                raw_text = ""
        if is_error_context(context):
            return (context, None, prior_context, prior_raw_text)
    return (context, normalized.getvalue(), None, None)
 def test_unescape_html(self):
     """
     Test unescape_html on corner cases like supplemental codepoints,
     re-escaping, broken escapes, etc.
     """
     self.assertEquals('', html.unescape_html(''))
     self.assertEquals('foo', html.unescape_html('foo'))
     self.assertEquals('foo<bar', html.unescape_html('foo&lt;bar'))
     self.assertEquals('foo< bar', html.unescape_html('foo&lt bar'))
     self.assertEquals('foo&amp;bar', html.unescape_html('foo&amp;amp;bar'))
     self.assertEquals('foo&bogus;bar', html.unescape_html('foo&bogus;bar'))
     self.assertEquals(
         u'>>>\u226b&gt;', html.unescape_html('&gt&gt;&GT;&Gt;&amp;gt;'))
     self.assertEquals(
         '""""', html.unescape_html('&#34;&#x22;&#X22;&quot;'))
     self.assertEquals(
         '<<<<', html.unescape_html('&#60;&#x3c;&#X3C;&lt;'))
     self.assertEquals(
         u'\u1234\u1234', html.unescape_html('&#4660;&#x1234;'))
     self.assertEquals(
         u'\uabcd\uabcd', html.unescape_html('&#43981;&#xabcd;'))
     self.assertEquals(
         u"\U0001D11E\U0001D11E",
         html.unescape_html('&#x1d11e;&#xd834;&#xdd1e;'))
     self.assertEquals("&#;&#gt;&#xxa0;", "&#;&#gt;&#xxa0;")