Exemplo n.º 1
0
 def decode_string_escaped(self, start):
     i = self.pos
     builder = StringBuilder((i - start) * 2)  # just an estimate
     assert start >= 0
     assert i >= 0
     builder.append_slice(self.s, start, i)
     while True:
         ch = self.ll_chars[i]
         i += 1
         if ch == '"':
             content_utf8 = builder.build()
             lgt = unicodehelper.check_utf8_or_raise(
                 self.space, content_utf8)
             self.pos = i
             return self.space.newutf8(content_utf8, lgt)
         elif ch == '\\':
             i = self.decode_escape_sequence(i, builder)
         elif ch < '\x20':
             if ch == '\0':
                 self._raise("Unterminated string starting at char %d",
                             start - 1)
             else:
                 self._raise("Invalid control character at char %d", i - 1)
         else:
             builder.append(ch)
Exemplo n.º 2
0
 def _create_string_wrapped(self, start, end, nonascii):
     content = self.getslice(start, end)
     if nonascii:
         # contains non-ascii chars, we need to check that it's valid utf-8
         lgt = unicodehelper.check_utf8_or_raise(self.space, content)
     else:
         lgt = end - start
     return self.space.newutf8(content, lgt)
Exemplo n.º 3
0
def decode_utf8_recode(space, s, ps, end, recode_encoding):
    p = ps
    while p < end and ord(s[p]) & 0x80:
        p += 1
    lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p)
    w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt),
                               recode_encoding)
    v = space.bytes_w(w_v)
    return v, p
Exemplo n.º 4
0
 def _create_string(self, start, end, bits):
     if bits & 0x80:
         # the 8th bit is set, it's an utf8 string
         content_utf8 = self.getslice(start, end)
         lgt = unicodehelper.check_utf8_or_raise(self.space, content_utf8)
         return self.space.newutf8(content_utf8, lgt)
     else:
         # ascii only, fast path (ascii is a strict subset of
         # latin1, and we already checked that all the chars are <
         # 128)
         return self.space.newutf8(self.getslice(start, end), end - start)
Exemplo n.º 5
0
 def decode_string_escaped(self, start, nonascii):
     i = self.pos
     builder = StringBuilder((i - start) * 2)  # just an estimate
     assert start >= 0
     assert i >= 0
     builder.append_slice(self.s, start, i)
     while True:
         ch = self.ll_chars[i]
         i += 1
         if ch == '"':
             content_utf8 = builder.build()
             length = unicodehelper.check_utf8_or_raise(
                 self.space, content_utf8)
             self.pos = i
             return self.space.newutf8(content_utf8, length)
         elif ch == '\\':
             i = self.decode_escape_sequence_to_utf8(i, builder)
         elif ch < '\x20':
             self._raise_control_char_in_string(ch, start, i)
         else:
             builder.append(ch)
Exemplo n.º 6
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_bytes):
        s = space.bytes_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        unicodehelper.check_utf8_or_raise(space, s)
        sb = StringBuilder(len(s))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        s = space.utf8_w(w_string)
        sb = StringBuilder(len(s))
        first = 0

    it = rutf8.Utf8StringIterator(s)
    for i in range(first):
        it.next()
    for c in it:
        if c <= ord('~'):
            if c == ord('"') or c == ord('\\'):
                sb.append('\\')
            elif c < ord(' '):
                sb.append(ESCAPE_BEFORE_SPACE[c])
                continue
            sb.append(chr(c))
        else:
            if c <= ord(u'\uffff'):
                sb.append('\\u')
                sb.append(HEX[c >> 12])
                sb.append(HEX[(c >> 8) & 0x0f])
                sb.append(HEX[(c >> 4) & 0x0f])
                sb.append(HEX[c & 0x0f])
            else:
                # surrogate pair
                n = c - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.newtext(res)
Exemplo n.º 7
0
def parsestr(space, encoding, s, unicode_literal=False):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=iso8859-1, the source string is also in this encoding.
    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        unicode_literal = True
    if quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(
            space, 'Internal error: parser passed unmatched '
            'quotes in literal')
    if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote:
        # triple quotes
        ps += 2
        if s[q - 1] != quote or s[q - 2] != quote:
            raise_app_valueerror(
                space, 'Internal error: parser passed '
                'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal:
        if encoding is None or encoding == "iso-8859-1":
            # 'unicode_escape' expects latin-1 bytes, string is ready.
            assert 0 <= ps <= q
            substr = s[ps:q]
        else:
            unicodehelper.check_utf8_or_raise(space, s, ps, q)
            substr = decode_unicode_utf8(space, s, ps, q)
        if rawmode:
            r = unicodehelper.decode_raw_unicode_escape(space, substr)
        else:
            r = unicodehelper.decode_unicode_escape(space, substr)
        v, length = r
        return space.newutf8(v, length)

    need_encoding = (encoding is not None and encoding != "utf-8"
                     and encoding != "utf8" and encoding != "iso-8859-1")
    assert 0 <= ps <= q
    substr = s[ps:q]
    if rawmode or '\\' not in s[ps:]:
        if need_encoding:
            lgt = unicodehelper.check_utf8_or_raise(space, substr)
            w_u = space.newutf8(substr, lgt)
            w_v = unicodehelper.encode(space, w_u, encoding)
            return w_v
        else:
            return space.newbytes(substr)

    enc = None
    if need_encoding:
        enc = encoding
    v = PyString_DecodeEscape(space, substr, 'strict', enc)
    return space.newbytes(v)
Exemplo n.º 8
0
def unmarshal_unicode(space, u, tc):
    arg = u.get_str()
    length = unicodehelper.check_utf8_or_raise(space, arg)
    return space.newutf8(arg, length)