예제 #1
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            # XXX error handling
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(
                str_decode_ascii(s, len(s), None, final=True,
                                 errorhandler=eh)[0])
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(
                str_decode_utf_8(s,
                                 len(s),
                                 None,
                                 final=True,
                                 errorhandler=eh,
                                 allow_surrogates=True)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.wrap("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding),
                                       space.wrap(errors))
    return w_retval
예제 #2
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            # XXX error handling
            s = space.charbuf_w(w_obj)
            try:
                u = fast_str_decode_ascii(s)
            except ValueError:
                eh = unicodehelper.decode_error_handler(space)
                u = str_decode_ascii(     # try again, to get the error right
                    s, len(s), None, final=True, errorhandler=eh)[0]
            return space.wrap(u)
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(str_decode_utf_8(
                    s, len(s), None, final=True, errorhandler=eh)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.wrap("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding),
                                       space.wrap(errors))
    return w_retval
예제 #3
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            # XXX error handling
            s = space.charbuf_w(w_obj)
            try:
                u = fast_str_decode_ascii(s)
            except ValueError:
                eh = unicodehelper.decode_error_handler(space)
                u = str_decode_ascii(     # try again, to get the error right
                    s, len(s), None, final=True, errorhandler=eh)[0]
            return space.newunicode(u)
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.newunicode(str_decode_utf_8(
                    s, len(s), None, final=True, errorhandler=eh,
                    allow_surrogates=True)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.newtext("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding),
                                       space.newtext(errors))
    return w_retval
예제 #4
0
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == 'strict':
        if encoding == 'ascii':
            s = space.charbuf_w(w_obj)
            try:
                u = fast_str_decode_ascii(s)
            except ValueError:
                eh = unicodehelper.decode_error_handler(space)
                u = str_decode_ascii(  # try again, to get the error right
                    s,
                    len(s),
                    None,
                    final=True,
                    errorhandler=eh)[0]
            return space.newunicode(u)
        if encoding == 'utf-8':
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.newunicode(
                str_decode_utf_8(s, len(s), None, final=True,
                                 errorhandler=eh)[0])

    from pypy.module._codecs.interp_codecs import decode_text
    w_retval = decode_text(space, w_obj, encoding, errors)
    if not space.isinstance_w(w_retval, space.w_unicode):
        raise oefmt(
            space.w_TypeError, "'%s' decoder returned '%T' instead of 'str'; "
            "use codecs.decode() to decode to arbitrary types", encoding,
            w_retval)
    return w_retval
예제 #5
0
def check_utf8(space, s, ps, end):
    assert ps >= 0
    pt = ps
    # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
    while ps < end and ord(s[ps]) & 0x80:
        ps += 1
    try:
        rutf8.check_utf8(s, True, pt, ps)
    except rutf8.CheckError as e:
        lgt, flag = rutf8.check_utf8(s, True, pt, e.pos)
        unicodehelper.decode_error_handler(space)('strict', 'utf8',
                                                  'invalid utf-8', s, pt + lgt,
                                                  pt + lgt + 1)
    return s[pt:ps]
def decode_object(space, w_obj, encoding, errors):
    if encoding is None:
        encoding = getdefaultencoding(space)
    if errors is None or errors == "strict":
        if encoding == "ascii":
            # XXX error handling
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(str_decode_ascii(s, len(s), None, final=True, errorhandler=eh)[0])
        if encoding == "utf-8":
            s = space.charbuf_w(w_obj)
            eh = unicodehelper.decode_error_handler(space)
            return space.wrap(str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0])
    w_codecs = space.getbuiltinmodule("_codecs")
    w_decode = space.getattr(w_codecs, space.wrap("decode"))
    if errors is None:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding))
    else:
        w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors))
    return w_retval
예제 #7
0
 def w_convert(self, space, s):
     # I suppose this is a valid utf8, but there is noone to check
     # and noone to catch an error either
     try:
         lgt = rutf8.check_utf8(s, True)
         return space.newutf8(s, lgt)
     except rutf8.CheckError:
         from pypy.interpreter import unicodehelper
         # get the correct error msg
         unicodehelper.str_decode_utf8(
             s, 'string', True, unicodehelper.decode_error_handler(space))
         assert False, "always raises"
     return space.newtext(s)
예제 #8
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_bytes):
        s = space.bytes_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        eh = unicodehelper.decode_error_handler(space)
        u = str_decode_utf_8(s,
                             len(s),
                             None,
                             final=True,
                             errorhandler=eh,
                             allow_surrogates=True)[0]
        sb = StringBuilder(len(u))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        u = space.unicode_w(w_string)
        sb = StringBuilder(len(u))
        first = 0

    for i in range(first, len(u)):
        c = ord(u[i])
        if c <= ord('~'):
            if c == ord('"') or c == ord('\\'):
                sb.append('\\')
            elif c < ord(' '):
                sb.append(ESCAPE_BEFORE_SPACE[c])
                continue
            sb.append(chr(c))
        else:
            if c <= ord(u'\uffff'):
                sb.append('\\u')
                sb.append(HEX[c >> 12])
                sb.append(HEX[(c >> 8) & 0x0f])
                sb.append(HEX[(c >> 4) & 0x0f])
                sb.append(HEX[c & 0x0f])
            else:
                # surrogate pair
                n = c - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.newtext(res)
예제 #9
0
def raw_encode_basestring_ascii(space, w_string):
    if space.isinstance_w(w_string, space.w_str):
        s = space.str_w(w_string)
        for i in range(len(s)):
            c = s[i]
            if c >= ' ' and c <= '~' and c != '"' and c != '\\':
                pass
            else:
                first = i
                break
        else:
            # the input is a string with only non-special ascii chars
            return w_string

        eh = unicodehelper.decode_error_handler(space)
        u = str_decode_utf_8(
                s, len(s), None, final=True, errorhandler=eh,
                allow_surrogates=True)[0]
        sb = StringBuilder(len(u))
        sb.append_slice(s, 0, first)
    else:
        # We used to check if 'u' contains only safe characters, and return
        # 'w_string' directly.  But this requires an extra pass over all
        # characters, and the expected use case of this function, from
        # json.encoder, will anyway re-encode a unicode result back to
        # a string (with the ascii encoding).  This requires two passes
        # over the characters.  So we may as well directly turn it into a
        # string here --- only one pass.
        u = space.unicode_w(w_string)
        sb = StringBuilder(len(u))
        first = 0

    for i in range(first, len(u)):
        c = u[i]
        if c <= u'~':
            if c == u'"' or c == u'\\':
                sb.append('\\')
            elif c < u' ':
                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
                continue
            sb.append(chr(ord(c)))
        else:
            if c <= u'\uffff':
                sb.append('\\u')
                sb.append(HEX[ord(c) >> 12])
                sb.append(HEX[(ord(c) >> 8) & 0x0f])
                sb.append(HEX[(ord(c) >> 4) & 0x0f])
                sb.append(HEX[ord(c) & 0x0f])
            else:
                # surrogate pair
                n = ord(c) - 0x10000
                s1 = 0xd800 | ((n >> 10) & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s1 >> 8) & 0x0f])
                sb.append(HEX[(s1 >> 4) & 0x0f])
                sb.append(HEX[s1 & 0x0f])
                s2 = 0xdc00 | (n & 0x3ff)
                sb.append('\\ud')
                sb.append(HEX[(s2 >> 8) & 0x0f])
                sb.append(HEX[(s2 >> 4) & 0x0f])
                sb.append(HEX[s2 & 0x0f])

    res = sb.build()
    return space.wrap(res)