def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': # XXX error handling s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap( str_decode_ascii(s, len(s), None, final=True, errorhandler=eh)[0]) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap( str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors)) return w_retval
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': # XXX error handling s = space.charbuf_w(w_obj) try: u = fast_str_decode_ascii(s) except ValueError: eh = unicodehelper.decode_error_handler(space) u = str_decode_ascii( # try again, to get the error right s, len(s), None, final=True, errorhandler=eh)[0] return space.wrap(u) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors)) return w_retval
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': # XXX error handling s = space.charbuf_w(w_obj) try: u = fast_str_decode_ascii(s) except ValueError: eh = unicodehelper.decode_error_handler(space) u = str_decode_ascii( # try again, to get the error right s, len(s), None, final=True, errorhandler=eh)[0] return space.newunicode(u) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.newunicode(str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.newtext("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.newtext(encoding), space.newtext(errors)) return w_retval
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == 'strict': if encoding == 'ascii': s = space.charbuf_w(w_obj) try: u = fast_str_decode_ascii(s) except ValueError: eh = unicodehelper.decode_error_handler(space) u = str_decode_ascii( # try again, to get the error right s, len(s), None, final=True, errorhandler=eh)[0] return space.newunicode(u) if encoding == 'utf-8': s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.newunicode( str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh)[0]) from pypy.module._codecs.interp_codecs import decode_text w_retval = decode_text(space, w_obj, encoding, errors) if not space.isinstance_w(w_retval, space.w_unicode): raise oefmt( space.w_TypeError, "'%s' decoder returned '%T' instead of 'str'; " "use codecs.decode() to decode to arbitrary types", encoding, w_retval) return w_retval
def check_utf8(space, s, ps, end): assert ps >= 0 pt = ps # while (s < end && *s != '\\') s++; */ /* inefficient for u".." while ps < end and ord(s[ps]) & 0x80: ps += 1 try: rutf8.check_utf8(s, True, pt, ps) except rutf8.CheckError as e: lgt, flag = rutf8.check_utf8(s, True, pt, e.pos) unicodehelper.decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', s, pt + lgt, pt + lgt + 1) return s[pt:ps]
def decode_object(space, w_obj, encoding, errors): if encoding is None: encoding = getdefaultencoding(space) if errors is None or errors == "strict": if encoding == "ascii": # XXX error handling s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_ascii(s, len(s), None, final=True, errorhandler=eh)[0]) if encoding == "utf-8": s = space.charbuf_w(w_obj) eh = unicodehelper.decode_error_handler(space) return space.wrap(str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0]) w_codecs = space.getbuiltinmodule("_codecs") w_decode = space.getattr(w_codecs, space.wrap("decode")) if errors is None: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding)) else: w_retval = space.call_function(w_decode, w_obj, space.wrap(encoding), space.wrap(errors)) return w_retval
def w_convert(self, space, s): # I suppose this is a valid utf8, but there is noone to check # and noone to catch an error either try: lgt = rutf8.check_utf8(s, True) return space.newutf8(s, lgt) except rutf8.CheckError: from pypy.interpreter import unicodehelper # get the correct error msg unicodehelper.str_decode_utf8( s, 'string', True, unicodehelper.decode_error_handler(space)) assert False, "always raises" return space.newtext(s)
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_bytes): s = space.bytes_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string eh = unicodehelper.decode_error_handler(space) u = str_decode_utf_8(s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0] sb = StringBuilder(len(u)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. u = space.unicode_w(w_string) sb = StringBuilder(len(u)) first = 0 for i in range(first, len(u)): c = ord(u[i]) if c <= ord('~'): if c == ord('"') or c == ord('\\'): sb.append('\\') elif c < ord(' '): sb.append(ESCAPE_BEFORE_SPACE[c]) continue sb.append(chr(c)) else: if c <= ord(u'\uffff'): sb.append('\\u') sb.append(HEX[c >> 12]) sb.append(HEX[(c >> 8) & 0x0f]) sb.append(HEX[(c >> 4) & 0x0f]) sb.append(HEX[c & 0x0f]) else: # surrogate pair n = c - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.newtext(res)
def raw_encode_basestring_ascii(space, w_string): if space.isinstance_w(w_string, space.w_str): s = space.str_w(w_string) for i in range(len(s)): c = s[i] if c >= ' ' and c <= '~' and c != '"' and c != '\\': pass else: first = i break else: # the input is a string with only non-special ascii chars return w_string eh = unicodehelper.decode_error_handler(space) u = str_decode_utf_8( s, len(s), None, final=True, errorhandler=eh, allow_surrogates=True)[0] sb = StringBuilder(len(u)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return # 'w_string' directly. But this requires an extra pass over all # characters, and the expected use case of this function, from # json.encoder, will anyway re-encode a unicode result back to # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. u = space.unicode_w(w_string) sb = StringBuilder(len(u)) first = 0 for i in range(first, len(u)): c = u[i] if c <= u'~': if c == u'"' or c == u'\\': sb.append('\\') elif c < u' ': sb.append(ESCAPE_BEFORE_SPACE[ord(c)]) continue sb.append(chr(ord(c))) else: if c <= u'\uffff': sb.append('\\u') sb.append(HEX[ord(c) >> 12]) sb.append(HEX[(ord(c) >> 8) & 0x0f]) sb.append(HEX[(ord(c) >> 4) & 0x0f]) sb.append(HEX[ord(c) & 0x0f]) else: # surrogate pair n = ord(c) - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) sb.append(HEX[(s1 >> 4) & 0x0f]) sb.append(HEX[s1 & 0x0f]) s2 = 0xdc00 | (n & 0x3ff) sb.append('\\ud') sb.append(HEX[(s2 >> 8) & 0x0f]) sb.append(HEX[(s2 >> 4) & 0x0f]) sb.append(HEX[s2 & 0x0f]) res = sb.build() return space.wrap(res)