示例#1
0
 def w_convert(self, space, s):
     # I suppose this is a valid utf8, but there is noone to check
     # and noone to catch an error either
     try:
         lgt = rutf8.check_utf8(s, True)
         return space.newutf8(s, lgt)
     except rutf8.CheckError:
         from pypy.interpreter import unicodehelper
         # get the correct error msg
         unicodehelper.str_decode_utf8(
             s, 'string', True, unicodehelper.decode_error_handler(space))
         assert False, "always raises"
     return space.newtext(s)
示例#2
0
def utf_8_decode(space, string, errors="strict", w_final=None):
    if errors is None:
        errors = 'strict'
    final = space.is_true(w_final)
    state = space.fromcache(CodecState)
    res, lgt, pos = unicodehelper.str_decode_utf8(string, errors, final,
                                                  state.decode_error_handler)
    return space.newtuple([space.newutf8(res, lgt), space.newint(pos)])
示例#3
0
 def descr_decode(self, space, w_encoding=None, w_errors=None):
     from pypy.objspace.std.unicodeobject import (
         get_encoding_and_errors, decode_object)
     encoding, errors = get_encoding_and_errors(space, w_encoding, w_errors)
     if encoding is None:
         encoding = 'utf8'
     if encoding == 'utf8' or encoding == 'utf-8':
         # fast path - do not call into app-level codecs.py
         from pypy.module._codecs.interp_codecs import CodecState
         state = space.fromcache(CodecState)
         eh = state.decode_error_handler
         s = space.charbuf_w(self)
         ret, lgt, pos = str_decode_utf8(s, errors, True, eh)
         return space.newtext(ret, lgt)
     return decode_object(space, self, encoding, errors)
示例#4
0
 def _compute_value(self, space):
     lst = [None] * (len(formats) + len(formats) + 1)
     lgt = 0
     for i, fmt, attr in entries:
         lst[i + i] = self.xstrings[i]
         lgt += len(self.xstrings[i])
         value = getattr(self, attr)
         if fmt == 'd':
             result = str(value)
             lgt += len(result)
         elif fmt == 'R':
             s = space.repr(value)
             result = space.utf8_w(s)
             lgt += space.len_w(s)
         elif fmt == 'S':
             s = space.str(value)
             result = space.utf8_w(s)
             lgt += space.len_w(s)
         elif fmt == 'T':
             result = space.type(value).name
             lgt += rutf8.codepoints_in_utf8(result)
         elif fmt == 'N':
             result = value.getname(space)
             lgt += len(result)
         elif fmt == '8':
             # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'"
             from pypy.interpreter import unicodehelper
             result, _lgt, pos = unicodehelper.str_decode_utf8(
                 value, 'replace', True,
                 unicodehelper.decode_never_raise, True)
             lgt += _lgt
         elif isinstance(value, unicode):
             # 's'
             result = str(value.encode('utf-8'))
             lgt += len(value)
         else:
             result = str(value)
             try:
                 lgt += rutf8.check_utf8(result, True)
             except rutf8.CheckError as e:
                 lgt -= e.pos
         lst[i + i + 1] = result
     lst[-1] = self.xstrings[-1]
     lgt += len(self.xstrings[-1])
     retval = ''.join(lst)
     return retval, lgt
示例#5
0
def utf_8_decode(space, string, errors="strict", w_final=None):
    from pypy.interpreter import unicodehelper

    if errors is None:
        errors = 'strict'
    final = space.is_true(w_final)
    state = space.fromcache(CodecState)
    # call the fast version for checking
    try:
        lgt = rutf8.check_utf8(string, allow_surrogates=True)
    except rutf8.CheckError:
        res, consumed, lgt = unicodehelper.str_decode_utf8(
            string, errors, final, state.decode_error_handler)
        return space.newtuple2(space.newutf8(res, lgt), space.newint(consumed))
    else:
        return space.newtuple2(space.newutf8(string, lgt),
                               space.newint(len(string)))
示例#6
0
def decode_utf8(u):
    return str_decode_utf8(u, True, "strict", None)