Exemplo n.º 1
0
def _decode_helper(cp, s, flags, encoding, errors, errorhandler, final, start,
                   end, res):
    if end > len(s):
        end = len(s)
    piece = s[start:end]
    with rffi.scoped_nonmovingbuffer(piece) as dataptr:
        # first get the size of the result
        outsize = MultiByteToWideChar(cp, flags, dataptr, len(piece),
                                      lltype.nullptr(rffi.CWCHARP.TO), 0)
        if outsize == 0:
            r, pos = _decode_cp_error(s, errorhandler, encoding, errors, final,
                                      start, end)
            res.append(r)
            return pos, check_utf8(r, True)

        with rffi.scoped_alloc_unicodebuffer(outsize) as buf:
            # do the conversion
            if MultiByteToWideChar(cp, flags, dataptr, len(piece), buf.raw,
                                   outsize) == 0:
                r, pos = _decode_cp_error(s, errorhandler, encoding, errors,
                                          final, start, end)
                res.append(r)
                return pos, check_utf8(r, True)
            buf_as_str = buf.str(outsize)
            assert buf_as_str is not None
            with rffi.scoped_nonmoving_unicodebuffer(buf_as_str) as dataptr:
                conv = _unibuf_to_utf8(dataptr, outsize)
            res.append(conv)
            return end, codepoints_in_utf8(conv)
Exemplo n.º 2
0
def verify_identifier(token):
    # 1=ok; 0=not an identifier; -1=bad utf-8
    try:
        rutf8.check_utf8(token, False)
    except rutf8.CheckError:
        return -1
    from pypy.objspace.std.unicodeobject import _isidentifier
    return _isidentifier(token)
Exemplo n.º 3
0
def check_utf8(space, s, ps, end):
    assert ps >= 0
    pt = ps
    # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
    while ps < end and ord(s[ps]) & 0x80:
        ps += 1
    try:
        rutf8.check_utf8(s, True, pt, ps)
    except rutf8.CheckError as e:
        lgt, flag = rutf8.check_utf8(s, True, pt, e.pos)
        unicodehelper.decode_error_handler(space)('strict', 'utf8',
                                                  'invalid utf-8', s, pt + lgt,
                                                  pt + lgt + 1)
    return s[pt:ps]
Exemplo n.º 4
0
    def wrap(self, x):
        """ Wraps the Python value 'x' into one of the wrapper classes. This
        should only be used for tests, in real code you need to use the
        explicit new* methods."""
        if x is None:
            return self.w_None
        if isinstance(x, OperationError):
            raise TypeError("attempt to wrap already wrapped exception: %s"%
                              (x,))
        if isinstance(x, int):
            if isinstance(x, bool):
                return self.newbool(x)
            else:
                return self.newint(x)
        if isinstance(x, str):
            return self.newtext(x)
        if isinstance(x, unicode):
            x = x.encode('utf8')
            lgt = rutf8.check_utf8(x, True)
            return self.newutf8(x, lgt)
        if isinstance(x, float):
            return W_FloatObject(x)
        if isinstance(x, W_Root):
            w_result = x.spacebind(self)
            #print 'wrapping', x, '->', w_result
            return w_result
        if isinstance(x, base_int):
            return self.newint(x)

        # we might get there in non-translated versions if 'x' is
        # a long that fits the correct range.
        if is_valid_int(x):
            return self.newint(x)

        return self._wrap_not_rpython(x)
Exemplo n.º 5
0
def xmlcharrefreplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # weeoes
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        pos = start
        obj = w_obj._utf8
        while pos < end:
            code = rutf8.codepoint_at_pos(obj, pos)
            builder.append("&#")
            builder.append(str(code))
            builder.append(";")
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemplo n.º 6
0
def namereplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        obj = w_obj._utf8
        pos = start
        while pos < end:
            oc = rutf8.codepoint_at_pos(obj, pos)
            try:
                name = unicodedb.name(oc)
            except KeyError:
                unicodehelper.raw_unicode_escape_helper(builder, oc)
            else:
                builder.append('\\N{')
                builder.append(name)
                builder.append('}')
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemplo n.º 7
0
def _maybe_utf8_to_w(space, utf8):
    # should this be a method of space?
    s = rffi.constcharp2str(utf8)
    try:
        length = rutf8.check_utf8(s, allow_surrogates=False)
    except rutf8.CheckError:
        raise  # XXX do something
    return space.newtext(s, length)
Exemplo n.º 8
0
def format_method(space, w_string, args, is_unicode):
    if is_unicode:
        template = unicode_template_formatter(space, space.utf8_w(w_string))
        r = template.build(args)
        lgt = rutf8.check_utf8(r, True)
        return space.newutf8(r, lgt)
    else:
        template = str_template_formatter(space, space.bytes_w(w_string))
        return space.newbytes(template.build(args))
Exemplo n.º 9
0
 def getmappingvalue(self, key):
     # return the value corresponding to a key in the input dict
     space = self.space
     if self.w_valuedict is None:
         raise oefmt(space.w_TypeError, "format requires a mapping")
     if do_unicode:
         lgt = rutf8.check_utf8(key, True)
         w_key = space.newutf8(key, lgt)
     else:
         w_key = space.newbytes(key)
     return space.getitem(self.w_valuedict, w_key)
Exemplo n.º 10
0
 def w_convert(self, space, s):
     # I suppose this is a valid utf8, but there is noone to check
     # and noone to catch an error either
     try:
         lgt = rutf8.check_utf8(s, True)
         return space.newutf8(s, lgt)
     except rutf8.CheckError:
         from pypy.interpreter import unicodehelper
         # get the correct error msg
         unicodehelper.str_decode_utf8(
             s, 'string', True, unicodehelper.decode_error_handler(space))
         assert False, "always raises"
     return space.newtext(s)
Exemplo n.º 11
0
        def call_errorhandler(errors, encoding, reason, input, startpos,
                              endpos):
            """Generic wrapper for calling into error handlers.

            Note that error handler receives and returns position into
            the unicode characters, not into the position of utf8 bytes,
            so it needs to be converted by the codec

            Returns (unicode_or_none, str_or_none, newpos) as error
            handlers may return unicode or on Python 3, bytes.
            """
            w_errorhandler = lookup_error(space, errors)
            if decode:
                w_cls = space.w_UnicodeDecodeError
                w_input = space.newbytes(input)
                length = len(input)
            else:
                w_cls = space.w_UnicodeEncodeError
                length = rutf8.check_utf8(input, allow_surrogates=True)
                w_input = space.newutf8(input, length)
            w_exc =  space.call_function(
                w_cls,
                space.newtext(encoding),
                w_input,
                space.newint(startpos),
                space.newint(endpos),
                space.newtext(reason))
            w_res = space.call_function(w_errorhandler, w_exc)
            if (not space.isinstance_w(w_res, space.w_tuple)
                or space.len_w(w_res) != 2
                or not space.isinstance_w(
                                 space.getitem(w_res, space.newint(0)),
                                 space.w_unicode)):
                raise oefmt(space.w_TypeError,
                            "%s error handler must return (unicode, int) "
                            "tuple, not %R",
                            "decoding" if decode else "encoding", w_res)
            w_replace, w_newpos = space.fixedview(w_res, 2)
            newpos = space.int_w(w_newpos)
            if newpos < 0:
                newpos = length + newpos
            if newpos < 0 or newpos > length:
                raise oefmt(space.w_IndexError,
                            "position %d from error handler out of bounds",
                            newpos)
            w_replace = space.convert_to_w_unicode(w_replace)
            return w_replace._utf8, newpos
Exemplo n.º 12
0
 def _compute_value(self, space):
     lst = [None] * (len(formats) + len(formats) + 1)
     lgt = 0
     for i, fmt, attr in entries:
         lst[i + i] = self.xstrings[i]
         lgt += len(self.xstrings[i])
         value = getattr(self, attr)
         if fmt == 'd':
             result = str(value)
             lgt += len(result)
         elif fmt == 'R':
             s = space.repr(value)
             result = space.utf8_w(s)
             lgt += space.len_w(s)
         elif fmt == 'S':
             s = space.str(value)
             result = space.utf8_w(s)
             lgt += space.len_w(s)
         elif fmt == 'T':
             result = space.type(value).name
             lgt += rutf8.codepoints_in_utf8(result)
         elif fmt == 'N':
             result = value.getname(space)
             lgt += len(result)
         elif fmt == '8':
             # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'"
             from pypy.interpreter import unicodehelper
             result, _lgt, pos = unicodehelper.str_decode_utf8(
                 value, 'replace', True,
                 unicodehelper.decode_never_raise, True)
             lgt += _lgt
         elif isinstance(value, unicode):
             # 's'
             result = str(value.encode('utf-8'))
             lgt += len(value)
         else:
             result = str(value)
             try:
                 lgt += rutf8.check_utf8(result, True)
             except rutf8.CheckError as e:
                 lgt -= e.pos
         lst[i + i + 1] = result
     lst[-1] = self.xstrings[-1]
     lgt += len(self.xstrings[-1])
     retval = ''.join(lst)
     return retval, lgt
Exemplo n.º 13
0
def utf_8_decode(space, string, errors="strict", w_final=None):
    from pypy.interpreter import unicodehelper

    if errors is None:
        errors = 'strict'
    final = space.is_true(w_final)
    state = space.fromcache(CodecState)
    # call the fast version for checking
    try:
        lgt = rutf8.check_utf8(string, allow_surrogates=True)
    except rutf8.CheckError:
        res, consumed, lgt = unicodehelper.str_decode_utf8(
            string, errors, final, state.decode_error_handler)
        return space.newtuple2(space.newutf8(res, lgt), space.newint(consumed))
    else:
        return space.newtuple2(space.newutf8(string, lgt),
                               space.newint(len(string)))
Exemplo n.º 14
0
def show_warning(space, w_filename, lineno, w_text, w_category,
                 w_sourceline=None):
    w_name = space.getattr(w_category, space.newtext("__name__"))
    w_stderr = space.sys.get("stderr")

    # Print "filename:lineno: category: text\n"
    try:
        message = "%s:%d: %s: %s\n" % (space.text_w(w_filename), lineno,
                                       space.text_w(w_name),
                                       space.text_w(w_text))
    except OperationError as e:
        if e.async(space):
            raise
        message = "%s:%d: %s: %s\n" % (space.utf8_w(w_filename), lineno,
                                        space.utf8_w(w_name),
                                        space.utf8_w(w_text))
        lgt = rutf8.check_utf8(message, True)
        w_message = space.newutf8(message, lgt)
Exemplo n.º 15
0
def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb,
                            unicodedata):
    if e > 0:
        reason = "illegal multibyte sequence"
        esize = e
    elif e == MBERR_TOOFEW:
        reason = "incomplete multibyte sequence"
        esize = pypy_cjk_enc_inbuf_remaining(encodebuf)
    elif e == MBERR_NOMEMORY:
        raise MemoryError
    else:
        raise RuntimeError
    #
    # compute the string to use as a replacement -> 'replace', and
    # the current position in the input 'unicodedata' -> 'end'
    start = pypy_cjk_enc_inbuf_consumed(encodebuf)
    end = start + esize
    if errors == "strict":
        raise EncodeDecodeError(start, end, reason)
    elif errors == "ignore":
        replace = ""
        rettype = 'b'  # != 'u'
    elif errors == "replace":
        replace = "?"  # utf-8 unicode
        rettype = 'u'
    else:
        assert errorcb
        replace, end, rettype, obj = errorcb(errors, namecb, reason,
                                             unicodedata, start, end)
    if rettype == 'u':
        codec = pypy_cjk_enc_getcodec(encodebuf)
        lgt = rutf8.check_utf8(replace, False)
        replace = encode(codec, replace, lgt, copystate=encodebuf)
    #else:
    #   replace is meant to be a byte string already
    with rffi.scoped_nonmovingbuffer(replace) as inbuf:
        r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
    if r == MBERR_NOMEMORY:
        raise MemoryError
Exemplo n.º 16
0
def backslashreplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        obj = w_obj._utf8
        pos = start
        while pos < end:
            oc = rutf8.codepoint_at_pos(obj, pos)
            num = hex(oc)
            if (oc >= 0x10000):
                builder.append("\\U")
                zeros = 8
            elif (oc >= 0x100):
                builder.append("\\u")
                zeros = 4
            else:
                builder.append("\\x")
                zeros = 2
            lnum = len(num)
            nb = zeros + 2 - lnum  # num starts with '0x'
            if nb > 0:
                builder.append_multiple_char('0', nb)
            builder.append_slice(num, 2, lnum)
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
Exemplo n.º 17
0
Arquivo: gateway.py Projeto: Mu-L/pypy
    def __init__(self,
                 func,
                 unwrap_spec=None,
                 self_type=None,
                 descrmismatch=None,
                 doc=None):
        from rpython.rlib import rutf8
        from rpython.flowspace.bytecode import cpython_code_signature
        # 'implfunc' is the interpreter-level function.
        # Note that this uses a lot of (construction-time) introspection.
        Code.__init__(self, func.__name__)
        self.docstring = doc or func.__doc__
        if self.docstring:
            # check that it's utf-8
            rutf8.check_utf8(self.docstring, False)

        self.identifier = "%s-%s-%s" % (func.__module__, func.__name__,
                                        getattr(self_type, '__name__', '*'))

        # unwrap_spec can be passed to interp2app or
        # attached as an attribute to the function.
        # It is a list of types or singleton objects:
        #  baseobjspace.ObjSpace is used to specify the space argument
        #  baseobjspace.W_Root is for wrapped arguments to keep wrapped
        #  argument.Arguments is for a final rest arguments Arguments object
        # 'args_w' for fixedview applied to rest arguments
        # 'w_args' for rest arguments passed as wrapped tuple
        # str,int,float: unwrap argument as such type
        # (function, cls) use function to check/unwrap argument of type cls

        # First extract the signature from the (CPython-level) code object
        sig = cpython_code_signature(func.func_code)
        argnames = sig.argnames
        varargname = sig.varargname
        kwargname = sig.kwargname
        self._argnames = argnames

        if unwrap_spec is None:
            unwrap_spec = build_unwrap_spec(func, argnames, self_type)

        if self_type:
            assert unwrap_spec[
                0] == 'self', "self_type without 'self' spec element"
            unwrap_spec = list(unwrap_spec)
            if descrmismatch is not None:
                assert issubclass(self_type, W_Root)
                unwrap_spec[0] = ('INTERNAL:self', self_type)
                self.descrmismatch_op = descrmismatch
                self.descr_reqcls = self_type
            else:
                unwrap_spec[0] = self_type
        else:
            assert descrmismatch is None, (
                "descrmismatch without a self-type specified")

        app_sig = SignatureBuilder(func)

        UnwrapSpec_Check(func, argnames).apply_over(unwrap_spec, app_sig)
        self.sig = app_sig.signature()
        argnames = self.sig.argnames
        varargname = self.sig.varargname

        self.minargs = len(argnames)
        if varargname:
            self.maxargs = sys.maxint
        else:
            self.maxargs = self.minargs

        self.activation = UnwrapSpec_EmitRun.make_activation(unwrap_spec, func)
        self._bltin = func
        self._unwrap_spec = unwrap_spec

        # speed hack
        if 0 <= len(unwrap_spec) <= 5:
            try:
                arity, fastfunc = UnwrapSpec_FastFunc_Unwrap.make_fastfunc(
                    unwrap_spec, func)
            except FastFuncNotSupported:
                if unwrap_spec == [ObjSpace, Arguments]:
                    self.__class__ = BuiltinCodePassThroughArguments0
                    self.func__args__ = func
                elif unwrap_spec == [ObjSpace, W_Root, Arguments]:
                    self.__class__ = BuiltinCodePassThroughArguments1
                    self.func__args__ = func
                elif unwrap_spec == [self_type, ObjSpace, Arguments]:
                    self.__class__ = BuiltinCodePassThroughArguments1
                    self.descr_reqcls = self_type
                    miniglobals = {'func': func, 'self_type': self_type}
                    d = {}
                    source = """if 1:
                        def _call(space, w_obj, args):
                            self = space.descr_self_interp_w(self_type, w_obj)
                            return func(self, space, args)
                        \n"""
                    exec compile2(source) in miniglobals, d
                    self.func__args__ = d['_call']
            else:
                self.__class__ = globals()['BuiltinCode%d' % arity]
                setattr(self, 'fastfunc_%d' % arity, fastfunc)
Exemplo n.º 18
0
def test_check_utf8_slice(a, b, c):
    start = len(a)
    b_utf8 = b.encode('utf-8')
    end = start + len(b_utf8)
    assert rutf8.check_utf8(a + b_utf8 + c, False, start, end) == len(b)
Exemplo n.º 19
0
def verify_utf8(token):
    try:
        rutf8.check_utf8(token, False)
    except rutf8.CheckError:
        return False
    return True
Exemplo n.º 20
0
 def wrap(self, u):
     lgt = rutf8.check_utf8(u, True)
     return self.space.newutf8(u, lgt)
Exemplo n.º 21
0
 def _parse_spec(self, default_type, default_align):
     space = self.space
     self._fill_char = self._lit(" ")[0]
     self._align = default_align
     self._alternate = False
     self._sign = "\0"
     self._thousands_sep = False
     self._precision = -1
     the_type = default_type
     spec = self.spec
     if not spec:
         return True
     length = len(spec)
     i = 0
     got_align = True
     got_fill_char = False
     # The single character could be utf8-encoded unicode
     if self.is_unicode:
         after_i = rutf8.next_codepoint_pos(spec, i)
     else:
         after_i = i + 1
     if length - i >= 2 and self._is_alignment(spec[after_i]):
         self._align = spec[after_i]
         self._fill_char = spec[i:after_i]
         got_fill_char = True
         i = after_i + 1
     elif length - i >= 1 and self._is_alignment(spec[i]):
         self._align = spec[i]
         i += 1
     else:
         got_align = False
     if length - i >= 1 and self._is_sign(spec[i]):
         self._sign = spec[i]
         i += 1
     if length - i >= 1 and spec[i] == "#":
         self._alternate = True
         i += 1
     if not got_fill_char and length - i >= 1 and spec[i] == "0":
         self._fill_char = self._lit("0")[0]
         if not got_align:
             self._align = "="
         i += 1
     self._width, i = _parse_int(self.space, spec, i, length)
     if length != i and spec[i] == ",":
         self._thousands_sep = True
         i += 1
     if length != i and spec[i] == ".":
         i += 1
         self._precision, i = _parse_int(self.space, spec, i, length)
         if self._precision == -1:
             raise oefmt(space.w_ValueError, "no precision given")
     if length - i > 1:
         raise oefmt(space.w_ValueError, "invalid format spec")
     if length - i == 1:
         presentation_type = spec[i]
         if self.is_unicode:
             try:
                 rutf8.check_utf8(spec[i], True)
                 the_type = spec[i][0]
             except rutf8.CheckError:
                 raise oefmt(space.w_ValueError,
                             "invalid presentation type")
         else:
             the_type = presentation_type
         i += 1
     self._type = the_type
     if self._thousands_sep:
         tp = self._type
         if (tp == "d" or tp == "e" or tp == "f" or tp == "g"
                 or tp == "E" or tp == "G" or tp == "%" or tp == "F"
                 or tp == "\0"):
             # ok
             pass
         else:
             raise oefmt(space.w_ValueError, "invalid type with ','")
     return False
Exemplo n.º 22
0
    def decode_w(self, space, w_input, final=False):
        if self.w_decoder is None:
            raise oefmt(space.w_ValueError,
                        "IncrementalNewlineDecoder.__init__ not called")

        # decode input (with the eventual \r from a previous pass)
        if not space.is_w(self.w_decoder, space.w_None):
            w_output = space.call_method(self.w_decoder, "decode", w_input,
                                         space.newbool(bool(final)))
        else:
            w_output = w_input

        if not space.isinstance_w(w_output, space.w_unicode):
            raise oefmt(space.w_TypeError,
                        "decoder should return a string result")

        output, output_len = space.utf8_len_w(w_output)
        output_len = len(output)
        if self.pendingcr and (final or output_len):
            output = '\r' + output
            self.pendingcr = False
            output_len += 1

        # retain last \r even when not translating data:
        # then readline() is sure to get \r\n in one pass
        if not final and output_len > 0:
            last = len(output) - 1
            assert last >= 0
            if output[last] == '\r':
                output = output[:last]
                self.pendingcr = True
                output_len -= 1

        if output_len == 0:
            return space.newutf8("", 0)

        # Record which newlines are read and do newline translation if
        # desired, all in one pass.
        seennl = self.seennl

        if output.find('\r') < 0:
            # If no \r, quick scan for a possible "\n" character.
            # (there's nothing else to be done, even when in translation mode)
            if output.find('\n') >= 0:
                seennl |= SEEN_LF
                # Finished: we have scanned for newlines, and none of them
                # need translating.
        elif not self.translate:
            i = 0
            while i < len(output):
                if seennl == SEEN_ALL:
                    break
                c = output[i]
                i += 1
                if c == '\n':
                    seennl |= SEEN_LF
                elif c == '\r':
                    if i < len(output) and output[i] == '\n':
                        seennl |= SEEN_CRLF
                        i += 1
                    else:
                        seennl |= SEEN_CR
        elif output.find('\r') >= 0:
            # Translate!
            builder = StringBuilder(len(output))
            i = 0
            while i < output_len:
                c = output[i]
                i += 1
                if c == '\n':
                    seennl |= SEEN_LF
                elif c == '\r':
                    if i < len(output) and output[i] == '\n':
                        seennl |= SEEN_CRLF
                        i += 1
                    else:
                        seennl |= SEEN_CR
                    builder.append('\n')
                    continue
                builder.append(c)
            output = builder.build()

        self.seennl |= seennl
        lgt = check_utf8(output, True)
        return space.newutf8(output, lgt)