예제 #1
0
파일: formatting.py 프로젝트: zcxowwww/pypy
 def unknown_fmtchar(self):
     space = self.space
     if do_unicode:
         cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
         pos = rutf8.codepoints_in_utf8(self.fmt, 0, self.fmtpos - 1)
         w_s = space.newutf8(
             rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1)
     else:
         cp = ord(self.fmt[self.fmtpos - 1])
         pos = self.fmtpos - 1
         w_s = space.newbytes(chr(cp))
     raise oefmt(space.w_ValueError,
                 "unsupported format character %R (%s) at index %d",
                 w_s, hex(cp), pos)
예제 #2
0
    def lookup(self, space, name):
        try:
            code = self._lookup(name.upper(), with_named_sequence=True)
        except KeyError:
            msg = space.mod(space.newtext("undefined character name '%s'"),
                            space.newtext(name))
            raise OperationError(space.w_KeyError, msg)

        # The code may be a named sequence
        sequence = self._lookup_named_sequence(code)
        if sequence is not None:
            # named sequences only contain UCS2 codes, no surrogates &co.
            return space.newutf8(sequence.encode('utf-8'), len(sequence))

        return space.newutf8(unichr_as_utf8(r_uint(code)), 1)
예제 #3
0
def wrap_value(space, func, add_arg, argdesc, letter):
    for c, ll_type in ll_typemap_iter:
        if letter == c:
            if c in TYPEMAP_PTR_LETTERS:
                res = func(add_arg, argdesc, rffi.VOIDP)
                return space.newint(rffi.cast(lltype.Unsigned, res))
            if c in TYPEMAP_NUMBER_LETTERS:
                return space.newint(func(add_arg, argdesc, ll_type))
            elif c == 'c':
                return space.newbytes(func(add_arg, argdesc, ll_type))
            elif c == 'u':
                return space.newutf8(rutf8.unichr_as_utf8(
                    ord(func(add_arg, argdesc, ll_type))), 1)
            elif c == 'f' or c == 'd' or c == 'g':
                return space.newfloat(float(func(add_arg, argdesc, ll_type)))
            else:
                assert 0, "unreachable"
    raise oefmt(space.w_TypeError, "cannot directly read value")
예제 #4
0
 def decode_escape_sequence_unicode(self, i, builder):
     # at this point we are just after the 'u' of the \u1234 sequence.
     start = i
     i += 4
     hexdigits = self.getslice(start, i)
     try:
         val = int(hexdigits, 16)
         if sys.maxunicode > 65535 and 0xd800 <= val <= 0xdfff:
             # surrogate pair
             if self.ll_chars[i] == '\\' and self.ll_chars[i + 1] == 'u':
                 val = self.decode_surrogate_pair(i, val)
                 i += 6
     except ValueError:
         self._raise("Invalid \uXXXX escape (char %d)", i - 1)
         return  # help the annotator to know that we'll never go beyond
         # this point
     #
     utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True)
     builder.append(utf8_ch)
     return i
예제 #5
0
 def decode_escape_sequence_unicode(self, i, builder):
     # at this point we are just after the 'u' of the \u1234 sequence.
     start = i
     i += 4
     try:
         val = self._get_int_val_from_hex4(start)
         if (0xd800 <= val <= 0xdbff and self.ll_chars[i] == '\\'
                 and self.ll_chars[i + 1] == 'u'):
             lowsurr = self._get_int_val_from_hex4(i + 2)
             if 0xdc00 <= lowsurr <= 0xdfff:
                 # decode surrogate pair
                 val = 0x10000 + (((val - 0xd800) << 10) |
                                  (lowsurr - 0xdc00))
                 i += 6
     except ValueError:
         raise DecoderError("Invalid \uXXXX escape (char %d)", i - 1)
         return  # help the annotator to know that we'll never go beyond
         # this point
     #
     utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True)
     builder.append(utf8_ch)
     return i
예제 #6
0
def wrap_value(space, func, add_arg, argdesc, letter):
    for c, ll_type in ll_typemap_iter:
        if letter == c:
            if c in TYPEMAP_PTR_LETTERS:
                res = func(add_arg, argdesc, rffi.VOIDP)
                return space.newint(rffi.cast(lltype.Unsigned, res))
            if c in TYPEMAP_NUMBER_LETTERS:
                return space.newint(func(add_arg, argdesc, ll_type))
            elif c == 'c':
                return space.newbytes(func(add_arg, argdesc, ll_type))
            elif c == 'u':
                code = ord(func(add_arg, argdesc, ll_type))
                try:
                    return space.newutf8(rutf8.unichr_as_utf8(
                        r_uint(code), allow_surrogates=True), 1)
                except rutf8.OutOfRange:
                    raise oefmt(space.w_ValueError,
                        "unicode character %d out of range", code)
            elif c == 'f' or c == 'd' or c == 'g':
                return space.newfloat(float(func(add_arg, argdesc, ll_type)))
            else:
                assert 0, "unreachable"
    raise oefmt(space.w_TypeError, "cannot directly read value")
예제 #7
0
파일: formatting.py 프로젝트: zcxowwww/pypy
 def fmt_c(self, w_value):
     self.prec = -1  # just because
     space = self.space
     if space.isinstance_w(w_value, space.w_bytes):
         if do_unicode:
             w_value = w_value.descr_decode(space,
                                            space.newtext('ascii'))
         s = space.bytes_w(w_value)
         if len(s) != 1:
             raise oefmt(space.w_TypeError, "%c requires int or char")
         self.std_wp(s, True)
     elif space.isinstance_w(w_value, space.w_unicode):
         if not do_unicode:
             raise NeedUnicodeFormattingError
         ustr = space.utf8_w(w_value)
         if space.len_w(w_value) != 1:
             raise oefmt(space.w_TypeError,
                         "%c requires int or unichar")
         self.std_wp(ustr, False)
     else:
         n = space.int_w(w_value)
         if do_unicode:
             try:
                 c = rutf8.unichr_as_utf8(r_uint(n),
                                          allow_surrogates=True)
             except rutf8.OutOfRange:
                 raise oefmt(space.w_OverflowError,
                             "unicode character code out of range")
             self.std_wp(c, False)
         else:
             try:
                 s = chr(n)
             except ValueError:
                 raise oefmt(space.w_OverflowError,
                             "character code not in range(256)")
             self.std_wp(s, True)
예제 #8
0
def chr(space, code):
    "Return a Unicode string of one character with the given ordinal."
    if code < 0 or code > 0x10FFFF:
        raise oefmt(space.w_ValueError, "chr() arg out of range")
    s = rutf8.unichr_as_utf8(code, allow_surrogates=True)
    return space.newutf8(s, 1)
예제 #9
0
 def _format_int_or_long(self, w_num, kind):
     space = self.space
     if self._precision != -1:
         raise oefmt(space.w_ValueError,
                     "precision not allowed in integer type")
     sign_char = "\0"
     tp = self._type
     if tp == "c":
         if self._sign != "\0":
             raise oefmt(space.w_ValueError,
                         "sign not allowed with 'c' presentation type")
         value = space.int_w(w_num)
         max_char = 0x10FFFF if self.is_unicode else 0xFF
         if not (0 <= value <= max_char):
             raise oefmt(space.w_OverflowError,
                         "%%c arg not in range(%s)", hex(max_char))
         if self.is_unicode:
             result = rutf8.unichr_as_utf8(value)
         else:
             result = chr(value)
         n_digits = 1
         n_remainder = 1
         to_remainder = 0
         n_prefix = 0
         to_prefix = 0
         to_numeric = 0
     else:
         if tp == "b":
             base = 2
             skip_leading = 2
         elif tp == "o":
             base = 8
             skip_leading = 2
         elif tp == "x" or tp == "X":
             base = 16
             skip_leading = 2
         elif tp == "n" or tp == "d":
             base = 10
             skip_leading = 0
         else:
             raise AssertionError("shouldn't reach")
         if kind == INT_KIND:
             result = self._int_to_base(base, space.int_w(w_num))
         else:
             result = self._long_to_base(base, space.bigint_w(w_num))
         n_prefix = skip_leading if self._alternate else 0
         to_prefix = 0
         if result[0] == "-":
             sign_char = "-"
             skip_leading += 1
             to_prefix += 1
         n_digits = len(result) - skip_leading
         n_remainder = 0
         to_remainder = 0
         to_numeric = skip_leading
     self._get_locale(tp)
     spec = self._calc_num_width(n_prefix, sign_char, to_numeric,
                                 n_digits, n_remainder, False, result)
     fill = self._fill_char
     upper = self._type == "X"
     return self.wrap(
         self._fill_number(spec, result, to_numeric, to_prefix, fill,
                           to_remainder, upper))
예제 #10
0
 def _lit(self, s):
     assert len(s) == 1
     if self.is_unicode:
         return rutf8.unichr_as_utf8(ord(s[0]))
     else:
         return s
예제 #11
0
def _get_delimiter(space, dialect):
    s = rutf8.unichr_as_utf8(dialect.delimiter)
    return space.newutf8(s, 1)
예제 #12
0
def _get_quotechar(space, dialect):
    if dialect.quotechar == 0:
        return space.w_None
    s = rutf8.unichr_as_utf8(dialect.quotechar)
    return space.newutf8(s, 1)
예제 #13
0
 def append_utf8(self, value):
     w_ch = self.space.newutf8(rutf8.unichr_as_utf8(r_uint(value)), 1)
     self.result_w.append(w_ch)
예제 #14
0
            self.std_wp(value)

        def fmt_c(self, w_value):
            self.prec = -1  # just because
            space = self.space
            try:
                w_value = space.index(w_value)
            except OperationError as e:
                if e. async (space):
                    raise
                # otherwise, eats all exceptions, like CPython
            else:
                n = space.int_w(w_value)
                if do_unicode:
                    try:
                        c = rutf8.unichr_as_utf8(r_uint(n),
                                                 allow_surrogates=True)
                    except rutf8.OutOfRange:
                        raise oefmt(space.w_OverflowError,
                                    "unicode character code out of range")
                    self.std_wp(c, False)
                else:
                    try:
                        s = chr(n)
                    except ValueError:
                        raise oefmt(space.w_OverflowError,
                                    "character code not in range(256)")
                    self.std_wp(s, True)
                return
            if not do_unicode:
                if space.isinstance_w(w_value, space.w_bytes):
                    s = space.bytes_w(w_value)
예제 #15
0
def surrogatepass_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        encoding = space.text_w(space.getattr(w_exc,
                                              space.newtext('encoding')))
        bytelength, code = get_standard_encoding(encoding)
        if code == ENC_UNKNOWN:
            # Not supported, fail with original exception
            raise OperationError(space.type(w_exc), w_exc)
        end = space.int_w(w_end)
        builder = StringBuilder()
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        obj = w_obj._utf8
        pos = start
        while pos < end:
            ch = rutf8.codepoint_at_pos(obj, pos)
            pos = rutf8.next_codepoint_pos(obj, pos)
            if ch < 0xd800 or ch > 0xdfff:
                # Not a surrogate, fail with original exception
                raise OperationError(space.type(w_exc), w_exc)
            if code == ENC_UTF8:
                builder.append(chr(0xe0 | (ch >> 12)))
                builder.append(chr(0x80 | ((ch >> 6) & 0x3f)))
                builder.append(chr(0x80 | (ch & 0x3f)))
            elif code == ENC_UTF16LE:
                builder.append(chr(ch & 0xff))
                builder.append(chr(ch >> 8))
            elif code == ENC_UTF16BE:
                builder.append(chr(ch >> 8))
                builder.append(chr(ch & 0xff))
            elif code == ENC_UTF32LE:
                builder.append(chr(ch & 0xff))
                builder.append(chr(ch >> 8))
                builder.append(chr(0))
                builder.append(chr(0))
            elif code == ENC_UTF32BE:
                builder.append(chr(0))
                builder.append(chr(0))
                builder.append(chr(ch >> 8))
                builder.append(chr(ch & 0xff))
        return space.newtuple([space.newbytes(builder.build()), w_end])
    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
        encoding = space.text_w(space.getattr(w_exc,
                                              space.newtext('encoding')))
        bytelength, code = get_standard_encoding(encoding)
        ch = 0
        # Try decoding a single surrogate character. If there are more,
        # let the codec call us again
        ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1
        ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1
        ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1
        ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1
        if code == ENC_UTF8:
            if (ch1 != -1 and ch2 != -1 and ch0 & 0xf0 == 0xe0
                    and ch1 & 0xc0 == 0x80 and ch2 & 0xc0 == 0x80):
                # it's a three-byte code
                ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
        elif code == ENC_UTF16LE:
            ch = (ch1 << 8) | ch0
        elif code == ENC_UTF16BE:
            ch = (ch0 << 8) | ch1
        elif code == ENC_UTF32LE:
            ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0
        elif code == ENC_UTF32BE:
            ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3
        if ch < 0xd800 or ch > 0xdfff:
            # it's not a surrogate - fail
            ch = 0
        if ch == 0:
            raise OperationError(space.type(w_exc), w_exc)
        ch_utf8 = rutf8.unichr_as_utf8(ch, allow_surrogates=True)
        return space.newtuple(
            [space.newtext(ch_utf8, 1),
             space.newint(start + bytelength)])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)