def pack_unichar(fmtiter): utf8, lgt = fmtiter.accept_unicode_arg() if lgt != 1: raise StructError("expected a unicode string of length 1") uchr = rutf8.codepoint_at_pos(utf8, 0) unichar.pack_codepoint(uchr, fmtiter.wbuf, fmtiter.pos) fmtiter.advance(unichar.UNICODE_SIZE)
def decode_unicode_utf8(space, s, ps, q): # ****The Python 2.7 version, producing UTF-32 escapes**** # String is utf8-encoded, but 'unicode_escape' expects # latin-1; So multibyte sequences must be escaped. lis = [] # using a list to assemble the value end = q # Worst case: # "<92><195><164>" may become "\u005c\U000000E4" (16 bytes) while ps < end: if s[ps] == '\\': lis.append(s[ps]) ps += 1 if ord(s[ps]) & 0x80: # A multibyte sequence will follow, it will be # escaped like \u1234. To avoid confusion with # the backslash we just wrote, we emit "\u005c" # instead. lis.append("u005c") if ord(s[ps]) & 0x80: cp = rutf8.codepoint_at_pos(s, ps) hexa = hex(cp + 0x10000000) lis.append('\\U0') lis.append(hexa[3:]) # Skip 0x and the leading 1 ps = rutf8.next_codepoint_pos(s, ps) else: lis.append(s[ps]) ps += 1 return ''.join(lis)
def xmlcharrefreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # weeoes w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start obj = w_obj._utf8 while pos < end: code = rutf8.codepoint_at_pos(obj, pos) builder.append("&#") builder.append(str(code)) builder.append(";") pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def namereplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() obj = w_obj._utf8 pos = start while pos < end: oc = rutf8.codepoint_at_pos(obj, pos) try: name = unicodedb.name(oc) except KeyError: unicodehelper.raw_unicode_escape_helper(builder, oc) else: builder.append('\\N{') builder.append(name) builder.append('}') pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def cast_unicode(self, w_ob): space = self.space w_u = space.convert_arg_to_w_unicode(w_ob) if w_u._len() != 1: raise oefmt( space.w_TypeError, "cannot cast unicode string of length %d to ctype '%s'", w_u._len(), self.name) return rutf8.codepoint_at_pos(w_u._utf8, 0)
def test_utf8_iterator_pos(arg): utf8s = arg.encode('utf8') u = rutf8.Utf8StringPosIterator(utf8s) l = [] i = 0 for c, pos in u: l.append(unichr(c)) assert c == rutf8.codepoint_at_pos(utf8s, pos) assert pos == i i = rutf8.next_codepoint_pos(utf8s, i) assert list(arg) == l
def charmap_build(space, chars): # XXX CPython sometimes uses a three-level trie w_charmap = space.newdict() pos = 0 num = 0 while pos < len(chars): w_char = space.newint(rutf8.codepoint_at_pos(chars, pos)) space.setitem(w_charmap, w_char, space.newint(num)) pos = rutf8.next_codepoint_pos(chars, pos) num += 1 return w_charmap
def unknown_fmtchar(self): space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1) else: cp = ord(self.fmt[self.fmtpos - 1]) w_s = space.newbytes(chr(cp)) raise oefmt(space.w_ValueError, "unsupported format character %R (%s) at index %d", w_s, hex(cp), self.fmtpos - 1)
def _get_error_info(self, pos): space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, pos) pos = rutf8.codepoints_in_utf8(self.fmt, 0, pos) w_s = space.newutf8( rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1) else: cp = ord(self.fmt[pos]) w_s = space.newbytes(chr(cp)) return w_s, pos, cp
def utf8_to_char32(utf8, target_ptr, target_length, add_final_zero): # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers; # we assume (and check) that target_length == number of unichars in utf8. unichardata = rffi.cast(rffi.UINTP, target_ptr) i = 0 for j in range(target_length): code = rutf8.codepoint_at_pos(utf8, i) unichardata[j] = rffi.cast(rffi.UINT, code) i = rutf8.next_codepoint_pos(utf8, i) assert i == len(utf8) if add_final_zero: unichardata[target_length] = rffi.cast(rffi.UINT, 0)
def _get_codepoint(space, w_src, default, name): if w_src is None: return default if space.is_w(w_src, space.w_None): return 0 if not space.isinstance_w(w_src, space.w_unicode): raise oefmt(space.w_TypeError, '"%s" must be string, not %T', name, w_src) src, length = space.utf8_len_w(w_src) if length == 1: res = rutf8.codepoint_at_pos(src, 0) assert res >= 0 return res if len(src) == 0: return 0 raise oefmt(space.w_TypeError, '"%s" must be a 1-character string', name)
def surrogateescape_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) res = '' start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) obj = w_obj._utf8 pos = start while pos < end: code = rutf8.codepoint_at_pos(obj, pos) if code < 0xdc80 or code > 0xdcff: # Not a UTF-8b surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) res += chr(code - 0xdc00) pos = rutf8.next_codepoint_pos(obj, pos) return space.newtuple([space.newbytes(res), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): consumed = 0 start = space.int_w(space.getattr(w_exc, space.newtext('start'))) end = space.int_w(space.getattr(w_exc, space.newtext('end'))) obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) replace = u'' while consumed < 4 and consumed < end - start: c = ord(obj[start + consumed]) if c < 128: # Refuse to escape ASCII bytes. break replace += unichr(0xdc00 + c) consumed += 1 if not consumed: # codec complained about ASCII byte. raise OperationError(space.type(w_exc), w_exc) replace_utf8 = runicode.unicode_encode_utf_8(replace, len(replace), 'strict', allow_surrogates=True) return space.newtuple([ space.newtext(replace_utf8, len(replace)), space.newint(start + consumed) ]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def _convert_to_charN_t(self, w_ob): # returns a r_uint. If self.size == 2, it is smaller than 0x10000 space = self.space if space.isinstance_w(w_ob, space.w_unicode): w_u = space.convert_arg_to_w_unicode(w_ob) if w_u._len() != 1: raise self._convert_error("single character", w_ob) ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0) if self.size == 2 and ordinal > 0xFFFF: raise self._convert_error("single character <= 0xFFFF", w_ob) return r_uint(ordinal) elif (isinstance(w_ob, cdataobj.W_CData) and isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and w_ob.ctype.size == self.size): with w_ob as ptr: return misc.read_raw_ulong_data(ptr, self.size) raise self._convert_error("unicode string of length 1", w_ob)
def unwrap_value(space, push_func, add_arg, argdesc, letter, w_arg): if letter in TYPEMAP_PTR_LETTERS: # check for NULL ptr if isinstance(w_arg, W_DataInstance): ptr = w_arg.ll_buffer else: ptr = unwrap_truncate_int(rffi.VOIDP, space, w_arg) push_func(add_arg, argdesc, ptr) elif letter == "d": push_func(add_arg, argdesc, space.float_w(w_arg)) elif letter == "f": push_func(add_arg, argdesc, rffi.cast(rffi.FLOAT, space.float_w(w_arg))) elif letter == "g": push_func(add_arg, argdesc, rffi.cast(rffi.LONGDOUBLE, space.float_w(w_arg))) elif letter == "c": if space.isinstance_w(w_arg, space.w_int): val = space.byte_w(w_arg) else: s = space.bytes_w(w_arg) if len(s) != 1: raise oefmt(space.w_TypeError, "Expected bytes of length one as character") val = s[0] push_func(add_arg, argdesc, val) elif letter == 'u': s, lgt = space.utf8_len_w(w_arg) if lgt != 1: raise oefmt( space.w_TypeError, "Expected unicode string of length one as wide " "character") val = rutf8.codepoint_at_pos(s, 0) push_func(add_arg, argdesc, rffi.cast(rffi.WCHAR_T, val)) else: for c in unroll_letters_for_numbers: if letter == c: TP = LL_TYPEMAP[c] val = unwrap_truncate_int(TP, space, w_arg) push_func(add_arg, argdesc, val) return else: raise oefmt(space.w_TypeError, "cannot directly write value")
def utf8_to_char16(utf8, target_ptr, target_length, add_final_zero): # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers; # we assume (and check) that target_length == utf8_size_as_char16(utf8). ptr = rffi.cast(rffi.USHORTP, target_ptr) i = 0 while i < len(utf8): ordinal = rutf8.codepoint_at_pos(utf8, i) if ordinal > 0xFFFF: ordinal -= 0x10000 ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10)) ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF)) ptr = rffi.ptradd(ptr, 2) else: ptr[0] = rffi.cast(rffi.USHORT, ordinal) ptr = rffi.ptradd(ptr, 1) i = rutf8.next_codepoint_pos(utf8, i) assert ptr == ( rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length)) if add_final_zero: ptr[0] = rffi.cast(rffi.USHORT, 0)
def item_w(self, w_item): space = self.space unwrap = getattr(space, mytype.unwrap) try: item = unwrap(w_item) except OperationError as e: if space.isinstance_w(w_item, space.w_float): # Odd special case from cpython raise if mytype.method != '' and e.match(space, space.w_TypeError): try: item = unwrap(space.call_method(w_item, mytype.method)) except OperationError: raise oefmt(space.w_TypeError, "array item must be " + mytype.errorname) else: raise if mytype.unwrap == 'bigint_w': try: item = item.touint() except (ValueError, OverflowError): raise oefmt(space.w_OverflowError, "unsigned %d-byte integer out of range", mytype.bytes) return rffi.cast(mytype.itemtype, item) if mytype.unwrap == 'bytes_w': if len(item) != 1: raise oefmt(space.w_TypeError, "array item must be char") item = item[0] return rffi.cast(mytype.itemtype, item) if mytype.unwrap == 'utf8_len_w': utf8, lgt = item if lgt != 1: raise oefmt(space.w_TypeError, "array item must be char") uchar = rutf8.codepoint_at_pos(utf8, 0) return rffi.cast(mytype.itemtype, uchar) # # "regular" case: it fits in an rpython integer (lltype.Signed) # or it is a float return self.item_from_int_or_float(item)
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() obj = w_obj._utf8 pos = start while pos < end: oc = rutf8.codepoint_at_pos(obj, pos) num = hex(oc) if (oc >= 0x10000): builder.append("\\U") zeros = 8 elif (oc >= 0x100): builder.append("\\u") zeros = 4 else: builder.append("\\x") zeros = 2 lnum = len(num) nb = zeros + 2 - lnum # num starts with '0x' if nb > 0: builder.append_multiple_char('0', nb) builder.append_slice(num, 2, lnum) pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or space.isinstance_w(w_exc, space.w_UnicodeTranslateError)): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start obj = w_obj._utf8 while pos < end: code = rutf8.codepoint_at_pos(obj, pos) unicodehelper.raw_unicode_escape_helper(builder, code) pos = rutf8.next_codepoint_pos(obj, pos) return space.newtuple([space.newtext(builder.build()), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) builder = StringBuilder() pos = start while pos < end: oc = ord(obj[pos]) unicodehelper.raw_unicode_escape_helper(builder, oc) pos += 1 return space.newtuple([space.newtext(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def surrogatepass_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) bytelength, code = get_standard_encoding(encoding) if code == ENC_UNKNOWN: # Not supported, fail with original exception raise OperationError(space.type(w_exc), w_exc) end = space.int_w(w_end) builder = StringBuilder() start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) obj = w_obj._utf8 pos = start while pos < end: ch = rutf8.codepoint_at_pos(obj, pos) pos = rutf8.next_codepoint_pos(obj, pos) if ch < 0xd800 or ch > 0xdfff: # Not a surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) if code == ENC_UTF8: builder.append(chr(0xe0 | (ch >> 12))) builder.append(chr(0x80 | ((ch >> 6) & 0x3f))) builder.append(chr(0x80 | (ch & 0x3f))) elif code == ENC_UTF16LE: builder.append(chr(ch & 0xff)) builder.append(chr(ch >> 8)) elif code == ENC_UTF16BE: builder.append(chr(ch >> 8)) builder.append(chr(ch & 0xff)) elif code == ENC_UTF32LE: builder.append(chr(ch & 0xff)) builder.append(chr(ch >> 8)) builder.append(chr(0)) builder.append(chr(0)) elif code == ENC_UTF32BE: builder.append(chr(0)) builder.append(chr(0)) builder.append(chr(ch >> 8)) builder.append(chr(ch & 0xff)) return space.newtuple([space.newbytes(builder.build()), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): start = space.int_w(space.getattr(w_exc, space.newtext('start'))) obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding'))) bytelength, code = get_standard_encoding(encoding) ch = 0 # Try decoding a single surrogate character. If there are more, # let the codec call us again ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1 ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1 ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1 ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1 if code == ENC_UTF8: if (ch1 != -1 and ch2 != -1 and ch0 & 0xf0 == 0xe0 and ch1 & 0xc0 == 0x80 and ch2 & 0xc0 == 0x80): # it's a three-byte code ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f) elif code == ENC_UTF16LE: ch = (ch1 << 8) | ch0 elif code == ENC_UTF16BE: ch = (ch0 << 8) | ch1 elif code == ENC_UTF32LE: ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0 elif code == ENC_UTF32BE: ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3 if ch < 0xd800 or ch > 0xdfff: # it's not a surrogate - fail ch = 0 if ch == 0: raise OperationError(space.type(w_exc), w_exc) ch_utf8 = rutf8.unichr_as_utf8(ch, allow_surrogates=True) return space.newtuple( [space.newtext(ch_utf8, 1), space.newint(start + bytelength)]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def str(self, index): check_nonneg(index) return rutf8.codepoint_at_pos(self._utf8, index)
class W_Array(W_ArrayBase): itemsize = mytype.bytes typecode = mytype.typecode _attrs_ = W_ArrayBase._attrs_ def get_buffer(self): return rffi.cast(mytype.arrayptrtype, self._buffer) if mytype.unwrap == 'utf8_len_w': def check_valid_unicode(self, space, s): i = 0 while i < len(s): if s[i] != '\x00' or ord(s[i + 1]) > 0x10: v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) + (ord(s[i + 2]) << 8) + ord(s[i + 3])) raise oefmt( space.w_ValueError, "Character U+%s is not in range [U+0000, U+10ffff]", hex(v)[2:]) i += 4 def item_w(self, w_item): space = self.space unwrap = getattr(space, mytype.unwrap) try: item = unwrap(w_item) except OperationError as e: if space.isinstance_w(w_item, space.w_float): # Odd special case from cpython raise if mytype.method != '' and e.match(space, space.w_TypeError): try: item = unwrap(space.call_method(w_item, mytype.method)) except OperationError as e: if e. async (space): raise msg = "array item must be " + mytype.errorname raise OperationError(space.w_TypeError, space.newtext(msg)) else: raise if mytype.convert: try: item = getattr(item, mytype.convert)() except (ValueError, OverflowError): raise oefmt(space.w_OverflowError, "unsigned %d-byte integer out of range", mytype.bytes) return rffi.cast(mytype.itemtype, item) if mytype.unwrap == 'utf8_len_w': utf8, lgt = item if lgt != 1: raise oefmt(space.w_TypeError, "array item must be char") uchar = rutf8.codepoint_at_pos(utf8, 0) return rffi.cast(mytype.itemtype, uchar) # # "regular" case: it fits in an rpython integer (lltype.Signed) # or it is a float return self.item_from_int_or_float(item)