def f(x): s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x) u, consumed = runicode.str_decode_utf_8(s1, len(s1), 'strict', allow_surrogates=True) s2 = runicode.unicode_encode_utf_8(u, len(u), 'strict', allow_surrogates=True) u3, consumed3 = runicode.str_decode_utf_8(s1, len(s1), 'strict', allow_surrogates=False) s3 = runicode.unicode_encode_utf_8(u3, len(u3), 'strict', allow_surrogates=False) return s1 == s2 == s3
def hex_to_utf8(state, token, s): try: uchr = UNICHR(int(s, 16)) return unicode_encode_utf_8(uchr, len(uchr), 'strict') except (ValueError, UnicodeDecodeError): # XXX better error message raise errorhandler(state, token, msg="Error encoding %s" % s)
def fsencode(space, w_uni): state = space.fromcache(interp_codecs.CodecState) if _WIN32: uni = space.unicode_w(w_uni) bytes = unicode_encode_mbcs(uni, len(uni), 'strict', errorhandler=encode_error_handler(space), force_replace=False) elif _MACOSX: uni = space.unicode_w(w_uni) bytes = runicode.unicode_encode_utf_8( uni, len(uni), 'surrogateescape', errorhandler=state.encode_error_handler) elif state.codec_need_encodings: # bootstrap check: if the filesystem codec is implemented in # Python we cannot use it before the codecs are ready. use the # locale codec instead from pypy.module._codecs.locale import ( unicode_encode_locale_surrogateescape) uni = space.unicode_w(w_uni) bytes = unicode_encode_locale_surrogateescape( uni, errorhandler=encode_error_handler(space)) else: from pypy.module.sys.interp_encoding import getfilesystemencoding return space.call_method(w_uni, 'encode', getfilesystemencoding(space), space.wrap('surrogateescape')) return space.wrapbytes(bytes)
def encode_utf_escape(self, utf_escape): utf_codepoint = int("".join(utf_escape), 16) if utf_codepoint > 0x101111: self.error("invalid Unicode codepoint (too large)") return [ c for c in unicode_encode_utf_8(unichr(utf_codepoint), 1, "ignore") ]
def test_encode_surrogate_pair_utf8(self): u = runicode.UNICHR(0xD800) + runicode.UNICHR(0xDC00) if runicode.MAXUNICODE < 65536: # Narrow unicode build, consider utf16 surrogate pairs assert runicode.unicode_encode_utf_8( u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80' assert runicode.unicode_encode_utf_8( u, len(u), True, allow_surrogates=False) == '\xf0\x90\x80\x80' else: # Wide unicode build, merge utf16 surrogate pairs only when allowed assert runicode.unicode_encode_utf_8( u, len(u), True, allow_surrogates=True) == '\xf0\x90\x80\x80' # Surrogates not merged, encoding fails. py.test.raises( UnicodeEncodeError, runicode.unicode_encode_utf_8, u, len(u), True, allow_surrogates=False)
def encode_object(space, w_object, encoding, errors): if encoding is None: # Get the encoder functions as a wrapped object. # This lookup is cached. w_encoder = space.sys.get_w_default_encoder() else: if errors is None or errors == 'strict': try: if encoding == 'ascii': u = space.unicode_w(w_object) eh = unicodehelper.rpy_encode_error_handler() return space.wrap(unicode_encode_ascii( u, len(u), None, errorhandler=eh)) if encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.rpy_encode_error_handler() return space.wrap(unicode_encode_utf_8( u, len(u), None, errorhandler=eh, allow_surrogates=True)) except unicodehelper.RUnicodeEncodeError, ue: raise OperationError(space.w_UnicodeEncodeError, space.newtuple([ space.wrap(ue.encoding), space.wrap(ue.object), space.wrap(ue.start), space.wrap(ue.end), space.wrap(ue.reason)])) from pypy.module._codecs.interp_codecs import lookup_codec w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0))
def encode_object(space, w_object, encoding, errors): if encoding is None: # Get the encoder functions as a wrapped object. # This lookup is cached. w_encoder = space.sys.get_w_default_encoder() else: if errors is None or errors == 'strict': if encoding == 'ascii': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) return space.newbytes( unicode_encode_ascii(u, len(u), None, errorhandler=eh)) if encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) return space.newbytes( unicode_encode_utf_8(u, len(u), None, errorhandler=eh, allow_surrogates=True)) from pypy.module._codecs.interp_codecs import lookup_codec w_encoder = space.getitem(lookup_codec(space, encoding), space.newint(0)) if errors is None: w_errors = space.newtext('strict') else: w_errors = space.newtext(errors) w_restuple = space.call_function(w_encoder, w_object, w_errors) w_retval = space.getitem(w_restuple, space.newint(0)) if not space.isinstance_w(w_retval, space.w_bytes): raise oefmt(space.w_TypeError, "encoder did not return an string object (type '%T')", w_retval) return w_retval
def handle_keypress(self, c_type, event): self.key = 0 p = rffi.cast(RSDL.KeyboardEventPtr, event) sym = rffi.getintfield(p.c_keysym, 'c_sym') char = rffi.getintfield(p.c_keysym, 'c_unicode') if sym == RSDL.K_DOWN: self.key = 31 elif sym == RSDL.K_LEFT: self.key = 28 elif sym == RSDL.K_RIGHT: self.key = 29 elif sym == RSDL.K_UP: self.key = 30 elif char != 0: chars = unicode_encode_utf_8(unichr(char), 1, "ignore") if len(chars) == 1: asciivalue = ord(chars[0]) if asciivalue >= 32: self.key = asciivalue if self.key == 0 and sym <= 255: self.key = sym interrupt = self.interrupt_key if (interrupt & 0xFF == self.key and interrupt >> 8 == self.get_modifier_mask(0)): raise KeyboardInterrupt
def encode_object(space, w_object, encoding, errors): if encoding is None: # Get the encoder functions as a wrapped object. # This lookup is cached. w_encoder = space.sys.get_w_default_encoder() else: if errors is None or errors == 'strict': try: if encoding == 'ascii': u = space.unicode_w(w_object) eh = unicodehelper.raise_unicode_exception_encode return space.wrap( unicode_encode_ascii(u, len(u), None, errorhandler=eh)) if encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.raise_unicode_exception_encode return space.wrap( unicode_encode_utf_8(u, len(u), None, errorhandler=eh, allow_surrogates=True)) except unicodehelper.RUnicodeEncodeError, ue: raise OperationError( space.w_UnicodeEncodeError, space.newtuple([ space.wrap(ue.encoding), space.wrap(ue.object), space.wrap(ue.start), space.wrap(ue.end), space.wrap(ue.reason) ])) from pypy.module._codecs.interp_codecs import lookup_codec w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0))
def encode_object(space, w_object, encoding, errors): if encoding is None: # Get the encoder functions as a wrapped object. # This lookup is cached. w_encoder = space.sys.get_w_default_encoder() else: if errors is None or errors == 'strict': if encoding == 'ascii': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) return space.wrap(unicode_encode_ascii( u, len(u), None, errorhandler=eh)) if encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) return space.wrap(unicode_encode_utf_8( u, len(u), None, errorhandler=eh, allow_surrogates=True)) from pypy.module._codecs.interp_codecs import lookup_codec w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0)) if errors is None: w_errors = space.wrap('strict') else: w_errors = space.wrap(errors) w_restuple = space.call_function(w_encoder, w_object, w_errors) w_retval = space.getitem(w_restuple, space.wrap(0)) if not space.isinstance_w(w_retval, space.w_str): raise oefmt(space.w_TypeError, "encoder did not return an string object (type '%T')", w_retval) return w_retval
def utf_8_encode(space, uni, errors="strict"): if errors is None: errors = 'strict' state = space.fromcache(CodecState) result = runicode.unicode_encode_utf_8( uni, len(uni), errors, state.encode_error_handler, allow_surrogates=True) return space.newtuple([space.wrap(result), space.wrap(len(uni))])
def encode_utf8(space, uni, allow_surrogates=False): # Note that this function never raises UnicodeEncodeError, # since surrogate pairs are allowed. # This is not the case with Python3. return runicode.unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=encode_error_handler(space), allow_surrogates=allow_surrogates)
def f(n): x = u'àèì' + unichr(n) if x: y = u'ìòé' else: y = u'òìàà' # the annotation of y is SomeUnicodeString(can_be_None=False) y = unicode_encode_utf_8(y, len(y), 'strict', errorhandler) return x.encode('utf-8') + y
def encode_utf8(space, uni): # Note that this function never raises UnicodeEncodeError, # since surrogates are allowed, either paired or lone. # A paired surrogate is considered like the non-BMP character # it stands for. These are the Python2 rules; Python3 differs. return runicode.unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=raise_unicode_exception_encode, allow_surrogates=True)
def f(n): x = u"àèì" + unichr(n) if x: y = u"ìòé" else: y = u"òìàà" # the annotation of y is SomeUnicodeString(can_be_None=False) y = unicode_encode_utf_8(y, len(y), "strict", errorhandler) return x.encode("utf-8") + y
def utf_8_encode(space, uni, errors="strict"): if errors is None: errors = 'strict' state = space.fromcache(CodecState) result = runicode.unicode_encode_utf_8(uni, len(uni), errors, state.encode_error_handler, allow_surrogates=True) return space.newtuple([space.wrap(result), space.wrap(len(uni))])
def write_raw_cached_string(self, si): assert isinstance(si, unicode) if self._with_cache: idx = self._string_cache.get(si, -1) if idx == -1: idx = len(self._string_cache) self._string_cache[si] = idx s = unicode_encode_utf_8(si, len(si), "?") write_int_raw(len(s), self) assert len(s) <= MAX_STRING_SIZE self.write(s) else: write_int_raw(r_uint(MAX_STRING_SIZE + idx), self) else: errors = "?" s = unicode_encode_utf_8(si, len(si), errors) assert len(s) <= MAX_INT32 write_int_raw(len(s), self) self.write(s)
def encode_utf8(space, uni): # Note that this function never raises UnicodeEncodeError, # since surrogates are allowed, either paired or lone. # A paired surrogate is considered like the non-BMP character # it stands for. These are the Python2 rules; Python3 differs. return runicode.unicode_encode_utf_8(uni, len(uni), "strict", errorhandler=None, allow_surrogates=True)
def identifier_w(self, space): identifier = self._utf8 if identifier is not None: return identifier u = self._value eh = unicodehelper.rpy_encode_error_handler() try: identifier = unicode_encode_utf_8(u, len(u), None, errorhandler=eh) except unicodehelper.RUnicodeEncodeError, ue: raise wrap_encode_error(space, ue)
def encode_utf8(space, uni, allow_surrogates=False): # Note that Python3 tends to forbid *all* surrogates in utf-8. # If allow_surrogates=True, then revert to the Python 2 behavior # which never raises UnicodeEncodeError. Surrogate pairs are then # allowed, either paired or lone. A paired surrogate is considered # like the non-BMP character it stands for. See also *_utf8sp(). assert isinstance(uni, unicode) return runicode.unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=encode_error_handler(space), allow_surrogates=allow_surrogates)
def unicode_encode_utf8(rope, allow_surrogates=False): from rpython.rlib.runicode import unicode_encode_utf_8 if rope.is_ascii(): return rope elif isinstance(rope, BinaryConcatNode): return BinaryConcatNode(unicode_encode_utf8(rope.left), unicode_encode_utf8(rope.right)) elif isinstance(rope, LiteralUnicodeNode): return LiteralStringNode( unicode_encode_utf_8(rope.u, len(rope.u), "strict", allow_surrogates=allow_surrogates)) elif isinstance(rope, LiteralStringNode): return LiteralStringNode(_str_encode_utf_8(rope.s))
def handle_keypress(self, c_type, event): self.key = 0 p = rffi.cast(RSDL.KeyboardEventPtr, event) sym = rffi.getintfield(p.c_keysym, 'c_sym') char = rffi.getintfield(p.c_keysym, 'c_unicode') if sym == RSDL.K_DOWN: self.key = key_constants.DOWN elif sym == RSDL.K_LEFT: self.key = key_constants.LEFT elif sym == RSDL.K_RIGHT: self.key = key_constants.RIGHT elif sym == RSDL.K_UP: self.key = key_constants.UP elif sym == RSDL.K_HOME: self.key = key_constants.HOME elif sym == RSDL.K_END: self.key = key_constants.END elif sym == RSDL.K_INSERT: self.key = key_constants.INSERT elif sym == RSDL.K_PAGEUP: self.key = key_constants.PAGEUP elif sym == RSDL.K_PAGEDOWN: self.key = key_constants.PAGEDOWN elif sym == RSDL.K_LSHIFT or sym == RSDL.K_RSHIFT: self.key = key_constants.SHIFT elif sym == RSDL.K_LCTRL or sym == RSDL.K_RCTRL: self.key = key_constants.CTRL elif sym == RSDL.K_LALT or sym == RSDL.K_RALT: self.key = key_constants.COMMAND elif sym == RSDL.K_BREAK: self.key = key_constants.BREAK elif sym == RSDL.K_CAPSLOCK: self.key = key_constants.CAPSLOCK elif sym == RSDL.K_NUMLOCK: self.key = key_constants.NUMLOCK elif sym == RSDL.K_SCROLLOCK: self.key = key_constants.SCROLLOCK elif char != 0: chars = unicode_encode_utf_8(unichr(char), 1, "ignore") if len(chars) == 1: asciivalue = ord(chars[0]) if asciivalue >= 32: self.key = asciivalue if self.key == 0 and sym <= 255: self.key = sym interrupt = self.interrupt_key if (interrupt & 0xFF == self.key and interrupt >> 8 == self.get_modifier_mask(0)): raise KeyboardInterrupt
def surrogateescape_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) res = '' start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) obj = w_obj._utf8 pos = start while pos < end: code = rutf8.codepoint_at_pos(obj, pos) if code < 0xdc80 or code > 0xdcff: # Not a UTF-8b surrogate, fail with original exception raise OperationError(space.type(w_exc), w_exc) res += chr(code - 0xdc00) pos = rutf8.next_codepoint_pos(obj, pos) return space.newtuple([space.newbytes(res), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): consumed = 0 start = space.int_w(space.getattr(w_exc, space.newtext('start'))) end = space.int_w(space.getattr(w_exc, space.newtext('end'))) obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) replace = u'' while consumed < 4 and consumed < end - start: c = ord(obj[start + consumed]) if c < 128: # Refuse to escape ASCII bytes. break replace += unichr(0xdc00 + c) consumed += 1 if not consumed: # codec complained about ASCII byte. raise OperationError(space.type(w_exc), w_exc) replace_utf8 = runicode.unicode_encode_utf_8(replace, len(replace), 'strict', allow_surrogates=True) return space.newtuple([ space.newtext(replace_utf8, len(replace)), space.newint(start + consumed) ]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def encode_object(space, w_object, encoding, errors): if encoding is None: # Get the encoder functions as a wrapped object. # This lookup is cached. w_encoder = space.sys.get_w_default_encoder() else: if errors is None or errors == 'strict': try: if encoding == 'ascii': u = space.unicode_w(w_object) eh = unicodehelper.rpy_encode_error_handler() return space.wrapbytes( unicode_encode_ascii(u, len(u), None, errorhandler=eh)) if encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.rpy_encode_error_handler() return space.wrapbytes( unicode_encode_utf_8(u, len(u), None, errorhandler=eh)) except unicodehelper.RUnicodeEncodeError, ue: raise wrap_encode_error(space, ue) from pypy.module._codecs.interp_codecs import lookup_codec w_encoder = space.getitem(lookup_codec(space, encoding), space.wrap(0))
def encode_object(space, w_object, encoding, errors): if errors is None or errors == 'strict': if encoding is None or encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) return space.newbytes( unicode_encode_utf_8(u, len(u), errors, errorhandler=eh)) elif encoding == 'ascii': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) return space.newbytes( unicode_encode_ascii(u, len(u), errors, errorhandler=eh)) from pypy.module._codecs.interp_codecs import encode_text if encoding is None: encoding = space.sys.defaultencoding w_retval = encode_text(space, w_object, encoding, errors) if not space.isinstance_w(w_retval, space.w_bytes): raise oefmt( space.w_TypeError, "'%s' encoder returned '%T' instead of 'bytes'; " "use codecs.encode() to encode to arbitrary types", encoding, w_retval) return w_retval
def _create_dict(self, dct): d = {} for key, value in dct.iteritems(): d[unicode_encode_utf_8(key, len(key), "strict")] = value return JsonObject(d)
def as_bytes(self): from rpython.rlib.runicode import unicode_encode_utf_8 res = unicode_encode_utf_8(self.unistr, len(self.unistr), "strict") return rstring.assert_str0(res)
def tostring(self): return "#\\%s" % runicode.unicode_encode_utf_8( self.value, len(self.value), "strict")
def unicode_to_utf8(s): """Converts a `unicode` value to a UTF8 encoded `str` value.""" return unicode_encode_utf_8(s, len(s), 'strict')
def write_char(w_char, w_port, env, cont): c = w_char.value from rpython.rlib.runicode import unicode_encode_utf_8 s = unicode_encode_utf_8(c, len(c), "strict") return do_print(s, w_port, env, cont)
def f(x): s1 = "".join(["\xd7\x90\xd6\x96\xeb\x96\x95\xf0\x90\x91\x93"] * x) u, consumed = runicode.str_decode_utf_8(s1, len(s1), True) s2 = runicode.unicode_encode_utf_8(u, len(u), True) return s1 == s2
def encode_utf_escape(self, utf_escape): utf_codepoint = int("".join(utf_escape), 16) if utf_codepoint > 0x101111: self.error("invalid Unicode codepoint (too large)") return [c for c in unicode_encode_utf_8(unichr(utf_codepoint), 1, "ignore")]
def hex_to_utf8(s): uchr = UNICHR(int(s, 16)) return unicode_encode_utf_8(uchr, len(uchr), 'strict')
def as_bytes(self): from rpython.rlib.runicode import unicode_encode_utf_8 return unicode_encode_utf_8(self.unistr, len(self.unistr), "strict")
def encode_utf8(space, uni): return runicode.unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=encode_error_handler(space), allow_surrogates=True)
def wrapunicode(self, x): return JsonString(unicode_encode_utf_8(x, len(x), "strict"))
def encode_unicode_utf8(string): result = runicode.unicode_encode_utf_8(string, len(string), None) return result