def _decode_helper(cp, s, flags, encoding, errors, errorhandler, final, start, end, res): if end > len(s): end = len(s) piece = s[start:end] with rffi.scoped_nonmovingbuffer(piece) as dataptr: # first get the size of the result outsize = MultiByteToWideChar(cp, flags, dataptr, len(piece), lltype.nullptr(rffi.CWCHARP.TO), 0) if outsize == 0: r, pos = _decode_cp_error(s, errorhandler, encoding, errors, final, start, end) res.append(r) return pos, check_utf8(r, True) with rffi.scoped_alloc_unicodebuffer(outsize) as buf: # do the conversion if MultiByteToWideChar(cp, flags, dataptr, len(piece), buf.raw, outsize) == 0: r, pos = _decode_cp_error(s, errorhandler, encoding, errors, final, start, end) res.append(r) return pos, check_utf8(r, True) buf_as_str = buf.str(outsize) assert buf_as_str is not None with rffi.scoped_nonmoving_unicodebuffer(buf_as_str) as dataptr: conv = _unibuf_to_utf8(dataptr, outsize) res.append(conv) return end, codepoints_in_utf8(conv)
def verify_identifier(token): # 1=ok; 0=not an identifier; -1=bad utf-8 try: rutf8.check_utf8(token, False) except rutf8.CheckError: return -1 from pypy.objspace.std.unicodeobject import _isidentifier return _isidentifier(token)
def check_utf8(space, s, ps, end): assert ps >= 0 pt = ps # while (s < end && *s != '\\') s++; */ /* inefficient for u".." while ps < end and ord(s[ps]) & 0x80: ps += 1 try: rutf8.check_utf8(s, True, pt, ps) except rutf8.CheckError as e: lgt, flag = rutf8.check_utf8(s, True, pt, e.pos) unicodehelper.decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', s, pt + lgt, pt + lgt + 1) return s[pt:ps]
def wrap(self, x): """ Wraps the Python value 'x' into one of the wrapper classes. This should only be used for tests, in real code you need to use the explicit new* methods.""" if x is None: return self.w_None if isinstance(x, OperationError): raise TypeError("attempt to wrap already wrapped exception: %s"% (x,)) if isinstance(x, int): if isinstance(x, bool): return self.newbool(x) else: return self.newint(x) if isinstance(x, str): return self.newtext(x) if isinstance(x, unicode): x = x.encode('utf8') lgt = rutf8.check_utf8(x, True) return self.newutf8(x, lgt) if isinstance(x, float): return W_FloatObject(x) if isinstance(x, W_Root): w_result = x.spacebind(self) #print 'wrapping', x, '->', w_result return w_result if isinstance(x, base_int): return self.newint(x) # we might get there in non-translated versions if 'x' is # a long that fits the correct range. if is_valid_int(x): return self.newint(x) return self._wrap_not_rpython(x)
def xmlcharrefreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # weeoes w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start obj = w_obj._utf8 while pos < end: code = rutf8.codepoint_at_pos(obj, pos) builder.append("&#") builder.append(str(code)) builder.append(";") pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def namereplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() obj = w_obj._utf8 pos = start while pos < end: oc = rutf8.codepoint_at_pos(obj, pos) try: name = unicodedb.name(oc) except KeyError: unicodehelper.raw_unicode_escape_helper(builder, oc) else: builder.append('\\N{') builder.append(name) builder.append('}') pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def _maybe_utf8_to_w(space, utf8): # should this be a method of space? s = rffi.constcharp2str(utf8) try: length = rutf8.check_utf8(s, allow_surrogates=False) except rutf8.CheckError: raise # XXX do something return space.newtext(s, length)
def format_method(space, w_string, args, is_unicode): if is_unicode: template = unicode_template_formatter(space, space.utf8_w(w_string)) r = template.build(args) lgt = rutf8.check_utf8(r, True) return space.newutf8(r, lgt) else: template = str_template_formatter(space, space.bytes_w(w_string)) return space.newbytes(template.build(args))
def getmappingvalue(self, key): # return the value corresponding to a key in the input dict space = self.space if self.w_valuedict is None: raise oefmt(space.w_TypeError, "format requires a mapping") if do_unicode: lgt = rutf8.check_utf8(key, True) w_key = space.newutf8(key, lgt) else: w_key = space.newbytes(key) return space.getitem(self.w_valuedict, w_key)
def w_convert(self, space, s): # I suppose this is a valid utf8, but there is noone to check # and noone to catch an error either try: lgt = rutf8.check_utf8(s, True) return space.newutf8(s, lgt) except rutf8.CheckError: from pypy.interpreter import unicodehelper # get the correct error msg unicodehelper.str_decode_utf8( s, 'string', True, unicodehelper.decode_error_handler(space)) assert False, "always raises" return space.newtext(s)
def call_errorhandler(errors, encoding, reason, input, startpos, endpos): """Generic wrapper for calling into error handlers. Note that error handler receives and returns position into the unicode characters, not into the position of utf8 bytes, so it needs to be converted by the codec Returns (unicode_or_none, str_or_none, newpos) as error handlers may return unicode or on Python 3, bytes. """ w_errorhandler = lookup_error(space, errors) if decode: w_cls = space.w_UnicodeDecodeError w_input = space.newbytes(input) length = len(input) else: w_cls = space.w_UnicodeEncodeError length = rutf8.check_utf8(input, allow_surrogates=True) w_input = space.newutf8(input, length) w_exc = space.call_function( w_cls, space.newtext(encoding), w_input, space.newint(startpos), space.newint(endpos), space.newtext(reason)) w_res = space.call_function(w_errorhandler, w_exc) if (not space.isinstance_w(w_res, space.w_tuple) or space.len_w(w_res) != 2 or not space.isinstance_w( space.getitem(w_res, space.newint(0)), space.w_unicode)): raise oefmt(space.w_TypeError, "%s error handler must return (unicode, int) " "tuple, not %R", "decoding" if decode else "encoding", w_res) w_replace, w_newpos = space.fixedview(w_res, 2) newpos = space.int_w(w_newpos) if newpos < 0: newpos = length + newpos if newpos < 0 or newpos > length: raise oefmt(space.w_IndexError, "position %d from error handler out of bounds", newpos) w_replace = space.convert_to_w_unicode(w_replace) return w_replace._utf8, newpos
def _compute_value(self, space): lst = [None] * (len(formats) + len(formats) + 1) lgt = 0 for i, fmt, attr in entries: lst[i + i] = self.xstrings[i] lgt += len(self.xstrings[i]) value = getattr(self, attr) if fmt == 'd': result = str(value) lgt += len(result) elif fmt == 'R': s = space.repr(value) result = space.utf8_w(s) lgt += space.len_w(s) elif fmt == 'S': s = space.str(value) result = space.utf8_w(s) lgt += space.len_w(s) elif fmt == 'T': result = space.type(value).name lgt += rutf8.codepoints_in_utf8(result) elif fmt == 'N': result = value.getname(space) lgt += len(result) elif fmt == '8': # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'" from pypy.interpreter import unicodehelper result, _lgt, pos = unicodehelper.str_decode_utf8( value, 'replace', True, unicodehelper.decode_never_raise, True) lgt += _lgt elif isinstance(value, unicode): # 's' result = str(value.encode('utf-8')) lgt += len(value) else: result = str(value) try: lgt += rutf8.check_utf8(result, True) except rutf8.CheckError as e: lgt -= e.pos lst[i + i + 1] = result lst[-1] = self.xstrings[-1] lgt += len(self.xstrings[-1]) retval = ''.join(lst) return retval, lgt
def utf_8_decode(space, string, errors="strict", w_final=None): from pypy.interpreter import unicodehelper if errors is None: errors = 'strict' final = space.is_true(w_final) state = space.fromcache(CodecState) # call the fast version for checking try: lgt = rutf8.check_utf8(string, allow_surrogates=True) except rutf8.CheckError: res, consumed, lgt = unicodehelper.str_decode_utf8( string, errors, final, state.decode_error_handler) return space.newtuple2(space.newutf8(res, lgt), space.newint(consumed)) else: return space.newtuple2(space.newutf8(string, lgt), space.newint(len(string)))
def show_warning(space, w_filename, lineno, w_text, w_category, w_sourceline=None): w_name = space.getattr(w_category, space.newtext("__name__")) w_stderr = space.sys.get("stderr") # Print "filename:lineno: category: text\n" try: message = "%s:%d: %s: %s\n" % (space.text_w(w_filename), lineno, space.text_w(w_name), space.text_w(w_text)) except OperationError as e: if e.async(space): raise message = "%s:%d: %s: %s\n" % (space.utf8_w(w_filename), lineno, space.utf8_w(w_name), space.utf8_w(w_text)) lgt = rutf8.check_utf8(message, True) w_message = space.newutf8(message, lgt)
def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb, unicodedata): if e > 0: reason = "illegal multibyte sequence" esize = e elif e == MBERR_TOOFEW: reason = "incomplete multibyte sequence" esize = pypy_cjk_enc_inbuf_remaining(encodebuf) elif e == MBERR_NOMEMORY: raise MemoryError else: raise RuntimeError # # compute the string to use as a replacement -> 'replace', and # the current position in the input 'unicodedata' -> 'end' start = pypy_cjk_enc_inbuf_consumed(encodebuf) end = start + esize if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": replace = "" rettype = 'b' # != 'u' elif errors == "replace": replace = "?" # utf-8 unicode rettype = 'u' else: assert errorcb replace, end, rettype, obj = errorcb(errors, namecb, reason, unicodedata, start, end) if rettype == 'u': codec = pypy_cjk_enc_getcodec(encodebuf) lgt = rutf8.check_utf8(replace, False) replace = encode(codec, replace, lgt, copystate=encodebuf) #else: # replace is meant to be a byte string already with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) if r == MBERR_NOMEMORY: raise MemoryError
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) start = w_obj._index_to_byte(start) end = w_obj._index_to_byte(end) builder = StringBuilder() obj = w_obj._utf8 pos = start while pos < end: oc = rutf8.codepoint_at_pos(obj, pos) num = hex(oc) if (oc >= 0x10000): builder.append("\\U") zeros = 8 elif (oc >= 0x100): builder.append("\\u") zeros = 4 else: builder.append("\\x") zeros = 2 lnum = len(num) nb = zeros + 2 - lnum # num starts with '0x' if nb > 0: builder.append_multiple_char('0', nb) builder.append_slice(num, 2, lnum) pos = rutf8.next_codepoint_pos(obj, pos) r = builder.build() lgt = rutf8.check_utf8(r, True) return space.newtuple([space.newutf8(r, lgt), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def __init__(self, func, unwrap_spec=None, self_type=None, descrmismatch=None, doc=None): from rpython.rlib import rutf8 from rpython.flowspace.bytecode import cpython_code_signature # 'implfunc' is the interpreter-level function. # Note that this uses a lot of (construction-time) introspection. Code.__init__(self, func.__name__) self.docstring = doc or func.__doc__ if self.docstring: # check that it's utf-8 rutf8.check_utf8(self.docstring, False) self.identifier = "%s-%s-%s" % (func.__module__, func.__name__, getattr(self_type, '__name__', '*')) # unwrap_spec can be passed to interp2app or # attached as an attribute to the function. # It is a list of types or singleton objects: # baseobjspace.ObjSpace is used to specify the space argument # baseobjspace.W_Root is for wrapped arguments to keep wrapped # argument.Arguments is for a final rest arguments Arguments object # 'args_w' for fixedview applied to rest arguments # 'w_args' for rest arguments passed as wrapped tuple # str,int,float: unwrap argument as such type # (function, cls) use function to check/unwrap argument of type cls # First extract the signature from the (CPython-level) code object sig = cpython_code_signature(func.func_code) argnames = sig.argnames varargname = sig.varargname kwargname = sig.kwargname self._argnames = argnames if unwrap_spec is None: unwrap_spec = build_unwrap_spec(func, argnames, self_type) if self_type: assert unwrap_spec[ 0] == 'self', "self_type without 'self' spec element" unwrap_spec = list(unwrap_spec) if descrmismatch is not None: assert issubclass(self_type, W_Root) unwrap_spec[0] = ('INTERNAL:self', self_type) self.descrmismatch_op = descrmismatch self.descr_reqcls = self_type else: unwrap_spec[0] = self_type else: assert descrmismatch is None, ( "descrmismatch without a self-type specified") app_sig = SignatureBuilder(func) UnwrapSpec_Check(func, argnames).apply_over(unwrap_spec, app_sig) self.sig = app_sig.signature() argnames = self.sig.argnames varargname = self.sig.varargname self.minargs = len(argnames) if varargname: self.maxargs = sys.maxint else: self.maxargs = self.minargs self.activation = UnwrapSpec_EmitRun.make_activation(unwrap_spec, func) self._bltin = func self._unwrap_spec = unwrap_spec # speed hack if 0 <= len(unwrap_spec) <= 5: try: arity, fastfunc = UnwrapSpec_FastFunc_Unwrap.make_fastfunc( unwrap_spec, func) except FastFuncNotSupported: if unwrap_spec == [ObjSpace, Arguments]: self.__class__ = BuiltinCodePassThroughArguments0 self.func__args__ = func elif unwrap_spec == [ObjSpace, W_Root, Arguments]: self.__class__ = BuiltinCodePassThroughArguments1 self.func__args__ = func elif unwrap_spec == [self_type, ObjSpace, Arguments]: self.__class__ = BuiltinCodePassThroughArguments1 self.descr_reqcls = self_type miniglobals = {'func': func, 'self_type': self_type} d = {} source = """if 1: def _call(space, w_obj, args): self = space.descr_self_interp_w(self_type, w_obj) return func(self, space, args) \n""" exec compile2(source) in miniglobals, d self.func__args__ = d['_call'] else: self.__class__ = globals()['BuiltinCode%d' % arity] setattr(self, 'fastfunc_%d' % arity, fastfunc)
def test_check_utf8_slice(a, b, c): start = len(a) b_utf8 = b.encode('utf-8') end = start + len(b_utf8) assert rutf8.check_utf8(a + b_utf8 + c, False, start, end) == len(b)
def verify_utf8(token): try: rutf8.check_utf8(token, False) except rutf8.CheckError: return False return True
def wrap(self, u): lgt = rutf8.check_utf8(u, True) return self.space.newutf8(u, lgt)
def _parse_spec(self, default_type, default_align): space = self.space self._fill_char = self._lit(" ")[0] self._align = default_align self._alternate = False self._sign = "\0" self._thousands_sep = False self._precision = -1 the_type = default_type spec = self.spec if not spec: return True length = len(spec) i = 0 got_align = True got_fill_char = False # The single character could be utf8-encoded unicode if self.is_unicode: after_i = rutf8.next_codepoint_pos(spec, i) else: after_i = i + 1 if length - i >= 2 and self._is_alignment(spec[after_i]): self._align = spec[after_i] self._fill_char = spec[i:after_i] got_fill_char = True i = after_i + 1 elif length - i >= 1 and self._is_alignment(spec[i]): self._align = spec[i] i += 1 else: got_align = False if length - i >= 1 and self._is_sign(spec[i]): self._sign = spec[i] i += 1 if length - i >= 1 and spec[i] == "#": self._alternate = True i += 1 if not got_fill_char and length - i >= 1 and spec[i] == "0": self._fill_char = self._lit("0")[0] if not got_align: self._align = "=" i += 1 self._width, i = _parse_int(self.space, spec, i, length) if length != i and spec[i] == ",": self._thousands_sep = True i += 1 if length != i and spec[i] == ".": i += 1 self._precision, i = _parse_int(self.space, spec, i, length) if self._precision == -1: raise oefmt(space.w_ValueError, "no precision given") if length - i > 1: raise oefmt(space.w_ValueError, "invalid format spec") if length - i == 1: presentation_type = spec[i] if self.is_unicode: try: rutf8.check_utf8(spec[i], True) the_type = spec[i][0] except rutf8.CheckError: raise oefmt(space.w_ValueError, "invalid presentation type") else: the_type = presentation_type i += 1 self._type = the_type if self._thousands_sep: tp = self._type if (tp == "d" or tp == "e" or tp == "f" or tp == "g" or tp == "E" or tp == "G" or tp == "%" or tp == "F" or tp == "\0"): # ok pass else: raise oefmt(space.w_ValueError, "invalid type with ','") return False
def decode_w(self, space, w_input, final=False): if self.w_decoder is None: raise oefmt(space.w_ValueError, "IncrementalNewlineDecoder.__init__ not called") # decode input (with the eventual \r from a previous pass) if not space.is_w(self.w_decoder, space.w_None): w_output = space.call_method(self.w_decoder, "decode", w_input, space.newbool(bool(final))) else: w_output = w_input if not space.isinstance_w(w_output, space.w_unicode): raise oefmt(space.w_TypeError, "decoder should return a string result") output, output_len = space.utf8_len_w(w_output) output_len = len(output) if self.pendingcr and (final or output_len): output = '\r' + output self.pendingcr = False output_len += 1 # retain last \r even when not translating data: # then readline() is sure to get \r\n in one pass if not final and output_len > 0: last = len(output) - 1 assert last >= 0 if output[last] == '\r': output = output[:last] self.pendingcr = True output_len -= 1 if output_len == 0: return space.newutf8("", 0) # Record which newlines are read and do newline translation if # desired, all in one pass. seennl = self.seennl if output.find('\r') < 0: # If no \r, quick scan for a possible "\n" character. # (there's nothing else to be done, even when in translation mode) if output.find('\n') >= 0: seennl |= SEEN_LF # Finished: we have scanned for newlines, and none of them # need translating. elif not self.translate: i = 0 while i < len(output): if seennl == SEEN_ALL: break c = output[i] i += 1 if c == '\n': seennl |= SEEN_LF elif c == '\r': if i < len(output) and output[i] == '\n': seennl |= SEEN_CRLF i += 1 else: seennl |= SEEN_CR elif output.find('\r') >= 0: # Translate! builder = StringBuilder(len(output)) i = 0 while i < output_len: c = output[i] i += 1 if c == '\n': seennl |= SEEN_LF elif c == '\r': if i < len(output) and output[i] == '\n': seennl |= SEEN_CRLF i += 1 else: seennl |= SEEN_CR builder.append('\n') continue builder.append(c) output = builder.build() self.seennl |= seennl lgt = check_utf8(output, True) return space.newutf8(output, lgt)