def format(space, w_fmt, values_w, w_valuedict, fmt_type): "Entry point" if fmt_type != FORMAT_UNICODE: if fmt_type == FORMAT_BYTEARRAY: fmt = w_fmt.buffer_w(space, 0).as_str() else: fmt = space.bytes_w(w_fmt) formatter = StringFormatter(space, fmt, values_w, w_valuedict) try: result = formatter.format() except NeedUnicodeFormattingError: # fall through to the unicode case pass else: if fmt_type == FORMAT_BYTES: return space.newbytes(result) elif fmt_type == FORMAT_BYTEARRAY: return _bytearray_from_bytes(space, result) return space.newbytes(result) fmt = space.utf8_w(w_fmt) formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict) result = formatter.format() # this can force strings, not sure if it's a problem or not lgt = rutf8.codepoints_in_utf8(result) return space.newutf8(result, lgt)
def get_chars(self, size): if self.text is None or size == 0: return "" lgt = codepoints_in_utf8(self.text) available = lgt - self.upos if size < 0 or size > available: size = available assert size >= 0 if self.pos > 0 or size < available: start = self.pos ret = [] pos = start for i in range(size): pos = next_codepoint_pos(self.text, pos) self.upos += 1 assert start >= 0 assert pos >= 0 chars = self.text[start:pos] self.pos = pos else: chars = self.text self.pos = len(self.text) self.upos = lgt return chars
def multibytecodec_decerror(decodebuf, e, errors, errorcb, namecb, stringdata): if e > 0: reason = "illegal multibyte sequence" esize = e elif e == MBERR_TOOFEW: reason = "incomplete multibyte sequence" esize = pypy_cjk_dec_inbuf_remaining(decodebuf) elif e == MBERR_NOMEMORY: raise MemoryError else: raise RuntimeError # # compute the unicode to use as a replacement -> 'replace', and # the current position in the input 'unicodedata' -> 'end' start = pypy_cjk_dec_inbuf_consumed(decodebuf) end = start + esize if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": replace = "" elif errors == "replace": replace = UNICODE_REPLACEMENT_CHARACTER else: assert errorcb replace, end, rettype, obj = errorcb(errors, namecb, reason, stringdata, start, end) # 'replace' is UTF8 encoded unicode, rettype is 'u' lgt = rutf8.codepoints_in_utf8(replace) inbuf = rffi.utf82wcharp(replace, lgt) try: r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end) finally: lltype.free(inbuf, flavor='raw') if r == MBERR_NOMEMORY: raise MemoryError
def std_wp(self, r, is_string=False): # r is utf8-encoded unicode length = rutf8.codepoints_in_utf8(r) if do_unicode and is_string: # convert string to unicode using the default encoding r = self.space.utf8_w(self.space.newbytes(r)) prec = self.prec if prec == -1 and self.width == 0: # fast path self.result.append(r) return if prec >= 0 and prec < length: length = prec # ignore the end of the string if too long padding = self.width - length if do_unicode: # XXX could use W_UnicodeObject.descr_getslice, but that would # require a refactor to use the w_val, not r length = rutf8._pos_at_index(r, length) result = self.result if padding < 0: padding = 0 assert padding >= 0 if not self.f_ljust and padding > 0: result.append_multiple_char(' ', padding) # add any padding at the left of 'r' padding = 0 result.append_slice(r, 0, length) # add 'r' itself if padding > 0: result.append_multiple_char(' ', padding)
def _decode_helper(cp, s, flags, encoding, errors, errorhandler, final, start, end, res): if end > len(s): end = len(s) piece = s[start:end] with rffi.scoped_nonmovingbuffer(piece) as dataptr: # first get the size of the result outsize = MultiByteToWideChar(cp, flags, dataptr, len(piece), lltype.nullptr(rffi.CWCHARP.TO), 0) if outsize == 0: r, pos = _decode_cp_error(s, errorhandler, encoding, errors, final, start, end) res.append(r) return pos, check_utf8(r, True) with rffi.scoped_alloc_unicodebuffer(outsize) as buf: # do the conversion if MultiByteToWideChar(cp, flags, dataptr, len(piece), buf.raw, outsize) == 0: r, pos = _decode_cp_error(s, errorhandler, encoding, errors, final, start, end) res.append(r) return pos, check_utf8(r, True) buf_as_str = buf.str(outsize) assert buf_as_str is not None with rffi.scoped_nonmoving_unicodebuffer(buf_as_str) as dataptr: conv = _unibuf_to_utf8(dataptr, outsize) res.append(conv) return end, codepoints_in_utf8(conv)
def write_w(self, space, w_text): self._check_attached(space) self._check_closed(space) if not self.w_encoder: raise oefmt(space.w_IOError, "not writable") if not space.isinstance_w(w_text, space.w_unicode): raise oefmt(space.w_TypeError, "unicode argument expected, got '%T'", w_text) text, textlen = space.utf8_len_w(w_text) haslf = False if (self.writetranslate and self.writenl) or self.line_buffering: if text.find('\n') >= 0: haslf = True if haslf and self.writetranslate and self.writenl: w_text = space.call_method( w_text, "replace", space.newutf8('\n', 1), space.newutf8(self.writenl, codepoints_in_utf8(self.writenl))) text = space.utf8_w(w_text) needflush = False if self.line_buffering and (haslf or text.find('\r') >= 0): needflush = True # XXX What if we were just reading? if self.encodefunc: w_bytes = self.encodefunc(space, w_text, self.errors) self.encoding_start_of_stream = False else: w_bytes = space.call_method(self.w_encoder, "encode", w_text) if not space.isinstance_w(w_bytes, space.w_bytes): raise oefmt(space.w_TypeError, "encoder should return a bytes object, not '%T'", w_bytes) b = space.bytes_w(w_bytes) if not self.pending_bytes: self.pending_bytes = [] self.pending_bytes_count = 0 self.pending_bytes.append(b) self.pending_bytes_count += len(b) if self.pending_bytes_count > self.chunk_size or needflush: self._writeflush(space) if needflush: space.call_method(self.w_buffer, "flush") self.decoded.reset() self.snapshot = None if self.w_decoder: space.call_method(self.w_decoder, "reset") return space.newint(textlen)
def convert_to(self, space, dtype): if dtype.is_unicode(): return self elif dtype.is_object(): return W_ObjectBox( space.newutf8(self._value, codepoints_in_utf8(self._value))) else: raise oefmt(space.w_NotImplementedError, "Conversion from unicode not implemented yet")
def write(self, string): length = codepoints_in_utf8(string) if self.pos + length > len(self.data): self.resize(self.pos + length) pos = 0 for i in range(length): nextpos = next_codepoint_pos(string, pos) self.data[self.pos + i] = string[pos:nextpos] pos = nextpos self.pos += length
def test_codepoints_in_utf8(u, start, len1): end = start + len1 if end > len(u): extra = end - len(u) else: extra = 0 count = rutf8.codepoints_in_utf8(u.encode('utf8'), len(u[:start].encode('utf8')), len(u[:end].encode('utf8')) + extra) assert count == len(u[start:end])
def _get_error_info(self, pos): space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, pos) pos = rutf8.codepoints_in_utf8(self.fmt, 0, pos) w_s = space.newutf8( rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1) else: cp = ord(self.fmt[pos]) w_s = space.newbytes(chr(cp)) return w_s, pos, cp
def fget_string(self, space): ctx = self.ctx if isinstance(ctx, rsre_core.BufMatchContext): return space.newbytes(ctx._buffer.as_str()) elif isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): lgt = rutf8.codepoints_in_utf8(ctx._utf8) return space.newutf8(ctx._utf8, lgt) else: raise SystemError
def descr_getstate(self, space): w_initialval = self.getvalue_w(space) w_dict = space.call_method(self.w_dict, "copy") if self.readnl is None: w_readnl = space.w_None else: w_readnl = space.str( space.newutf8(self.readnl, codepoints_in_utf8(self.readnl))) # YYY return space.newtuple( [w_initialval, w_readnl, space.newint(self.buf.pos), w_dict])
def readline_w(self, space, w_limit=None): self._check_closed(space) limit = convert_size(space, w_limit) if self.readuniversal: result = self.buf.readline_universal(limit) else: if self.readtranslate: # Newlines are already translated, only search for \n newline = '\n' else: newline = self.readnl result = self.buf.readline(newline, limit) resultlen = codepoints_in_utf8(result) return space.newutf8(result, resultlen)
def unknown_fmtchar(self): space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) pos = rutf8.codepoints_in_utf8(self.fmt, 0, self.fmtpos - 1) w_s = space.newutf8( rutf8.unichr_as_utf8(r_uint(cp), allow_surrogates=True), 1) else: cp = ord(self.fmt[self.fmtpos - 1]) pos = self.fmtpos - 1 w_s = space.newbytes(chr(cp)) raise oefmt(space.w_ValueError, "unsupported format character %R (%s) at index %d", w_s, hex(cp), pos)
def decode(self, space, input, errors=None): if errors is None: errors = 'strict' state = space.fromcache(CodecState) # try: utf8_output = c_codecs.decode(self.codec, input, errors, state.decode_error_handler, self.name) except c_codecs.EncodeDecodeError as e: raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) lgt = rutf8.codepoints_in_utf8(utf8_output) return space.newtuple([space.newutf8(utf8_output, lgt), space.newint(len(input))])
def _compute_value(self, space): lst = [None] * (len(formats) + len(formats) + 1) lgt = 0 for i, fmt, attr in entries: lst[i + i] = self.xstrings[i] lgt += len(self.xstrings[i]) value = getattr(self, attr) if fmt == 'd': result = str(value) lgt += len(result) elif fmt == 'R': s = space.repr(value) result = space.utf8_w(s) lgt += space.len_w(s) elif fmt == 'S': s = space.str(value) result = space.utf8_w(s) lgt += space.len_w(s) elif fmt == 'T': result = space.type(value).name lgt += rutf8.codepoints_in_utf8(result) elif fmt == 'N': result = value.getname(space) lgt += len(result) elif fmt == '8': # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'" from pypy.interpreter import unicodehelper result, _lgt, pos = unicodehelper.str_decode_utf8( value, 'replace', True, unicodehelper.decode_never_raise, True) lgt += _lgt elif isinstance(value, unicode): # 's' result = str(value.encode('utf-8')) lgt += len(value) else: result = str(value) try: lgt += rutf8.check_utf8(result, True) except rutf8.CheckError as e: lgt -= e.pos lst[i + i + 1] = result lst[-1] = self.xstrings[-1] lgt += len(self.xstrings[-1]) retval = ''.join(lst) return retval, lgt
def format(space, w_fmt, values_w, w_valuedict, do_unicode): "Entry point" if not do_unicode: fmt = space.bytes_w(w_fmt) formatter = StringFormatter(space, fmt, values_w, w_valuedict) try: result = formatter.format() except NeedUnicodeFormattingError: # fall through to the unicode case pass else: return space.newbytes(result) fmt = space.utf8_w(w_fmt) formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict) result = formatter.format() # this can force strings, not sure if it's a problem or not lgt = rutf8.codepoints_in_utf8(result) return space.newutf8(result, lgt)
def decode_w(self, object, final=False): space = self.space state = space.fromcache(CodecState) if len(self.pending) > 0: object = self.pending + object try: output = c_codecs.decodeex(self.decodebuf, object, self.errors, state.decode_error_handler, self.name, get_ignore_error(final)) except c_codecs.EncodeDecodeError as e: raise wrap_unicodedecodeerror(space, e, object, self.name) except RuntimeError: raise wrap_runtimeerror(space) pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] lgt = rutf8.codepoints_in_utf8(output) return space.newutf8(output, lgt)
def slice_w(space, ctx, start, end, w_default): # 'start' and 'end' are byte positions if ctx.ZERO <= start <= end: if isinstance(ctx, rsre_core.BufMatchContext): return space.newbytes( ctx._buffer.getslice(start, end, 1, end - start)) if isinstance(ctx, rsre_core.StrMatchContext): start = ctx._real_pos(start) end = ctx._real_pos(end) return space.newbytes(ctx._string[start:end]) elif isinstance(ctx, rsre_utf8.Utf8MatchContext): s = ctx._utf8[start:end] lgt = rutf8.codepoints_in_utf8(s) return space.newutf8(s, lgt) else: # unreachable raise SystemError return w_default
def write_w(self, space, w_obj): if not space.isinstance_w(w_obj, space.w_unicode): raise oefmt(space.w_TypeError, "unicode argument expected, got '%T'", w_obj) self._check_closed(space) orig_size = space.len_w(w_obj) if self.w_decoder is not None: w_decoded = space.call_method(self.w_decoder, "decode", w_obj, space.w_True) else: w_decoded = w_obj if self.writenl: w_decoded = space.call_method( w_decoded, "replace", space.newtext("\n"), space.newutf8(self.writenl, codepoints_in_utf8(self.writenl))) string = space.utf8_w(w_decoded) if string: self.buf.write(string) return space.newint(orig_size)
def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb, unicodedata): if e > 0: reason = "illegal multibyte sequence" esize = e elif e == MBERR_TOOFEW: reason = "incomplete multibyte sequence" esize = pypy_cjk_enc_inbuf_remaining(encodebuf) elif e == MBERR_NOMEMORY: raise MemoryError else: raise RuntimeError # # compute the string to use as a replacement -> 'replace', and # the current position in the input 'unicodedata' -> 'end' start = pypy_cjk_enc_inbuf_consumed(encodebuf) end = start + esize if errors == "strict": raise EncodeDecodeError(start, end, reason) elif errors == "ignore": replace = "" elif errors == "replace": codec = pypy_cjk_enc_getcodec(encodebuf) try: replace = encode(codec, "?", 1) except EncodeDecodeError: replace = "?" else: assert errorcb rets, end = errorcb(errors, namecb, reason, unicodedata, start, end) codec = pypy_cjk_enc_getcodec(encodebuf) lgt = rutf8.codepoints_in_utf8(rets) replace = encode(codec, rets, lgt, "strict", errorcb, namecb) with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) if r == MBERR_NOMEMORY: raise MemoryError
def replace_count(input, sub, by, maxcount=-1, isutf8=False): if isinstance(input, str): Builder = StringBuilder elif isinstance(input, unicode): Builder = UnicodeBuilder else: assert isinstance(input, list) Builder = ByteListBuilder if maxcount == 0: return input, 0 if not sub and not isutf8: upper = len(input) if maxcount > 0 and maxcount < upper + 2: upper = maxcount - 1 assert upper >= 0 try: result_size = ovfcheck(upper * len(by)) result_size = ovfcheck(result_size + upper) result_size = ovfcheck(result_size + len(by)) remaining_size = len(input) - upper result_size = ovfcheck(result_size + remaining_size) except OverflowError: raise builder = Builder(result_size) for i in range(upper): builder.append(by) builder.append(input[i]) builder.append(by) builder.append_slice(input, upper, len(input)) replacements = upper + 1 elif isinstance(input, str) and len(sub) == 1: if len(by) == 1: return replace_count_str_chr_chr(input, sub[0], by[0], maxcount) return replace_count_str_chr_str(input, sub[0], by, maxcount) else: # First compute the exact result size if sub: cnt = count(input, sub, 0, len(input)) if isinstance(input, str) and cnt == 0: return input, 0 if isinstance(input, str): return replace_count_str_str_str(input, sub, by, cnt, maxcount) else: assert isutf8 from rpython.rlib import rutf8 cnt = rutf8.codepoints_in_utf8(input) + 1 if cnt > maxcount and maxcount > 0: cnt = maxcount diff_len = len(by) - len(sub) try: result_size = ovfcheck(diff_len * cnt) result_size = ovfcheck(result_size + len(input)) except OverflowError: raise replacements = cnt builder = Builder(result_size) start = 0 sublen = len(sub) if sublen == 0: assert isutf8 from rpython.rlib import rutf8 while True: builder.append(by) maxcount -= 1 if start == len(input) or maxcount == 0: break next = rutf8.next_codepoint_pos(input, start) builder.append_slice(input, start, next) start = next else: while maxcount != 0: next = find(input, sub, start, len(input)) if next < 0: break builder.append_slice(input, start, next) builder.append(by) start = next + sublen maxcount -= 1 # NB. if it's already < 0, it stays < 0 builder.append_slice(input, start, len(input)) return builder.build(), replacements
ctx.reset(start) if last_pos < ctx.end: _sub_append_slice(ctx, space, use_builder, sublist_w, strbuilder, last_pos, ctx.end) if use_builder != '\x00': assert strbuilder is not None result_bytes = strbuilder.build() if use_builder == 'S': assert not isinstance(ctx, rsre_utf8.Utf8MatchContext) return space.newbytes(result_bytes), n elif use_builder == 'U': assert (isinstance(ctx, UnicodeAsciiMatchContext) or isinstance(ctx, rsre_utf8.Utf8MatchContext)) return space.newutf8(result_bytes, rutf8.codepoints_in_utf8(result_bytes)), n else: raise AssertionError(use_builder) else: if space.isinstance_w(w_string, space.w_unicode): w_emptystr = space.newutf8('', 0) else: w_emptystr = space.newbytes('') w_item = space.call_method(w_emptystr, 'join', space.newlist(sublist_w)) return w_item, n sub_jitdriver = jit.JitDriver( reds="""count n last_pos ctx w_filter
def tell_w(self, space): self._check_closed(space) if not self.seekable: raise oefmt(space.w_IOError, "underlying stream is not seekable") if not self.telling: raise oefmt(space.w_IOError, "telling position disabled by next() call") self._writeflush(space) space.call_method(self, "flush") w_pos = space.call_method(self.w_buffer, "tell") if self.w_decoder is None or self.snapshot is None: assert not self.decoded.text return w_pos cookie = PositionCookie(space.bigint_w(w_pos)) # Skip backward to the snapshot point (see _read_chunk) cookie.dec_flags = self.snapshot.flags input = self.snapshot.input cookie.start_pos -= len(input) # How many decoded characters have been used up since the snapshot? if not self.decoded.pos: # We haven't moved from the snapshot point. return space.newlong_from_rbigint(cookie.pack()) chars_to_skip = codepoints_in_utf8(self.decoded.text, end=self.decoded.pos) # Starting from the snapshot position, we will walk the decoder # forward until it gives us enough decoded characters. w_saved_state = space.call_method(self.w_decoder, "getstate") try: # Note our initial start point self._decoder_setstate(space, cookie) # Feed the decoder one byte at a time. As we go, note the nearest # "safe start point" before the current location (a point where # the decoder has nothing buffered, so seek() can safely start # from there and advance to this location). chars_decoded = 0 i = 0 while i < len(input): w_decoded = space.call_method(self.w_decoder, "decode", space.newbytes(input[i])) check_decoded(space, w_decoded) chars_decoded += space.len_w(w_decoded) cookie.bytes_to_feed += 1 w_state = space.call_method(self.w_decoder, "getstate") w_dec_buffer, w_flags = space.unpackiterable(w_state, 2) dec_buffer_len = space.len_w(w_dec_buffer) if dec_buffer_len == 0 and chars_decoded <= chars_to_skip: # Decoder buffer is empty, so this is a safe start point. cookie.start_pos += cookie.bytes_to_feed chars_to_skip -= chars_decoded assert chars_to_skip >= 0 cookie.dec_flags = space.int_w(w_flags) cookie.bytes_to_feed = 0 chars_decoded = 0 if chars_decoded >= chars_to_skip: break i += 1 else: # We didn't get enough decoded data; signal EOF to get more. w_decoded = space.call_method(self.w_decoder, "decode", space.newbytes(""), space.newint(1)) # final=1 check_decoded(space, w_decoded) chars_decoded += space.len_w(w_decoded) cookie.need_eof = 1 if chars_decoded < chars_to_skip: raise oefmt(space.w_IOError, "can't reconstruct logical file position") finally: space.call_method(self.w_decoder, "setstate", w_saved_state) # The returned cookie corresponds to the last safe start point. cookie.chars_to_skip = chars_to_skip return space.newlong_from_rbigint(cookie.pack())
def read_w(self, space, w_size=None): self._check_closed(space) size = convert_size(space, w_size) v = self.buf.read(size) lgt = codepoints_in_utf8(v) return space.newutf8(v, lgt)
def getvalue_w(self, space): self._check_closed(space) v = self.buf.getvalue() lgt = codepoints_in_utf8(v) return space.newutf8(v, lgt)
def normalize(s): u = s.encode('utf8') w_s = space.newutf8(u, codepoints_in_utf8(u)) w_res = ucd.normalize(space, NF_code, w_s) return space.utf8_w(w_res).decode('utf8')
def fstring_find_literal(astbuilder, fstr, atom_node, rec): space = astbuilder.space raw = fstr.raw_mode # Return the next literal part. Updates the current index inside 'fstr'. # Differs from CPython: this version handles double-braces on its own. s = fstr.unparsed literal_start = fstr.current_index assert literal_start >= 0 # Get any literal string. It ends when we hit an un-doubled left # brace (which isn't part of a unicode name escape such as # "\N{EULER CONSTANT}"), or the end of the string. i = literal_start builder = StringBuilder() while i < len(s): ch = s[i] i += 1 if not raw and ch == '\\' and i < len(s): ch = s[i] i += 1 if ch == 'N': if i < len(s) and s[i] == '{': while i < len(s) and s[i] != '}': i += 1 if i < len(s): i += 1 continue elif i < len(s): i += 1 break if ch == '{': msg = "invalid escape sequence '%s'" try: space.warn(space.newtext(msg % ch), space.w_DeprecationWarning) except error.OperationError as e: if e.match(space, space.w_DeprecationWarning): astbuilder.error(msg % ch, atom_node) else: raise if ch == '{' or ch == '}': # Check for doubled braces, but only at the top level. If # we checked at every level, then f'{0:{3}}' would fail # with the two closing braces. if rec == 0 and i < len(s) and s[i] == ch: assert 0 <= i <= len(s) builder.append(s[literal_start:i]) i += 1 # skip over the second brace literal_start = i elif rec == 0 and ch == '}': i -= 1 assert i >= 0 fstr.current_index = i # Where a single '{' is the start of a new expression, a # single '}' is not allowed. astbuilder.error("f-string: single '}' is not allowed", atom_node) else: # We're either at a '{', which means we're starting another # expression; or a '}', which means we're at the end of this # f-string (for a nested format_spec). i -= 1 break assert 0 <= i <= len(s) assert i == len(s) or s[i] == '{' or s[i] == '}' builder.append(s[literal_start:i]) fstr.current_index = i literal = builder.build() lgt = codepoints_in_utf8(literal) if not raw and '\\' in literal: literal = parsestring.decode_unicode_utf8(space, literal, 0, len(literal)) literal, lgt, pos = unicodehelper.decode_unicode_escape(space, literal) return space.newtext(literal, lgt)
def subx(self, w_ptemplate, w_string, count): space = self.space # use a (much faster) string builder (possibly utf8) if w_ptemplate and # w_string are both string or both unicode objects, and if w_ptemplate # is a literal use_builder = '\x00' # or 'S'tring or 'U'nicode/UTF8 is_buffer = False filter_as_string = None if space.isinstance_w(w_string, space.w_unicode): if not self.is_known_unicode(): raise oefmt( space.w_TypeError, "cannot use a bytes pattern on a string-like object") else: if self.is_known_unicode(): raise oefmt( space.w_TypeError, "cannot use a string pattern on a bytes-like object") if space.is_true(space.callable(w_ptemplate)): w_filter = w_ptemplate filter_is_callable = True else: if space.isinstance_w(w_ptemplate, space.w_unicode): filter_as_string = space.utf8_w(w_ptemplate) literal = '\\' not in filter_as_string if space.isinstance_w(w_string, space.w_unicode) and literal: use_builder = 'U' elif space.isinstance_w(w_ptemplate, space.w_bytes): filter_as_string = space.bytes_w(w_ptemplate) literal = '\\' not in filter_as_string if space.isinstance_w(w_string, space.w_bytes) and literal: use_builder = 'S' else: if space.isinstance_w(w_ptemplate, space.w_bytes): filter_as_string = space.bytes_w(w_ptemplate) else: filter_as_string = space.readbuf_w(w_ptemplate).as_str() is_buffer = True literal = '\\' not in filter_as_string if space.isinstance_w(w_string, space.w_bytes) and literal: use_builder = 'S' if literal: w_filter = w_ptemplate filter_is_callable = False else: # not a literal; hand it over to the template compiler # FIX for a CPython 3.5 bug: if w_ptemplate is a buffer # (e.g. a bytearray), convert it to a byte string here. if is_buffer: w_ptemplate = space.newbytes(filter_as_string) w_re = import_re(space) w_filter = space.call_method(w_re, '_subx', self, w_ptemplate) filter_is_callable = space.is_true(space.callable(w_filter)) # # XXX this is a bit of a mess, but it improves performance a lot ctx = self.make_ctx(w_string) sublist_w = strbuilder = None if use_builder != '\x00': assert filter_as_string is not None strbuilder = StringBuilder(ctx.end) else: sublist_w = [] n = 0 last_pos = ctx.ZERO while not count or n < count: pattern = self.code sub_jitdriver.jit_merge_point( self=self, use_builder=use_builder, filter_is_callable=filter_is_callable, filter_type=type(w_filter), ctx=ctx, pattern=pattern, w_filter=w_filter, strbuilder=strbuilder, filter_as_string=filter_as_string, count=count, w_string=w_string, n=n, last_pos=last_pos, sublist_w=sublist_w) space = self.space if not searchcontext(space, ctx, pattern): break if last_pos < ctx.match_start: _sub_append_slice(ctx, space, use_builder, sublist_w, strbuilder, last_pos, ctx.match_start) if not (last_pos == ctx.match_start == ctx.match_end and n > 0): # the above ignores empty matches on latest position last_pos = ctx.match_end if filter_is_callable: w_match = self.getmatch(ctx, True) # make a copy of 'ctx'; see test_sub_matches_stay_valid ctx = self.fresh_copy(ctx) w_piece = space.call_function(w_filter, w_match) if not space.is_w(w_piece, space.w_None): assert strbuilder is None assert use_builder == '\x00' sublist_w.append(w_piece) else: if use_builder != '\x00': assert filter_as_string is not None assert strbuilder is not None strbuilder.append(filter_as_string) else: sublist_w.append(w_filter) n += 1 elif last_pos >= ctx.end: break # empty match at the end: finished start = ctx.match_end if start == ctx.match_start: if start == ctx.end: break start = ctx.next_indirect(start) ctx.reset(start) if last_pos < ctx.end: _sub_append_slice(ctx, space, use_builder, sublist_w, strbuilder, last_pos, ctx.end) if use_builder != '\x00': assert strbuilder is not None result_bytes = strbuilder.build() if use_builder == 'S': assert not isinstance(ctx, rsre_utf8.Utf8MatchContext) return space.newbytes(result_bytes), n elif use_builder == 'U': assert (isinstance(ctx, UnicodeAsciiMatchContext) or isinstance(ctx, rsre_utf8.Utf8MatchContext)) return space.newutf8(result_bytes, rutf8.codepoints_in_utf8(result_bytes)), n else: raise AssertionError(use_builder) else: if space.isinstance_w(w_string, space.w_unicode): w_emptystr = space.newutf8('', 0) else: w_emptystr = space.newbytes('') w_item = space.call_method(w_emptystr, 'join', space.newlist(sublist_w)) return w_item, n
def call_errorhandler(errors, encoding, reason, input, startpos, endpos): """Generic wrapper for calling into error handlers. Note that error handler receives and returns position into the unicode characters, not into the position of utf8 bytes, so it needs to be converted by the codec Returns (str_or_none, newpos) as error handlers return utf8 so we add whether they used unicode or bytes """ w_errorhandler = lookup_error(space, errors) if decode: w_cls = space.w_UnicodeDecodeError assert isinstance(input, str) w_input = space.newbytes(input) length = len(input) else: w_cls = space.w_UnicodeEncodeError assert isinstance(input, str) length = rutf8.codepoints_in_utf8(input) w_input = space.newtext(input, length) w_exc = space.call_function(w_cls, space.newtext(encoding), w_input, space.newint(startpos), space.newint(endpos), space.newtext(reason)) w_res = space.call_function(w_errorhandler, w_exc) if (not space.isinstance_w(w_res, space.w_tuple) or space.len_w(w_res) != 2): if decode: msg = ("decoding error handler must return " "(str, int) tuple") else: msg = ("encoding error handler must return " "(str/bytes, int) tuple") raise OperationError(space.w_TypeError, space.newtext(msg)) w_replace, w_newpos = space.fixedview(w_res, 2) if space.isinstance_w(w_replace, space.w_unicode): rettype = 'u' elif not decode and space.isinstance_w(w_replace, space.w_bytes): rettype = 'b' else: if decode: msg = ("decoding error handler must return " "(str, int) tuple") else: msg = ("encoding error handler must return " "(str/bytes, int) tuple") raise OperationError(space.w_TypeError, space.newtext(msg)) try: newpos = space.int_w(w_newpos) except OperationError as e: if not e.match(space, space.w_OverflowError): raise newpos = -1 else: if newpos < 0: newpos = length + newpos if newpos < 0 or newpos > length: raise oefmt(space.w_IndexError, "position %d from error handler out of bounds", newpos) w_obj = space.getattr(w_exc, space.newtext('object')) if decode: if not space.isinstance_w(w_obj, space.w_bytes): raise oefmt( space.w_ValueError, "error handler modified exc.object must be bytes") else: if not space.isinstance_w(w_obj, space.w_unicode): raise oefmt( space.w_ValueError, "error handler modified exc.object must be str") obj = space.utf8_w(w_obj) return space.utf8_w(w_replace), newpos, rettype, obj