示例#1
0
def str_decode_code_page(cp, s, errors, errorhandler, final=False):
    """Decodes a byte string s from a code page cp with an error handler.
    Returns utf8 result, codepoints in s
    """
    insize = len(s)
    if insize == 0:
        return '', 0
    flags = _decode_code_page_flags(cp)
    encoding = _code_page_name(cp)
    assert errorhandler is not None
    res = StringBuilder(insize)
    if errors == 'strict':
        pos, outsize = _decode_helper(cp, s, flags, encoding, errors,
                                      errorhandler, final, 0, len(s), res)
    else:
        prev_pos = 0
        pos = 0
        outsize = 0
        while pos < len(s):
            pos = next_codepoint_pos(s, prev_pos)
            pos, _outsize = _decode_helper(cp, s, flags, encoding, errors,
                                           errorhandler, final, prev_pos, pos,
                                           res)
            prev_pos = pos
            outsize += _outsize
    return res.build(), outsize
示例#2
0
    def get_chars(self, size):
        if self.text is None or size == 0:
            return ""

        lgt = codepoints_in_utf8(self.text)
        available = lgt - self.upos
        if size < 0 or size > available:
            size = available
        assert size >= 0

        if self.pos > 0 or size < available:
            start = self.pos
            ret = []
            pos = start
            for i in range(size):
                pos = next_codepoint_pos(self.text, pos)
                self.upos += 1
            assert start >= 0
            assert pos >= 0
            chars = self.text[start:pos]
            self.pos = pos
        else:
            chars = self.text
            self.pos = len(self.text)
            self.upos = lgt

        return chars
示例#3
0
 def encode_w(self, space, w_object, final=False):
     utf8data, length = space.utf8_len_w(w_object)
     space = self.space
     state = space.fromcache(CodecState)
     if len(self.pending) > 0:
         utf8data = self.pending + utf8data
         length += self.pending_len
     try:
         output = c_codecs.encodeex(self.encodebuf, utf8data, length,
                                    self.errors, state.encode_error_handler,
                                    self.name, get_ignore_error(final))
     except c_codecs.EncodeDecodeError as e:
         raise wrap_unicodeencodeerror(space, e, utf8data, length,
                                       self.name)
     except RuntimeError:
         raise wrap_runtimeerror(space)
     pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
     assert 0 <= pos <= length
     # scan the utf8 string until we hit pos
     i = 0
     stop = length - pos
     self.pending_len = stop
     if stop > 0:
         while pos > 0:
             i = rutf8.next_codepoint_pos(utf8data, i)
             pos -= 1
         self.pending = utf8data[i:]
     else:
         self.pending = ""
     return space.newbytes(output)
示例#4
0
def namereplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        obj = w_obj._utf8
        pos = start
        while pos < end:
            oc = rutf8.codepoint_at_pos(obj, pos)
            try:
                name = unicodedb.name(oc)
            except KeyError:
                unicodehelper.raw_unicode_escape_helper(builder, oc)
            else:
                builder.append('\\N{')
                builder.append(name)
                builder.append('}')
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
示例#5
0
    def get_chars(self, size):
        """ returns a tuple (utf8, lgt) """
        if self.text is None or size == 0:
            return "", 0

        lgt = self.ulen
        available = lgt - self.upos
        if size < 0 or size > available:
            size = available
        assert size >= 0

        if self.pos > 0 or size < available:
            start = self.pos
            pos = start
            for i in range(size):
                pos = next_codepoint_pos(self.text, pos)
                self.upos += 1
            assert start >= 0
            assert pos >= 0
            chars = self.text[start:pos]
            self.pos = pos
        else:
            chars = self.text
            self.pos = len(self.text)
            self.upos = lgt
            size = lgt

        return chars, size
示例#6
0
def xmlcharrefreplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # weeoes
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        pos = start
        obj = w_obj._utf8
        while pos < end:
            code = rutf8.codepoint_at_pos(obj, pos)
            builder.append("&#")
            builder.append(str(code))
            builder.append(";")
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
示例#7
0
def decode_unicode_utf8(space, s, ps, q):
    # ****The Python 2.7 version, producing UTF-32 escapes****
    # String is utf8-encoded, but 'unicode_escape' expects
    # latin-1; So multibyte sequences must be escaped.
    lis = []  # using a list to assemble the value
    end = q
    # Worst case:
    # "<92><195><164>" may become "\u005c\U000000E4" (16 bytes)
    while ps < end:
        if s[ps] == '\\':
            lis.append(s[ps])
            ps += 1
            if ord(s[ps]) & 0x80:
                # A multibyte sequence will follow, it will be
                # escaped like \u1234. To avoid confusion with
                # the backslash we just wrote, we emit "\u005c"
                # instead.
                lis.append("u005c")
        if ord(s[ps]) & 0x80:
            cp = rutf8.codepoint_at_pos(s, ps)
            hexa = hex(cp + 0x10000000)
            lis.append('\\U0')
            lis.append(hexa[3:])  # Skip 0x and the leading 1
            ps = rutf8.next_codepoint_pos(s, ps)
        else:
            lis.append(s[ps])
            ps += 1
    return ''.join(lis)
示例#8
0
 def next_n(self, position, n, end_position):
     i = 0
     # avoid range(n) since n can be quite large
     while i < n:
         if position >= end_position:
             raise EndOfString
         position = rutf8.next_codepoint_pos(self._utf8, position)
         i += 1
     return position
示例#9
0
 def peek_char(self):
     # like next_char, but doesn't advance pos
     if self.exhausted():
         raise StopIteration
     newpos = next_codepoint_pos(self.text, self.pos)
     pos = self.pos
     assert pos >= 0
     assert newpos >= 0
     return self.text[pos:newpos]
示例#10
0
def _incr(s, pos, isutf8):
    if isutf8:
        from rpython.rlib.rutf8 import next_codepoint_pos
        assert pos >= 0
        r = next_codepoint_pos(s, pos)
        assert r >= 0
        return r
    else:
        return pos + 1
示例#11
0
 def write(self, string):
     length = get_utf8_length(string)
     if self.pos + length > len(self.data):
         self.resize(self.pos + length)
     pos = 0
     for i in range(length):
         nextpos = next_codepoint_pos(string, pos)
         self.data[self.pos + i] = string[pos:nextpos]
         pos = nextpos
     self.pos += length
示例#12
0
def test_utf8_iterator_pos(arg):
    utf8s = arg.encode('utf8')
    u = rutf8.Utf8StringPosIterator(utf8s)
    l = []
    i = 0
    for c, pos in u:
        l.append(unichr(c))
        assert c == rutf8.codepoint_at_pos(utf8s, pos)
        assert pos == i
        i = rutf8.next_codepoint_pos(utf8s, i)
    assert list(arg) == l
示例#13
0
def charmap_build(space, chars):
    # XXX CPython sometimes uses a three-level trie
    w_charmap = space.newdict()
    pos = 0
    num = 0
    while pos < len(chars):
        w_char = space.newint(rutf8.codepoint_at_pos(chars, pos))
        space.setitem(w_charmap, w_char, space.newint(num))
        pos = rutf8.next_codepoint_pos(chars, pos)
        num += 1
    return w_charmap
示例#14
0
 def next_char(self):
     if self.exhausted():
         raise StopIteration
     newpos = next_codepoint_pos(self.text, self.pos)
     pos = self.pos
     assert pos >= 0
     assert newpos >= 0
     ch = self.text[pos:newpos]
     self.pos = newpos
     self.upos += 1
     return ch
示例#15
0
def test_next_pos(uni):
    skips = []
    for elem in uni:
        skips.append(len(elem.encode('utf8')))
    pos = 0
    i = 0
    utf8 = uni.encode('utf8')
    while pos < len(utf8):
        new_pos = rutf8.next_codepoint_pos(utf8, pos)
        assert new_pos - pos == skips[i]
        i += 1
        pos = new_pos
示例#16
0
def utf8_to_char32(utf8, target_ptr, target_length, add_final_zero):
    # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
    # we assume (and check) that target_length == number of unichars in utf8.
    unichardata = rffi.cast(rffi.UINTP, target_ptr)
    i = 0
    for j in range(target_length):
        code = rutf8.codepoint_at_pos(utf8, i)
        unichardata[j] = rffi.cast(rffi.UINT, code)
        i = rutf8.next_codepoint_pos(utf8, i)
    assert i == len(utf8)
    if add_final_zero:
        unichardata[target_length] = rffi.cast(rffi.UINT, 0)
示例#17
0
 def __init__(self, data=None, pos=0):
     if data is None:
         data = ''
     self.data = []
     self.pos = 0
     # break the data into unicode codepoints
     _pos = 0
     while _pos < pos:
         _pos = next_codepoint_pos(data, _pos)
         if _pos >= len(data):
             break
     self.write(data[_pos:])
     self.pos = pos
示例#18
0
def surrogateescape_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        res = ''
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        obj = w_obj._utf8
        pos = start
        while pos < end:
            code = rutf8.codepoint_at_pos(obj, pos)
            if code < 0xdc80 or code > 0xdcff:
                # Not a UTF-8b surrogate, fail with original exception
                raise OperationError(space.type(w_exc), w_exc)
            res += chr(code - 0xdc00)
            pos = rutf8.next_codepoint_pos(obj, pos)
        return space.newtuple([space.newbytes(res), w_end])
    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
        consumed = 0
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        end = space.int_w(space.getattr(w_exc, space.newtext('end')))
        obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
        replace = u''
        while consumed < 4 and consumed < end - start:
            c = ord(obj[start + consumed])
            if c < 128:
                # Refuse to escape ASCII bytes.
                break
            replace += unichr(0xdc00 + c)
            consumed += 1
        if not consumed:
            # codec complained about ASCII byte.
            raise OperationError(space.type(w_exc), w_exc)
        replace_utf8 = runicode.unicode_encode_utf_8(replace,
                                                     len(replace),
                                                     'strict',
                                                     allow_surrogates=True)
        return space.newtuple([
            space.newtext(replace_utf8, len(replace)),
            space.newint(start + consumed)
        ])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
示例#19
0
 def descr_next(self, space):
     from pypy.objspace.std.unicodeobject import W_UnicodeObject
     from rpython.rlib import rutf8
     w_seq = self.w_seq
     if w_seq is None:
         raise OperationError(space.w_StopIteration, space.w_None)
     assert isinstance(w_seq, W_UnicodeObject)
     index = self.index
     if index == w_seq._length:
         self.w_seq = None
         raise OperationError(space.w_StopIteration, space.w_None)
     start = self.byteindex
     end = rutf8.next_codepoint_pos(w_seq._utf8, start)
     w_res = W_UnicodeObject(w_seq._utf8[start:end], 1)
     self.byteindex = end
     self.index += 1
     return w_res
示例#20
0
def utf8_to_char16(utf8, target_ptr, target_length, add_final_zero):
    # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
    # we assume (and check) that target_length == utf8_size_as_char16(utf8).
    ptr = rffi.cast(rffi.USHORTP, target_ptr)
    i = 0
    while i < len(utf8):
        ordinal = rutf8.codepoint_at_pos(utf8, i)
        if ordinal > 0xFFFF:
            ordinal -= 0x10000
            ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
            ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
            ptr = rffi.ptradd(ptr, 2)
        else:
            ptr[0] = rffi.cast(rffi.USHORT, ordinal)
            ptr = rffi.ptradd(ptr, 1)
        i = rutf8.next_codepoint_pos(utf8, i)
    assert ptr == (
        rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
    if add_final_zero:
        ptr[0] = rffi.cast(rffi.USHORT, 0)
示例#21
0
def backslashreplace_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        obj = w_obj._utf8
        pos = start
        while pos < end:
            oc = rutf8.codepoint_at_pos(obj, pos)
            num = hex(oc)
            if (oc >= 0x10000):
                builder.append("\\U")
                zeros = 8
            elif (oc >= 0x100):
                builder.append("\\u")
                zeros = 4
            else:
                builder.append("\\x")
                zeros = 2
            lnum = len(num)
            nb = zeros + 2 - lnum  # num starts with '0x'
            if nb > 0:
                builder.append_multiple_char('0', nb)
            builder.append_slice(num, 2, lnum)
            pos = rutf8.next_codepoint_pos(obj, pos)
        r = builder.build()
        lgt = rutf8.check_utf8(r, True)
        return space.newtuple([space.newutf8(r, lgt), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
示例#22
0
def backslashreplace_errors(space, w_exc):

    check_exception(space, w_exc)
    if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError)
            or space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        space.realutf8_w(w_obj)  # for errors
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        builder = StringBuilder()
        pos = start
        obj = w_obj._utf8
        while pos < end:
            code = rutf8.codepoint_at_pos(obj, pos)
            unicodehelper.raw_unicode_escape_helper(builder, code)
            pos = rutf8.next_codepoint_pos(obj, pos)
        return space.newtuple([space.newtext(builder.build()), w_end])
    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
        obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        end = space.int_w(w_end)
        builder = StringBuilder()
        pos = start
        while pos < end:
            oc = ord(obj[pos])
            unicodehelper.raw_unicode_escape_helper(builder, oc)
            pos += 1
        return space.newtuple([space.newtext(builder.build()), w_end])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
示例#23
0
 def _parse_spec(self, default_type, default_align):
     space = self.space
     self._fill_char = self._lit(" ")[0]
     self._align = default_align
     self._alternate = False
     self._sign = "\0"
     self._thousands_sep = False
     self._precision = -1
     the_type = default_type
     spec = self.spec
     if not spec:
         return True
     length = len(spec)
     i = 0
     got_align = True
     got_fill_char = False
     # The single character could be utf8-encoded unicode
     if self.is_unicode:
         after_i = rutf8.next_codepoint_pos(spec, i)
     else:
         after_i = i + 1
     if length - i >= 2 and self._is_alignment(spec[after_i]):
         self._align = spec[after_i]
         self._fill_char = spec[i:after_i]
         got_fill_char = True
         i = after_i + 1
     elif length - i >= 1 and self._is_alignment(spec[i]):
         self._align = spec[i]
         i += 1
     else:
         got_align = False
     if length - i >= 1 and self._is_sign(spec[i]):
         self._sign = spec[i]
         i += 1
     if length - i >= 1 and spec[i] == "#":
         self._alternate = True
         i += 1
     if not got_fill_char and length - i >= 1 and spec[i] == "0":
         self._fill_char = self._lit("0")[0]
         if not got_align:
             self._align = "="
         i += 1
     self._width, i = _parse_int(self.space, spec, i, length)
     if length != i and spec[i] == ",":
         self._thousands_sep = True
         i += 1
     if length != i and spec[i] == ".":
         i += 1
         self._precision, i = _parse_int(self.space, spec, i, length)
         if self._precision == -1:
             raise oefmt(space.w_ValueError, "no precision given")
     if length - i > 1:
         raise oefmt(space.w_ValueError, "invalid format spec")
     if length - i == 1:
         presentation_type = spec[i]
         if self.is_unicode:
             try:
                 rutf8.check_utf8(spec[i], True)
                 the_type = spec[i][0]
             except rutf8.CheckError:
                 raise oefmt(space.w_ValueError,
                             "invalid presentation type")
         else:
             the_type = presentation_type
         i += 1
     self._type = the_type
     if self._thousands_sep:
         tp = self._type
         if (tp == "d" or tp == "e" or tp == "f" or tp == "g"
                 or tp == "E" or tp == "G" or tp == "%" or tp == "F"
                 or tp == "\0"):
             # ok
             pass
         else:
             raise oefmt(space.w_ValueError, "invalid type with ','")
     return False
示例#24
0
def surrogatepass_errors(space, w_exc):
    check_exception(space, w_exc)
    if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
        w_obj = space.getattr(w_exc, space.newtext('object'))
        w_obj = space.convert_arg_to_w_unicode(w_obj)
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        w_end = space.getattr(w_exc, space.newtext('end'))
        encoding = space.text_w(space.getattr(w_exc,
                                              space.newtext('encoding')))
        bytelength, code = get_standard_encoding(encoding)
        if code == ENC_UNKNOWN:
            # Not supported, fail with original exception
            raise OperationError(space.type(w_exc), w_exc)
        end = space.int_w(w_end)
        builder = StringBuilder()
        start = w_obj._index_to_byte(start)
        end = w_obj._index_to_byte(end)
        obj = w_obj._utf8
        pos = start
        while pos < end:
            ch = rutf8.codepoint_at_pos(obj, pos)
            pos = rutf8.next_codepoint_pos(obj, pos)
            if ch < 0xd800 or ch > 0xdfff:
                # Not a surrogate, fail with original exception
                raise OperationError(space.type(w_exc), w_exc)
            if code == ENC_UTF8:
                builder.append(chr(0xe0 | (ch >> 12)))
                builder.append(chr(0x80 | ((ch >> 6) & 0x3f)))
                builder.append(chr(0x80 | (ch & 0x3f)))
            elif code == ENC_UTF16LE:
                builder.append(chr(ch & 0xff))
                builder.append(chr(ch >> 8))
            elif code == ENC_UTF16BE:
                builder.append(chr(ch >> 8))
                builder.append(chr(ch & 0xff))
            elif code == ENC_UTF32LE:
                builder.append(chr(ch & 0xff))
                builder.append(chr(ch >> 8))
                builder.append(chr(0))
                builder.append(chr(0))
            elif code == ENC_UTF32BE:
                builder.append(chr(0))
                builder.append(chr(0))
                builder.append(chr(ch >> 8))
                builder.append(chr(ch & 0xff))
        return space.newtuple([space.newbytes(builder.build()), w_end])
    elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
        start = space.int_w(space.getattr(w_exc, space.newtext('start')))
        obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
        encoding = space.text_w(space.getattr(w_exc,
                                              space.newtext('encoding')))
        bytelength, code = get_standard_encoding(encoding)
        ch = 0
        # Try decoding a single surrogate character. If there are more,
        # let the codec call us again
        ch0 = ord(obj[start + 0]) if len(obj) > start + 0 else -1
        ch1 = ord(obj[start + 1]) if len(obj) > start + 1 else -1
        ch2 = ord(obj[start + 2]) if len(obj) > start + 2 else -1
        ch3 = ord(obj[start + 3]) if len(obj) > start + 3 else -1
        if code == ENC_UTF8:
            if (ch1 != -1 and ch2 != -1 and ch0 & 0xf0 == 0xe0
                    and ch1 & 0xc0 == 0x80 and ch2 & 0xc0 == 0x80):
                # it's a three-byte code
                ch = ((ch0 & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f)
        elif code == ENC_UTF16LE:
            ch = (ch1 << 8) | ch0
        elif code == ENC_UTF16BE:
            ch = (ch0 << 8) | ch1
        elif code == ENC_UTF32LE:
            ch = (ch3 << 24) | (ch2 << 16) | (ch1 << 8) | ch0
        elif code == ENC_UTF32BE:
            ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3
        if ch < 0xd800 or ch > 0xdfff:
            # it's not a surrogate - fail
            ch = 0
        if ch == 0:
            raise OperationError(space.type(w_exc), w_exc)
        ch_utf8 = rutf8.unichr_as_utf8(ch, allow_surrogates=True)
        return space.newtuple(
            [space.newtext(ch_utf8, 1),
             space.newint(start + bytelength)])
    else:
        raise oefmt(space.w_TypeError,
                    "don't know how to handle %T in error callback", w_exc)
示例#25
0
def replace_count(input, sub, by, maxcount=-1, isutf8=False):
    if isinstance(input, str):
        Builder = StringBuilder
    elif isinstance(input, unicode):
        Builder = UnicodeBuilder
    else:
        assert isinstance(input, list)
        Builder = ByteListBuilder
    if maxcount == 0:
        return input, 0

    if not sub and not isutf8:
        upper = len(input)
        if maxcount > 0 and maxcount < upper + 2:
            upper = maxcount - 1
            assert upper >= 0

        try:
            result_size = ovfcheck(upper * len(by))
            result_size = ovfcheck(result_size + upper)
            result_size = ovfcheck(result_size + len(by))
            remaining_size = len(input) - upper
            result_size = ovfcheck(result_size + remaining_size)
        except OverflowError:
            raise
        builder = Builder(result_size)
        for i in range(upper):
            builder.append(by)
            builder.append(input[i])
        builder.append(by)
        builder.append_slice(input, upper, len(input))
        replacements = upper + 1

    elif isinstance(input, str) and len(sub) == 1:
        if len(by) == 1:
            return replace_count_str_chr_chr(input, sub[0], by[0], maxcount)
        return replace_count_str_chr_str(input, sub[0], by, maxcount)

    else:
        # First compute the exact result size
        if sub:
            cnt = count(input, sub, 0, len(input))
            if isinstance(input, str) and cnt == 0:
                return input, 0
            if isinstance(input, str):
                return replace_count_str_str_str(input, sub, by, cnt, maxcount)
        else:
            assert isutf8
            from rpython.rlib import rutf8
            cnt = rutf8.codepoints_in_utf8(input) + 1

        if cnt > maxcount and maxcount > 0:
            cnt = maxcount
        diff_len = len(by) - len(sub)
        try:
            result_size = ovfcheck(diff_len * cnt)
            result_size = ovfcheck(result_size + len(input))
        except OverflowError:
            raise
        replacements = cnt

        builder = Builder(result_size)
        start = 0
        sublen = len(sub)

        if sublen == 0:
            assert isutf8
            from rpython.rlib import rutf8
            while True:
                builder.append(by)
                maxcount -= 1
                if start == len(input) or maxcount == 0:
                    break
                next = rutf8.next_codepoint_pos(input, start)
                builder.append_slice(input, start, next)
                start = next
        else:
            while maxcount != 0:
                next = find(input, sub, start, len(input))
                if next < 0:
                    break
                builder.append_slice(input, start, next)
                builder.append(by)
                start = next + sublen
                maxcount -= 1  # NB. if it's already < 0, it stays < 0

        builder.append_slice(input, start, len(input))

    return builder.build(), replacements
示例#26
0
 def _advance_codepoint(self):
     # must only be called after checking self.exhausted()!
     self.pos = next_codepoint_pos(self.text, self.pos)
     self.upos += 1
示例#27
0
 def next_n(self, position, n, end_position):
     for i in range(n):
         if position >= end_position:
             raise EndOfString
         position = rutf8.next_codepoint_pos(self._utf8, position)
     return position
示例#28
0
 def next(self, position):
     return rutf8.next_codepoint_pos(self._utf8, position)