def make_ctx(self, w_string, pos=0, endpos=sys.maxint): """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for searching in the given w_string object.""" space = self.space if pos < 0: pos = 0 if endpos < pos: endpos = pos if space.isinstance_w(w_string, space.w_unicode): w_unicode_obj = space.convert_arg_to_w_unicode(w_string) utf8str = w_unicode_obj._utf8 length = w_unicode_obj._len() if pos <= 0: bytepos = 0 elif pos >= length: bytepos = len(utf8str) else: index_storage = w_unicode_obj._get_index_storage() bytepos = rutf8.codepoint_position_at_index( utf8str, index_storage, pos) if endpos >= length: endbytepos = len(utf8str) else: index_storage = w_unicode_obj._get_index_storage() endbytepos = rutf8.codepoint_position_at_index( utf8str, index_storage, endpos) ctx = rsre_utf8.Utf8MatchContext(utf8str, bytepos, endbytepos, self.flags) # xxx we store the w_string on the ctx too, for # W_SRE_Match.bytepos_to_charindex() ctx.w_unicode_obj = w_unicode_obj return ctx elif space.isinstance_w(w_string, space.w_bytes): str = space.bytes_w(w_string) if pos > len(str): pos = len(str) if endpos > len(str): endpos = len(str) return self._make_str_match_context(str, pos, endpos) else: buf = space.readbuf_w(w_string) size = buf.getlength() assert size >= 0 if pos > size: pos = size if endpos > size: endpos = size return rsre_core.BufMatchContext(buf, pos, endpos, self.flags)
def test_codepoint_position_at_index_inverse(u): print u b = u.encode('utf8') storage = rutf8.create_utf8_index_storage(b, len(u)) for i in range(len(u) + 1): bytepos = rutf8.codepoint_position_at_index(b, storage, i) assert rutf8.codepoint_index_at_byte_position( b, storage, bytepos, len(u)) == i
def test_codepoint_position_at_index(u): index = rutf8.create_utf8_index_storage(u.encode('utf8'), len(u)) for i in range(len(u) + 1): assert (rutf8.codepoint_position_at_index(u.encode('utf8'), index, i) == len(u[:i].encode('utf8')))