def test_codepoint_index_at_byte_position(u): b = u.encode('utf8') storage = rutf8.create_utf8_index_storage(b, len(u)) for i in range(len(u) + 1): bytepos = len(u[:i].encode('utf8')) assert rutf8.codepoint_index_at_byte_position( b, storage, bytepos, len(u)) == i
def test_codepoint_position_at_index_inverse(u): print u b = u.encode('utf8') storage = rutf8.create_utf8_index_storage(b, len(u)) for i in range(len(u) + 1): bytepos = rutf8.codepoint_position_at_index(b, storage, i) assert rutf8.codepoint_index_at_byte_position( b, storage, bytepos, len(u)) == i
def bytepos_to_charindex(self, bytepos): # Transform a 'byte position', as returned by all methods from # rsre_core, back into a 'character index'. This is for UTF8 # handling. ctx = self.ctx if isinstance(ctx, rsre_utf8.Utf8MatchContext): index_storage = ctx.w_unicode_obj._get_index_storage() return rutf8.codepoint_index_at_byte_position( ctx.w_unicode_obj._utf8, index_storage, bytepos) else: return bytepos