def testSplitWithOffsetsWithDifferentEncodings(self, encoding, texts): expected_codepoints = _nested_splitchars(texts, encoding) expected_offsets = _nested_offsets(texts, encoding) input_tensor = constant_op.constant(_nested_encode(texts, encoding)) result = ragged.unicode_split_with_offsets(input_tensor, encoding) self.assertRaggedEqual(expected_codepoints, result[0]) self.assertRaggedEqual(expected_offsets, result[1])
def testErrorModesWithOffsets(self, expected=None, expected_offsets=None, **args): result = ragged.unicode_split_with_offsets(**args) self.assertRaggedEqual(expected, result[0]) self.assertRaggedEqual(expected_offsets, result[1])
def testVectorSplitWithOffset(self): text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"]) chars, starts = ragged.unicode_split_with_offsets(text, "UTF-8") expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"], [c.encode("UTF-8") for c in u"hello"]] self.assertRaggedEqual(chars, expected_chars) self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
def testBasicSplitWithOffsets(self, texts, ragged_rank=None): input_tensor = ragged.constant_value(_nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged.unicode_split_with_offsets(input_tensor, "UTF-8") expected_codepoints = _nested_splitchars(texts, "UTF-8") expected_offsets = _nested_offsets(texts, "UTF-8") self.assertRaggedEqual(expected_codepoints, result[0]) self.assertRaggedEqual(expected_offsets, result[1])
def testDocstringExamples(self): texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]] codepoints1 = ragged.unicode_split(texts, "UTF-8") codepoints2, offsets = ragged.unicode_split_with_offsets( texts, "UTF-8") self.assertRaggedEqual(codepoints1, [[ b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t" ], [b"\xf0\x9f\x98\x8a"]]) self.assertRaggedEqual(codepoints2, [[ b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t" ], [b"\xf0\x9f\x98\x8a"]]) self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
def testScalarSplitWithOffset(self): text = constant_op.constant(u"仅今年前".encode("UTF-8")) chars, starts = ragged.unicode_split_with_offsets(text, "UTF-8") self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"]) self.assertAllEqual(starts, [0, 3, 6, 9])