def testErrorModesWithOffsets(self,
                               expected=None,
                               expected_offsets=None,
                               **args):
   result = ragged_string_ops.unicode_split_with_offsets(**args)
   self.assertRaggedEqual(expected, result[0])
   self.assertRaggedEqual(expected_offsets, result[1])
示例#2
0
 def testErrorModesWithOffsets(self,
                               expected=None,
                               expected_offsets=None,
                               **args):
   result = ragged_string_ops.unicode_split_with_offsets(**args)
   self.assertAllEqual(expected, result[0])
   self.assertAllEqual(expected_offsets, result[1])
示例#3
0
 def testVectorSplitWithOffset(self):
   text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
   chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
   expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
                     [c.encode("UTF-8") for c in u"hello"]]
   self.assertAllEqual(chars, expected_chars)
   self.assertAllEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
 def testVectorSplitWithOffset(self):
   text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
   chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
   expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
                     [c.encode("UTF-8") for c in u"hello"]]
   self.assertRaggedEqual(chars, expected_chars)
   self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
示例#5
0
 def testSplitWithOffsetsWithDifferentEncodings(self, encoding, texts):
   expected_codepoints = _nested_splitchars(texts, encoding)
   expected_offsets = _nested_offsets(texts, encoding)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_split_with_offsets(
       input_tensor, encoding)
   self.assertAllEqual(expected_codepoints, result[0])
   self.assertAllEqual(expected_offsets, result[1])
示例#6
0
 def testBasicSplitWithOffsets(self, texts, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_split_with_offsets(input_tensor, "UTF-8")
   expected_codepoints = _nested_splitchars(texts, "UTF-8")
   expected_offsets = _nested_offsets(texts, "UTF-8")
   self.assertAllEqual(expected_codepoints, result[0])
   self.assertAllEqual(expected_offsets, result[1])
 def testSplitWithOffsetsWithDifferentEncodings(self, encoding, texts):
   expected_codepoints = _nested_splitchars(texts, encoding)
   expected_offsets = _nested_offsets(texts, encoding)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_split_with_offsets(
       input_tensor, encoding)
   self.assertRaggedEqual(expected_codepoints, result[0])
   self.assertRaggedEqual(expected_offsets, result[1])
 def testBasicSplitWithOffsets(self, texts, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_split_with_offsets(input_tensor, "UTF-8")
   expected_codepoints = _nested_splitchars(texts, "UTF-8")
   expected_offsets = _nested_offsets(texts, "UTF-8")
   self.assertRaggedEqual(expected_codepoints, result[0])
   self.assertRaggedEqual(expected_offsets, result[1])
示例#9
0
 def testDocstringExamples(self):
     texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
     codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8")
     codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets(
         texts, "UTF-8")
     self.assertAllEqual(codepoints1, [[
         b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"
     ], [b"\xf0\x9f\x98\x8a"]])
     self.assertAllEqual(codepoints2, [[
         b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"
     ], [b"\xf0\x9f\x98\x8a"]])
     self.assertAllEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
 def testDocstringExamples(self):
   texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
   codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8")
   codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets(
       texts, "UTF-8")
   self.assertRaggedEqual(
       codepoints1,
       [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
        [b"\xf0\x9f\x98\x8a"]])
   self.assertRaggedEqual(
       codepoints2,
       [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
        [b"\xf0\x9f\x98\x8a"]])
   self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
示例#11
0
 def testScalarSplitWithOffset(self):
   text = constant_op.constant(u"仅今年前".encode("UTF-8"))
   chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
   self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
   self.assertAllEqual(starts, [0, 3, 6, 9])
 def testScalarSplitWithOffset(self):
   text = constant_op.constant(u"仅今年前".encode("UTF-8"))
   chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
   self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
   self.assertAllEqual(starts, [0, 3, 6, 9])