def _whitespace_tokenize_with_offsets_encode_decode_wrapper( self, input_tensor): """Tokenizes a tensor of UTF-8 strings with rank of 1. Args: input_tensor: The single dimensional Tensor to tokenize. Returns: Tuple of RaggedTensors of tokenized text and byte offsets, with shapes [num_strings, (num_tokens or num_offsets)]. """ # Decode the strings and get byte offsets (codepoints, byte_start_offsets) = ( ragged_string_ops.unicode_decode_with_offsets(input_tensor, "UTF-8")) byte_limit_offsets = array_ops.concat([ byte_start_offsets[:, 1:], math_ops.cast( array_ops.expand_dims(string_ops.string_length(input_tensor), 1), dtypes.int64) ], 1) # Tokenize (codepoint_tokens, codepoint_start_offsets, codepoint_limit_offsets) = ( self._whitespace_tokenize_codepoints_with_offsets(codepoints)) # Encode the codepoints and translate the codepoint offsets to byte offsets. return (ragged_string_ops.unicode_encode(codepoint_tokens, "UTF-8"), array_ops.batch_gather(byte_start_offsets, codepoint_start_offsets), array_ops.batch_gather( byte_limit_offsets, math_ops.subtract(codepoint_limit_offsets, [1])))
def testVectorDecodeWithOffset(self): text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"]) chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8") expected_chars = [[ord(c) for c in u"仅今年前"], [ord(c) for c in u"hello"]] self.assertAllEqual(chars, expected_chars) self.assertAllEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin """Tokenizes a tensor of UTF-8 strings to Unicode characters. Returned token tensors are of integer type. Args: input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape. Returns: A tuple `(tokens, start_offsets, end_offsets)` where: * `tokens`: A `RaggedTensor` of codepoints (integer type). * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset. * `end_offsets`: A `RaggedTensor` of the tokens' ending byte offset. """ name = None with ops.name_scope(name, "UnicodeCharTokenize", [input]): input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input) (codepoints, byte_start_offsets) = ( ragged_string_ops.unicode_decode_with_offsets( input_tensor, "UTF-8")) strlens = math_ops.cast( array_ops.expand_dims(string_ops.string_length(input_tensor), -1), dtypes.int64) # Adjust strlens to set 0-length strings to empty array (there will be no # tokens in this case). final_ends = ragged_array_ops.boolean_mask(strlens, strlens > 0) byte_end_offsets = array_ops.concat( [byte_start_offsets[..., 1:], final_ends], -1) return codepoints, byte_start_offsets, byte_end_offsets
def testErrorModesWithOffsets(self, expected=None, expected_offsets=None, **args): result = ragged_string_ops.unicode_decode_with_offsets(**args) self.assertAllEqual(result[0], expected) self.assertAllEqual(result[1], expected_offsets)
def testErrorModesWithOffsets(self, expected=None, expected_offsets=None, **args): result = ragged_string_ops.unicode_decode_with_offsets(**args) self.assertRaggedEqual(result[0], expected) self.assertRaggedEqual(result[1], expected_offsets)
def testVectorDecodeWithOffset(self): text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"]) chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8") expected_chars = [[ord(c) for c in u"仅今年前"], [ord(c) for c in u"hello"]] self.assertRaggedEqual(chars, expected_chars) self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
def testDecodeWithOffsetsWithDifferentEncodings(self, encoding, texts): expected_codepoints = _nested_codepoints(texts) expected_offsets = _nested_offsets(texts, encoding) input_tensor = constant_op.constant(_nested_encode(texts, encoding)) result = ragged_string_ops.unicode_decode_with_offsets( input_tensor, encoding) self.assertAllEqual(expected_codepoints, result[0]) self.assertAllEqual(expected_offsets, result[1])
def testDecodeWithOffsetsWithDifferentEncodings(self, encoding, texts): expected_codepoints = _nested_codepoints(texts) expected_offsets = _nested_offsets(texts, encoding) input_tensor = constant_op.constant(_nested_encode(texts, encoding)) result = ragged_string_ops.unicode_decode_with_offsets( input_tensor, encoding) self.assertRaggedEqual(expected_codepoints, result[0]) self.assertRaggedEqual(expected_offsets, result[1])
def testBasicDecodeWithOffsets(self, texts, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_decode_with_offsets( input_tensor, "UTF-8") expected_codepoints = _nested_codepoints(texts) expected_offsets = _nested_offsets(texts, "UTF-8") self.assertAllEqual(expected_codepoints, result[0]) self.assertAllEqual(expected_offsets, result[1])
def testBasicDecodeWithOffsets(self, texts, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_decode_with_offsets( input_tensor, "UTF-8") expected_codepoints = _nested_codepoints(texts) expected_offsets = _nested_offsets(texts, "UTF-8") self.assertRaggedEqual(expected_codepoints, result[0]) self.assertRaggedEqual(expected_offsets, result[1])
def testDocstringExamples(self): texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]] codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8") codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets( texts, "UTF-8") self.assertAllEqual( codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertAllEqual( codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertAllEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
def testDocstringExamples(self): texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]] codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8") codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets( texts, "UTF-8") self.assertRaggedEqual( codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertRaggedEqual( codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
def testScalarDecodeWithOffset(self): text = constant_op.constant(u"仅今年前".encode("utf-8")) chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8") self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"]) self.assertAllEqual(starts, [0, 3, 6, 9])
def testScalarDecodeWithOffset(self): text = constant_op.constant(u"仅今年前".encode("utf-8")) chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8") self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"]) self.assertAllEqual(starts, [0, 3, 6, 9])