def _whitespace_tokenize_with_offsets_encode_decode_wrapper( self, input_tensor): """Tokenizes a tensor of UTF-8 strings with rank of 1. Args: input_tensor: The single dimensional Tensor to tokenize. Returns: Tuple of RaggedTensors of tokenized text and byte offsets, with shapes [num_strings, (num_tokens or num_offsets)]. """ # Decode the strings and get byte offsets (codepoints, byte_start_offsets) = ( ragged_string_ops.unicode_decode_with_offsets(input_tensor, "UTF-8")) byte_limit_offsets = array_ops.concat([ byte_start_offsets[:, 1:], math_ops.cast( array_ops.expand_dims(string_ops.string_length(input_tensor), 1), dtypes.int64) ], 1) # Tokenize (codepoint_tokens, codepoint_start_offsets, codepoint_limit_offsets) = ( self._whitespace_tokenize_codepoints_with_offsets(codepoints)) # Encode the codepoints and translate the codepoint offsets to byte offsets. return (ragged_string_ops.unicode_encode(codepoint_tokens, "UTF-8"), array_ops.batch_gather(byte_start_offsets, codepoint_start_offsets), array_ops.batch_gather( byte_limit_offsets, math_ops.subtract(codepoint_limit_offsets, [1])))
def testLegacyPositionalName(self): # Code that predates the 'unit' parameter may have used a positional # argument for the 'name' parameter. Check that we don't break such code. strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]] lengths = string_ops.string_length(strings, "some_name") with self.test_session(): self.assertAllEqual(lengths.eval(), [[[1, 2], [3, 4], [5, 6]]])
def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin """Tokenizes a tensor of UTF-8 strings to Unicode characters. Returned token tensors are of integer type. Args: input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape. Returns: A tuple `(tokens, start_offsets, end_offsets)` where: * `tokens`: A `RaggedTensor` of codepoints (integer type). * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset. * `end_offsets`: A `RaggedTensor` of the tokens' ending byte offset. """ name = None with ops.name_scope(name, "UnicodeCharTokenize", [input]): input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input) (codepoints, byte_start_offsets) = ( ragged_string_ops.unicode_decode_with_offsets( input_tensor, "UTF-8")) strlens = math_ops.cast( array_ops.expand_dims(string_ops.string_length(input_tensor), -1), dtypes.int64) # Adjust strlens to set 0-length strings to empty array (there will be no # tokens in this case). final_ends = ragged_array_ops.boolean_mask(strlens, strlens > 0) byte_end_offsets = array_ops.concat( [byte_start_offsets[..., 1:], final_ends], -1) return codepoints, byte_start_offsets, byte_end_offsets
def testStringLength(self): strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]] with self.cached_session() as sess: lengths = string_ops.string_length(strings) values = sess.run(lengths) self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
def testUnit(self): unicode_strings = [u"H\xc3llo", u"\U0001f604"] utf8_strings = [s.encode("utf-8") for s in unicode_strings] expected_utf8_byte_lengths = [6, 4] expected_utf8_char_lengths = [5, 1] with self.test_session() as sess: utf8_byte_lengths = string_ops.string_length(utf8_strings, unit="BYTE") utf8_char_lengths = string_ops.string_length( utf8_strings, unit="UTF8_CHAR") self.assertAllEqual( sess.run(utf8_byte_lengths), expected_utf8_byte_lengths) self.assertAllEqual( sess.run(utf8_char_lengths), expected_utf8_char_lengths) with self.assertRaisesRegexp( ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' " 'not in: "BYTE", "UTF8_CHAR"'): string_ops.string_length(utf8_strings, unit="XYZ")
def testUnit(self): unicode_strings = [u"H\xc3llo", u"\U0001f604"] utf8_strings = [s.encode("utf-8") for s in unicode_strings] expected_utf8_byte_lengths = [6, 4] expected_utf8_char_lengths = [5, 1] with self.session() as sess: utf8_byte_lengths = string_ops.string_length(utf8_strings, unit="BYTE") utf8_char_lengths = string_ops.string_length( utf8_strings, unit="UTF8_CHAR") self.assertAllEqual( self.evaluate(utf8_byte_lengths), expected_utf8_byte_lengths) self.assertAllEqual( self.evaluate(utf8_char_lengths), expected_utf8_char_lengths) with self.assertRaisesRegexp( ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' " 'not in: "BYTE", "UTF8_CHAR"'): string_ops.string_length(utf8_strings, unit="XYZ")
def f1(self, x): return string_ops.string_length( string_ops.string_format('{}', x))
def fn(x): return string_ops.string_length( string_ops.string_format('{}', x)) # COMMENT4