示例#1
0
  def _whitespace_tokenize_with_offsets_encode_decode_wrapper(
      self, input_tensor):
    """Tokenizes a tensor of UTF-8 strings with rank of 1.

    Args:
      input_tensor: The single dimensional Tensor to tokenize.

    Returns:
      Tuple of RaggedTensors of tokenized text and byte offsets, with shapes
      [num_strings, (num_tokens or num_offsets)].
    """
    # Decode the strings and get byte offsets
    (codepoints, byte_start_offsets) = (
        ragged_string_ops.unicode_decode_with_offsets(input_tensor, "UTF-8"))
    byte_limit_offsets = array_ops.concat([
        byte_start_offsets[:, 1:],
        math_ops.cast(
            array_ops.expand_dims(string_ops.string_length(input_tensor), 1),
            dtypes.int64)
    ], 1)

    # Tokenize
    (codepoint_tokens, codepoint_start_offsets, codepoint_limit_offsets) = (
        self._whitespace_tokenize_codepoints_with_offsets(codepoints))

    # Encode the codepoints and translate the codepoint offsets to byte offsets.
    return (ragged_string_ops.unicode_encode(codepoint_tokens, "UTF-8"),
            array_ops.batch_gather(byte_start_offsets, codepoint_start_offsets),
            array_ops.batch_gather(
                byte_limit_offsets,
                math_ops.subtract(codepoint_limit_offsets, [1])))
示例#2
0
 def testLegacyPositionalName(self):
     # Code that predates the 'unit' parameter may have used a positional
     # argument for the 'name' parameter.  Check that we don't break such code.
     strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]
     lengths = string_ops.string_length(strings, "some_name")
     with self.test_session():
         self.assertAllEqual(lengths.eval(), [[[1, 2], [3, 4], [5, 6]]])
    def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
        """Tokenizes a tensor of UTF-8 strings to Unicode characters.

    Returned token tensors are of integer type.

    Args:
      input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape.

    Returns:
      A tuple `(tokens, start_offsets, end_offsets)` where:

        * `tokens`: A `RaggedTensor` of codepoints (integer type).
        * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset.
        * `end_offsets`: A `RaggedTensor` of the tokens' ending byte offset.
    """
        name = None
        with ops.name_scope(name, "UnicodeCharTokenize", [input]):
            input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                input)
            (codepoints, byte_start_offsets) = (
                ragged_string_ops.unicode_decode_with_offsets(
                    input_tensor, "UTF-8"))
            strlens = math_ops.cast(
                array_ops.expand_dims(string_ops.string_length(input_tensor),
                                      -1), dtypes.int64)
            # Adjust strlens to set 0-length strings to empty array (there will be no
            # tokens in this case).
            final_ends = ragged_array_ops.boolean_mask(strlens, strlens > 0)
            byte_end_offsets = array_ops.concat(
                [byte_start_offsets[..., 1:], final_ends], -1)
            return codepoints, byte_start_offsets, byte_end_offsets
示例#4
0
    def testStringLength(self):
        strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]

        with self.cached_session() as sess:
            lengths = string_ops.string_length(strings)
            values = sess.run(lengths)
            self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
 def testLegacyPositionalName(self):
   # Code that predates the 'unit' parameter may have used a positional
   # argument for the 'name' parameter.  Check that we don't break such code.
   strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]
   lengths = string_ops.string_length(strings, "some_name")
   with self.test_session():
     self.assertAllEqual(lengths.eval(), [[[1, 2], [3, 4], [5, 6]]])
  def testStringLength(self):
    strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]

    with self.cached_session() as sess:
      lengths = string_ops.string_length(strings)
      values = sess.run(lengths)
      self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
  def testUnit(self):
    unicode_strings = [u"H\xc3llo", u"\U0001f604"]
    utf8_strings = [s.encode("utf-8") for s in unicode_strings]
    expected_utf8_byte_lengths = [6, 4]
    expected_utf8_char_lengths = [5, 1]

    with self.test_session() as sess:
      utf8_byte_lengths = string_ops.string_length(utf8_strings, unit="BYTE")
      utf8_char_lengths = string_ops.string_length(
          utf8_strings, unit="UTF8_CHAR")
      self.assertAllEqual(
          sess.run(utf8_byte_lengths), expected_utf8_byte_lengths)
      self.assertAllEqual(
          sess.run(utf8_char_lengths), expected_utf8_char_lengths)
      with self.assertRaisesRegexp(
          ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' "
          'not in: "BYTE", "UTF8_CHAR"'):
        string_ops.string_length(utf8_strings, unit="XYZ")
示例#8
0
  def testUnit(self):
    unicode_strings = [u"H\xc3llo", u"\U0001f604"]
    utf8_strings = [s.encode("utf-8") for s in unicode_strings]
    expected_utf8_byte_lengths = [6, 4]
    expected_utf8_char_lengths = [5, 1]

    with self.session() as sess:
      utf8_byte_lengths = string_ops.string_length(utf8_strings, unit="BYTE")
      utf8_char_lengths = string_ops.string_length(
          utf8_strings, unit="UTF8_CHAR")
      self.assertAllEqual(
          self.evaluate(utf8_byte_lengths), expected_utf8_byte_lengths)
      self.assertAllEqual(
          self.evaluate(utf8_char_lengths), expected_utf8_char_lengths)
      with self.assertRaisesRegexp(
          ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' "
          'not in: "BYTE", "UTF8_CHAR"'):
        string_ops.string_length(utf8_strings, unit="XYZ")
示例#9
0
 def f1(self, x):
     return string_ops.string_length(
         string_ops.string_format('{}', x))
示例#10
0
 def fn(x):
     return string_ops.string_length(
         string_ops.string_format('{}', x))  # COMMENT4