def testSplitWithSparseOutput(self, texts, expected):
   input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
   result = ragged_string_ops.unicode_split(input_tensor, "UTF-8").to_sparse()
   self.assertIsInstance(result, sparse_tensor.SparseTensor)
   self.assertAllEqual(expected.indices, result.indices)
   self.assertAllEqual(expected.values, result.values)
   self.assertAllEqual(expected.dense_shape, result.dense_shape)
示例#2
0
 def testSplitWithSparseOutput(self, texts, expected):
   input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
   result = ragged_string_ops.unicode_split(input_tensor, "UTF-8").to_sparse()
   self.assertIsInstance(result, sparse_tensor.SparseTensor)
   self.assertAllEqual(expected.indices, result.indices)
   self.assertAllEqual(expected.values, result.values)
   self.assertAllEqual(expected.dense_shape, result.dense_shape)
示例#3
0
 def testSplitWithPaddedOutput(self, texts, expected, ragged_rank=None):
     input_tensor = ragged_factory_ops.constant_value(
         _nested_encode(texts, "UTF-8"),
         ragged_rank=ragged_rank,
         dtype=bytes)
     result = ragged_string_ops.unicode_split(
         input_tensor, "UTF-8").to_tensor(default_value="")
     self.assertAllEqual(np.array(expected, dtype=bytes), result)
示例#4
0
 def testBasicSplit(self, texts, ragged_rank=None):
     input_tensor = ragged_factory_ops.constant_value(
         _nested_encode(texts, "UTF-8"),
         ragged_rank=ragged_rank,
         dtype=bytes)
     result = ragged_string_ops.unicode_split(input_tensor, "UTF-8")
     expected = _nested_splitchars(texts, "UTF-8")
     self.assertAllEqual(expected, result)
示例#5
0
 def testDocstringExamples(self):
     texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
     codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8")
     codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets(
         texts, "UTF-8")
     self.assertAllEqual(codepoints1, [[
         b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"
     ], [b"\xf0\x9f\x98\x8a"]])
     self.assertAllEqual(codepoints2, [[
         b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"
     ], [b"\xf0\x9f\x98\x8a"]])
     self.assertAllEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
 def testDocstringExamples(self):
   texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
   codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8")
   codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets(
       texts, "UTF-8")
   self.assertRaggedEqual(
       codepoints1,
       [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
        [b"\xf0\x9f\x98\x8a"]])
   self.assertRaggedEqual(
       codepoints2,
       [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
        [b"\xf0\x9f\x98\x8a"]])
   self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
示例#7
0
  def tokenize(self, text_input):
    """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
    # lowercase and strip accents (if option is set)
    if self._lower_case:
      text_input = case_fold_utf8(text_input)
      text_input = normalize_utf8(text_input, "NFD")
      text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
    else:
      # utf8 normalization
      if self._normalization_form is not None:
        text_input = normalize_utf8(text_input, self._normalization_form)

    # strip out control characters
    text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ")

    # For chinese and emoji characters, tokenize by unicode codepoints
    unicode_tokenizer = UnicodeScriptTokenizer(
        keep_whitespace=self._keep_whitespace)
    script_tokenized = unicode_tokenizer.tokenize(text_input)

    split_cond = self._should_split(script_tokenized)

    unicode_char_split = ragged_string_ops.unicode_split(
        script_tokenized, "UTF-8")
    unicode_split_tokens = array_ops.where(
        array_ops.squeeze(split_cond),
        y=array_ops.expand_dims(script_tokenized.values, axis=1),
        x=unicode_char_split.values)
    final_tokens = script_tokenized.with_flat_values(unicode_split_tokens)
    return final_tokens.merge_dims(-2, -1)
 def testScalarSplit(self):
   text = constant_op.constant(u"仅今年前".encode("UTF-8"))
   chars = ragged_string_ops.unicode_split(text, "UTF-8")
   self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
示例#9
0
 def testSplitWithDifferentEncodings(self, encoding, texts):
   expected = _nested_splitchars(texts, encoding)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_split(input_tensor, encoding)
   self.assertAllEqual(expected, result)
示例#10
0
 def testExceptions(self, exception=None, message=None, **args):
   with self.assertRaisesRegex(exception, message):
     self.evaluate(ragged_string_ops.unicode_split(**args))
 def testBasicSplit(self, texts, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_split(input_tensor, "UTF-8")
   expected = _nested_splitchars(texts, "UTF-8")
   self.assertRaggedEqual(expected, result)
示例#12
0
 def testErrorModes(self, expected=None, **args):
   result = ragged_string_ops.unicode_split(**args)
   self.assertAllEqual(expected, result)
示例#13
0
 def testScalarSplit(self):
   text = constant_op.constant(u"仅今年前".encode("UTF-8"))
   chars = ragged_string_ops.unicode_split(text, "UTF-8")
   self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
示例#14
0
 def testVectorSplit(self):
   text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
   chars = ragged_string_ops.unicode_split(text, "UTF-8")
   expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
                     [c.encode("UTF-8") for c in u"hello"]]
   self.assertAllEqual(chars, expected_chars)
 def testSplitWithPaddedOutput(self, texts, expected, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_split(
       input_tensor, "UTF-8").to_tensor(default_value="")
   self.assertAllEqual(np.array(expected, dtype=bytes), result)
 def testErrorModes(self, expected=None, **args):
   result = ragged_string_ops.unicode_split(**args)
   self.assertRaggedEqual(expected, result)
 def testVectorSplit(self):
   text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
   chars = ragged_string_ops.unicode_split(text, "UTF-8")
   expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
                     [c.encode("UTF-8") for c in u"hello"]]
   self.assertRaggedEqual(chars, expected_chars)
 def testExceptions(self, exception=None, message=None, **args):
   with self.assertRaisesRegexp(exception, message):
     self.evaluate(ragged_string_ops.unicode_split(**args))
 def testSplitWithDifferentEncodings(self, encoding, texts):
   expected = _nested_splitchars(texts, encoding)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_split(input_tensor, encoding)
   self.assertRaggedEqual(expected, result)