Пример #1
0
    def test_width_3_ragged_tensor_equivalence(self, test_case):
        input_tensor = tf.ragged.constant(test_case)
        tf_output = tf_text.ngrams(
            input_tensor, 3, reduction_type=tf_text.Reduction.STRING_JOIN)

        rank = input_tensor.shape.rank
        model = self._make_model(rank, 3, ragged_tensor=True, flex=False)
        interpreter = interpreter_wrapper.InterpreterWithCustomOps(
            model_content=model, custom_op_registerers=['AddNgramsCustomOp'])
        interpreter.resize_tensor_input(0, input_tensor.flat_values.shape)
        for r in range(rank - 1):
            interpreter.resize_tensor_input(
                r + 1, input_tensor.nested_row_splits[r].shape)
        interpreter.allocate_tensors()
        interpreter.set_tensor(interpreter.get_input_details()[0]['index'],
                               input_tensor.flat_values.numpy())
        for r in range(rank - 1):
            interpreter.set_tensor(
                interpreter.get_input_details()[r + 1]['index'],
                input_tensor.nested_row_splits[r].numpy())
        interpreter.invoke()
        tflite_output_values = interpreter.get_tensor(
            interpreter.get_output_details()[0]['index'])
        self.assertEqual(tf_output.flat_values.numpy().tolist(),
                         tflite_output_values.tolist())
        for i in range(rank - 1):
            tflite_output_cur_row_splits = interpreter.get_tensor(
                interpreter.get_output_details()[i + 1]['index'])
            self.assertEqual(tf_output.nested_row_splits[i].numpy().tolist(),
                             tflite_output_cur_row_splits.tolist())
 def __init__(self, all_text):
     # self.paragraphs = all_text
     tokenizer = text.UnicodeScriptTokenizer()
     (self.tokens, self.offset_starts,
      self.offset_limits) = tokenizer.tokenize_with_offsets(all_text)
     self.bigrams = text.ngrams(self.tokens,
                                2,
                                reduction_type=text.Reduction.STRING_JOIN)
Пример #3
0
 def testStringJoinReductionFailsWithImproperAxis(self):
     with self.assertRaisesRegexp(
             tf.errors.InvalidArgumentError,
             r".*requires that ngrams' 'axis' parameter be -1."):
         _ = text.ngrams(data=[],
                         width=2,
                         axis=0,
                         reduction_type=text.Reduction.STRING_JOIN)
Пример #4
0
    def testMeanReduction(self):
        test_data = tf.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
        ngrams_op = text.ngrams(test_data,
                                width=2,
                                axis=1,
                                reduction_type=text.Reduction.MEAN)
        expected_values = [[1.5, 2.5], [15.0, 25.0]]

        self.assertRaggedEqual(expected_values, ngrams_op)
Пример #5
0
    def testSumReduction(self):
        test_data = tf.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
        ngrams_op = text.ngrams(test_data,
                                width=2,
                                axis=1,
                                reduction_type=text.Reduction.SUM)
        expected_values = [[3.0, 5.0], [30.0, 50.0]]

        self.assertRaggedEqual(expected_values, ngrams_op)
Пример #6
0
    def testReductionOnAxisWithInsufficientValuesReturnsEmptySet(self):
        test_data = tf.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
        ngrams_op = text.ngrams(test_data,
                                width=4,
                                axis=-1,
                                reduction_type=text.Reduction.SUM)
        expected_values = [[], []]

        self.assertRaggedEqual(expected_values, ngrams_op)
Пример #7
0
    def testStringJoinReduction(self):
        test_data = tf.constant([["a", "b", "c"], ["dd", "ee", "ff"]])
        ngrams_op = text.ngrams(test_data,
                                width=2,
                                axis=-1,
                                reduction_type=text.Reduction.STRING_JOIN,
                                string_separator="|")
        expected_values = [["a|b", "b|c"], ["dd|ee", "ee|ff"]]

        self.assertRaggedEqual(expected_values, ngrams_op)
Пример #8
0
    def testRaggedSumReductionAxisZero(self):
        test_data = tf.ragged.constant([[1.0, 2.0, 3.0, 4.0],
                                        [10.0, 20.0, 30.0, 40.0]])
        ngrams_op = text.ngrams(test_data,
                                width=2,
                                axis=0,
                                reduction_type=text.Reduction.SUM)
        expected_values = [[11.0, 22.0, 33.0, 44.0]]

        self.assertRaggedEqual(expected_values, ngrams_op)
Пример #9
0
    def testReductionOnInnerAxis(self):
        test_data = tf.constant([[[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]],
                                 [[4.0, 5.0, 6.0], [40.0, 50.0, 60.0]]])
        ngrams_op = text.ngrams(test_data,
                                width=2,
                                axis=-2,
                                reduction_type=text.Reduction.SUM)
        expected_values = [[[11.0, 22.0, 33.0]], [[44.0, 55.0, 66.0]]]

        self.assertRaggedEqual(expected_values, ngrams_op)
Пример #10
0
    def testStringJoinReductionAxisZero(self):
        test_data = tf.constant(["a", "b", "c"])
        ngrams_op = text.ngrams(
            test_data,
            width=2,
            axis=-1,  # The -1 axis is the zero axis here.
            reduction_type=text.Reduction.STRING_JOIN,
            string_separator="|")
        expected_values = ["a|b", "b|c"]

        self.assertRaggedEqual(expected_values, ngrams_op)
Пример #11
0
def make_data(sentences, window_size):
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(sentences)
    ngrams = text.ngrams(tokens,
                         window_size + 1,
                         reduction_type=text.Reduction.STRING_JOIN)
    segments = np.array(
        [x[0].decode("UTF-8").split(" ") for x in ngrams.to_list()])
    input_batch = [' '.join(x) for x in segments[:, 0:-1]]
    target_batch = to_categorical(np.vectorize(lambda x: word_index[x] - 1)(
        segments[:, -1]),
                                  n_class,
                                  dtype='float32')
    return input_batch, target_batch
Пример #12
0
    def test_width_3_tensor_equivalence(self, test_case):
        input_tensor = tf.ragged.constant(test_case).to_tensor()
        tf_output = tf_text.ngrams(
            input_tensor, 3, reduction_type=tf_text.Reduction.STRING_JOIN)

        rank = input_tensor.shape.rank
        model = self._make_model(rank, 3, ragged_tensor=False, flex=False)
        interpreter = interpreter_wrapper.InterpreterWithCustomOps(
            model_content=model, custom_op_registerers=['AddNgramsCustomOp'])
        interpreter.resize_tensor_input(0, input_tensor.shape)
        interpreter.allocate_tensors()
        interpreter.set_tensor(interpreter.get_input_details()[0]['index'],
                               input_tensor.numpy())
        interpreter.invoke()
        tflite_output = interpreter.get_tensor(
            interpreter.get_output_details()[0]['index'])
        self.assertEqual(tf_output.numpy().tolist(), tflite_output.tolist())
Пример #13
0
 def test_width_2_ragged_tensor_equivalence(self, test_case):
     input_tensor = tf.ragged.constant(test_case)
     tf_output = tf_text.ngrams(
         input_tensor, 2, reduction_type=tf_text.Reduction.STRING_JOIN)
     rank = input_tensor.shape.rank
     model = self._make_model(rank, 2, ragged_tensor=True, flex=False)
     interpreter = interpreter_wrapper.InterpreterWithCustomOps(
         model_content=model, custom_op_registerers=['AddNgramsCustomOp'])
     signature_fn = interpreter.get_signature_runner()
     signature_kwargs = {}
     signature_kwargs['values'] = input_tensor.flat_values.numpy()
     for r in range(rank - 1):
         signature_kwargs[f'args_{r}'] = input_tensor.nested_row_splits[
             r].numpy()
     output = signature_fn(**signature_kwargs)
     tflite_output_values = output['output_0']
     self.assertEqual(tf_output.flat_values.numpy().tolist(),
                      tflite_output_values.tolist())
     for i in range(rank - 1):
         tflite_output_cur_row_splits = output[f'output_{i + 1}']
         self.assertEqual(tf_output.nested_row_splits[i].numpy().tolist(),
                          tflite_output_cur_row_splits.tolist())
Пример #14
0
iterator = iter(tokenized_docs)
print(next(iterator).to_list())
print(next(iterator).to_list())

tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(
    ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Is capitalized?
f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
# Are all letters uppercased?
f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
# Does the token contain punctuation?
f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
# Is the token a number?
f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

print(f1.to_list())
print(f2.to_list())
print(f3.to_list())
print(f4.to_list())

tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(
    ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Ngrams, in this case bi-gram (n = 2)
bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN)

print(bigrams.to_list())
Пример #15
0
def main():
    # Unicode
    docs = tf.constant([
        u'Everything not saved will be lost.'.encode('UTF-16-BE'),
        u'Sad☹'.encode('UTF-16-BE')
    ])
    _ = tf.strings.unicode_transcode(docs,
                                     input_encoding='UTF-16-BE',
                                     output_encoding='UTF-8')

    # Tokenization
    # WhitespaceTokenizer
    tokenizer = text.UnicodeScriptTokenizer()
    tokens = tokenizer.tokenize(
        ['everything not saved will be lost', u'Sad☹'.encode('UTF-8')])
    print(f'Tokens: {tokens.to_list()}')

    # Unicode split
    tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8')
    print(f'Tokens: {tokens.to_list()}')

    # Offsets
    tokenizer = text.UnicodeScriptTokenizer()
    (tokens, _, end_offsets) = tokenizer.tokenize_with_offsets(
        ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    print(f'Tokens: {tokens.to_list()}')
    print(f'Offsets: {end_offsets.to_list()}')

    # TF.Data Example
    docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'],
                                               ["It's a trap!"]])
    tokenizer = text.WhitespaceTokenizer()
    tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
    iterator = iter(tokenized_docs)
    print(f'First sentence tokens: {next(iterator).to_list()}')
    print(f'Seconds sentence tokens: {next(iterator).to_list()}')

    # Other Text Ops
    # Wordshape
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(
        ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    # Is capitalized?
    f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
    # Are all letters uppercased
    f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
    # Does the token contain punctuation?
    f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
    # Is the token a number?
    f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

    print(f'Is capitalized? {f1.to_list()}')
    print(f'Are all letters uppercased? {f2.to_list()}')
    print(f'Does the token contain punctuation? {f3.to_list()}')
    print(f'Is the token a number? {f4.to_list()}')

    # N-grams & Sliding Window
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(
        ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    # Ngrams, in this case bi-gram (n = 2)
    bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN)

    print(f'Bi-grams: {bigrams.to_list()}')
Пример #16
0
 def testUnspecifiedReductionTypeFails(self):
     with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
                                  r"reduction_type must be specified."):
         _ = text.ngrams(data=[], width=2, axis=0)
Пример #17
0
 def func(data):
     return tf_text.ngrams(data, width, axis, reduction_type,
                           string_separator, name)
Пример #18
0
 def ragged_func(values, *args):
     ragged_tensor = tf.RaggedTensor.from_nested_row_splits(
         flat_values=values, nested_row_splits=args)
     return tf_text.ngrams(ragged_tensor, width, axis, reduction_type,
                           string_separator, name)
Пример #19
0
 def testBadReductionTypeFails(self):
     with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
                                  r"reduction_type must be a Reduction."):
         _ = text.ngrams(data=[], width=2, axis=0, reduction_type="SUM")