예제 #1
0
 def testNGramsBadSizes(self):
     string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
     tokenized_tensor = tf.string_split(string_tensor, delimiter='')
     with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
         mappers.ngrams(tokenized_tensor, (0, 5), separator='')
     with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'):
         mappers.ngrams(tokenized_tensor, (6, 5), separator='')
예제 #2
0
 def testNGramsMinSizeNotOne(self):
     string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
     tokenized_tensor = tf.compat.v1.string_split(string_tensor,
                                                  delimiter='')
     output_tensor = mappers.ngrams(tokens=tokenized_tensor,
                                    ngram_range=(2, 5),
                                    separator='')
     self.assertSparseOutput(expected_indices=[[0, 0], [0, 1], [0,
                                                                2], [1, 0],
                                               [1, 1], [1, 2], [2,
                                                                0], [2, 1],
                                               [2, 2], [2, 3], [2,
                                                                4], [2, 5],
                                               [2, 6], [2, 7], [2, 8],
                                               [2, 9], [2, 10], [2, 11],
                                               [2, 12], [2, 13], [2, 14],
                                               [2, 15], [2, 16], [2, 17],
                                               [2, 18], [2, 19], [2, 20],
                                               [2, 21]],
                             expected_values=[
                                 b'ab', b'abc', b'bc', b'de', b'def', b'ef',
                                 b'fg', b'fgh', b'fghi', b'fghij', b'gh',
                                 b'ghi', b'ghij', b'ghijk', b'hi', b'hij',
                                 b'hijk', b'hijkl', b'ij', b'ijk', b'ijkl',
                                 b'ijklm', b'jk', b'jkl', b'jklm', b'kl',
                                 b'klm', b'lm'
                             ],
                             expected_shape=[5, 22],
                             actual_sparse_tensor=output_tensor,
                             close_values=False)
예제 #3
0
 def testNGramsWithRepeatedTokensPerRow(self):
     string_tensor = tf.constant(
         ['Cats or dogs or bunnies', 'Cats not rats'])
     tokenized_tensor = tf.strings.split(string_tensor, sep=' ')
     output_tensor = mappers.ngrams(tokens=tokenized_tensor,
                                    ngram_range=(1, 1),
                                    separator=' ')
     with tf.compat.v1.Session():
         output = output_tensor.eval()
         self.assertAllEqual(output.indices, [
             [0, 0],
             [0, 1],
             [0, 2],
             [0, 3],
             [0, 4],
             [1, 0],
             [1, 1],
             [1, 2],
         ])
         # Note: the ngram "or" is represented twice for the first document.
         self.assertAllEqual(output.values, [
             b'Cats', b'or', b'dogs', b'or', b'bunnies', b'Cats', b'not',
             b'rats'
         ])
         self.assertAllEqual(output.dense_shape, [2, 5])
예제 #4
0
 def testNGrams(self):
   with tf.compat.v1.Graph().as_default():
     string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', ''])
     tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='')
     output_tensor = mappers.ngrams(
         tokens=tokenized_tensor,
         ngram_range=(1, 5),
         separator='')
     self.assertSparseOutput(
         expected_indices=[
             [0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5],
             [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],
             [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7],
             [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14],
             [2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21],
             [2, 22], [2, 23], [2, 24], [2, 25], [2, 26], [2, 27], [2, 28],
             [2, 29], [3, 0]],
         expected_values=[
             b'a', b'ab', b'abc', b'b', b'bc', b'c', b'd', b'de', b'def', b'e',
             b'ef', b'f', b'f', b'fg', b'fgh', b'fghi', b'fghij', b'g', b'gh',
             b'ghi', b'ghij', b'ghijk', b'h', b'hi', b'hij', b'hijk', b'hijkl',
             b'i', b'ij', b'ijk', b'ijkl', b'ijklm', b'j', b'jk', b'jkl',
             b'jklm', b'k', b'kl', b'klm', b'l', b'lm', b'm', b'z'
         ],
         expected_shape=[5, 30],
         actual_sparse_tensor=output_tensor,
         close_values=False)
예제 #5
0
 def testNGramsEmpty(self):
     output_tensor = mappers.ngrams(
         tf.strings.split(source=tf.constant([''])), (1, 5), '')
     with tf.compat.v1.Session():
         output = output_tensor.eval()
         self.assertEqual((0, 2), output.indices.shape)
         self.assertAllEqual([1, 0], output.dense_shape)
         self.assertEqual(0, len(output.values))
예제 #6
0
 def testNGramsBagOfWordsEmpty(self):
   string_tensor = tf.constant([], dtype=tf.string)
   tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='')
   ngrams = mappers.ngrams(tokenized_tensor, (1, 2), separator='')
   bow = mappers.bag_of_words(tokenized_tensor, (1, 2), separator='')
   with tf.compat.v1.Session():
     ngrams_output = ngrams.eval()
     bow_output = bow.eval()
     self.assertAllEqual(ngrams_output.values, [])
     self.assertAllEqual(bow_output.values, [])
     self.assertAllEqual(ngrams_output.dense_shape, [0, 0])
     self.assertAllEqual(bow_output.dense_shape, [0, 0])
예제 #7
0
 def testNGramsWithSpaceSeparator(self):
     string_tensor = tf.constant(['One was Johnny', 'Two was a rat'])
     tokenized_tensor = tf.strings.split(source=string_tensor, sep=' ')
     output_tensor = mappers.ngrams(tokens=tokenized_tensor,
                                    ngram_range=(1, 2),
                                    separator=' ')
     with tf.compat.v1.Session():
         output = output_tensor.eval()
         self.assertAllEqual(
             output.indices,
             [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1],
              [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
         self.assertAllEqual(output.values, [
             b'One', b'One was', b'was', b'was Johnny', b'Johnny', b'Two',
             b'Two was', b'was', b'was a', b'a', b'a rat', b'rat'
         ])
         self.assertAllEqual(output.dense_shape, [2, 7])