def testNGramsBadSizes(self): string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', '']) tokenized_tensor = tf.string_split(string_tensor, delimiter='') with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'): mappers.ngrams(tokenized_tensor, (0, 5), separator='') with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'): mappers.ngrams(tokenized_tensor, (6, 5), separator='')
def testNGramsMinSizeNotOne(self): string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', '']) tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='') output_tensor = mappers.ngrams(tokens=tokenized_tensor, ngram_range=(2, 5), separator='') self.assertSparseOutput(expected_indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7], [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14], [2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21]], expected_values=[ b'ab', b'abc', b'bc', b'de', b'def', b'ef', b'fg', b'fgh', b'fghi', b'fghij', b'gh', b'ghi', b'ghij', b'ghijk', b'hi', b'hij', b'hijk', b'hijkl', b'ij', b'ijk', b'ijkl', b'ijklm', b'jk', b'jkl', b'jklm', b'kl', b'klm', b'lm' ], expected_shape=[5, 22], actual_sparse_tensor=output_tensor, close_values=False)
def testNGramsWithRepeatedTokensPerRow(self): string_tensor = tf.constant( ['Cats or dogs or bunnies', 'Cats not rats']) tokenized_tensor = tf.strings.split(string_tensor, sep=' ') output_tensor = mappers.ngrams(tokens=tokenized_tensor, ngram_range=(1, 1), separator=' ') with tf.compat.v1.Session(): output = output_tensor.eval() self.assertAllEqual(output.indices, [ [0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2], ]) # Note: the ngram "or" is represented twice for the first document. self.assertAllEqual(output.values, [ b'Cats', b'or', b'dogs', b'or', b'bunnies', b'Cats', b'not', b'rats' ]) self.assertAllEqual(output.dense_shape, [2, 5])
def testNGrams(self): with tf.compat.v1.Graph().as_default(): string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', '']) tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='') output_tensor = mappers.ngrams( tokens=tokenized_tensor, ngram_range=(1, 5), separator='') self.assertSparseOutput( expected_indices=[ [0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7], [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14], [2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21], [2, 22], [2, 23], [2, 24], [2, 25], [2, 26], [2, 27], [2, 28], [2, 29], [3, 0]], expected_values=[ b'a', b'ab', b'abc', b'b', b'bc', b'c', b'd', b'de', b'def', b'e', b'ef', b'f', b'f', b'fg', b'fgh', b'fghi', b'fghij', b'g', b'gh', b'ghi', b'ghij', b'ghijk', b'h', b'hi', b'hij', b'hijk', b'hijkl', b'i', b'ij', b'ijk', b'ijkl', b'ijklm', b'j', b'jk', b'jkl', b'jklm', b'k', b'kl', b'klm', b'l', b'lm', b'm', b'z' ], expected_shape=[5, 30], actual_sparse_tensor=output_tensor, close_values=False)
def testNGramsEmpty(self): output_tensor = mappers.ngrams( tf.strings.split(source=tf.constant([''])), (1, 5), '') with tf.compat.v1.Session(): output = output_tensor.eval() self.assertEqual((0, 2), output.indices.shape) self.assertAllEqual([1, 0], output.dense_shape) self.assertEqual(0, len(output.values))
def testNGramsBagOfWordsEmpty(self): string_tensor = tf.constant([], dtype=tf.string) tokenized_tensor = tf.compat.v1.string_split(string_tensor, delimiter='') ngrams = mappers.ngrams(tokenized_tensor, (1, 2), separator='') bow = mappers.bag_of_words(tokenized_tensor, (1, 2), separator='') with tf.compat.v1.Session(): ngrams_output = ngrams.eval() bow_output = bow.eval() self.assertAllEqual(ngrams_output.values, []) self.assertAllEqual(bow_output.values, []) self.assertAllEqual(ngrams_output.dense_shape, [0, 0]) self.assertAllEqual(bow_output.dense_shape, [0, 0])
def testNGramsWithSpaceSeparator(self): string_tensor = tf.constant(['One was Johnny', 'Two was a rat']) tokenized_tensor = tf.strings.split(source=string_tensor, sep=' ') output_tensor = mappers.ngrams(tokens=tokenized_tensor, ngram_range=(1, 2), separator=' ') with tf.compat.v1.Session(): output = output_tensor.eval() self.assertAllEqual( output.indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]]) self.assertAllEqual(output.values, [ b'One', b'One was', b'was', b'was Johnny', b'Johnny', b'Two', b'Two was', b'was', b'was a', b'a', b'a rat', b'rat' ]) self.assertAllEqual(output.dense_shape, [2, 7])