class WhitespaceTokenizerOpTest(ragged_test_util.RaggedTensorTestCase): def setUp(self): self.whitespace_tokenizer = WhitespaceTokenizer() def testScalar(self): with self.cached_session(): with self.assertRaises(ValueError): self.whitespace_tokenizer.tokenize('I love Flume!') def testVectorSingleValue(self): test_value = constant_op.constant(['I love Flume!']) expected_tokens = [['I', 'love', 'Flume!']] expected_offset_starts = [[0, 2, 7]] expected_offset_limits = [[1, 6, 13]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testVector(self): test_value = constant_op.constant(['I love Flume!', 'Good day']) expected_tokens = [['I', 'love', 'Flume!'], ['Good', 'day']] expected_offset_starts = [[0, 2, 7], [0, 5]] expected_offset_limits = [[1, 6, 13], [4, 8]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testMatrix(self): test_value = constant_op.constant([['I love Flume!', 'Good day'], ['I don\'t want', 'no scrubs']]) expected_tokens = [[['I', 'love', 'Flume!'], ['Good', 'day']], [['I', 'don\'t', 'want'], ['no', 'scrubs']]] expected_offset_starts = [[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 3]]] expected_offset_limits = [[[1, 6, 13], [4, 8]], [[1, 7, 12], [2, 9]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testMatrixRagged(self): test_value = ragged_factory_ops.constant( [['I love Flume!'], ['I don\'t want', 'no scrubs']]) expected_tokens = [[['I', 'love', 'Flume!']], [['I', 'don\'t', 'want'], ['no', 'scrubs']]] expected_offset_starts = [[[0, 2, 7]], [[0, 2, 8], [0, 3]]] expected_offset_limits = [[[1, 6, 13]], [[1, 7, 12], [2, 9]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def test3DimMatrix(self): test_value = constant_op.constant([[['I love Flume!', 'Good day'], ['I don\'t want', 'no scrubs']], [['I love Zhu!', 'Good night'], ['A scrub is', 'a guy']]]) expected_tokens = [[[['I', 'love', 'Flume!'], ['Good', 'day']], [['I', 'don\'t', 'want'], ['no', 'scrubs']]], [[['I', 'love', 'Zhu!'], ['Good', 'night']], [['A', 'scrub', 'is'], ['a', 'guy']]]] expected_offset_starts = [[[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 3]]], [[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 2]]]] expected_offset_limits = [[[[1, 6, 13], [4, 8]], [[1, 7, 12], [2, 9]]], [[[1, 6, 11], [4, 10]], [[1, 7, 10], [1, 5]]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def test3DimMatrixRagged(self): test_value = ragged_factory_ops.constant( [[['I love Flume!'], ['I don\'t want', 'no scrubs']], [['I love Zhu!', 'Good night']]]) expected_tokens = [[[['I', 'love', 'Flume!']], [['I', 'don\'t', 'want'], ['no', 'scrubs']]], [[['I', 'love', 'Zhu!'], ['Good', 'night']]]] expected_offset_starts = [[[[0, 2, 7]], [[0, 2, 8], [0, 3]]], [[[0, 2, 7], [0, 5]]]] expected_offset_limits = [[[[1, 6, 13]], [[1, 7, 12], [2, 9]]], [[[1, 6, 11], [4, 10]]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testInternationalization(self): test_value = constant_op.constant( [u"J'adore la灯".encode('utf8'), u'¡Escríbeme!'.encode('utf8')]) expected_tokens = [['J\'adore', u'la灯'.encode('utf8')], [u'¡Escríbeme!'.encode('utf8')]] expected_offset_starts = [[0, 8], [0]] expected_offset_limits = [[7, 13], [13]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testSpaceBoundaries(self): test_value = constant_op.constant([' Hook em! ', ' .Ok. Go ']) expected_tokens = [['Hook', 'em!'], ['.Ok.', 'Go']] expected_offset_starts = [[1, 6], [1, 8]] expected_offset_limits = [[5, 9], [5, 10]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testOnlySpaces(self): test_value = constant_op.constant([' ', ' ', ' \t\r\n']) expected_tokens = [[], [], []] expected_offset_starts = [[], [], []] expected_offset_limits = [[], [], []] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testWhitespaceCharacters(self): test_value = constant_op.constant( ['things:\tcarpet\rdesk\nlamp\r\nlove']) expected_tokens = [['things:', 'carpet', 'desk', 'lamp', 'love']] expected_offset_starts = [[0, 8, 15, 20, 26]] expected_offset_limits = [[7, 14, 19, 24, 30]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testEmptyStringSingle(self): test_value = constant_op.constant(['']) expected_tokens = [[]] expected_offset_starts = [[]] expected_offset_limits = [[]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testEmptyString(self): test_value = constant_op.constant( ['', 'I love Flume!', '', 'O hai', '']) expected_tokens = [[], ['I', 'love', 'Flume!'], [], ['O', 'hai'], []] expected_offset_starts = [[], [0, 2, 7], [], [0, 2], []] expected_offset_limits = [[], [1, 6, 13], [], [1, 5], []] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits) def testEmptyDimensions(self): test_value = ragged_factory_ops.constant( [[['I love Flume!', 'Good day. . .'], []], [], [['I love Zhu!', 'Good night'], ['A scrub is', 'a guy']]]) expected_tokens = [[[['I', 'love', 'Flume!'], ['Good', 'day.', '.', '.']], []], [], [[['I', 'love', 'Zhu!'], ['Good', 'night']], [['A', 'scrub', 'is'], ['a', 'guy']]]] expected_offset_starts = [[[[0, 2, 7], [0, 5, 10, 12]], []], [], [[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 2]]]] expected_offset_limits = [[[[1, 6, 13], [4, 9, 11, 13]], []], [], [[[1, 6, 11], [4, 10]], [[1, 7, 10], [1, 5]]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertRaggedEqual(tokens, expected_tokens) (tokens, starts, limits) = ( self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertRaggedEqual(tokens, expected_tokens) self.assertRaggedEqual(starts, expected_offset_starts) self.assertRaggedEqual(limits, expected_offset_limits)
class WhitespaceTokenizerOpTest(test_util.TensorFlowTestCase): def setUp(self): super(WhitespaceTokenizerOpTest, self).setUp() self.whitespace_tokenizer = WhitespaceTokenizer() def testScalar(self): test_value = constant_op.constant(b'I love Flume!') expected_tokens = [b'I', b'love', b'Flume!'] expected_offset_starts = [0, 2, 7] expected_offset_ends = [1, 6, 13] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testScalarWithSplit(self): # Similar to testScalar, but using split() calls (instead of tokenize()). # Should produce the same results as before. This tests that a # WhitespaceTokenizer is a valid Splitter. test_value = constant_op.constant(b'I love Flume!') expected_tokens = [b'I', b'love', b'Flume!'] expected_offset_starts = [0, 2, 7] expected_offset_ends = [1, 6, 13] tokens = self.whitespace_tokenizer.split(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.split_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testVectorSingleValue(self): test_value = constant_op.constant([b'I love Flume!']) expected_tokens = [[b'I', b'love', b'Flume!']] expected_offset_starts = [[0, 2, 7]] expected_offset_ends = [[1, 6, 13]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testVector(self): test_value = constant_op.constant([b'I love Flume!', b'Good day']) expected_tokens = [[b'I', b'love', b'Flume!'], [b'Good', b'day']] expected_offset_starts = [[0, 2, 7], [0, 5]] expected_offset_ends = [[1, 6, 13], [4, 8]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testMatrix(self): test_value = constant_op.constant([[b'I love Flume!', b'Good day'], [b'I don\'t want', b'no scrubs']]) expected_tokens = [[[b'I', b'love', b'Flume!'], [b'Good', b'day']], [[b'I', b'don\'t', b'want'], [b'no', b'scrubs']]] expected_offset_starts = [[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 3]]] expected_offset_ends = [[[1, 6, 13], [4, 8]], [[1, 7, 12], [2, 9]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testMatrixRagged(self): test_value = ragged_factory_ops.constant( [[b'I love Flume!'], [b'I don\'t want', b'no scrubs']]) expected_tokens = [[[b'I', b'love', b'Flume!']], [[b'I', b'don\'t', b'want'], [b'no', b'scrubs']]] expected_offset_starts = [[[0, 2, 7]], [[0, 2, 8], [0, 3]]] expected_offset_ends = [[[1, 6, 13]], [[1, 7, 12], [2, 9]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def test3DimMatrix(self): test_value = constant_op.constant([[[b'I love Flume!', b'Good day'], [b'I don\'t want', b'no scrubs']], [[b'I love Zhu!', b'Good night'], [b'A scrub is', b'a guy']]]) expected_tokens = [[[[b'I', b'love', b'Flume!'], [b'Good', b'day']], [[b'I', b'don\'t', b'want'], [b'no', b'scrubs']]], [[[b'I', b'love', b'Zhu!'], [b'Good', b'night']], [[b'A', b'scrub', b'is'], [b'a', b'guy']]]] expected_offset_starts = [[[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 3]]], [[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 2]]]] expected_offset_ends = [[[[1, 6, 13], [4, 8]], [[1, 7, 12], [2, 9]]], [[[1, 6, 11], [4, 10]], [[1, 7, 10], [1, 5]]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def test3DimMatrixRagged(self): test_value = ragged_factory_ops.constant( [[[b'I love Flume!'], [b'I don\'t want', b'no scrubs']], [[b'I love Zhu!', b'Good night']]]) expected_tokens = [[[[b'I', b'love', b'Flume!']], [[b'I', b'don\'t', b'want'], [b'no', b'scrubs']]], [[[b'I', b'love', b'Zhu!'], [b'Good', b'night']]]] expected_offset_starts = [[[[0, 2, 7]], [[0, 2, 8], [0, 3]]], [[[0, 2, 7], [0, 5]]]] expected_offset_ends = [[[[1, 6, 13]], [[1, 7, 12], [2, 9]]], [[[1, 6, 11], [4, 10]]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testInternationalization(self): test_value = constant_op.constant( [u"J'adore la灯".encode('utf8'), u'¡Escríbeme!'.encode('utf8')]) expected_tokens = [[b'J\'adore', u'la灯'.encode('utf8')], [u'¡Escríbeme!'.encode('utf8')]] expected_offset_starts = [[0, 8], [0]] expected_offset_ends = [[7, 13], [13]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testSpaceBoundaries(self): test_value = constant_op.constant([b' Hook em! ', b' .Ok. Go ']) expected_tokens = [[b'Hook', b'em!'], [b'.Ok.', b'Go']] expected_offset_starts = [[1, 6], [1, 8]] expected_offset_ends = [[5, 9], [5, 10]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testOnlySpaces(self): test_value = constant_op.constant([b' ', b' ', b' \t\r\n']) expected_tokens = [[], [], []] expected_offset_starts = [[], [], []] expected_offset_ends = [[], [], []] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testWhitespaceCharacters(self): test_value = constant_op.constant( [b'things:\tcarpet\rdesk\nlamp\r\nlove']) expected_tokens = [[b'things:', b'carpet', b'desk', b'lamp', b'love']] expected_offset_starts = [[0, 8, 15, 20, 26]] expected_offset_ends = [[7, 14, 19, 24, 30]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testEmptyStringSingle(self): test_value = constant_op.constant([b'']) expected_tokens = [[]] expected_offset_starts = [[]] expected_offset_ends = [[]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testEmptyString(self): test_value = constant_op.constant( [b'', b'I love Flume!', b'', b'O hai', b'']) expected_tokens = [[], [b'I', b'love', b'Flume!'], [], [b'O', b'hai'], []] expected_offset_starts = [[], [0, 2, 7], [], [0, 2], []] expected_offset_ends = [[], [1, 6, 13], [], [1, 5], []] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends) def testEmptyDimensions(self): test_value = ragged_factory_ops.constant( [[[b'I love Flume!', b'Good day. . .'], []], [], [[b'I love Zhu!', b'Good night'], [b'A scrub is', b'a guy']]]) expected_tokens = [[[[b'I', b'love', b'Flume!'], [b'Good', b'day.', b'.', b'.']], []], [], [[[b'I', b'love', b'Zhu!'], [b'Good', b'night']], [[b'A', b'scrub', b'is'], [b'a', b'guy']]]] expected_offset_starts = [[[[0, 2, 7], [0, 5, 10, 12]], []], [], [[[0, 2, 7], [0, 5]], [[0, 2, 8], [0, 2]]]] expected_offset_ends = [[[[1, 6, 13], [4, 9, 11, 13]], []], [], [[[1, 6, 11], [4, 10]], [[1, 7, 10], [1, 5]]]] tokens = self.whitespace_tokenizer.tokenize(test_value) self.assertAllEqual(tokens, expected_tokens) (tokens, starts, ends) = (self.whitespace_tokenizer.tokenize_with_offsets(test_value)) self.assertAllEqual(tokens, expected_tokens) self.assertAllEqual(starts, expected_offset_starts) self.assertAllEqual(ends, expected_offset_ends)