def test_numberize_with_token_tensorizer(self): tensorizer = TokenTensorizer(text_column="text") self._initialize_tensorizer(tensorizer) rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len, token_ranges = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) self.assertEqual([(0, 1), (2, 6), (7, 11), (12, 18)], token_ranges) tokens, seq_len, token_ranges = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len) self.assertEqual([(0, 4), (5, 7), (8, 10)], token_ranges)
def test_create_word_tensors(self): tensorizer = TokenTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) tokens, seq_len = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len)