def test_create_word_character_tensors(self): tensorizer = WordCharacterTensorizer(column="text") # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" s2 = "Turn it up" def ords(word, pad_to): return [ord(c) for c in word] + [0] * (pad_to - len(word)) batch = [{"text": types.Text(s1)}, {"text": types.Text(s2)}] # Note that the tokenizer lowercases here expected = [ [ ords("i", 6), ords("want", 6), ords("some", 6), ords("coffee", 6) ], [ords("turn", 6), ords("it", 6), ords("up", 6), ords("", 6)], ] expected_lens = [[1, 4, 4, 6], [4, 2, 2, 0]] chars, seq_lens = tensorizer.tensorize( tensorizer.numberize(row) for row in batch) self.assertIsInstance(chars, torch.LongTensor) self.assertIsInstance(seq_lens, torch.LongTensor) self.assertEqual((2, 4, 6), chars.size()) self.assertEqual((2, 4), seq_lens.size()) self.assertEqual(expected, chars.tolist()) self.assertEqual(expected_lens, seq_lens.tolist())
def test_create_byte_tensors(self): tensorizer = ByteTensorizer(column="text", lower=False) # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" s2 = "Turn it up" rows = [{"text": types.Text(s1)}, {"text": types.Text(s2)}] expected = [[ord(c) for c in s1], [ord(c) for c in s2]] tensors = (tensorizer.numberize(row) for row in rows) chars, seq_len = next(tensors) self.assertEqual(len(s1), len(chars)) self.assertEqual(expected[0], chars) self.assertEqual(len(s1), seq_len) chars, seq_len = next(tensors) self.assertEqual(len(s2), len(chars)) self.assertEqual(expected[1], chars) self.assertEqual(len(s2), seq_len)
def test_create_word_tensors(self): tensorizer = WordTensorizer(column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() batch = [ {"text": types.Text("I want some coffee")}, {"text": types.Text("Turn it up")}, ] tokens, seq_lens = tensorizer.create_training_tensors(batch) self.assertIsInstance(tokens, torch.LongTensor) self.assertIsInstance(seq_lens, torch.LongTensor) self.assertEqual((2, 4), tokens.size()) self.assertEqual((2,), seq_lens.size()) self.assertEqual([[24, 0, 0, 0], [13, 47, 9, 1]], tokens.tolist()) self.assertEqual([4, 3], seq_lens.tolist())
def test_create_character_tensors(self): tensorizer = CharacterTensorizer(column="text") # not initializing because initializing is a no-op for CharacterTensorizer s1 = "I want some coffee" s2 = "Turn it up" ld = len(s1) - len(s2) batch = [{"text": types.Text(s1)}, {"text": types.Text(s2)}] expected = [ [ord(c) for c in s1] + [0] * (max(-ld, 0)), [ord(c) for c in s2] + [0] * (max(ld, 0)), ] chars, seq_lens = tensorizer.create_training_tensors(batch) self.assertIsInstance(chars, torch.LongTensor) self.assertIsInstance(seq_lens, torch.LongTensor) self.assertEqual((2, max(len(s1), len(s2))), chars.size()) self.assertEqual((2,), seq_lens.size()) self.assertEqual(expected, chars.tolist()) self.assertEqual([len(s1), len(s2)], seq_lens.tolist())
def test_create_word_tensors(self): tensorizer = WordTensorizer(column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() rows = [ { "text": types.Text("I want some coffee") }, { "text": types.Text("Turn it up") }, ] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) tokens, seq_len = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len)
def load_text(s): return types.Text(s)