def test_as_tensor_produces_integer_targets(self): vocab = Dictionary() vocab.add_token_to_namespace("B", namespace='*labels') vocab.add_token_to_namespace("I", namespace='*labels') vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) padding_lengths = sequence_label_field.get_padding_lengths() tensor = sequence_label_field.as_tensor( padding_lengths).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
def test_index_converts_field_correctly(self): vocab = Dictionary() b_index = vocab.add_token_to_namespace("B", namespace='*labels') i_index = vocab.add_token_to_namespace("I", namespace='*labels') o_index = vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) # pylint: disable=protected-access assert sequence_label_field.indexed_labels == [ b_index, i_index, o_index, o_index, o_index ]
def test_additional_start_and_end_tokens(self): vocab = Dictionary() vocab.add_token_to_namespace("A", namespace='characters') # 2 vocab.add_token_to_namespace("s", namespace='characters') # 3 vocab.add_token_to_namespace("e", namespace='characters') # 4 vocab.add_token_to_namespace("n", namespace='characters') # 5 vocab.add_token_to_namespace("t", namespace='characters') # 6 vocab.add_token_to_namespace("c", namespace='characters') # 7 vocab.add_token_to_namespace("<", namespace='characters') # 8 vocab.add_token_to_namespace(">", namespace='characters') # 9 vocab.add_token_to_namespace("/", namespace='characters') # 10 indexer = TokenCharacterIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1) indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") expected_ = { "char": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]] } assert indices == expected_
def test_label_field_can_index_with_vocab(self): vocab = Dictionary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 0
def test_index_converts_field_correctly(self): vocab = Dictionary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace( "A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [ capital_a_index, sentence_index ] field1 = TextField( [Token(t) for t in ["A", "sentence"]], { "characters": TokenCharacterIndexer(namespace="characters", min_padding_length=1) }) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharacterIndexer(namespace="characters", min_padding_length=1) }) field2.index(vocab) assert field2._indexed_tokens["words"] == [ capital_a_index, sentence_index ] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]]
def test_token2indices_correct_characters(self): vocab = Dictionary() vocab.add_token_to_namespace("A", namespace='characters') # 2 vocab.add_token_to_namespace("s", namespace='characters') # 3 vocab.add_token_to_namespace("e", namespace='characters') # 4 vocab.add_token_to_namespace("n", namespace='characters') # 5 vocab.add_token_to_namespace("t", namespace='characters') # 6 vocab.add_token_to_namespace("c", namespace='characters') # 7 indexer = TokenCharacterIndexer("characters", min_padding_length=1) indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") expected_ = {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]} assert indices == expected_
def test_minimal_padding_length(self): sentence = "This is a test ." tokens = [Token(t) for t in sentence.split(" ")] vocab = Dictionary() vocab.add_token_to_namespace("T", namespace='characters') # 2 vocab.add_token_to_namespace("h", namespace='characters') # 3 vocab.add_token_to_namespace("i", namespace='characters') # 4 vocab.add_token_to_namespace("s", namespace='characters') # 5 vocab.add_token_to_namespace("a", namespace='characters') # 6 vocab.add_token_to_namespace("t", namespace='characters') # 7 vocab.add_token_to_namespace("e", namespace='characters') # 8 vocab.add_token_to_namespace(".", namespace='characters') # 9 vocab.add_token_to_namespace("y", namespace='characters') # 10 vocab.add_token_to_namespace("m", namespace='characters') # 11 vocab.add_token_to_namespace("n", namespace='characters') # 12 indexer = TokenCharacterIndexer("characters", min_padding_length=10) indices = indexer.tokens_to_indices(tokens, vocab, "char") key_padding_lengths = "num_token_characters" value_padding_lengths = 0 for token in indices["char"]: item = indexer.get_padding_lengths(token) value = item.values() value_padding_lengths = max(value_padding_lengths, max(value)) padded_ = indexer.pad_token_sequence( indices, {"char": len(indices["char"])}, {key_padding_lengths: value_padding_lengths}) expected_ = { "char": [[2, 3, 4, 5, 0, 0, 0, 0, 0, 0], [4, 5, 0, 0, 0, 0, 0, 0, 0, 0], [6, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 8, 5, 7, 0, 0, 0, 0, 0, 0], [9, 0, 0, 0, 0, 0, 0, 0, 0, 0]] } assert padded_ == expected_
class TestTextField(unittest.TestCase): def setUp(self): self.vocab = Dictionary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharacterIndexer("characters", min_padding_length=1) }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharacterIndexer("characters", min_padding_length=1) }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"} def test_index_converts_field_correctly(self): vocab = Dictionary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace( "A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [ capital_a_index, sentence_index ] field1 = TextField( [Token(t) for t in ["A", "sentence"]], { "characters": TokenCharacterIndexer(namespace="characters", min_padding_length=1) }) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharacterIndexer(namespace="characters", min_padding_length=1) }) field2.index(vocab) assert field2._indexed_tokens["words"] == [ capital_a_index, sentence_index ] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] def test_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"words_length": 5, "num_tokens": 5} field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharacterIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "num_tokens": 5, "characters_length": 5, "num_token_characters": 8 } field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharacterIndexer("characters", min_padding_length=1), "words": SingleIdTokenIndexer("words") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "num_tokens": 5, "characters_length": 5, "words_length": 5, "num_token_characters": 8 } def test_as_tensor_handles_words(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])) def test_as_tensor_handles_longer_lengths(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words_length"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_as_tensor_handles_characters(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharacterIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].detach().cpu().numpy(), expected_character_array) def test_as_tensor_handles_characters_if_empty_field(self): field = TextField( [], token_indexers={ "characters": TokenCharacterIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([]) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].detach().cpu().numpy(), expected_character_array) def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField( [Token(t) for t in ["a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharacterIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words_length"] = 5 padding_lengths["characters_length"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) print(field) def test_token_indexer_returns_dict(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharacterIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids_length': 5, 'additional_key_length': 2, 'words_length': 2, 'characters_length': 2, 'num_token_characters': 8, 'num_tokens': 5, } padding_lengths['token_ids_length'] = 7 padding_lengths['additional_key_length'] = 3 padding_lengths['words_length'] = 4 padding_lengths['characters_length'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors['token_ids'].shape) == [7] assert list(tensors['additional_key'].shape) == [3] assert list(tensors['words'].shape) == [4] assert list(tensors['characters'].shape) == [4, 8] def test_token_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3), "words": SingleIdTokenIndexer("words", token_min_padding_length=3), "characters": TokenCharacterIndexer("characters", min_padding_length=1, token_min_padding_length=3) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids_length': 5, 'additional_key_length': 3, 'words_length': 3, 'characters_length': 3, 'num_token_characters': 8, 'num_tokens': 5, } tensors = field.as_tensor(padding_lengths) assert tensors['additional_key'].tolist()[-1] == 0 assert tensors['words'].tolist()[-1] == 0 assert tensors['characters'].tolist()[-1] == [0] * 8 def test_sequence_methods(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], {}) assert len(field) == 5 assert field[1].text == "is" assert [token.text for token in field] == ["This", "is", "a", "sentence", "."]