def test_count_vocab_items_case_insensitive(self):
        indexer = TokenCharacterIndexer(
            "characters",
            CharacterTokenizer(lowercase_characters=True),
            min_padding_length=5)
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)

        assert counter["characters"] == {"h": 2, "e": 2, "l": 4, "o": 2}
Пример #2
0
    def test_index_converts_field_correctly(self):
        vocab = Dictionary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]], {
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words":
                SingleIdTokenIndexer(namespace="words"),
                "characters":
                TokenCharacterIndexer(namespace="characters",
                                      min_padding_length=1)
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
    def test_count_vocab_items_respect_casing(self):
        indexer = TokenCharacterIndexer("characters", min_padding_length=5)
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token("Hello"), counter)
        indexer.count_vocab_items(Token("hello"), counter)

        assert counter["characters"] == {
            "h": 1,
            "H": 1,
            "e": 2,
            "l": 4,
            "o": 2
        }
    def test_as_array_produce_token_sequence(self):
        indexer = TokenCharacterIndexer("characters", min_padding_length=1)

        padded_tokens = indexer.pad_token_sequence(
            {'k': [[1, 2, 3, 4, 5], [1, 2, 3], [1]]},
            desired_num_tokens={'k': 4},
            padding_length={"num_token_characters": 10})

        expected_ = {
            'k':
            [[1, 2, 3, 4, 5, 0, 0, 0, 0, 0], [1, 2, 3, 0, 0, 0, 0, 0, 0, 0],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        }

        assert padded_tokens == expected_
    def test_token2indices_correct_characters(self):
        vocab = Dictionary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7

        indexer = TokenCharacterIndexer("characters", min_padding_length=1)
        indices = indexer.tokens_to_indices([Token("sentential")], vocab,
                                            "char")

        expected_ = {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}

        assert indices == expected_
Пример #6
0
 def test_token_padding_lengths_are_computed_correctly(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(token_min_padding_length=3),
             "words":
             SingleIdTokenIndexer("words", token_min_padding_length=3),
             "characters":
             TokenCharacterIndexer("characters",
                                   min_padding_length=1,
                                   token_min_padding_length=3)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 3,
         'words_length': 3,
         'characters_length': 3,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     tensors = field.as_tensor(padding_lengths)
     assert tensors['additional_key'].tolist()[-1] == 0
     assert tensors['words'].tolist()[-1] == 0
     assert tensors['characters'].tolist()[-1] == [0] * 8
Пример #7
0
 def test_token_indexer_returns_dict(self):
     field = TextField(
         [Token(t) for t in ["A", "sentence"]],
         token_indexers={
             "field_with_dict":
             DictReturningTokenIndexer(),
             "words":
             SingleIdTokenIndexer("words"),
             "characters":
             TokenCharacterIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     assert padding_lengths == {
         'token_ids_length': 5,
         'additional_key_length': 2,
         'words_length': 2,
         'characters_length': 2,
         'num_token_characters': 8,
         'num_tokens': 5,
     }
     padding_lengths['token_ids_length'] = 7
     padding_lengths['additional_key_length'] = 3
     padding_lengths['words_length'] = 4
     padding_lengths['characters_length'] = 4
     tensors = field.as_tensor(padding_lengths)
     assert list(tensors['token_ids'].shape) == [7]
     assert list(tensors['additional_key'].shape) == [3]
     assert list(tensors['words'].shape) == [4]
     assert list(tensors['characters'].shape) == [4, 8]
Пример #8
0
    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
    def test_minimal_padding_length(self):
        sentence = "This is a test ."
        tokens = [Token(t) for t in sentence.split(" ")]
        vocab = Dictionary()
        vocab.add_token_to_namespace("T", namespace='characters')  # 2
        vocab.add_token_to_namespace("h", namespace='characters')  # 3
        vocab.add_token_to_namespace("i", namespace='characters')  # 4
        vocab.add_token_to_namespace("s", namespace='characters')  # 5
        vocab.add_token_to_namespace("a", namespace='characters')  # 6
        vocab.add_token_to_namespace("t", namespace='characters')  # 7
        vocab.add_token_to_namespace("e", namespace='characters')  # 8
        vocab.add_token_to_namespace(".", namespace='characters')  # 9
        vocab.add_token_to_namespace("y", namespace='characters')  # 10
        vocab.add_token_to_namespace("m", namespace='characters')  # 11
        vocab.add_token_to_namespace("n", namespace='characters')  # 12

        indexer = TokenCharacterIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab, "char")
        key_padding_lengths = "num_token_characters"
        value_padding_lengths = 0

        for token in indices["char"]:
            item = indexer.get_padding_lengths(token)
            value = item.values()
            value_padding_lengths = max(value_padding_lengths, max(value))
        padded_ = indexer.pad_token_sequence(
            indices, {"char": len(indices["char"])},
            {key_padding_lengths: value_padding_lengths})

        expected_ = {
            "char": [[2, 3, 4, 5, 0, 0, 0, 0, 0, 0],
                     [4, 5, 0, 0, 0, 0, 0, 0, 0, 0],
                     [6, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                     [7, 8, 5, 7, 0, 0, 0, 0, 0, 0],
                     [9, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        }

        assert padded_ == expected_
Пример #10
0
    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "num_token_characters": 8
        }

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1),
                "words":
                SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "words_length": 5,
            "num_token_characters": 8
        }
Пример #11
0
 def test_as_tensor_handles_characters_if_empty_field(self):
     field = TextField(
         [],
         token_indexers={
             "characters":
             TokenCharacterIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([])
     numpy.testing.assert_array_almost_equal(
         tensor_dict["characters"].detach().cpu().numpy(),
         expected_character_array)
    def test_additional_start_and_end_tokens(self):
        vocab = Dictionary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7
        vocab.add_token_to_namespace("<", namespace='characters')  # 8
        vocab.add_token_to_namespace(">", namespace='characters')  # 9
        vocab.add_token_to_namespace("/", namespace='characters')  # 10

        indexer = TokenCharacterIndexer("characters",
                                        start_tokens=["<s>"],
                                        end_tokens=["</s>"],
                                        min_padding_length=1)
        indices = indexer.tokens_to_indices([Token("sentential")], vocab,
                                            "char")

        expected_ = {
            "char": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
        }

        assert indices == expected_
Пример #13
0
 def test_as_tensor_handles_characters(self):
     field = TextField(
         [Token(t) for t in ["This", "is", "a", "sentence", "."]],
         token_indexers={
             "characters":
             TokenCharacterIndexer("characters", min_padding_length=1)
         })
     field.index(self.vocab)
     padding_lengths = field.get_padding_lengths()
     tensor_dict = field.as_tensor(padding_lengths)
     expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                             [1, 3, 0, 0, 0, 0, 0, 0],
                                             [1, 0, 0, 0, 0, 0, 0, 0],
                                             [3, 4, 5, 6, 4, 5, 7, 4],
                                             [1, 0, 0, 0, 0, 0, 0, 0]])
     numpy.testing.assert_array_almost_equal(
         tensor_dict["characters"].detach().cpu().numpy(),
         expected_character_array)
Пример #14
0
    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharacterIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}