示例#1
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
                "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
                "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
                "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
                "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
                }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
                'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
                'embedding_dim': 5
                })
        with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({
                    'pretrained_file': file_uri,
                    'embedding_dim': 5
                    })
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
示例#2
0
    def test_dry_run_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        dry_run_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
示例#3
0
def get_vocab(word2freq, max_v_sizes):
    '''Build vocabulary'''
    vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word'])
    words_by_freq = [(word, freq) for word, freq in word2freq.items()]
    words_by_freq.sort(key=lambda x: x[1], reverse=True)
    for word, _ in words_by_freq[:max_v_sizes['word']]:
        vocab.add_token_to_namespace(word, 'tokens')
    log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens'))
    return vocab
 def test_token_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
     vocab.add_token_to_namespace('ORG', namespace='ner_tags')
     indexer = NerTagIndexer()
     assert indexer.token_to_indices(tokens[1], vocab) == person_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
    def test_as_tensor_produces_integer_targets(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("B", namespace='*labels')
        vocab.add_token_to_namespace("I", namespace='*labels')
        vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
        sequence_label_field.index(vocab)
        padding_lengths = sequence_label_field.get_padding_lengths()
        tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
示例#6
0
 def test_get_embedding_layer_uses_correct_embedding_dim(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace('word1')
     vocab.add_token_to_namespace('word2')
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
     embedding_weights = _read_pretrained_embedding_file(embeddings_filename, 3, vocab)
     assert tuple(embedding_weights.size()) == (4, 3)  # 4 because of padding and OOV
     with pytest.raises(ConfigurationError):
         _read_pretrained_embedding_file(embeddings_filename, 4, vocab)
 def test_token_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
     cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
     indexer = PosTagIndexer(coarse_tags=True)
     assert indexer.token_to_indices(tokens[1], vocab) == verb_index
     assert indexer.token_to_indices(tokens[-1], vocab) == none_index
     indexer._coarse_tags = False  # pylint: disable=protected-access
     assert indexer.token_to_indices(tokens[1], vocab) == cop_index
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        b_index = vocab.add_token_to_namespace("B", namespace='*labels')
        i_index = vocab.add_token_to_namespace("I", namespace='*labels')
        o_index = vocab.add_token_to_namespace("O", namespace='*labels')

        tags = ["B", "I", "O", "O", "O"]
        sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels")
        sequence_label_field.index(vocab)

        # pylint: disable=protected-access
        assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
示例#9
0
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = self.TEST_DIR + "embeddings.gz"
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     params = Params({
             'pretrained_file': embeddings_filename,
             'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
示例#10
0
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this")
        self.vocab.add_token_to_namespace("is")
        self.vocab.add_token_to_namespace("a")
        self.vocab.add_token_to_namespace("sentence")
        self.vocab.add_token_to_namespace(".")
        self.token_indexer = {"tokens": SingleIdTokenIndexer()}
        self.instances = self.get_instances()
        super(TestDataset, self).setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({"words": TextField([Token("hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
                                   "text2": {"num_tokens": 6, "tokens_length": 6}}

    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))

    def get_instances(self):
        field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
                           self.token_indexer)
        field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]],
                           self.token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           self.token_indexer)
        instances = [Instance({"text1": field1, "text2": field2}),
                     Instance({"text1": field3, "text2": field4})]
        return instances
示例#11
0
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset(
                    'embedding', embeddings.shape, dtype='float32', data=embeddings
            )

        params = Params({
                'pretrained_file': embeddings_filename,
                'embedding_dim': 5,
                })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
示例#12
0
    def test_adjacency_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("a", namespace="labels")
        vocab.add_token_to_namespace("b", namespace="labels")
        vocab.add_token_to_namespace("c", namespace="labels")

        labels = ["a", "b"]
        indices = [(0, 1), (2, 1)]
        adjacency_field = AdjacencyField(indices, self.text, labels)
        adjacency_field.index(vocab)
        tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths())
        numpy.testing.assert_equal(tensor.numpy(), numpy.array([[-1, 0, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1],
                                                                [-1, 1, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1],
                                                                [-1, -1, -1, -1, -1]]))
    def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')  # 2
        vocab.add_token_to_namespace("s", namespace='characters')  # 3
        vocab.add_token_to_namespace("e", namespace='characters')  # 4
        vocab.add_token_to_namespace("n", namespace='characters')  # 5
        vocab.add_token_to_namespace("t", namespace='characters')  # 6
        vocab.add_token_to_namespace("c", namespace='characters')  # 7
        vocab.add_token_to_namespace("<", namespace='characters')  # 8
        vocab.add_token_to_namespace(">", namespace='characters')  # 9
        vocab.add_token_to_namespace("/", namespace='characters')  # 10

        indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"])
        indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char")
        assert indices == {"char": [[8, 3, 9],
                                    [3, 4, 5, 6, 4, 5, 6, 1, 1, 1],
                                    [8, 10, 3, 9]]}
示例#14
0
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        embeddings_filename = self.TEST_DIR + "embeddings.hdf5"
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset(
                    'embedding', embeddings.shape, dtype='float32', data=embeddings
            )

        params = Params({
                'pretrained_file': embeddings_filename,
                'embedding_dim': 5,
                })
        embedding_layer = Embedding.from_params(vocab, params)
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
示例#15
0
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        params = Params({
                'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz',
                'embedding_dim': 300,
                'projection_dim': 20
                })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]]))
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
示例#16
0
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels')
     none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels')
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]}
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
class TestTokenCharactersEncoder(AllenNlpTestCase):
    def setUp(self):
        super(TestTokenCharactersEncoder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1", "token_characters")
        self.vocab.add_token_to_namespace("2", "token_characters")
        self.vocab.add_token_to_namespace("3", "token_characters")
        self.vocab.add_token_to_namespace("4", "token_characters")
        params = Params({
                "embedding": {
                        "embedding_dim": 2,
                        "vocab_namespace": "token_characters"
                        },
                "encoder": {
                        "type": "cnn",
                        "embedding_dim": 2,
                        "num_filters": 4,
                        "ngram_filter_sizes": [1, 2],
                        "output_dim": 3
                        }
                })
        self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params))
        self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"])
        self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
        constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.}))
        initializer = InitializerApplicator([(".*", constant_init)])
        initializer(self.encoder)
        initializer(self.embedding)
        initializer(self.inner_encoder)

    def test_get_output_dim_uses_encoder_output_dim(self):
        assert self.encoder.get_output_dim() == 3

    def test_forward_applies_embedding_then_encoder(self):
        numpy_tensor = numpy.random.randint(6, size=(3, 4, 7))
        inputs = torch.from_numpy(numpy_tensor)
        encoder_output = self.encoder(inputs)
        reshaped_input = inputs.view(12, 7)
        embedded = self.embedding(reshaped_input)
        mask = (inputs != 0).long().view(12, 7)
        reshaped_manual_output = self.inner_encoder(embedded, mask)
        manual_output = reshaped_manual_output.view(3, 4, 3)
        assert_almost_equal(encoder_output.data.numpy(), manual_output.data.numpy())
示例#18
0
class IteratorTest(AllenNlpTestCase):
    def setUp(self):
        super(IteratorTest, self).setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
                self.create_instance(["this", "is", "a", "sentence"]),
                self.create_instance(["this", "is", "another", "sentence"]),
                self.create_instance(["yet", "another", "sentence"]),
                self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]),
                self.create_instance(["sentence"]),
                ]

        self.instances = instances
        self.lazy_instances = LazyIterable(instances)

    def create_instance(self, str_tokens: List[str]):
        tokens = [Token(t) for t in str_tokens]
        instance = Instance({'text': TextField(tokens, self.token_indexers)})
        return instance

    def create_instances_from_token_counts(self, token_counts: List[int]) -> List[Instance]:
        return [self.create_instance(["word"] * count) for count in token_counts]

    def get_batches_stats(self, batches: Iterable[Batch]) -> Dict[str, Union[int, List[int]]]:
        grouped_instances = [batch.instances for batch in batches]
        group_lengths = [len(group) for group in grouped_instances]

        sample_sizes = []
        for batch in batches:
            batch_sequence_length = max(
                    [instance.get_padding_lengths()['text']['num_tokens']
                     for instance in batch.instances]
            )
            sample_sizes.append(batch_sequence_length * len(batch.instances))

        return {
                "batch_lengths": group_lengths,
                "total_instances": sum(group_lengths),
                "sample_sizes": sample_sizes
        }

    def assert_instances_are_correct(self, candidate_instances):
        # First we need to remove padding tokens from the candidates.
        # pylint: disable=protected-access
        candidate_instances = [tuple(w for w in instance if w != 0) for instance in candidate_instances]
        expected_instances = [tuple(instance.fields["text"]._indexed_tokens["tokens"])
                              for instance in self.instances]
        assert set(candidate_instances) == set(expected_instances)
示例#19
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
示例#20
0
    def test_dry_run_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = True
        self.params['vocabulary']['min_count'] = {"tokens" : 3}
        dry_run_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'}

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'

        tokens.sort()
        assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are',
                          'some_weird_token_1', 'some_weird_token_2']

        with open(extended_vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
示例#21
0
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 0
示例#22
0
    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.split_words("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags')
        cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags')
        none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags')
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace('DET', namespace='pos_tags')
        vocab.add_token_to_namespace('NOUN', namespace='pos_tags')
        vocab.add_token_to_namespace('PUNCT', namespace='pos_tags')

        indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab, "tokens")
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False  # pylint: disable=protected-access
        assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}
    def test_token_to_indices_produces_correct_characters(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')
        vocab.add_token_to_namespace("s", namespace='characters')
        vocab.add_token_to_namespace("e", namespace='characters')
        vocab.add_token_to_namespace("n", namespace='characters')
        vocab.add_token_to_namespace("t", namespace='characters')
        vocab.add_token_to_namespace("c", namespace='characters')

        indexer = TokenCharactersIndexer("characters")
        indices = indexer.token_to_indices(Token("sentential"), vocab)
        assert indices == [3, 4, 5, 6, 4, 5, 6, 1, 1, 1]
class IteratorTest(AllenNlpTestCase):
    def setUp(self):
        super(IteratorTest, self).setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
                self.create_instance(["this", "is", "a", "sentence"]),
                self.create_instance(["this", "is", "another", "sentence"]),
                self.create_instance(["yet", "another", "sentence"]),
                self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]),
                self.create_instance(["sentence"]),
                ]

        class LazyIterable:
            def __iter__(self):
                return (instance for instance in instances)

        self.instances = instances
        self.lazy_instances = LazyIterable()

    def create_instance(self, str_tokens: List[str]):
        tokens = [Token(t) for t in str_tokens]
        instance = Instance({'text': TextField(tokens, self.token_indexers)})
        instance.index_fields(self.vocab)
        return instance

    def assert_instances_are_correct(self, candidate_instances):
        # First we need to remove padding tokens from the candidates.
        # pylint: disable=protected-access
        candidate_instances = [tuple(w for w in instance if w != 0) for instance in candidate_instances]
        expected_instances = [tuple(instance.fields["text"]._indexed_tokens["tokens"])
                              for instance in self.instances]
        assert set(candidate_instances) == set(expected_instances)
示例#25
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
     vocab.add_token_to_namespace(unicode_space)
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8'))
     params = Params({
             'pretrained_file': embeddings_filename,
             'embedding_dim': 3,
             })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word")]
     assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)]
     assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")]
     assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
示例#26
0
class IteratorTest(AllenNlpTestCase):
    def setUp(self):
        super(IteratorTest, self).setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
            self.create_instance(["this", "is", "a", "sentence"]),
            self.create_instance(["this", "is", "another", "sentence"]),
            self.create_instance(["yet", "another", "sentence"]),
            self.create_instance([
                "this", "is", "a", "very", "very", "very", "very", "long",
                "sentence"
            ]),
            self.create_instance(["sentence"]),
        ]

        self.instances = instances
        self.lazy_instances = LazyIterable(instances)

    def create_instance(self, str_tokens: List[str]):
        tokens = [Token(t) for t in str_tokens]
        instance = Instance({'text': TextField(tokens, self.token_indexers)})
        return instance

    def create_instances_from_token_counts(
            self, token_counts: List[int]) -> List[Instance]:
        return [
            self.create_instance(["word"] * count) for count in token_counts
        ]

    def get_batches_stats(
            self,
            batches: Iterable[Batch]) -> Dict[str, Union[int, List[int]]]:
        grouped_instances = [batch.instances for batch in batches]
        group_lengths = [len(group) for group in grouped_instances]

        sample_sizes = []
        for batch in batches:
            batch_sequence_length = max([
                instance.get_padding_lengths()['text']['num_tokens']
                for instance in batch.instances
            ])
            sample_sizes.append(batch_sequence_length * len(batch.instances))

        return {
            "batch_lengths": group_lengths,
            "total_instances": sum(group_lengths),
            "sample_sizes": sample_sizes
        }

    def assert_instances_are_correct(self, candidate_instances):
        # First we need to remove padding tokens from the candidates.
        # pylint: disable=protected-access
        candidate_instances = [
            tuple(w for w in instance if w != 0)
            for instance in candidate_instances
        ]
        expected_instances = [
            tuple(instance.fields["text"]._indexed_tokens["tokens"])
            for instance in self.instances
        ]
        assert set(candidate_instances) == set(expected_instances)
示例#27
0
class TestTextField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("sentence", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='characters')
        self.vocab.add_token_to_namespace("s", namespace='characters')
        self.vocab.add_token_to_namespace("e", namespace='characters')
        self.vocab.add_token_to_namespace("n", namespace='characters')
        self.vocab.add_token_to_namespace("t", namespace='characters')
        self.vocab.add_token_to_namespace("c", namespace='characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters")
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters")
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}

    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words": SingleIdTokenIndexer(namespace="words"),
                "characters": TokenCharactersIndexer(namespace="characters")
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        # pylint: enable=protected-access

    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters"),
                "words": SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8}

    def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].data.cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1]))

    def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].data.cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].data.cpu().numpy(),
            expected_character_array)

    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["num_tokens"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].data.cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].data.cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_printing_doesnt_crash(self):
        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        print(field)
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })
        self.token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        self.inputs = {
                "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])),
                "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])),
                "words3": Variable(torch.LongTensor([[1, 5, 1, 2]]))
                }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError):
            self.token_embedder(self.inputs)
        self.inputs['words3'] = self.inputs['words4']
        del self.inputs['words4']

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
                "words": {
                        "type": "embedding",
                        "num_embeddings": 20,
                        "embedding_dim": 2,
                        },
                "characters": {
                        "type": "character_encoding",
                        "embedding": {
                                "embedding_dim": 4,
                                "num_embeddings": 15,
                                },
                        "encoder": {
                                "type": "cnn",
                                "embedding_dim": 4,
                                "num_filters": 10,
                                "ngram_filter_sizes": [3],
                                },
                        }
                })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'words': Variable(torch.rand(3, 4, 5, 6) * 20).long(),
                'characters': Variable(torch.rand(3, 4, 5, 6, 7) * 15).long(),
                }
        assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })
        self.token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params)
        self.inputs = {
                "words1": torch.LongTensor([[0, 2, 3, 5]]),
                "words2": torch.LongTensor([[1, 4, 3, 2]]),
                "words3": torch.LongTensor([[1, 5, 1, 2]])
                }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        # Total mismatch
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        self.inputs['words3'] = self.inputs['words4']

        # Text field has too many inputs
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("is generating more keys")

        del self.inputs['words4']

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                                },
                        "characters": {
                                "type": "character_encoding",
                                "embedding": {
                                        "embedding_dim": 4,
                                        "num_embeddings": 15,
                                        },
                                "encoder": {
                                        "type": "cnn",
                                        "embedding_dim": 4,
                                        "num_filters": 10,
                                        "ngram_filter_sizes": [3],
                                        },
                                }
                        }
                })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params)
        inputs = {
                'words': (torch.rand(3, 4, 5, 6) * 20).long(),
                'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(),
                }
        assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                                },
                        "elmo": {
                                "type": "elmo_token_embedder",
                                "options_file": options_file,
                                "weight_file": weight_file
                                },
                        },
                "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]}
                })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'words': (torch.rand(3, 6) * 20).long(),
                'elmo': (torch.rand(3, 6, 50) * 15).long(),
                }
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_null(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
                "token_embedders": {
                        "elmo": {
                                "type": "elmo_token_embedder",
                                "options_file": options_file,
                                "weight_file": weight_file
                        },
                },
                "embedder_to_indexer_map": {
                        # ignore `word_inputs` in `ElmoTokenEmbedder.forward`
                        "elmo": ["elmo", None]
                }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'elmo': (torch.rand(3, 6, 50) * 15).long(),
        }
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_dict(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                        },
                        "elmo": {
                                "type": "elmo_token_embedder",
                                "options_file": options_file,
                                "weight_file": weight_file
                        },
                },
                "embedder_to_indexer_map": {
                        # pass arguments to `ElmoTokenEmbedder.forward` by dict
                        "elmo": {
                                "inputs": "elmo",
                                "word_inputs": "words"
                        },
                        "words": ["words"]
                }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'words': (torch.rand(3, 6) * 20).long(),
                'elmo': (torch.rand(3, 6, 50) * 15).long(),
        }
        token_embedder(inputs)

    def test_old_from_params_new_from_params(self):
        old_params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })

        # Allow loading the parameters in the old format
        with pytest.warns(DeprecationWarning):
            old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab)

        new_params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })

        # But also allow loading the parameters in the new format
        new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab)
        assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys()

        assert new_embedder(self.inputs).size() == (1, 4, 10)
示例#30
0
class KnowledgeGraphFieldTest(AllenNlpTestCase):
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        table_file = self.FIXTURES_ROOT / "data" / "wikitables" / "tables" / "341.tagged"
        self.graph = TableQuestionContext.read_from_file(
            table_file, self.utterance).get_table_knowledge_graph()
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string',
                                                    namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()

    def test_count_vocab_items(self):
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        self.field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["tokens"] == {
            'name': 1,
            'in': 2,
            'english': 2,
            'location': 1,
            'mersin': 1,
        }

    def test_index_converts_field_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field._indexed_entity_texts.keys() == {'tokens'}
        # Note that these are sorted by their _identifiers_, not their cell text, so the
        # `fb:row.rows` show up after the `fb:cells`.
        expected_array = [[self.mersin_index],
                          [
                              self.location_index, self.in_index,
                              self.english_index
                          ],
                          [self.name_index, self.in_index, self.english_index]]
        assert self.field._indexed_entity_texts['tokens'] == expected_array

    def test_get_padding_lengths_raises_if_not_indexed(self):
        with pytest.raises(AssertionError):
            self.field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            'num_entities': 3,
            'num_entity_tokens': 3,
            'num_utterance_tokens': 4
        }
        self.field._token_indexers[
            'token_characters'] = TokenCharactersIndexer(min_padding_length=1)
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            'num_entities': 3,
            'num_entity_tokens': 3,
            'num_utterance_tokens': 4,
            'num_token_characters': 8
        }

    def test_as_tensor_produces_correct_output(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        padding_lengths['num_utterance_tokens'] += 1
        padding_lengths['num_entities'] += 1
        tensor_dict = self.field.as_tensor(padding_lengths)
        assert tensor_dict.keys() == {'text', 'linking'}
        expected_text_tensor = [
            [self.mersin_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index], [0, 0, 0]
        ]
        assert_almost_equal(
            tensor_dict['text']['tokens'].detach().cpu().numpy(),
            expected_text_tensor)

        linking_tensor = tensor_dict['linking'].detach().cpu().numpy()
        expected_linking_tensor = [
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # string:mersin, "where"
                [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # string:mersin, "is"
                [0, 1, 1, 1, 1, 1, 0, 0, 1, 1],  # string:mersin, "mersin"
                [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # string:mersin, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # string:mersin, padding
            [
                [0, 0, 0, 0, 0, -2.6, 0, 0, 0,
                 0],  # string_column:name_in_english, "where"
                [0, 0, 0, 0, 0, -7.5, 0, 0, 0,
                 0],  # string_column:name_in_english, "is"
                [0, 0, 0, 0, 0, -1.8333, 1, 1, 0,
                 0],  # string_column:..in_english, "mersin"
                [0, 0, 0, 0, 0, -18, 0, 0, 0,
                 0],  # string_column:name_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # string_column:name_in_english, padding
            [
                [0, 0, 0, 0, 0, -1.6, 0, 0, 0,
                 0],  # string_..:location_in_english, "where"
                [0, 0, 0, 0, 0, -5.5, 0, 0, 0,
                 0],  # string_column:location_in_english, "is"
                [0, 0, 0, 0, 0, -1, 0, 0, 0,
                 0],  # string_column:location_in_english, "mersin"
                [0, 0, 0, 0, 0, -14, 0, 0, 0,
                 0],  # string_column:location_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # string_column:location_in_english, padding
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ]
        ]  # padding, padding
        for entity_index, entity_features in enumerate(
                expected_linking_tensor):
            for question_index, feature_vector in enumerate(entity_features):
                assert_almost_equal(linking_tensor[entity_index,
                                                   question_index],
                                    feature_vector,
                                    decimal=4,
                                    err_msg=f"{entity_index} {question_index}")

    def test_lemma_feature_extractor(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize("Names in English")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = 'string_column:name_in_english'
        lemma_feature = field._contains_lemma_match(
            entity, field._entity_text_map[entity], utterance[0], 0, utterance)
        assert lemma_feature == 1

    def test_span_overlap_fraction(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize(
            "what is the name in english of mersin?")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = 'string_column:name_in_english'
        entity_text = field._entity_text_map[entity]
        feature_values = [
            field._span_overlap_fraction(entity, entity_text, token, i,
                                         utterance)
            for i, token in enumerate(utterance)
        ]
        assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0]

    def test_batch_tensors(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        tensor_dict1 = self.field.as_tensor(padding_lengths)
        tensor_dict2 = self.field.as_tensor(padding_lengths)
        batched_tensor_dict = self.field.batch_tensors(
            [tensor_dict1, tensor_dict2])
        assert batched_tensor_dict.keys() == {'text', 'linking'}
        expected_single_tensor = [
            [self.mersin_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index]
        ]
        expected_batched_tensor = [
            expected_single_tensor, expected_single_tensor
        ]
        assert_almost_equal(
            batched_tensor_dict['text']['tokens'].detach().cpu().numpy(),
            expected_batched_tensor)
        expected_linking_tensor = torch.stack(
            [tensor_dict1['linking'], tensor_dict2['linking']])
        assert_almost_equal(
            batched_tensor_dict['linking'].detach().cpu().numpy(),
            expected_linking_tensor.detach().cpu().numpy())

    def test_field_initialized_with_empty_constructor(self):
        try:
            self.field.empty_field()
        except AssertionError as e:
            pytest.fail(str(e), pytrace=True)
示例#31
0
class TestTextField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace(u"sentence", namespace=u'words')
        self.vocab.add_token_to_namespace(u"A", namespace=u'words')
        self.vocab.add_token_to_namespace(u"A", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"s", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"e", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"n", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"t", namespace=u'characters')
        self.vocab.add_token_to_namespace(u"c", namespace=u'characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts[u"words"][u"This"] == 1
        assert namespace_token_counts[u"words"][u"is"] == 1
        assert namespace_token_counts[u"words"][u"a"] == 1
        assert namespace_token_counts[u"words"][u"sentence"] == 1
        assert namespace_token_counts[u"words"][u"."] == 1
        assert list(namespace_token_counts.keys()) == [u"words"]

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts[u"characters"][u"T"] == 1
        assert namespace_token_counts[u"characters"][u"h"] == 1
        assert namespace_token_counts[u"characters"][u"i"] == 2
        assert namespace_token_counts[u"characters"][u"s"] == 3
        assert namespace_token_counts[u"characters"][u"a"] == 1
        assert namespace_token_counts[u"characters"][u"e"] == 3
        assert namespace_token_counts[u"characters"][u"n"] == 2
        assert namespace_token_counts[u"characters"][u"t"] == 1
        assert namespace_token_counts[u"characters"][u"c"] == 1
        assert namespace_token_counts[u"characters"][u"."] == 1
        assert list(namespace_token_counts.keys()) == [u"characters"]

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words"),
                                          u"characters": TokenCharactersIndexer(u"characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts[u"characters"][u"T"] == 1
        assert namespace_token_counts[u"characters"][u"h"] == 1
        assert namespace_token_counts[u"characters"][u"i"] == 2
        assert namespace_token_counts[u"characters"][u"s"] == 3
        assert namespace_token_counts[u"characters"][u"a"] == 1
        assert namespace_token_counts[u"characters"][u"e"] == 3
        assert namespace_token_counts[u"characters"][u"n"] == 2
        assert namespace_token_counts[u"characters"][u"t"] == 1
        assert namespace_token_counts[u"characters"][u"c"] == 1
        assert namespace_token_counts[u"characters"][u"."] == 1
        assert namespace_token_counts[u"words"][u"This"] == 1
        assert namespace_token_counts[u"words"][u"is"] == 1
        assert namespace_token_counts[u"words"][u"a"] == 1
        assert namespace_token_counts[u"words"][u"sentence"] == 1
        assert namespace_token_counts[u"words"][u"."] == 1
        assert set(namespace_token_counts.keys()) == set([u"words", u"characters"])

    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace(u"sentence", namespace=u'words')
        capital_a_index = vocab.add_token_to_namespace(u"A", namespace=u'words')
        capital_a_char_index = vocab.add_token_to_namespace(u"A", namespace=u'characters')
        s_index = vocab.add_token_to_namespace(u"s", namespace=u'characters')
        e_index = vocab.add_token_to_namespace(u"e", namespace=u'characters')
        n_index = vocab.add_token_to_namespace(u"n", namespace=u'characters')
        t_index = vocab.add_token_to_namespace(u"t", namespace=u'characters')
        c_index = vocab.add_token_to_namespace(u"c", namespace=u'characters')

        field = TextField([Token(t) for t in [u"A", u"sentence"]],
                          {u"words": SingleIdTokenIndexer(namespace=u"words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens[u"words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in [u"A", u"sentence"]],
                           {u"characters": TokenCharactersIndexer(namespace=u"characters")})
        field1.index(vocab)
        assert field1._indexed_tokens[u"characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in [u"A", u"sentence"]],
                           token_indexers={u"words": SingleIdTokenIndexer(namespace=u"words"),
                                           u"characters": TokenCharactersIndexer(namespace=u"characters")})
        field2.index(vocab)
        assert field2._indexed_tokens[u"words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens[u"characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        # pylint: enable=protected-access

    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {u"num_tokens": 5}

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8}

        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters"),
                                          u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8}

    def test_as_tensor_handles_words(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(),
                                                numpy.array([1, 1, 1, 2, 1]))

    def test_as_tensor_handles_longer_lengths(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths[u"num_tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(),
                                                numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_as_tensor_handles_characters(self):
        field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]],
                          token_indexers={u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(),
                                                expected_character_array)

    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField([Token(t) for t in [u"a", u"sentence", u"."]],
                          token_indexers={u"words": SingleIdTokenIndexer(u"words"),
                                          u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths[u"num_tokens"] = 5
        padding_lengths[u"num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(),
                                                numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                                                             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_printing_doesnt_crash(self):
        field = TextField([Token(t) for t in [u"A", u"sentence"]],
                          {u"words": SingleIdTokenIndexer(namespace=u"words")})
        print(field)

    def test_token_embedder_returns_dict(self):
        field = TextField([Token(t) for t in [u"A", u"sentence"]],
                          token_indexers={u"field_with_dict": DictReturningTokenIndexer(),
                                          u"words": SingleIdTokenIndexer(u"words"),
                                          u"characters": TokenCharactersIndexer(u"characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
                u'token_ids': 5,
                u'additional_key': 2,
                u'words': 2,
                u'characters': 2,
                u'num_token_characters': 8
        }
        padding_lengths[u'token_ids'] = 7
        padding_lengths[u'additional_key'] = 3
        padding_lengths[u'words'] = 4
        padding_lengths[u'characters'] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors[u'token_ids'].shape) == [7]
        assert list(tensors[u'additional_key'].shape) == [3]
        assert list(tensors[u'words'].shape) == [4]
        assert list(tensors[u'characters'].shape) == [4, 8]
class FancyIteratorTest(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
            self.create_instance(["this", "is", "a", "sentence"]),
            self.create_instance(["this", "is", "another", "sentence"]),
            self.create_instance(["yet", "another", "sentence"]),
            self.create_instance([
                "this", "is", "a", "very", "very", "very", "very", "long",
                "sentence"
            ]),
            self.create_instance(["sentence"]),
        ]

        self.instances = instances

    def create_instance(self, str_tokens: List[str]):
        tokens = [Token(t) for t in str_tokens]
        instance = Instance({'source': TextField(tokens, self.token_indexers)})
        return instance

    def test_truncate(self):
        # Checks that the truncate parameter works as intended.

        # Since split size is less than the length of the "very ... very long" sentence, the
        # iterator should return one batch when the truncation is enabled.
        split_size = 4
        truncated_iterator = FancyIterator(batch_size=5,
                                           split_size=split_size,
                                           splitting_keys=['source'],
                                           truncate=True)
        truncated_iterator.index_with(self.vocab)
        batches = list(truncated_iterator(self.instances, num_epochs=1))
        assert len(batches) == 1

        # When truncation is disabled the iterator should return 3 batches instead.
        non_truncated_iterator = FancyIterator(batch_size=5,
                                               split_size=split_size,
                                               splitting_keys=['source'],
                                               truncate=False)
        non_truncated_iterator.index_with(self.vocab)
        batches = list(non_truncated_iterator(self.instances, num_epochs=1))
        assert len(batches) == 3

        # When the batch size is larger than the number of instances, truncation will the iterator
        # to return zero batches of data (since some of the instances in the batch would consist
        # entirely of padding). Check that the iterator raises an error in this case.
        invalid_iterator = FancyIterator(batch_size=6,
                                         split_size=split_size,
                                         splitting_keys=['source'],
                                         truncate=True)
        invalid_iterator.index_with(self.vocab)
        with self.assertRaises(ConfigurationError):
            batches = list(invalid_iterator(self.instances, num_epochs=1))

        # If truncation is disabled then this should not cause an issue
        valid_iterator = FancyIterator(batch_size=6,
                                       split_size=split_size,
                                       splitting_keys=['source'],
                                       truncate=False)
        valid_iterator.index_with(self.vocab)
        batches = list(valid_iterator(self.instances, num_epochs=1))
        assert len(batches) == 3
示例#33
0
class RelaxedBeamSearchTest(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.vocab = Vocabulary(non_padded_namespaces=['tokens'])
        for i in range(transition_probabilities.size(0)):
            self.vocab.add_token_to_namespace(str(i))
        self.end_symbol = str(transition_probabilities.size()[0] - 1)
        self.end_index = transition_probabilities.size()[0] - 1
        # Ensure the end symbol has the expected index
        assert self.end_index == self.vocab.get_token_index(self.end_symbol)
        self.beam_search = RelaxedBeamSearch(self.vocab,
                                             beam_size=3,
                                             end_symbol=self.end_symbol,
                                             max_steps=10)

        # This is what the top k should look like for each item in the batch.
        self.expected_top_k = [
            np.array([1, 2, 3, 4, 5]),
            np.array([2, 3, 4, 5]),
            np.array([3, 4, 5])
        ]

        # This is what the log probs should look like for each item in the batch.
        self.expected_log_probs = np.log(np.array([0.4, 0.3, 0.2]))  # pylint: disable=assignment-from-no-return

    def _check_results(self,
                       batch_size: int = 5,
                       expected_top_k: np.array = None,
                       expected_log_probs: np.array = None,
                       beam_search: RelaxedBeamSearch = None,
                       state: Dict[str, torch.Tensor] = None,
                       step: StepFunctionType = None,
                       rtol: float = 1e-7) -> None:
        expected_top_k = expected_top_k if expected_top_k is not None else self.expected_top_k
        expected_log_probs = expected_log_probs if expected_log_probs is not None else self.expected_log_probs
        state = state or {}
        step = step or take_step

        beam_search = beam_search or self.beam_search
        beam_size = beam_search.beam_size

        initial_predictions = torch.tensor([0] * batch_size)  # pylint: disable=not-callable
        top_k, log_probs = beam_search.search(initial_predictions, state,
                                              step)  # type: ignore

        assert len(top_k) == batch_size
        assert len(log_probs) == batch_size
        for i in range(batch_size):
            assert len(top_k[i]) == beam_size
            assert len(log_probs[i]) == beam_size
            for j in range(beam_size):
                np.testing.assert_array_equal(top_k[i][j].numpy(),
                                              expected_top_k[j])
                np.testing.assert_allclose(log_probs[i][j].numpy(),
                                           expected_log_probs[j])

    def test_search(self):
        self._check_results()

    def test_finished_state(self):
        state = {}
        state["foo"] = torch.tensor(  # pylint: disable=not-callable
            [[1, 0, 1], [2, 0, 1], [0, 0, 1], [1, 1, 1], [0, 0, 0]])
        # shape: (batch_size, 3)

        expected_finished_state = {}
        expected_finished_state["foo"] = np.array([[1, 0, 1], [1, 0, 1],
                                                   [1, 0, 1], [2, 0, 1],
                                                   [2, 0, 1], [2, 0, 1],
                                                   [0, 0, 1], [0, 0, 1],
                                                   [0, 0, 1], [1, 1, 1],
                                                   [1, 1, 1], [1, 1, 1],
                                                   [0, 0, 0], [0, 0, 0],
                                                   [0, 0, 0]])
        # shape: (batch_size x beam_size, 3)

        self._check_results(state=state)

        # check finished state.
        for key, array in expected_finished_state.items():
            np.testing.assert_allclose(state[key].numpy(), array)

    def test_batch_size_of_one(self):
        self._check_results(batch_size=1)

    def test_greedy_search(self):
        beam_search = RelaxedBeamSearch(self.vocab,
                                        beam_size=1,
                                        end_symbol=self.end_symbol)
        expected_top_k = np.array([[1, 2, 3, 4, 5]])
        expected_log_probs = np.log(np.array([0.4]))  # pylint: disable=assignment-from-no-return
        self._check_results(expected_top_k=expected_top_k,
                            expected_log_probs=expected_log_probs,
                            beam_search=beam_search)

    def test_catch_bad_config(self):
        """
        If `per_node_beam_size` (which defaults to `beam_size`) is larger than
        the size of the target vocabulary, `BeamSearch.search` should raise
        a ConfigurationError.
        """
        beam_search = RelaxedBeamSearch(self.vocab,
                                        beam_size=20,
                                        end_symbol=self.end_symbol)
        with pytest.raises(ConfigurationError):
            self._check_results(beam_search=beam_search)

    def test_warn_for_bad_log_probs(self):
        # The only valid next step from the initial predictions is the end index.
        # But with a beam size of 3, the call to `topk` to find the 3 most likely
        # next beams will result in 2 new beams that are invalid, in that have probability of 0.
        # The beam search should warn us of this.
        initial_predictions = torch.LongTensor(
            [self.end_index - 1, self.end_index - 1])
        with pytest.warns(RuntimeWarning, match="Infinite log probabilities"):
            self.beam_search.search(initial_predictions, {}, take_step)

    def test_empty_sequences(self):
        initial_predictions = torch.LongTensor(
            [self.end_index - 1, self.end_index - 1])
        beam_search = RelaxedBeamSearch(self.vocab,
                                        beam_size=1,
                                        end_symbol=self.end_symbol)
        with pytest.warns(RuntimeWarning, match="Empty sequences predicted"):
            predictions, log_probs = beam_search.search(
                initial_predictions, {}, take_step)
        # predictions hould have shape `(batch_size, beam_size, max_predicted_length)`.
        assert list(predictions.size()) == [2, 1, 1]
        # log probs hould have shape `(batch_size, beam_size)`.
        assert list(log_probs.size()) == [2, 1]
        assert (predictions == self.end_index).all()
        assert (log_probs == 0).all()

    def test_min_steps_warn_for_bad_log_probs(self):
        initial_predictions = torch.LongTensor([0] * 2)
        beam_search = RelaxedBeamSearch(self.vocab,
                                        beam_size=1,
                                        end_symbol=self.end_symbol,
                                        min_steps=5)
        with pytest.warns(RuntimeWarning, match="Infinite log probabilities"):
            beam_search.search(initial_predictions, {}, take_step)

    def test_length_penalizer(self):
        # This is an extreme value for the Wu penalizer just to force
        # the outputs to switch order
        length_penalizer = WuLengthPenalizer(-10)
        beam_search = RelaxedBeamSearch(self.vocab,
                                        beam_size=3,
                                        end_symbol=self.end_symbol,
                                        max_steps=10,
                                        length_penalizer=length_penalizer)
        # The outputs are in the opposite order than expected
        expected_top_k = [
            np.array([3, 4, 5]),
            np.array([2, 3, 4, 5]),
            np.array([1, 2, 3, 4, 5])
        ]
        expected_log_probs = np.log(np.array([0.2, 0.3, 0.4]))
        self._check_results(expected_top_k=expected_top_k,
                            expected_log_probs=expected_log_probs,
                            beam_search=beam_search,
                            step=take_step)
示例#34
0
 def _create_vocab(cls) -> Vocabulary:
     vocab = Vocabulary()
     vocab.add_token_to_namespace("O", "labels")
     vocab.add_token_to_namespace("B-Tag", "labels")
     vocab.add_token_to_namespace("I-Tag", "labels")
     return vocab
示例#35
0
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
            "words1": {
                "type": "embedding",
                "embedding_dim": 2
            },
            "words2": {
                "type": "embedding",
                "embedding_dim": 5
            },
            "words3": {
                "type": "embedding",
                "embedding_dim": 3
            }
        })
        self.token_embedder = BasicTextFieldEmbedder.from_params(
            self.vocab, params)
        self.inputs = {
            "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])),
            "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])),
            "words3": Variable(torch.LongTensor([[1, 5, 1, 2]]))
        }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError):
            self.token_embedder(self.inputs)
        self.inputs['words3'] = self.inputs['words4']
        del self.inputs['words4']

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
            "words": {
                "type": "embedding",
                "num_embeddings": 20,
                "embedding_dim": 2,
            },
            "characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 4,
                    "num_embeddings": 15,
                },
                "encoder": {
                    "type": "cnn",
                    "embedding_dim": 4,
                    "num_filters": 10,
                    "ngram_filter_sizes": [3],
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
            'words': Variable(torch.rand(3, 4, 5, 6) * 20).long(),
            'characters': Variable(torch.rand(3, 4, 5, 6, 7) * 15).long(),
        }
        assert token_embedder(inputs,
                              num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
示例#36
0
    def setUp(self):
        super(SpanBasedF1Test, self).setUp()
        vocab = Vocabulary()
        vocab.add_token_to_namespace("O", "tags")
        vocab.add_token_to_namespace("B-ARG1", "tags")
        vocab.add_token_to_namespace("I-ARG1", "tags")
        vocab.add_token_to_namespace("B-ARG2", "tags")
        vocab.add_token_to_namespace("I-ARG2", "tags")
        vocab.add_token_to_namespace("B-V", "tags")
        vocab.add_token_to_namespace("I-V", "tags")
        vocab.add_token_to_namespace("U-ARG1", "tags")
        vocab.add_token_to_namespace("U-ARG2", "tags")
        vocab.add_token_to_namespace("B-C-ARG1", "tags")
        vocab.add_token_to_namespace("I-C-ARG1", "tags")
        vocab.add_token_to_namespace("B-ARGM-ADJ", "tags")
        vocab.add_token_to_namespace("I-ARGM-ADJ", "tags")

        self.vocab = vocab
示例#37
0
import json
import argparse

from allennlp.data import Vocabulary

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--ontology-path', type=str, required=True)
    parser.add_argument('--output-path', type=str, required=True)
    args = parser.parse_args()

    with open(args.ontology_path) as f:
        ontology = json.load(f)

    vocab = Vocabulary()
    vocab.add_token_to_namespace(token='None', namespace='span_labels')
    vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels')
    vocab.add_tokens_to_namespace(tokens=list(ontology['args'].keys()),
                                  namespace='span_labels')
    vocab.add_tokens_to_namespace(tokens=list(ontology['events'].keys()),
                                  namespace='event_labels')
    vocab.save_to_files(args.output_path)
示例#38
0
    def __init__(
            self,
            vocab: Vocabulary,
            text_field_embedder: TextFieldEmbedder,
            title_encoder: Seq2VecEncoder,
            abstract_encoder: Seq2VecEncoder,
            venue_encoder: Seq2VecEncoder,
            body_encoder: Seq2VecEncoder = None,
            predict_mode: bool = False,
            author_text_embedder: TextFieldEmbedder = None,
            venue_field_embedder: TextFieldEmbedder = None,
            author_text_encoder: Seq2VecEncoder = None,
            # author_id_embedder: Optional[Embedding] = None,
            author_id_embedder: TextFieldEmbedder = None,
            # author_position_embedder: Optional[Embedding] = None,
            author_position_embedder: TextFieldEmbedder = None,
            feedforward: FeedForward = None,
            author_feedforward: FeedForward = None,
            initializer: InitializerApplicator = InitializerApplicator(),
            regularizer: Optional[RegularizerApplicator] = None,
            max_num_authors: Optional[int] = 5,
            dropout: Optional[float] = None,
            ignore_authors: Optional[bool] = False,
            layer_norm: Optional[bool] = True,
            embedding_layer_norm: Optional[bool] = False,
            loss_distance: Optional[str] = 'l2-norm',
            loss_margin: Optional[float] = 1,
            bert_finetune: Optional[bool] = False,
            include_venue: Optional[bool] = False) -> None:
        super(Specter, self).__init__(vocab, regularizer)

        for lbl in range(max_num_authors):
            vocab.add_token_to_namespace(token=str(lbl),
                                         namespace='author_positions')

        self.text_field_embedder = text_field_embedder
        self.venue_field_embedder = venue_field_embedder
        self.title_encoder = title_encoder
        self.abstract_encoder = abstract_encoder
        self.body_encoder = body_encoder
        self.venue_encoder = venue_encoder

        self.predict_mode = predict_mode

        self.feedforward = feedforward

        if loss_distance == 'l2-norm':
            self.loss = torch.nn.TripletMarginLoss(margin=loss_margin,
                                                   reduction='none')
        elif loss_distance == 'binary':
            self.loss = BinaryLoss(margin=loss_margin)
        else:
            self.loss = TripletLoss(margin=loss_margin,
                                    distance=loss_distance,
                                    reduction='none')

        if layer_norm:
            self.layer_norm = LayerNorm(self.feedforward.get_output_dim())
        self.do_layer_norm = layer_norm

        # self.layer_norm_author_embedding = LayerNorm(author_feedforward.get_output_dim())

        if embedding_layer_norm:
            self.layer_norm_word_embedding = LayerNorm(
                self.title_encoder.get_input_dim())
            self.layer_norm_word_embedding_venue = LayerNorm(
                self.venue_encoder.get_input_dim())
        self.embedding_layer_norm = embedding_layer_norm

        self.dropout = Dropout()

        self.ignore_authors = ignore_authors

        if not ignore_authors:
            self.author_id_embedder = author_id_embedder
            self.author_position_embedder = author_position_embedder
            self.author_text_embedder = author_text_embedder
            self.author_text_encoder = author_text_encoder
            # author representation would be a concatenation of author-id and author-position
            # [batch, num-authors, auth-dim + position-dim]
            # we apply timedistributed mlp on top to make this a:
            # [batch, num-authors, dim]
            self.author_time_dist_ff = TimeDistributed(author_feedforward)

        # internal variable showing that the title/abstract should be encoded with a transformer
        # do not change this as it should be by default `false` in this class
        # in the inheriting `PaperRepresentationTransoformer` class it is set to true in the constructor
        # to indicate that the title/abstract should be encoded with a transformer.
        self.tansformer_encoder = False

        self.bert_finetune = bert_finetune
        self.include_venue = include_venue

        self.include_venue = include_venue

        initializer(self)
class TestProductionRuleField(AllenNlpTestCase):
    def setUp(self):
        super(TestProductionRuleField, self).setUp()
        self.vocab = Vocabulary()
        self.s_rule_index = self.vocab.add_token_to_namespace("S -> [NP, VP]", namespace='rule_labels')
        self.np_index = self.vocab.add_token_to_namespace("NP -> test", namespace='rule_labels')

    def test_field_counts_vocab_items_correctly(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 1

        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False)
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 0

    def test_index_converts_field_correctly(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        assert field._rule_id == self.s_rule_index

    def test_padding_lengths_are_computed_correctly(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        assert field.get_padding_lengths() == {}

    def test_as_tensor_produces_correct_output(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        tensor_tuple = field.as_tensor(field.get_padding_lengths())
        assert isinstance(tensor_tuple, tuple)
        assert len(tensor_tuple) == 4
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False)
        field.index(self.vocab)
        tensor_tuple = field.as_tensor(field.get_padding_lengths())
        assert isinstance(tensor_tuple, tuple)
        assert len(tensor_tuple) == 4
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

    def test_batch_tensors_does_not_modify_list(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict1 = field.as_tensor(padding_lengths)

        field = ProductionRuleField('NP -> test', is_global_rule=True)
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict2 = field.as_tensor(padding_lengths)
        tensor_list = [tensor_dict1, tensor_dict2]
        assert field.batch_tensors(tensor_list) == tensor_list

    def test_doubly_nested_field_works(self):
        field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field2 = ProductionRuleField('NP -> test', is_global_rule=True)
        field3 = ProductionRuleField('VP -> eat', is_global_rule=False)
        list_field = ListField([ListField([field1, field2, field3]),
                                ListField([field1, field2])])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensors = list_field.as_tensor(padding_lengths)
        assert isinstance(tensors, list)
        assert len(tensors) == 2
        assert isinstance(tensors[0], list)
        assert len(tensors[0]) == 3
        assert isinstance(tensors[1], list)
        assert len(tensors[1]) == 3

        tensor_tuple = tensors[0][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[0][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        tensor_tuple = tensors[0][2]
        assert tensor_tuple[0] == 'VP -> eat'
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

        tensor_tuple = tensors[1][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index])

        tensor_tuple = tensors[1][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index])

        # This item was just padding.
        tensor_tuple = tensors[1][2]
        assert tensor_tuple[0] == ''
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

    def test_production_rule_field_can_print(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        print(field)
示例#40
0
def _read_embeddings_from_text_file(
        file_uri: str,
        embedding_dim: int,
        vocab: Vocabulary,
        namespace: str = "tokens",
        min_pretrained_embeddings: int = 0) -> torch.FloatTensor:
    """
    Read pre-trained word vectors from an eventually compressed text file, possibly contained
    inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
    space-separated fields: [word] [dim 1] [dim 2] ...

    Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped.

    The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``.
    """
    tokens_to_keep = set(
        vocab.get_index_to_token_vocabulary(namespace).values())
    embeddings = {}

    # First we read the embeddings from the file, only keeping vectors for the words we need.
    logger.info("Reading pretrained embeddings from file")

    with EmbeddingsTextFile(file_uri) as embeddings_file:
        for index, line in Tqdm.tqdm(enumerate(embeddings_file)):
            token = line.split(' ', 1)[0]
            if token in tokens_to_keep or index < min_pretrained_embeddings:
                fields = line.rstrip().split(' ')
                if len(fields) - 1 != embedding_dim:
                    # Sometimes there are funny unicode parsing problems that lead to different
                    # fields lengths (e.g., a word with a unicode space character that splits
                    # into more than one column).  We skip those lines.  Note that if you have
                    # some kind of long header, this could result in all of your lines getting
                    # skipped.  It's hard to check for that here; you just have to look in the
                    # embedding_misses_file and at the model summary to make sure things look
                    # like they are supposed to.
                    logger.warning(
                        "Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
                        embedding_dim,
                        len(fields) - 1, line)
                    continue

                vector = numpy.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
                if token not in tokens_to_keep:
                    vocab.add_token_to_namespace(token, namespace)

    vocab_size = vocab.get_vocab_size(namespace)

    if not embeddings:
        raise ConfigurationError(
            "No embeddings of correct dimension found; you probably "
            "misspecified your embedding_dim parameter, or didn't "
            "pre-populate your Vocabulary")

    all_embeddings = numpy.asarray(list(embeddings.values()))
    embeddings_mean = float(numpy.mean(all_embeddings))
    embeddings_std = float(numpy.std(all_embeddings))
    # Now we initialize the weight matrix for an embedding layer, starting with random vectors,
    # then filling in the word vectors we just read.
    logger.info("Initializing pre-trained embedding layer")
    embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
        embeddings_mean, embeddings_std)
    num_tokens_found = 0
    index_to_token = vocab.get_index_to_token_vocabulary(namespace)
    for i in range(vocab_size):
        token = index_to_token[i]

        # If we don't have a pre-trained vector for this word, we'll just leave this row alone,
        # so the word has a random initialization.
        if token in embeddings:
            embedding_matrix[i] = torch.FloatTensor(embeddings[token])
            num_tokens_found += 1
        else:
            logger.debug(
                "Token %s was not found in the embedding file. Initialising randomly.",
                token)

    logger.info("Pretrained embeddings were found for %d out of %d tokens",
                num_tokens_found, vocab_size)

    return embedding_matrix
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })
        self.token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params)
        self.inputs = {
                "words1": torch.LongTensor([[0, 2, 3, 5]]),
                "words2": torch.LongTensor([[1, 4, 3, 2]]),
                "words3": torch.LongTensor([[1, 5, 1, 2]])
                }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        # Total mismatch
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        self.inputs['words3'] = self.inputs['words4']

        # Text field has too many inputs
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("is generating more keys")

        del self.inputs['words4']


    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                                },
                        "characters": {
                                "type": "character_encoding",
                                "embedding": {
                                        "embedding_dim": 4,
                                        "num_embeddings": 15,
                                        },
                                "encoder": {
                                        "type": "cnn",
                                        "embedding_dim": 4,
                                        "num_filters": 10,
                                        "ngram_filter_sizes": [3],
                                        },
                                }
                        }
                })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params)
        inputs = {
                'words': (torch.rand(3, 4, 5, 6) * 20).long(),
                'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(),
                }
        assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
                "token_embedders": {
                        "words": {
                                "type": "embedding",
                                "num_embeddings": 20,
                                "embedding_dim": 2,
                                },
                        "elmo": {
                                "type": "elmo_token_embedder",
                                "options_file": options_file,
                                "weight_file": weight_file
                                },
                        },
                "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]}
                })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
                'words': (torch.rand(3, 6) * 20).long(),
                'elmo': (torch.rand(3, 6, 50) * 15).long(),
                }
        token_embedder(inputs)

    def test_old_from_params_new_from_params(self):
        old_params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })

        # Allow loading the parameters in the old format
        with pytest.warns(DeprecationWarning):
            old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab)

        new_params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })

        # But also allow loading the parameters in the new format
        new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab)
        assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys() #pylint: disable=protected-access

        assert new_embedder(self.inputs).size() == (1, 4, 10)
示例#42
0
    def test_min_padding_length(self):
        sentence = "AllenNLP is awesome ."
        tokens = [Token(token) for token in sentence.split(" ")]
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("l", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("N", namespace="characters")  # 6
        vocab.add_token_to_namespace("L", namespace="characters")  # 7
        vocab.add_token_to_namespace("P", namespace="characters")  # 8
        vocab.add_token_to_namespace("i", namespace="characters")  # 9
        vocab.add_token_to_namespace("s", namespace="characters")  # 10
        vocab.add_token_to_namespace("a", namespace="characters")  # 11
        vocab.add_token_to_namespace("w", namespace="characters")  # 12
        vocab.add_token_to_namespace("o", namespace="characters")  # 13
        vocab.add_token_to_namespace("m", namespace="characters")  # 14
        vocab.add_token_to_namespace(".", namespace="characters")  # 15

        indexer = TokenCharactersIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab, "char")
        key_padding_lengths = "num_token_characters"
        value_padding_lengths = 0
        for token in indices["char"]:
            item = indexer.get_padding_lengths(token)
            value = item.values()
            value_padding_lengths = max(value_padding_lengths, max(value))
        padded = indexer.pad_token_sequence(
            indices, {"char": len(indices["char"])},
            {key_padding_lengths: value_padding_lengths})
        assert padded == {
            "char": [[2, 3, 3, 4, 5, 6, 7, 8, 0, 0],
                     [9, 10, 0, 0, 0, 0, 0, 0, 0, 0],
                     [11, 12, 4, 10, 13, 14, 4, 0, 0, 0],
                     [15, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
        }
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
            "token_embedders": {
                "words1": {
                    "type": "embedding",
                    "embedding_dim": 2
                },
                "words2": {
                    "type": "embedding",
                    "embedding_dim": 5
                },
                "words3": {
                    "type": "embedding",
                    "embedding_dim": 3
                },
            }
        })
        self.token_embedder = BasicTextFieldEmbedder.from_params(
            vocab=self.vocab, params=params)
        self.inputs = {
            "words1": {
                "tokens": torch.LongTensor([[0, 2, 3, 5]])
            },
            "words2": {
                "tokens": torch.LongTensor([[1, 4, 3, 2]])
            },
            "words3": {
                "tokens": torch.LongTensor([[1, 5, 1, 2]])
            },
        }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        # Total mismatch
        self.inputs["words4"] = self.inputs["words3"]
        del self.inputs["words3"]
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        self.inputs["words3"] = self.inputs["words4"]

        # Text field has too many inputs
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        del self.inputs["words4"]

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2
                },
                "characters": {
                    "type": "character_encoding",
                    "embedding": {
                        "embedding_dim": 4,
                        "num_embeddings": 15
                    },
                    "encoder": {
                        "type": "cnn",
                        "embedding_dim": 4,
                        "num_filters": 10,
                        "ngram_filter_sizes": [3],
                    },
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "words": {
                "tokens": (torch.rand(3, 4, 5, 6) * 20).long()
            },
            "characters": {
                "token_characters": (torch.rand(3, 4, 5, 6, 7) * 15).long()
            },
        }
        assert token_embedder(inputs,
                              num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_forward_params(self):
        class FakeEmbedder(torch.nn.Module):
            def __init__(self):
                super().__init__()

            def forward(self, tokens: torch.Tensor, extra_arg: int = None):
                assert tokens is not None
                assert extra_arg is not None
                return tokens

        token_embedder = BasicTextFieldEmbedder({"elmo": FakeEmbedder()})
        inputs = {"elmo": {"tokens": (torch.rand(3, 6, 5) * 2).long()}}
        kwargs = {"extra_arg": 1}
        token_embedder(inputs, **kwargs)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
        options_file = str(elmo_fixtures_path / "options.json")
        weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2
                },
                "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": options_file,
                    "weight_file": weight_file,
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "words": {
                "tokens": (torch.rand(3, 6) * 20).long()
            },
            "elmo": {
                "tokens": (torch.rand(3, 6, 50) * 15).long()
            },
        }
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_null(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
        options_file = str(elmo_fixtures_path / "options.json")
        weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
        params = Params({
            "token_embedders": {
                "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": options_file,
                    "weight_file": weight_file,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {"elmo": {"tokens": (torch.rand(3, 6, 50) * 15).long()}}
        token_embedder(inputs)

    def test_forward_runs_with_non_bijective_mapping_with_dict(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
        options_file = str(elmo_fixtures_path / "options.json")
        weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2
                },
                "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": options_file,
                    "weight_file": weight_file,
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "words": {
                "tokens": (torch.rand(3, 6) * 20).long()
            },
            "elmo": {
                "tokens": (torch.rand(3, 6, 50) * 15).long()
            },
        }
        token_embedder(inputs)

    def test_forward_runs_with_bijective_and_non_bijective_mapping(self):
        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased"
                },
                "token_characters": {
                    "type": "character_encoding",
                    "embedding": {
                        "embedding_dim": 5
                    },
                    "encoder": {
                        "type": "cnn",
                        "embedding_dim": 5,
                        "num_filters": 5,
                        "ngram_filter_sizes": [5],
                    },
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            "bert": {
                "token_ids": (torch.rand(3, 5) * 10).long(),
                "mask": (torch.rand(3, 5) * 1).bool(),
            },
            "token_characters": {
                "token_characters": (torch.rand(3, 5, 5) * 1).long()
            },
        }
        token_embedder(inputs)
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
            "token_embedders": {
                "words1": {
                    "type": "embedding",
                    "embedding_dim": 2
                },
                "words2": {
                    "type": "embedding",
                    "embedding_dim": 5
                },
                "words3": {
                    "type": "embedding",
                    "embedding_dim": 3
                }
            }
        })
        self.token_embedder = BasicTextFieldEmbedder.from_params(
            vocab=self.vocab, params=params)
        self.inputs = {
            "words1": torch.LongTensor([[0, 2, 3, 5]]),
            "words2": torch.LongTensor([[1, 4, 3, 2]]),
            "words3": torch.LongTensor([[1, 5, 1, 2]])
        }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        # Total mismatch
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("Mismatched token keys")

        self.inputs['words3'] = self.inputs['words4']

        # Text field has too many inputs
        with pytest.raises(ConfigurationError) as exc:
            self.token_embedder(self.inputs)
        assert exc.match("is generating more keys")

        del self.inputs['words4']

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2,
                },
                "characters": {
                    "type": "character_encoding",
                    "embedding": {
                        "embedding_dim": 4,
                        "num_embeddings": 15,
                    },
                    "encoder": {
                        "type": "cnn",
                        "embedding_dim": 4,
                        "num_filters": 10,
                        "ngram_filter_sizes": [3],
                    },
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                            params=params)
        inputs = {
            'words': (torch.rand(3, 4, 5, 6) * 20).long(),
            'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(),
        }
        assert token_embedder(inputs,
                              num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
            "token_embedders": {
                "words": {
                    "type": "embedding",
                    "num_embeddings": 20,
                    "embedding_dim": 2,
                },
                "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": options_file,
                    "weight_file": weight_file
                },
            },
            "embedder_to_indexer_map": {
                "words": ["words"],
                "elmo": ["elmo", "words"]
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
            'words': (torch.rand(3, 6) * 20).long(),
            'elmo': (torch.rand(3, 6, 50) * 15).long(),
        }
        token_embedder(inputs)

    def test_old_from_params_new_from_params(self):
        old_params = Params({
            "words1": {
                "type": "embedding",
                "embedding_dim": 2
            },
            "words2": {
                "type": "embedding",
                "embedding_dim": 5
            },
            "words3": {
                "type": "embedding",
                "embedding_dim": 3
            }
        })

        # Allow loading the parameters in the old format
        with pytest.warns(DeprecationWarning):
            old_embedder = BasicTextFieldEmbedder.from_params(
                params=old_params, vocab=self.vocab)

        new_params = Params({
            "token_embedders": {
                "words1": {
                    "type": "embedding",
                    "embedding_dim": 2
                },
                "words2": {
                    "type": "embedding",
                    "embedding_dim": 5
                },
                "words3": {
                    "type": "embedding",
                    "embedding_dim": 3
                }
            }
        })

        # But also allow loading the parameters in the new format
        new_embedder = BasicTextFieldEmbedder.from_params(params=new_params,
                                                          vocab=self.vocab)
        assert old_embedder._token_embedders.keys(
        ) == new_embedder._token_embedders.keys()

        assert new_embedder(self.inputs).size() == (1, 4, 10)

    def test_extension_by_vocab(self):
        text_embedder = self.token_embedder
        vocab = self.vocab

        original_token_embedder_weight_words1 = text_embedder.token_embedder_words1.weight
        original_token_embedder_weight_words2 = text_embedder.token_embedder_words2.weight
        original_token_embedder_weight_words3 = text_embedder.token_embedder_words3.weight

        assert tuple(text_embedder.token_embedder_words1.weight.shape) == (6,
                                                                           2)
        assert tuple(text_embedder.token_embedder_words2.weight.shape) == (6,
                                                                           5)
        assert tuple(text_embedder.token_embedder_words3.weight.shape) == (6,
                                                                           3)

        extended_inputs = {
            "words1": torch.LongTensor([[6]]),
            "words2": torch.LongTensor([[7]]),
            "words3": torch.LongTensor([[8]])
        }

        # This should give error for now.
        with pytest.raises(Exception) as _:
            text_embedder(extended_inputs)

        counter = {"tokens": {"5": 1, "6": 1, "7": 1}}
        vocab._extend(counter)

        text_embedder.extend_vocab(vocab)

        assert tuple(text_embedder.token_embedder_words1.weight.shape) == (9,
                                                                           2)
        assert tuple(text_embedder.token_embedder_words2.weight.shape) == (9,
                                                                           5)
        assert tuple(text_embedder.token_embedder_words3.weight.shape) == (9,
                                                                           3)

        # This shouldn't give error now.
        text_embedder(extended_inputs)

        assert torch.all(text_embedder.token_embedder_words1.weight[:6, :] ==
                         original_token_embedder_weight_words1[:6, :])
        assert torch.all(text_embedder.token_embedder_words2.weight[:6, :] ==
                         original_token_embedder_weight_words2[:6, :])
        assert torch.all(text_embedder.token_embedder_words3.weight[:6, :] ==
                         original_token_embedder_weight_words3[:6, :])
示例#45
0
class TestDictField(unittest.TestCase):
    def setUp(self):
        super(TestDictField, self).setUp()

        entity_tokenizer = WordTokenizer(
            word_splitter=JustSpacesWordSplitter())

        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("entity1", "entity")
        self.vocab.add_token_to_namespace("entity2", "entity")
        self.vocab.add_token_to_namespace("entity3", "entity")
        self.entity_indexer = {
            "entity":
            TokenCharactersIndexerTokenizer(
                "entity", character_tokenizer=entity_tokenizer)
        }

        tokens1 = "The sentence .".split()
        tokens_field = TextField(
            [Token(t) for t in tokens1],
            token_indexers={'tokens': SingleIdTokenIndexer()})

        self.instance1_fields = {
            "candidate_entities":
            TextField([Token("entity1 entity2"),
                       Token("entity_unk")],
                      token_indexers=self.entity_indexer),
            "candidate_entity_prior":
            ArrayField(np.array([[0.5, 0.5], [1.0, 0.0]])),
            "candidate_spans":
            ListField(
                [SpanField(0, 0, tokens_field),
                 SpanField(1, 2, tokens_field)])
        }

        tokens2 = "The sentence".split()
        tokens2_field = TextField(
            [Token(t) for t in tokens2],
            token_indexers={'tokens': SingleIdTokenIndexer()})

        self.instance2_fields = {
            "candidate_entities":
            TextField([Token("entity1")], token_indexers=self.entity_indexer),
            "candidate_entity_prior":
            ArrayField(np.array([[1.0]])),
            "candidate_spans":
            ListField([SpanField(1, 1, tokens2_field)], )
        }

    def test_get_padding_lengths(self):
        field = DictField(self.instance1_fields)
        field.index(self.vocab)
        lengths = field.get_padding_lengths()
        self.assertDictEqual(
            lengths, {
                'candidate_entities*entity_length': 2,
                'candidate_entities*num_token_characters': 2,
                'candidate_entities*num_tokens': 2,
                'candidate_entity_prior*dimension_0': 2,
                'candidate_entity_prior*dimension_1': 2,
                'candidate_spans*num_fields': 2
            })

    def test_dict_field_can_handle_empty(self):
        field = DictField(self.instance1_fields)
        empty = field.empty_field()
        self.assertTrue(True)

    def _check_tensors(self, tensor, expected):
        self.assertListEqual(sorted(list(tensor.keys())),
                             sorted(list(expected.keys())))
        for key in tensor.keys():
            if key == 'candidate_entities':
                a = tensor[key]['entity']
                b = expected[key]['entity']
            else:
                a = tensor[key]
                b = expected[key]
            self.assertTrue(np.allclose(a.numpy(), b.numpy()))

    def test_dict_field_as_tensor(self):
        field = DictField(self.instance1_fields)
        field.index(self.vocab)
        tensor = field.as_tensor(field.get_padding_lengths())

        expected = {
            'candidate_entities': {
                'entity': torch.tensor([[2, 3], [1, 0]])
            },
            'candidate_entity_prior':
            torch.tensor([[0.5000, 0.5000], [1.0000, 0.0000]]),
            'candidate_spans':
            torch.tensor([[0, 0], [1, 2]])
        }

        self._check_tensors(tensor, expected)

    def test_dict_field_can_iterator(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [
            Instance({"candidates": DictField(self.instance1_fields)}),
            Instance({"candidates": DictField(self.instance2_fields)})
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        expected_batch = {
            'candidates': {
                'candidate_entities': {
                    'entity': torch.tensor([[[2, 3], [1, 0]], [[2, 0], [0,
                                                                        0]]])
                },
                'candidate_entity_prior':
                torch.tensor([[[0.5000, 0.5000], [1.0000, 0.0000]],
                              [[1.0000, 0.0000], [0.0000, 0.0000]]]),
                'candidate_spans':
                torch.tensor([[[0, 0], [1, 2]], [[1, 1], [-1, -1]]])
            }
        }

        self._check_tensors(batch['candidates'], expected_batch['candidates'])

    def test_list_field_of_dict_field(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        tokens3 = "The long sentence .".split()
        tokens3_field = TextField(
            [Token(t) for t in tokens3],
            token_indexers={'tokens': SingleIdTokenIndexer()})

        instance3_fields = {
            "candidate_entities":
            TextField([
                Token("entity1 entity2 entity3"),
                Token("entity_unk"),
                Token("entity2 entity3")
            ],
                      token_indexers=self.entity_indexer),
            "candidate_entity_prior":
            ArrayField(
                np.array([[0.1, 0.1, 0.8], [1.0, 0.0, 0.0], [0.33, 0.67,
                                                             0.0]])),
            "candidate_spans":
            ListField([
                SpanField(1, 1, tokens3_field),
                SpanField(1, 2, tokens3_field),
                SpanField(1, 3, tokens3_field)
            ], )
        }

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [
            Instance({
                "candidates":
                ListField([
                    DictField(self.instance1_fields),
                    DictField(self.instance2_fields)
                ])
            }),
            Instance({
                "candidates":
                ListField([
                    DictField(self.instance1_fields),
                    DictField(instance3_fields)
                ])
            })
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        self.assertTrue(
            batch['candidates']['candidate_entities']['entity'].shape ==
            batch['candidates']['candidate_entity_prior'].shape)
    def setUp(self):
        super().setUp()
        vocab = Vocabulary()
        vocab.add_token_to_namespace("O", "tags")
        vocab.add_token_to_namespace("B-ARG1", "tags")
        vocab.add_token_to_namespace("I-ARG1", "tags")
        vocab.add_token_to_namespace("B-ARG2", "tags")
        vocab.add_token_to_namespace("I-ARG2", "tags")
        vocab.add_token_to_namespace("B-V", "tags")
        vocab.add_token_to_namespace("I-V", "tags")
        vocab.add_token_to_namespace("U-ARG1", "tags")
        vocab.add_token_to_namespace("U-ARG2", "tags")
        vocab.add_token_to_namespace("B-C-ARG1", "tags")
        vocab.add_token_to_namespace("I-C-ARG1", "tags")
        vocab.add_token_to_namespace("B-ARGM-ADJ", "tags")
        vocab.add_token_to_namespace("I-ARGM-ADJ", "tags")

        # BMES.
        vocab.add_token_to_namespace("B", "bmes_tags")
        vocab.add_token_to_namespace("M", "bmes_tags")
        vocab.add_token_to_namespace("E", "bmes_tags")
        vocab.add_token_to_namespace("S", "bmes_tags")

        self.vocab = vocab
示例#47
0
class TestListField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        super(TestListField, self).setUp()

    def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {"num_fields": 3, "list_num_tokens": 5}

    def test_list_field_can_handle_empty_text_fields(self):
        list_field = ListField(
            [self.field1, self.field2, self.empty_text_field])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor_dict["words"].data.cpu().numpy(),
            numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]))

    def test_list_field_can_handle_empty_index_fields(self):
        list_field = ListField(
            [self.index_field, self.index_field, self.empty_index_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.data.cpu().numpy(),
                                         numpy.array([[1], [1], [-1]]))

    def test_list_field_can_handle_empty_sequence_label_fields(self):
        list_field = ListField([
            self.sequence_label_field, self.sequence_label_field,
            self.empty_sequence_label_field
        ])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor.data.cpu().numpy(),
            numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]]))

    def test_all_fields_padded_to_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][0].data.cpu().numpy(),
            numpy.array([2, 3, 4, 5, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][1].data.cpu().numpy(),
            numpy.array([2, 3, 4, 1, 5]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][2].data.cpu().numpy(),
            numpy.array([2, 3, 1, 5, 0]))

    def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField(
            [LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
        nested_field2 = ListField(
            [LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
        list_field = ListField(
            [nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
        tensor = list_field.as_tensor(padding_lengths).data.cpu().numpy()
        numpy.testing.assert_almost_equal(
            tensor,
            [[[-1], [-1], [-1], [-1], [-1], [-1]],
             [[0], [1], [2], [3], [4], [-1]], [[5], [6], [7], [8], [9], [10]]])

    def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths["list_num_tokens"] = 7
        padding_lengths["num_fields"] = 5
        tensor_dict = list_field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][0].data.cpu().numpy(),
            numpy.array([2, 3, 4, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][1].data.cpu().numpy(),
            numpy.array([2, 3, 4, 1, 5, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][2].data.cpu().numpy(),
            numpy.array([2, 3, 1, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][3].data.cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"][4].data.cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]))

    def test_as_tensor_can_handle_multiple_token_indexers(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].data.cpu().numpy()
        characters = tensor_dict["characters"].data.cpu().numpy()
        numpy.testing.assert_array_almost_equal(
            words,
            numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[0],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [1, 1, 1, 1, 3, 1, 3, 4, 5],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 4, 1, 5, 1, 3, 1, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(
            self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField(
            [self.field1.empty_field(), self.field1, self.field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].data.cpu().numpy()
        characters = tensor_dict["characters"].data.cpu().numpy()

        numpy.testing.assert_array_almost_equal(
            words,
            numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]]))

        numpy.testing.assert_array_almost_equal(characters[0],
                                                numpy.zeros([5, 9]))

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [1, 1, 1, 1, 3, 1, 3, 4, 5],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

    def test_printing_doesnt_crash(self):
        list_field = ListField([self.field1, self.field2])
        print(list_field)
class TestTokenCharactersEncoder(AllenNlpTestCase):
    def setUp(self):
        super(TestTokenCharactersEncoder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1", "token_characters")
        self.vocab.add_token_to_namespace("2", "token_characters")
        self.vocab.add_token_to_namespace("3", "token_characters")
        self.vocab.add_token_to_namespace("4", "token_characters")
        params = Params({
            "embedding": {
                "embedding_dim": 2,
                "vocab_namespace": "token_characters"
            },
            "encoder": {
                "type": "cnn",
                "embedding_dim": 2,
                "num_filters": 4,
                "ngram_filter_sizes": [1, 2],
                "output_dim": 3
            }
        })
        self.encoder = TokenCharactersEncoder.from_params(
            vocab=self.vocab, params=deepcopy(params))
        self.embedding = Embedding.from_params(vocab=self.vocab,
                                               params=params["embedding"])
        self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
        constant_init = Initializer.from_params(
            Params({
                "type": "constant",
                "val": 1.
            }))
        initializer = InitializerApplicator([(".*", constant_init)])
        initializer(self.encoder)
        initializer(self.embedding)
        initializer(self.inner_encoder)

    def test_get_output_dim_uses_encoder_output_dim(self):
        assert self.encoder.get_output_dim() == 3

    def test_forward_applies_embedding_then_encoder(self):
        numpy_tensor = numpy.random.randint(6, size=(3, 4, 7))
        inputs = torch.from_numpy(numpy_tensor)
        encoder_output = self.encoder(inputs)
        reshaped_input = inputs.view(12, 7)
        embedded = self.embedding(reshaped_input)
        mask = (inputs != 0).long().view(12, 7)
        reshaped_manual_output = self.inner_encoder(embedded, mask)
        manual_output = reshaped_manual_output.view(3, 4, 3)
        assert_almost_equal(encoder_output.data.numpy(),
                            manual_output.data.numpy())

    def test_char_embedding_vocab_extension_with_default_namespace(self):
        vocab = self.vocab
        character_encoder = self.encoder

        original_weight = character_encoder._embedding._module.weight
        assert tuple(original_weight.shape) == (6, 2)
        vocab.add_token_to_namespace("5", "token_characters")

        character_encoder.extend_vocab(vocab)
        extended_weight = character_encoder._embedding._module.weight
        assert tuple(extended_weight.shape) == (7, 2)

        assert torch.all(original_weight == extended_weight[:6, :])
class TestTextField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("sentence", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='characters')
        self.vocab.add_token_to_namespace("s", namespace='characters')
        self.vocab.add_token_to_namespace("e", namespace='characters')
        self.vocab.add_token_to_namespace("n", namespace='characters')
        self.vocab.add_token_to_namespace("t", namespace='characters')
        self.vocab.add_token_to_namespace("c", namespace='characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}

    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence",
                                                      namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace(
            "A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]], {
                "characters":
                TokenCharactersIndexer(namespace="characters",
                                       min_padding_length=1)
            })
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words":
                SingleIdTokenIndexer(namespace="words"),
                "characters":
                TokenCharactersIndexer(namespace="characters",
                                       min_padding_length=1)
            })
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [
            capital_a_index, sentence_index
        ]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [
                                                            s_index, e_index,
                                                            n_index, t_index,
                                                            e_index, n_index,
                                                            c_index, e_index
                                                        ]]
        # pylint: enable=protected-access

    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "num_token_characters": 8
        }

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1),
                "words":
                SingleIdTokenIndexer("words")
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "num_tokens": 5,
            "characters_length": 5,
            "words_length": 5,
            "num_token_characters": 8
        }

    def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1]))

    def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            expected_character_array)

    def test_as_tensor_handles_characters_if_empty_field(self):
        field = TextField(
            [],
            token_indexers={
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([])
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            expected_character_array)

    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["a", "sentence", "."]],
            token_indexers={
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"].detach().cpu().numpy(),
            numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"].detach().cpu().numpy(),
            numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_printing_doesnt_crash(self):
        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        print(field)

    def test_token_indexer_returns_dict(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict":
                DictReturningTokenIndexer(),
                "words":
                SingleIdTokenIndexer("words"),
                "characters":
                TokenCharactersIndexer("characters", min_padding_length=1)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            'token_ids_length': 5,
            'additional_key_length': 2,
            'words_length': 2,
            'characters_length': 2,
            'num_token_characters': 8,
            'num_tokens': 5,
        }
        padding_lengths['token_ids_length'] = 7
        padding_lengths['additional_key_length'] = 3
        padding_lengths['words_length'] = 4
        padding_lengths['characters_length'] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors['token_ids'].shape) == [7]
        assert list(tensors['additional_key'].shape) == [3]
        assert list(tensors['words'].shape) == [4]
        assert list(tensors['characters'].shape) == [4, 8]

    def test_token_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict":
                DictReturningTokenIndexer(token_min_padding_length=3),
                "words":
                SingleIdTokenIndexer("words", token_min_padding_length=3),
                "characters":
                TokenCharactersIndexer("characters",
                                       min_padding_length=1,
                                       token_min_padding_length=3)
            })
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            'token_ids_length': 5,
            'additional_key_length': 3,
            'words_length': 3,
            'characters_length': 3,
            'num_token_characters': 8,
            'num_tokens': 5,
        }
        tensors = field.as_tensor(padding_lengths)
        assert tensors['additional_key'].tolist()[-1] == 0
        assert tensors['words'].tolist()[-1] == 0
        assert tensors['characters'].tolist()[-1] == [0] * 8

    def test_sequence_methods(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]], {})

        assert len(field) == 5
        assert field[1].text == "is"
        assert [token.text
                for token in field] == ["This", "is", "a", "sentence", "."]
class TestKnowledgeGraphField(SemparseTestCase):
    def setup_method(self):
        self.tokenizer = SpacyTokenizer(pos_tags=True)
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        table_file = self.FIXTURES_ROOT / "data" / "wikitables" / "tables" / "341.tagged"
        self.graph = TableQuestionContext.read_from_file(
            table_file, self.utterance).get_table_knowledge_graph()
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace="tokens")
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace="tokens")
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace="tokens")
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace="tokens")
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace="tokens")

        self.oov_index = self.vocab.get_token_index("random OOV string",
                                                    namespace="tokens")
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super().setup_method()

    def test_count_vocab_items(self):
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        self.field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["tokens"] == {
            "name": 1,
            "in": 2,
            "english": 2,
            "location": 1,
            "mersin": 1,
        }

    def test_get_padding_lengths_raises_if_not_indexed(self):
        with pytest.raises(ConfigurationError):
            self.field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            "num_entities": 3,
            "num_utterance_tokens": 4,
            "num_fields": 3,
            "list_tokens___tokens": 3,
        }
        self.field._token_indexers[
            "token_characters"] = TokenCharactersIndexer(min_padding_length=1)
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            "num_entities": 3,
            "num_utterance_tokens": 4,
            "num_fields": 3,
            "list_tokens___tokens": 3,
            "list_token_characters___token_characters": 3,
            "list_token_characters___num_token_characters": 8,
        }

    def test_as_tensor_produces_correct_output(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        padding_lengths["num_utterance_tokens"] += 1
        padding_lengths["num_entities"] += 1
        padding_lengths["num_fields"] += 1
        tensor_dict = self.field.as_tensor(padding_lengths)
        assert tensor_dict.keys() == {"text", "linking"}
        expected_text_tensor = [
            [self.mersin_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index],
            [0, 0, 0],
        ]
        assert_almost_equal(
            tensor_dict["text"]["tokens"]["tokens"].detach().cpu().numpy(),
            expected_text_tensor)

        linking_tensor = tensor_dict["linking"].detach().cpu().numpy()
        expected_linking_tensor = [
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # string:mersin, "where"
                [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # string:mersin, "is"
                [0, 1, 1, 1, 1, 1, 0, 0, 1, 1],  # string:mersin, "mersin"
                [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # string:mersin, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            ],  # string:mersin, padding
            [
                [0, 0, 0, 0, 0, -2.6, 0, 0, 0,
                 0],  # string_column:name_in_english, "where"
                [0, 0, 0, 0, 0, -7.5, 0, 0, 0,
                 0],  # string_column:name_in_english, "is"
                [0, 0, 0, 0, 0, -1.8333, 1, 1, 0,
                 0],  # string_column:..in_english, "mersin"
                [0, 0, 0, 0, 0, -18, 0, 0, 0,
                 0],  # string_column:name_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            ],  # string_column:name_in_english, padding
            [
                [0, 0, 0, 0, 0, -1.6, 0, 0, 0,
                 0],  # string_..:location_in_english, "where"
                [0, 0, 0, 0, 0, -5.5, 0, 0, 0,
                 0],  # string_column:location_in_english, "is"
                [0, 0, 0, 0, 0, -1, 0, 0, 0,
                 0],  # string_column:location_in_english, "mersin"
                [0, 0, 0, 0, 0, -14, 0, 0, 0,
                 0],  # string_column:location_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            ],  # string_column:location_in_english, padding
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            ],
        ]  # padding, padding
        for entity_index, entity_features in enumerate(
                expected_linking_tensor):
            for question_index, feature_vector in enumerate(entity_features):
                assert_almost_equal(
                    linking_tensor[entity_index, question_index],
                    feature_vector,
                    decimal=4,
                    err_msg=f"{entity_index} {question_index}",
                )

    def test_lemma_feature_extractor(self):
        utterance = self.tokenizer.tokenize("Names in English")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = "string_column:name_in_english"
        lemma_feature = field._contains_lemma_match(
            entity, field._entity_text_map[entity], utterance[0], 0, utterance)
        assert lemma_feature == 1

    def test_span_overlap_fraction(self):
        utterance = self.tokenizer.tokenize(
            "what is the name in english of mersin?")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = "string_column:name_in_english"
        entity_text = field._entity_text_map[entity]
        feature_values = [
            field._span_overlap_fraction(entity, entity_text, token, i,
                                         utterance)
            for i, token in enumerate(utterance)
        ]
        assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0]

    def test_batch_tensors(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        tensor_dict1 = self.field.as_tensor(padding_lengths)
        tensor_dict2 = self.field.as_tensor(padding_lengths)
        batched_tensor_dict = self.field.batch_tensors(
            [tensor_dict1, tensor_dict2])
        assert batched_tensor_dict.keys() == {"text", "linking"}
        expected_single_tensor = [
            [self.mersin_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index],
        ]
        expected_batched_tensor = [
            expected_single_tensor, expected_single_tensor
        ]
        assert_almost_equal(
            batched_tensor_dict["text"]["tokens"]
            ["tokens"].detach().cpu().numpy(),
            expected_batched_tensor,
        )
        expected_linking_tensor = torch.stack(
            [tensor_dict1["linking"], tensor_dict2["linking"]])
        assert_almost_equal(
            batched_tensor_dict["linking"].detach().cpu().numpy(),
            expected_linking_tensor.detach().cpu().numpy(),
        )

    def test_field_initialized_with_empty_constructor(self):
        try:
            self.field.empty_field()
        except AssertionError as e:
            pytest.fail(str(e), pytrace=True)
class TestProductionRuleField(AllenNlpTestCase):
    def setUp(self):
        super(TestProductionRuleField, self).setUp()
        self.vocab = Vocabulary()
        self.s_rule_index = self.vocab.add_token_to_namespace(
            "S -> [NP, VP]", namespace='rule_labels')
        self.np_index = self.vocab.add_token_to_namespace(
            "NP -> test", namespace='rule_labels')

    def test_field_counts_vocab_items_correctly(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 1

        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False)
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 0

    def test_index_converts_field_correctly(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        assert field._rule_id == self.s_rule_index

    def test_padding_lengths_are_computed_correctly(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        assert field.get_padding_lengths() == {}

    def test_as_tensor_produces_correct_output(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        tensor_tuple = field.as_tensor(field.get_padding_lengths())
        assert isinstance(tensor_tuple, tuple)
        assert len(tensor_tuple) == 3
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(),
                            [self.s_rule_index])

        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False)
        field.index(self.vocab)
        tensor_tuple = field.as_tensor(field.get_padding_lengths())
        assert isinstance(tensor_tuple, tuple)
        assert len(tensor_tuple) == 3
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

    def test_batch_tensors_does_not_modify_list(self):
        field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict1 = field.as_tensor(padding_lengths)

        field = ProductionRuleField('NP -> test', is_global_rule=True)
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict2 = field.as_tensor(padding_lengths)
        tensor_list = [tensor_dict1, tensor_dict2]
        assert field.batch_tensors(tensor_list) == tensor_list

    def test_doubly_nested_field_works(self):
        field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True)
        field2 = ProductionRuleField('NP -> test', is_global_rule=True)
        field3 = ProductionRuleField('VP -> eat', is_global_rule=False)
        list_field = ListField(
            [ListField([field1, field2, field3]),
             ListField([field1, field2])])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensors = list_field.as_tensor(padding_lengths)
        assert isinstance(tensors, list)
        assert len(tensors) == 2
        assert isinstance(tensors[0], list)
        assert len(tensors[0]) == 3
        assert isinstance(tensors[1], list)
        assert len(tensors[1]) == 3

        tensor_tuple = tensors[0][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(),
                            [self.s_rule_index])

        tensor_tuple = tensors[0][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(),
                            [self.np_index])

        tensor_tuple = tensors[0][2]
        assert tensor_tuple[0] == 'VP -> eat'
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None

        tensor_tuple = tensors[1][0]
        assert tensor_tuple[0] == 'S -> [NP, VP]'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(),
                            [self.s_rule_index])

        tensor_tuple = tensors[1][1]
        assert tensor_tuple[0] == 'NP -> test'
        assert tensor_tuple[1] is True
        assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(),
                            [self.np_index])

        # This item was just padding.
        tensor_tuple = tensors[1][2]
        assert tensor_tuple[0] == ''
        assert tensor_tuple[1] is False
        assert tensor_tuple[2] is None
示例#52
0
class TestDataset(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this")
        self.vocab.add_token_to_namespace("is")
        self.vocab.add_token_to_namespace("a")
        self.vocab.add_token_to_namespace("sentence")
        self.vocab.add_token_to_namespace(".")
        self.token_indexer = {"tokens": SingleIdTokenIndexer()}
        self.instances = self.get_instances()
        super().setUp()

    def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({"words": TextField([Token("hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

    def test_padding_lengths_uses_max_instance_lengths(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        assert padding_lengths == {
            "text1": {
                "num_tokens": 5,
                "tokens_length": 5
            },
            "text2": {
                "num_tokens": 6,
                "tokens_length": 6
            }
        }

    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(
            text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(
            text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))

    def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]],
            self.token_indexer)
        field2 = TextField([
            Token(t)
            for t in ["this", "is", "a", "different", "sentence", "."]
        ], self.token_indexer)
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            self.token_indexer)
        field4 = TextField([Token(t) for t in ["this", "is", "short"]],
                           self.token_indexer)
        instances = [
            Instance({
                "text1": field1,
                "text2": field2
            }),
            Instance({
                "text1": field3,
                "text2": field4
            })
        ]
        return instances
class KnowledgeGraphFieldTest(AllenNlpTestCase):
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
            'question': self.utterance,
            'columns': ['Name in English', 'Location in English'],
            'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']]
        }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace(
            "paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake",
                                                            namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala",
                                                            namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace(
            "-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0",
                                                            namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1",
                                                           namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string',
                                                    namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()

    def test_count_vocab_items(self):
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        self.field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["tokens"] == {
            '-1': 1,
            '0': 1,
            '1': 1,
            'name': 1,
            'in': 2,
            'english': 2,
            'location': 1,
            'paradeniz': 1,
            'mersin': 1,
            'lake': 1,
            'gala': 1,
            'edirne': 1,
        }

    def test_index_converts_field_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field._indexed_entity_texts.keys() == {'tokens'}
        # Note that these are sorted by their _identifiers_, not their cell text, so the
        # `fb:row.rows` show up after the `fb:cells`.
        expected_array = [[self.negative_one_index], [self.zero_index],
                          [self.one_index], [self.edirne_index],
                          [self.lake_index, self.gala_index],
                          [self.mersin_index], [self.paradeniz_index],
                          [
                              self.location_index, self.in_index,
                              self.english_index
                          ],
                          [self.name_index, self.in_index, self.english_index]]
        assert self.field._indexed_entity_texts['tokens'] == expected_array

    def test_get_padding_lengths_raises_if_not_indexed(self):
        with pytest.raises(AssertionError):
            self.field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        # pylint: disable=protected-access
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            'num_entities': 9,
            'num_entity_tokens': 3,
            'num_utterance_tokens': 4
        }
        self.field._token_indexers[
            'token_characters'] = TokenCharactersIndexer()
        self.field.index(self.vocab)
        assert self.field.get_padding_lengths() == {
            'num_entities': 9,
            'num_entity_tokens': 3,
            'num_utterance_tokens': 4,
            'num_token_characters': 9
        }

    def test_as_tensor_produces_correct_output(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        padding_lengths['num_utterance_tokens'] += 1
        padding_lengths['num_entities'] += 1
        tensor_dict = self.field.as_tensor(padding_lengths)
        assert tensor_dict.keys() == {'text', 'linking'}
        expected_text_tensor = [
            [self.negative_one_index, 0, 0], [self.zero_index, 0, 0],
            [self.one_index, 0, 0], [self.edirne_index, 0, 0],
            [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0],
            [self.paradeniz_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index], [0, 0, 0]
        ]
        assert_almost_equal(
            tensor_dict['text']['tokens'].detach().cpu().numpy(),
            expected_text_tensor)

        linking_tensor = tensor_dict['linking'].detach().cpu().numpy()
        expected_linking_tensor = [
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -1, "mersin"
                [0, 0, 0, 0, 0, -1, 0, 0, 0, 0]
            ],  # -1, "?"
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 0, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # 0, "?"
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # 1, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # 1, "?"
            [
                [0, 0, 0, 0, 0, .2, 0, 0, 0, 0],  # fb:cell.edirne, "where"
                [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # fb:cell.edirne, "is"
                [0, 0, 0, 0, 0, .1666, 0, 0, 0, 0],  # fb:cell.edirne, "mersin"
                [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # fb:cell.edirne, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.edirne, padding
            [
                [0, 0, 0, 0, 0, -.6, 0, 0, 0, 0],  # fb:cell.lake_gala, "where"
                [0, 0, 0, 0, 0, -3.5, 0, 0, 0, 0],  # fb:cell.lake_gala, "is"
                [0, 0, 0, 0, 0, -.3333, 0, 0, 0,
                 0],  # fb:cell.lake_gala, "mersin"
                [0, 0, 0, 0, 0, -8, 0, 0, 0, 0],  # fb:cell.lake_gala, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.lake_gala, padding
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # fb:cell.mersin, "where"
                [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0],  # fb:cell.mersin, "is"
                [0, 1, 1, 1, 1, 1, 0, 0, 1, 1],  # fb:cell.mersin, "mersin"
                [0, 0, 0, 0, 0, -5, 0, 0, 0, 0],  # fb:cell.mersin, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.mersin, padding
            [
                [0, 0, 0, 0, 0, -.6, 0, 0, 0, 0],  # fb:cell.paradeniz, "where"
                [0, 0, 0, 0, 0, -3, 0, 0, 0, 0],  # fb:cell.paradeniz, "is"
                [0, 0, 0, 0, 0, -.1666, 0, 0, 0,
                 0],  # fb:cell.paradeniz, "mersin"
                [0, 0, 0, 0, 0, -8, 0, 0, 0, 0],  # fb:cell.paradeniz, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:cell.paradeniz, padding
            [
                [0, 0, 0, 0, 0, -2.6, 0, 0, 0,
                 0],  # fb:row.row.name_in_english, "where"
                [0, 0, 0, 0, 0, -7.5, 0, 0, 0,
                 0],  # fb:row.row.name_in_english, "is"
                [0, 0, 0, 0, 0, -1.8333, 1, 1, 0,
                 0],  # fb:row.row.name_in_english, "mersin"
                [0, 0, 0, 0, 0, -18, 0, 0, 0,
                 0],  # fb:row.row.name_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:row.row.name_in_english, padding
            [
                [0, 0, 0, 0, 0, -1.6, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "where"
                [0, 0, 0, 0, 0, -5.5, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "is"
                [0, 0, 0, 0, 0, -1, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "mersin"
                [0, 0, 0, 0, 0, -14, 0, 0, 0,
                 0],  # fb:row.row.location_in_english, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ],  # fb:row.row.location_in_english, padding
            [
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "where"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "is"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "mersin"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # padding, "?"
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            ]
        ]  # padding, padding
        for entity_index, entity_features in enumerate(
                expected_linking_tensor):
            for question_index, feature_vector in enumerate(entity_features):
                assert_almost_equal(linking_tensor[entity_index,
                                                   question_index],
                                    feature_vector,
                                    decimal=4,
                                    err_msg=f"{entity_index} {question_index}")

    def test_lemma_feature_extractor(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize("Names in English")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = 'fb:row.row.name_in_english'
        lemma_feature = field._contains_lemma_match(
            entity, field._entity_text_map[entity], utterance[0], 0, utterance)
        assert lemma_feature == 1

    def test_span_overlap_fraction(self):
        # pylint: disable=protected-access
        utterance = self.tokenizer.tokenize(
            "what is the name in english of mersin?")
        field = KnowledgeGraphField(self.graph, self.utterance,
                                    self.token_indexers, self.tokenizer)
        entity = 'fb:row.row.name_in_english'
        entity_text = field._entity_text_map[entity]
        feature_values = [
            field._span_overlap_fraction(entity, entity_text, token, i,
                                         utterance)
            for i, token in enumerate(utterance)
        ]
        assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0]

    def test_batch_tensors(self):
        self.field.index(self.vocab)
        padding_lengths = self.field.get_padding_lengths()
        tensor_dict1 = self.field.as_tensor(padding_lengths)
        tensor_dict2 = self.field.as_tensor(padding_lengths)
        batched_tensor_dict = self.field.batch_tensors(
            [tensor_dict1, tensor_dict2])
        assert batched_tensor_dict.keys() == {'text', 'linking'}
        expected_single_tensor = [
            [self.negative_one_index, 0, 0], [self.zero_index, 0, 0],
            [self.one_index, 0, 0], [self.edirne_index, 0, 0],
            [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0],
            [self.paradeniz_index, 0, 0],
            [self.location_index, self.in_index, self.english_index],
            [self.name_index, self.in_index, self.english_index]
        ]
        expected_batched_tensor = [
            expected_single_tensor, expected_single_tensor
        ]
        assert_almost_equal(
            batched_tensor_dict['text']['tokens'].detach().cpu().numpy(),
            expected_batched_tensor)
        expected_linking_tensor = torch.stack(
            [tensor_dict1['linking'], tensor_dict2['linking']])
        assert_almost_equal(
            batched_tensor_dict['linking'].detach().cpu().numpy(),
            expected_linking_tensor.detach().cpu().numpy())
class TestBasicTextFieldEmbedder(AllenNlpTestCase):
    def setUp(self):
        super(TestBasicTextFieldEmbedder, self).setUp()
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("1")
        self.vocab.add_token_to_namespace("2")
        self.vocab.add_token_to_namespace("3")
        self.vocab.add_token_to_namespace("4")
        params = Params({
            "words1": {
                "type": "embedding",
                "embedding_dim": 2
            },
            "words2": {
                "type": "embedding",
                "embedding_dim": 5
            },
            "words3": {
                "type": "embedding",
                "embedding_dim": 3
            }
        })
        self.token_embedder = BasicTextFieldEmbedder.from_params(
            self.vocab, params)
        self.inputs = {
            "words1": torch.LongTensor([[0, 2, 3, 5]]),
            "words2": torch.LongTensor([[1, 4, 3, 2]]),
            "words3": torch.LongTensor([[1, 5, 1, 2]])
        }

    def test_get_output_dim_aggregates_dimension_from_each_embedding(self):
        assert self.token_embedder.get_output_dim() == 10

    def test_forward_asserts_input_field_match(self):
        self.inputs['words4'] = self.inputs['words3']
        del self.inputs['words3']
        with pytest.raises(ConfigurationError):
            self.token_embedder(self.inputs)
        self.inputs['words3'] = self.inputs['words4']
        del self.inputs['words4']

    def test_forward_concats_resultant_embeddings(self):
        assert self.token_embedder(self.inputs).size() == (1, 4, 10)

    def test_forward_works_on_higher_order_input(self):
        params = Params({
            "words": {
                "type": "embedding",
                "num_embeddings": 20,
                "embedding_dim": 2,
            },
            "characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 4,
                    "num_embeddings": 15,
                },
                "encoder": {
                    "type": "cnn",
                    "embedding_dim": 4,
                    "num_filters": 10,
                    "ngram_filter_sizes": [3],
                },
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
            'words': (torch.rand(3, 4, 5, 6) * 20).long(),
            'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(),
        }
        assert token_embedder(inputs,
                              num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)

    def test_forward_runs_with_non_bijective_mapping(self):
        elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
        options_file = str(elmo_fixtures_path / 'options.json')
        weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
        params = Params({
            "words": {
                "type": "embedding",
                "num_embeddings": 20,
                "embedding_dim": 2,
            },
            "elmo": {
                "type": "elmo_token_embedder",
                "options_file": options_file,
                "weight_file": weight_file
            },
            "embedder_to_indexer_map": {
                "words": ["words"],
                "elmo": ["elmo", "words"]
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
        inputs = {
            'words': (torch.rand(3, 6) * 20).long(),
            'elmo': (torch.rand(3, 6, 50) * 15).long(),
        }
        token_embedder(inputs)
示例#55
0
class TestListField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexer = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters")
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)
        super(TestListField, self).setUp()

    def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {"num_fields": 3, "num_tokens": 5}

    def test_all_fields_padded_to_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        array_dict = list_field.as_array(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(array_dict["words"][0],
                                                numpy.array([2, 3, 4, 5, 0]))
        numpy.testing.assert_array_almost_equal(array_dict["words"][1],
                                                numpy.array([2, 3, 4, 1, 5]))
        numpy.testing.assert_array_almost_equal(array_dict["words"][2],
                                                numpy.array([2, 3, 1, 5, 0]))

    def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths["num_tokens"] = 7
        padding_lengths["num_fields"] = 5
        array_dict = list_field.as_array(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            array_dict["words"][0], numpy.array([2, 3, 4, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            array_dict["words"][1], numpy.array([2, 3, 4, 1, 5, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            array_dict["words"][2], numpy.array([2, 3, 1, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            array_dict["words"][3], numpy.array([0, 0, 0, 0, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(
            array_dict["words"][4], numpy.array([0, 0, 0, 0, 0, 0, 0]))

    def test_as_array_can_handle_multiple_token_indexers(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexer
        self.field2._token_indexers = self.words_and_characters_indexer
        self.field3._token_indexers = self.words_and_characters_indexer

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        array_dict = list_field.as_array(padding_lengths)
        words = array_dict["words"]
        characters = array_dict["characters"]
        numpy.testing.assert_array_almost_equal(
            words,
            numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[0],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 0, 0, 0, 0, 0, 0, 0, 0],
                         [1, 1, 1, 1, 3, 1, 3, 4, 5],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                         [1, 2, 0, 0, 0, 0, 0, 0, 0],
                         [1, 4, 1, 5, 1, 3, 1, 0, 0],
                         [2, 3, 4, 5, 3, 4, 6, 3, 0],
                         [0, 0, 0, 0, 0, 0, 0, 0, 0]]))
示例#56
0
    def test_min_padding_length(self):
        sentence = "AllenNLP is awesome ."
        tokens = [Token(token) for token in sentence.split(" ")]
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("l", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("N", namespace="characters")  # 6
        vocab.add_token_to_namespace("L", namespace="characters")  # 7
        vocab.add_token_to_namespace("P", namespace="characters")  # 8
        vocab.add_token_to_namespace("i", namespace="characters")  # 9
        vocab.add_token_to_namespace("s", namespace="characters")  # 10
        vocab.add_token_to_namespace("a", namespace="characters")  # 11
        vocab.add_token_to_namespace("w", namespace="characters")  # 12
        vocab.add_token_to_namespace("o", namespace="characters")  # 13
        vocab.add_token_to_namespace("m", namespace="characters")  # 14
        vocab.add_token_to_namespace(".", namespace="characters")  # 15

        indexer = TokenCharactersIndexer("characters", min_padding_length=10)
        indices = indexer.tokens_to_indices(tokens, vocab)
        padded = indexer.as_padded_tensor_dict(
            indices, indexer.get_padding_lengths(indices))
        assert padded["token_characters"].tolist() == [
            [2, 3, 3, 4, 5, 6, 7, 8, 0, 0],
            [9, 10, 0, 0, 0, 0, 0, 0, 0, 0],
            [11, 12, 4, 10, 13, 14, 4, 0, 0, 0],
            [15, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ]
示例#57
0
class TestListField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", 'words')
        self.vocab.add_token_to_namespace("s", 'characters')
        self.vocab.add_token_to_namespace("e", 'characters')
        self.vocab.add_token_to_namespace("n", 'characters')
        self.vocab.add_token_to_namespace("t", 'characters')
        self.vocab.add_token_to_namespace("c", 'characters')
        for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']:
            self.vocab.add_token_to_namespace(label, 'labels')

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"),
                                              "characters": TokenCharactersIndexer("characters")}
        self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp()

    def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5}

    def test_list_field_can_handle_empty_text_fields(self):
        list_field = ListField([self.field1, self.field2, self.empty_text_field])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(),
                                         numpy.array([[2, 3, 4, 5, 0],
                                                      [2, 3, 4, 1, 5],
                                                      [0, 0, 0, 0, 0]]))

    def test_list_field_can_handle_empty_index_fields(self):
        list_field = ListField([self.index_field, self.index_field, self.empty_index_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]]))

    def test_list_field_can_handle_empty_sequence_label_fields(self):
        list_field = ListField([self.sequence_label_field,
                                self.sequence_label_field,
                                self.empty_sequence_label_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(),
                                         numpy.array([[1, 1, 0, 1],
                                                      [1, 1, 0, 1],
                                                      [0, 0, 0, 0]]))

    def test_all_fields_padded_to_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 5, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 1, 5]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(),
                                                numpy.array([2, 3, 1, 5, 0]))

    def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']])
        nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']])
        list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                   [0, 1, 2, 3, 4, -1],
                                                   [5, 6, 7, 8, 9, 10]])

    def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths["list_words_length"] = 7
        padding_lengths["num_fields"] = 5
        tensor_dict = list_field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(),
                                                numpy.array([2, 3, 4, 1, 5, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(),
                                                numpy.array([2, 3, 1, 5, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(),
                                                numpy.array([0, 0, 0, 0, 0, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(),
                                                numpy.array([0, 0, 0, 0, 0, 0, 0]))

    def test_as_tensor_can_handle_multiple_token_indexers(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0],
                                                                    [2, 3, 4, 1, 5],
                                                                    [2, 3, 1, 5, 0]]))

        numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 1, 1, 1, 3, 1, 3, 4, 5],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

        numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 4, 1, 5, 1, 3, 1, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self):
        # pylint: disable=protected-access
        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1.empty_field(), self.field1, self.field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"].detach().cpu().numpy()
        characters = tensor_dict["characters"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0],
                                                                    [2, 3, 4, 5, 0],
                                                                    [2, 3, 4, 1, 5]]))

        numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9]))

        numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0],
                                                                            [0, 0, 0, 0, 0, 0, 0, 0, 0]]))

        numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0],
                                                                            [1, 2, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 0, 0, 0, 0, 0, 0, 0, 0],
                                                                            [1, 1, 1, 1, 3, 1, 3, 4, 5],
                                                                            [2, 3, 4, 5, 3, 4, 6, 3, 0]]))

    def test_printing_doesnt_crash(self):
        list_field = ListField([self.field1, self.field2])
        print(list_field)

    def test_sequence_methods(self):
        list_field = ListField([self.field1, self.field2, self.field3])

        assert len(list_field) == 3
        assert list_field[1] == self.field2
        assert [f for f in list_field] == [self.field1, self.field2, self.field3]
示例#58
0
class SamplerTest(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace("this")
        self.is_index = self.vocab.add_token_to_namespace("is")
        self.a_index = self.vocab.add_token_to_namespace("a")
        self.sentence_index = self.vocab.add_token_to_namespace("sentence")
        self.another_index = self.vocab.add_token_to_namespace("another")
        self.yet_index = self.vocab.add_token_to_namespace("yet")
        self.very_index = self.vocab.add_token_to_namespace("very")
        self.long_index = self.vocab.add_token_to_namespace("long")
        instances = [
            self.create_instance(["this", "is", "a", "sentence"]),
            self.create_instance(["this", "is", "another", "sentence"]),
            self.create_instance(["yet", "another", "sentence"]),
            self.create_instance([
                "this", "is", "a", "very", "very", "very", "very", "long",
                "sentence"
            ]),
            self.create_instance(["sentence"]),
        ]

        self.instances = instances
        self.lazy_instances = LazyIterable(instances)

    def get_mock_reader(self) -> DatasetReader:
        class MockReader(DatasetReader):
            def __init__(self, instances, **kwargs):
                super().__init__(**kwargs)
                self.instances = instances

            def _read(self, file_path: str):
                for instance in self.instances:
                    yield instance

        return MockReader(self.instances)

    def create_instance(self, str_tokens: List[str]):
        tokens = [Token(t) for t in str_tokens]
        instance = Instance({"text": TextField(tokens, self.token_indexers)})
        instance.index_fields(self.vocab)
        return instance

    def create_instances_from_token_counts(
            self, token_counts: List[int]) -> List[Instance]:
        return [
            self.create_instance(["word"] * count) for count in token_counts
        ]

    def get_batches_stats(
            self,
            batches: Iterable[Batch]) -> Dict[str, Union[int, List[int]]]:
        grouped_instances = [batch.instances for batch in batches]
        group_lengths = [len(group) for group in grouped_instances]

        sample_sizes = []
        for batch in batches:
            batch_sequence_length = max(
                instance.get_padding_lengths()["text"]["tokens___tokens"]
                for instance in batch.instances)
            sample_sizes.append(batch_sequence_length * len(batch.instances))

        return {
            "batch_lengths": group_lengths,
            "total_instances": sum(group_lengths),
            "sample_sizes": sample_sizes,
        }

    def assert_instances_are_correct(self, candidate_instances):
        # First we need to remove padding tokens from the candidates.

        candidate_instances = [
            tuple(w for w in instance if w != 0)
            for instance in candidate_instances
        ]
        expected_instances = [
            tuple(instance.fields["text"]._indexed_tokens["tokens"]["tokens"])
            for instance in self.instances
        ]
        assert set(candidate_instances) == set(expected_instances)
示例#59
0
class TestTextField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("sentence", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='words')
        self.vocab.add_token_to_namespace("A", namespace='characters')
        self.vocab.add_token_to_namespace("s", namespace='characters')
        self.vocab.add_token_to_namespace("e", namespace='characters')
        self.vocab.add_token_to_namespace("n", namespace='characters')
        self.vocab.add_token_to_namespace("t", namespace='characters')
        self.vocab.add_token_to_namespace("c", namespace='characters')
        super(TestTextField, self).setUp()

    def test_field_counts_vocab_items_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["words"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)

        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert list(namespace_token_counts.keys()) == ["characters"]

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        namespace_token_counts = defaultdict(lambda: defaultdict(int))
        field.count_vocab_items(namespace_token_counts)
        assert namespace_token_counts["characters"]["T"] == 1
        assert namespace_token_counts["characters"]["h"] == 1
        assert namespace_token_counts["characters"]["i"] == 2
        assert namespace_token_counts["characters"]["s"] == 3
        assert namespace_token_counts["characters"]["a"] == 1
        assert namespace_token_counts["characters"]["e"] == 3
        assert namespace_token_counts["characters"]["n"] == 2
        assert namespace_token_counts["characters"]["t"] == 1
        assert namespace_token_counts["characters"]["c"] == 1
        assert namespace_token_counts["characters"]["."] == 1
        assert namespace_token_counts["words"]["This"] == 1
        assert namespace_token_counts["words"]["is"] == 1
        assert namespace_token_counts["words"]["a"] == 1
        assert namespace_token_counts["words"]["sentence"] == 1
        assert namespace_token_counts["words"]["."] == 1
        assert set(namespace_token_counts.keys()) == {"words", "characters"}

    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        # pylint: enable=protected-access

    def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

    def test_padding_lengths_are_computed_correctly(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words_length": 5, "num_tokens": 5}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8}

        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters"),
                                          "words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"num_tokens": 5,
                                   "characters_length": 5,
                                   "words_length": 5,
                                   "num_token_characters": 8}

    def test_as_tensor_handles_words(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                                numpy.array([1, 1, 1, 2, 1]))

    def test_as_tensor_handles_longer_lengths(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                                numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]))

    def test_as_tensor_handles_characters(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]],
                          token_indexers={"characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0],
                                                [1, 3, 0, 0, 0, 0, 0, 0],
                                                [1, 0, 0, 0, 0, 0, 0, 0],
                                                [3, 4, 5, 6, 4, 5, 7, 4],
                                                [1, 0, 0, 0, 0, 0, 0, 0]])
        numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                                expected_character_array)

    def test_as_tensor_handles_words_and_characters_with_longer_lengths(self):
        field = TextField([Token(t) for t in ["a", "sentence", "."]],
                          token_indexers={"words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words_length"] = 5
        padding_lengths["characters_length"] = 5
        padding_lengths["num_token_characters"] = 10
        tensor_dict = field.as_tensor(padding_lengths)

        numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(),
                                                numpy.array([1, 2, 1, 0, 0]))
        numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(),
                                                numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [3, 4, 5, 6, 4, 5, 7, 4, 0, 0],
                                                             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                                                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

    def test_printing_doesnt_crash(self):
        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        print(field)

    def test_token_indexer_returns_dict(self):
        field = TextField([Token(t) for t in ["A", "sentence"]],
                          token_indexers={"field_with_dict": DictReturningTokenIndexer(),
                                          "words": SingleIdTokenIndexer("words"),
                                          "characters": TokenCharactersIndexer("characters")})
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
                'token_ids_length': 5,
                'additional_key_length': 2,
                'words_length': 2,
                'characters_length': 2,
                'num_token_characters': 8,
                'num_tokens': 5,
        }
        padding_lengths['token_ids_length'] = 7
        padding_lengths['additional_key_length'] = 3
        padding_lengths['words_length'] = 4
        padding_lengths['characters_length'] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors['token_ids'].shape) == [7]
        assert list(tensors['additional_key'].shape) == [3]
        assert list(tensors['words'].shape) == [4]
        assert list(tensors['characters'].shape) == [4, 8]

    def test_sequence_methods(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], {})

        assert len(field) == 5
        assert field[1].text == "is"
        assert [token.text for token in field] == ["This", "is", "a", "sentence", "."]
示例#60
0
class TestListField(AllenNlpTestCase):
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", "words")
        self.vocab.add_token_to_namespace("s", "characters")
        self.vocab.add_token_to_namespace("e", "characters")
        self.vocab.add_token_to_namespace("n", "characters")
        self.vocab.add_token_to_namespace("t", "characters")
        self.vocab.add_token_to_namespace("c", "characters")
        for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]:
            self.vocab.add_token_to_namespace(label, "labels")

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters", min_padding_length=1),
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer
        )
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer
        )
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer
        )

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        empty_list_field = ListField([text_field.empty_field()])
        empty_fields = {"list_tensor": empty_list_field}
        self.empty_instance = Instance(empty_fields)

        non_empty_list_field = ListField([text_field])
        non_empty_fields = {"list_tensor": non_empty_list_field}
        self.non_empty_instance = Instance(non_empty_fields)

        super().setUp()

    def test_get_padding_lengths(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        lengths = list_field.get_padding_lengths()
        assert lengths == {"num_fields": 3, "list_words___tokens": 5}

    def test_list_field_can_handle_empty_text_fields(self):
        list_field = ListField([self.field1, self.field2, self.empty_text_field])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(),
            numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]),
        )

    def test_list_field_can_handle_empty_index_fields(self):
        list_field = ListField([self.index_field, self.index_field, self.empty_index_field])
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])
        )

    def test_list_field_can_handle_empty_sequence_label_fields(self):
        list_field = ListField(
            [self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]
        )
        list_field.index(self.vocab)
        tensor = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_equal(
            tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])
        )

    def test_all_fields_padded_to_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        tensor_dict = list_field.as_tensor(list_field.get_padding_lengths())
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0])
        )

    def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField([LabelField(c) for c in ["a", "b", "c", "d", "e"]])
        nested_field2 = ListField([LabelField(c) for c in ["f", "g", "h", "i", "j", "k"]])
        list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {"num_fields": 3, "list_num_fields": 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(
            tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]
        )

    def test_fields_can_pad_to_greater_than_max_length(self):
        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        padding_lengths["list_words___tokens"] = 7
        padding_lengths["num_fields"] = 5
        tensor_dict = list_field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][0].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 5, 0, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][1].detach().cpu().numpy(),
            numpy.array([2, 3, 4, 1, 5, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][2].detach().cpu().numpy(),
            numpy.array([2, 3, 1, 5, 0, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][3].detach().cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]),
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"][4].detach().cpu().numpy(),
            numpy.array([0, 0, 0, 0, 0, 0, 0]),
        )

    def test_as_tensor_can_handle_multiple_token_indexers(self):

        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1, self.field2, self.field3])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"]["tokens"].detach().cpu().numpy()
        characters = tensor_dict["characters"]["token_characters"].detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(
            words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])
        )

        numpy.testing.assert_array_almost_equal(
            characters[0],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0],
                ]
            ),
        )

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0],
                    [1, 1, 1, 1, 3, 1, 3, 4, 5],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                ]
            ),
        )

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 4, 1, 5, 1, 3, 1, 0, 0],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0],
                ]
            ),
        )

    def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self):

        self.field1._token_indexers = self.words_and_characters_indexers
        self.field2._token_indexers = self.words_and_characters_indexers
        self.field3._token_indexers = self.words_and_characters_indexers

        list_field = ListField([self.field1.empty_field(), self.field1, self.field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        tensor_dict = list_field.as_tensor(padding_lengths)
        words = tensor_dict["words"]["tokens"].detach().cpu().numpy()
        characters = tensor_dict["characters"]["token_characters"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(
            words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])
        )

        numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9]))

        numpy.testing.assert_array_almost_equal(
            characters[1],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0, 0],
                ]
            ),
        )

        numpy.testing.assert_array_almost_equal(
            characters[2],
            numpy.array(
                [
                    [5, 1, 1, 2, 0, 0, 0, 0, 0],
                    [1, 2, 0, 0, 0, 0, 0, 0, 0],
                    [1, 0, 0, 0, 0, 0, 0, 0, 0],
                    [1, 1, 1, 1, 3, 1, 3, 4, 5],
                    [2, 3, 4, 5, 3, 4, 6, 3, 0],
                ]
            ),
        )

    def test_printing_doesnt_crash(self):
        list_field = ListField([self.field1, self.field2])
        print(list_field)

    def test_sequence_methods(self):
        list_field = ListField([self.field1, self.field2, self.field3])

        assert len(list_field) == 3
        assert list_field[1] == self.field2
        assert [f for f in list_field] == [self.field1, self.field2, self.field3]

    def test_empty_list_can_be_tensorized(self):
        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        list_field = ListField([text_field.empty_field()])
        fields = {
            "list": list_field,
            "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
        }
        instance = Instance(fields)
        instance.index_fields(self.vocab)
        instance.as_tensor_dict()

    def test_batch_with_some_empty_lists_works(self):
        dataset = [self.empty_instance, self.non_empty_instance]

        model = DummyModel(self.vocab)
        model.eval()
        iterator = BasicIterator(batch_size=2)
        iterator.index_with(self.vocab)
        batch = next(iterator(dataset, shuffle=False))
        model.forward(**batch)

    # This use case may seem a bit peculiar. It's intended for situations where
    # you have sparse inputs that are used as additional features for some
    # prediction, and they are sparse enough that they can be empty for some
    # cases. It would be silly to try to handle these as None in your model; it
    # makes a whole lot more sense to just have a minimally-sized tensor that
    # gets entirely masked and has no effect on the rest of the model.
    def test_batch_of_entirely_empty_lists_works(self):
        dataset = [self.empty_instance, self.empty_instance]

        model = DummyModel(self.vocab)
        model.eval()
        iterator = BasicIterator(batch_size=2)
        iterator.index_with(self.vocab)
        batch = next(iterator(dataset, shuffle=False))
        model.forward(**batch)