def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]) } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ 'pretrained_file': str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'), 'embedding_dim': 5 }) with pytest.raises(ValueError, message="No ValueError when pretrained_file is a multi-file archive"): Embedding.from_params(vocab, params) for ext in ['.zip', '.tar.gz']: archive_path = str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext file_uri = format_embeddings_file_uri(archive_path, 'folder/fake_embeddings.5d.txt') params = Params({ 'pretrained_file': file_uri, 'embedding_dim': 5 }) embeddings = Embedding.from_params(vocab, params).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), 'Problem with format ' + archive_path
def test_dry_run_without_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() # if extend is False, its users responsibility to make sure that dataset instances # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in # namespace for which there could be OOV entries seen in dataset during indexing. # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token. # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront. vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') vocab.add_token_to_namespace('N', namespace='labels') vocab.add_token_to_namespace('V', namespace='labels') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = False dry_run_from_params(self.params, extended_serialization_dir) with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' assert len(tokens) == 3
def get_vocab(word2freq, max_v_sizes): '''Build vocabulary''' vocab = Vocabulary(counter=None, max_vocab_size=max_v_sizes['word']) words_by_freq = [(word, freq) for word, freq in word2freq.items()] words_by_freq.sort(key=lambda x: x[1], reverse=True) for word, _ in words_by_freq[:max_v_sizes['word']]: vocab.add_token_to_namespace(word, 'tokens') log.info("\tFinished building vocab. Using %d words", vocab.get_vocab_size('tokens')) return vocab
def test_token_to_indices_uses_ner_tags(self): tokens = self.tokenizer.split_words("Larry Page is CEO of Google.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() person_index = vocab.add_token_to_namespace('PERSON', namespace='ner_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags') vocab.add_token_to_namespace('ORG', namespace='ner_tags') indexer = NerTagIndexer() assert indexer.token_to_indices(tokens[1], vocab) == person_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index
def test_as_tensor_produces_integer_targets(self): vocab = Vocabulary() vocab.add_token_to_namespace("B", namespace='*labels') vocab.add_token_to_namespace("I", namespace='*labels') vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) padding_lengths = sequence_label_field.get_padding_lengths() tensor = sequence_label_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 1, 2, 2, 2]))
def test_get_embedding_layer_uses_correct_embedding_dim(self): vocab = Vocabulary() vocab.add_token_to_namespace('word1') vocab.add_token_to_namespace('word2') embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8')) embedding_weights = _read_pretrained_embedding_file(embeddings_filename, 3, vocab) assert tuple(embedding_weights.size()) == (4, 3) # 4 because of padding and OOV with pytest.raises(ConfigurationError): _read_pretrained_embedding_file(embeddings_filename, 4, vocab)
def test_token_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') indexer = PosTagIndexer(coarse_tags=True) assert indexer.token_to_indices(tokens[1], vocab) == verb_index assert indexer.token_to_indices(tokens[-1], vocab) == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.token_to_indices(tokens[1], vocab) == cop_index
def test_index_converts_field_correctly(self): vocab = Vocabulary() b_index = vocab.add_token_to_namespace("B", namespace='*labels') i_index = vocab.add_token_to_namespace("I", namespace='*labels') o_index = vocab.add_token_to_namespace("O", namespace='*labels') tags = ["B", "I", "O", "O", "O"] sequence_label_field = SequenceLabelField(tags, self.text, label_namespace="*labels") sequence_label_field.index(vocab) # pylint: disable=protected-access assert sequence_label_field._indexed_labels == [b_index, i_index, o_index, o_index, o_index]
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.gz" with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
class TestDataset(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super(TestDataset, self).setUp() def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2]) def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5}, "text2": {"num_tokens": 6, "tokens_length": 6}} def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]])) def get_instances(self): field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer) field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4})] return instances
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = self.TEST_DIR + "embeddings.hdf5" embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset( 'embedding', embeddings.shape, dtype='float32', data=embeddings ) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(vocab, params)
def test_adjacency_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("a", namespace="labels") vocab.add_token_to_namespace("b", namespace="labels") vocab.add_token_to_namespace("c", namespace="labels") labels = ["a", "b"] indices = [(0, 1), (2, 1)] adjacency_field = AdjacencyField(indices, self.text, labels) adjacency_field.index(vocab) tensor = adjacency_field.as_tensor(adjacency_field.get_padding_lengths()) numpy.testing.assert_equal(tensor.numpy(), numpy.array([[-1, 0, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, 1, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1]]))
def test_start_and_end_tokens(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') # 2 vocab.add_token_to_namespace("s", namespace='characters') # 3 vocab.add_token_to_namespace("e", namespace='characters') # 4 vocab.add_token_to_namespace("n", namespace='characters') # 5 vocab.add_token_to_namespace("t", namespace='characters') # 6 vocab.add_token_to_namespace("c", namespace='characters') # 7 vocab.add_token_to_namespace("<", namespace='characters') # 8 vocab.add_token_to_namespace(">", namespace='characters') # 9 vocab.add_token_to_namespace("/", namespace='characters') # 10 indexer = TokenCharactersIndexer("characters", start_tokens=["<s>"], end_tokens=["</s>"]) indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char") assert indices == {"char": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]}
def test_read_hdf5_format_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.hdf5" embeddings = numpy.random.rand(vocab.get_vocab_size(), 5) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset( 'embedding', embeddings.shape, dtype='float32', data=embeddings ) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) embedding_layer = Embedding.from_params(vocab, params) assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace('the') vocab.add_token_to_namespace('a') params = Params({ 'pretrained_file': 'tests/fixtures/glove.6B.300d.sample.txt.gz', 'embedding_dim': 300, 'projection_dim': 20 }) embedding_layer = Embedding.from_params(vocab, params) input_tensor = Variable(torch.LongTensor([[3, 2, 1, 0]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = Variable(torch.LongTensor([[[3, 2, 1, 0]]])) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() root_index = vocab.add_token_to_namespace('ROOT', namespace='dep_labels') none_index = vocab.add_token_to_namespace('NONE', namespace='dep_labels') indexer = DepLabelIndexer() assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {"tokens1": [root_index]} assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {"tokens-1": [none_index]}
class TestTokenCharactersEncoder(AllenNlpTestCase): def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params(vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params(Params({"type": "constant", "val": 1.})) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder) def test_get_output_dim_uses_encoder_output_dim(self): assert self.encoder.get_output_dim() == 3 def test_forward_applies_embedding_then_encoder(self): numpy_tensor = numpy.random.randint(6, size=(3, 4, 7)) inputs = torch.from_numpy(numpy_tensor) encoder_output = self.encoder(inputs) reshaped_input = inputs.view(12, 7) embedded = self.embedding(reshaped_input) mask = (inputs != 0).long().view(12, 7) reshaped_manual_output = self.inner_encoder(embedded, mask) manual_output = reshaped_manual_output.view(3, 4, 3) assert_almost_equal(encoder_output.data.numpy(), manual_output.data.numpy())
class IteratorTest(AllenNlpTestCase): def setUp(self): super(IteratorTest, self).setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]), self.create_instance(["sentence"]), ] self.instances = instances self.lazy_instances = LazyIterable(instances) def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({'text': TextField(tokens, self.token_indexers)}) return instance def create_instances_from_token_counts(self, token_counts: List[int]) -> List[Instance]: return [self.create_instance(["word"] * count) for count in token_counts] def get_batches_stats(self, batches: Iterable[Batch]) -> Dict[str, Union[int, List[int]]]: grouped_instances = [batch.instances for batch in batches] group_lengths = [len(group) for group in grouped_instances] sample_sizes = [] for batch in batches: batch_sequence_length = max( [instance.get_padding_lengths()['text']['num_tokens'] for instance in batch.instances] ) sample_sizes.append(batch_sequence_length * len(batch.instances)) return { "batch_lengths": group_lengths, "total_instances": sum(group_lengths), "sample_sizes": sample_sizes } def assert_instances_are_correct(self, candidate_instances): # First we need to remove padding tokens from the candidates. # pylint: disable=protected-access candidate_instances = [tuple(w for w in instance if w != 0) for instance in candidate_instances] expected_instances = [tuple(instance.fields["text"]._indexed_tokens["tokens"]) for instance in self.instances] assert set(candidate_instances) == set(expected_instances)
def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]]
def test_dry_run_with_extension(self): existing_serialization_dir = self.TEST_DIR / 'existing' extended_serialization_dir = self.TEST_DIR / 'extended' existing_vocab_path = existing_serialization_dir / 'vocabulary' extended_vocab_path = extended_serialization_dir / 'vocabulary' vocab = Vocabulary() vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens') vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens') os.makedirs(existing_serialization_dir, exist_ok=True) vocab.save_to_files(existing_vocab_path) self.params['vocabulary'] = {} self.params['vocabulary']['directory_path'] = existing_vocab_path self.params['vocabulary']['extend'] = True self.params['vocabulary']['min_count'] = {"tokens" : 3} dry_run_from_params(self.params, extended_serialization_dir) vocab_files = os.listdir(extended_vocab_path) assert set(vocab_files) == {'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'} with open(extended_vocab_path / 'tokens.txt') as f: tokens = [line.strip() for line in f] assert tokens[0] == '@@UNKNOWN@@' assert tokens[1] == 'some_weird_token_1' assert tokens[2] == 'some_weird_token_2' tokens.sort() assert tokens == ['.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1', 'some_weird_token_2'] with open(extended_vocab_path / 'labels.txt') as f: labels = [line.strip() for line in f] labels.sort() assert labels == ['N', 'V']
def test_label_field_can_index_with_vocab(self): vocab = Vocabulary() vocab.add_token_to_namespace("entailment", namespace="labels") vocab.add_token_to_namespace("contradiction", namespace="labels") vocab.add_token_to_namespace("neutral", namespace="labels") label = LabelField("entailment") label.index(vocab) tensor = label.as_tensor(label.get_padding_lengths()) assert tensor.item() == 0
def test_tokens_to_indices_uses_pos_tags(self): tokens = self.tokenizer.split_words("This is a sentence.") tokens = [t for t in tokens] + [Token("</S>")] vocab = Vocabulary() verb_index = vocab.add_token_to_namespace('VERB', namespace='pos_tags') cop_index = vocab.add_token_to_namespace('VBZ', namespace='pos_tags') none_index = vocab.add_token_to_namespace('NONE', namespace='pos_tags') # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them vocab.add_token_to_namespace('DET', namespace='pos_tags') vocab.add_token_to_namespace('NOUN', namespace='pos_tags') vocab.add_token_to_namespace('PUNCT', namespace='pos_tags') indexer = PosTagIndexer(namespace='pos_tags', coarse_tags=True) indices = indexer.tokens_to_indices(tokens, vocab, "tokens") assert len(indices) == 1 assert "tokens" in indices assert indices["tokens"][1] == verb_index assert indices["tokens"][-1] == none_index indexer._coarse_tags = False # pylint: disable=protected-access assert indexer.tokens_to_indices([tokens[1]], vocab, "coarse") == {"coarse": [cop_index]}
def test_token_to_indices_produces_correct_characters(self): vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace='characters') vocab.add_token_to_namespace("s", namespace='characters') vocab.add_token_to_namespace("e", namespace='characters') vocab.add_token_to_namespace("n", namespace='characters') vocab.add_token_to_namespace("t", namespace='characters') vocab.add_token_to_namespace("c", namespace='characters') indexer = TokenCharactersIndexer("characters") indices = indexer.token_to_indices(Token("sentential"), vocab) assert indices == [3, 4, 5, 6, 4, 5, 6, 1, 1, 1]
class IteratorTest(AllenNlpTestCase): def setUp(self): super(IteratorTest, self).setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance(["this", "is", "a", "very", "very", "very", "very", "long", "sentence"]), self.create_instance(["sentence"]), ] class LazyIterable: def __iter__(self): return (instance for instance in instances) self.instances = instances self.lazy_instances = LazyIterable() def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({'text': TextField(tokens, self.token_indexers)}) instance.index_fields(self.vocab) return instance def assert_instances_are_correct(self, candidate_instances): # First we need to remove padding tokens from the candidates. # pylint: disable=protected-access candidate_instances = [tuple(w for w in instance if w != 0) for instance in candidate_instances] expected_instances = [tuple(instance.fields["text"]._indexed_tokens["tokens"]) for instance in self.instances] assert set(candidate_instances) == set(expected_instances)
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, 'wb') as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8')) embeddings_file.write(f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8')) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 3, }) embedding_layer = Embedding.from_params(vocab, params) word_vector = embedding_layer.weight.data[vocab.get_token_index("word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index(unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index("word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
class IteratorTest(AllenNlpTestCase): def setUp(self): super(IteratorTest, self).setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance([ "this", "is", "a", "very", "very", "very", "very", "long", "sentence" ]), self.create_instance(["sentence"]), ] self.instances = instances self.lazy_instances = LazyIterable(instances) def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({'text': TextField(tokens, self.token_indexers)}) return instance def create_instances_from_token_counts( self, token_counts: List[int]) -> List[Instance]: return [ self.create_instance(["word"] * count) for count in token_counts ] def get_batches_stats( self, batches: Iterable[Batch]) -> Dict[str, Union[int, List[int]]]: grouped_instances = [batch.instances for batch in batches] group_lengths = [len(group) for group in grouped_instances] sample_sizes = [] for batch in batches: batch_sequence_length = max([ instance.get_padding_lengths()['text']['num_tokens'] for instance in batch.instances ]) sample_sizes.append(batch_sequence_length * len(batch.instances)) return { "batch_lengths": group_lengths, "total_instances": sum(group_lengths), "sample_sizes": sample_sizes } def assert_instances_are_correct(self, candidate_instances): # First we need to remove padding tokens from the candidates. # pylint: disable=protected-access candidate_instances = [ tuple(w for w in instance if w != 0) for instance in candidate_instances ] expected_instances = [ tuple(instance.fields["text"]._indexed_tokens["tokens"]) for instance in self.instances ] assert set(candidate_instances) == set(expected_instances)
class TestTextField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters") }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"} def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace( "A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [ capital_a_index, sentence_index ] field1 = TextField( [Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters") }) field2.index(vocab) assert field2._indexed_tokens["words"] == [ capital_a_index, sentence_index ] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] # pylint: enable=protected-access def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5} field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters"), "words": SingleIdTokenIndexer("words") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "num_token_characters": 8} def test_as_tensor_handles_words(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].data.cpu().numpy(), numpy.array([1, 1, 1, 2, 1])) def test_as_tensor_handles_longer_lengths(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].data.cpu().numpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_as_tensor_handles_characters(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].data.cpu().numpy(), expected_character_array) def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField( [Token(t) for t in ["a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["num_tokens"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].data.cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].data.cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) print(field)
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) self.inputs = { "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])), "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])), "words3": Variable(torch.LongTensor([[1, 5, 1, 2]])) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError): self.token_embedder(self.inputs) self.inputs['words3'] = self.inputs['words4'] del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': Variable(torch.rand(3, 4, 5, 6) * 20).long(), 'characters': Variable(torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) self.token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): # Total mismatch self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") self.inputs['words3'] = self.inputs['words4'] # Text field has too many inputs with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("is generating more keys") del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { 'words': (torch.rand(3, 4, 5, 6) * 20).long(), 'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]} }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_null(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": { # ignore `word_inputs` in `ElmoTokenEmbedder.forward` "elmo": ["elmo", None] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_dict(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": { # pass arguments to `ElmoTokenEmbedder.forward` by dict "elmo": { "inputs": "elmo", "word_inputs": "words" }, "words": ["words"] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_old_from_params_new_from_params(self): old_params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) # Allow loading the parameters in the old format with pytest.warns(DeprecationWarning): old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab) new_params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) # But also allow loading the parameters in the new format new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab) assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys() assert new_embedder(self.inputs).size() == (1, 4, 10)
class KnowledgeGraphFieldTest(AllenNlpTestCase): def setUp(self): self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True)) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} table_file = self.FIXTURES_ROOT / "data" / "wikitables" / "tables" / "341.tagged" self.graph = TableQuestionContext.read_from_file( table_file, self.utterance).get_table_knowledge_graph() self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens') self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens') self.english_index = self.vocab.add_token_to_namespace( "english", namespace='tokens') self.location_index = self.vocab.add_token_to_namespace( "location", namespace='tokens') self.mersin_index = self.vocab.add_token_to_namespace( "mersin", namespace='tokens') self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens') self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super(KnowledgeGraphFieldTest, self).setUp() def test_count_vocab_items(self): namespace_token_counts = defaultdict(lambda: defaultdict(int)) self.field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["tokens"] == { 'name': 1, 'in': 2, 'english': 2, 'location': 1, 'mersin': 1, } def test_index_converts_field_correctly(self): # pylint: disable=protected-access self.field.index(self.vocab) assert self.field._indexed_entity_texts.keys() == {'tokens'} # Note that these are sorted by their _identifiers_, not their cell text, so the # `fb:row.rows` show up after the `fb:cells`. expected_array = [[self.mersin_index], [ self.location_index, self.in_index, self.english_index ], [self.name_index, self.in_index, self.english_index]] assert self.field._indexed_entity_texts['tokens'] == expected_array def test_get_padding_lengths_raises_if_not_indexed(self): with pytest.raises(AssertionError): self.field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): # pylint: disable=protected-access self.field.index(self.vocab) assert self.field.get_padding_lengths() == { 'num_entities': 3, 'num_entity_tokens': 3, 'num_utterance_tokens': 4 } self.field._token_indexers[ 'token_characters'] = TokenCharactersIndexer(min_padding_length=1) self.field.index(self.vocab) assert self.field.get_padding_lengths() == { 'num_entities': 3, 'num_entity_tokens': 3, 'num_utterance_tokens': 4, 'num_token_characters': 8 } def test_as_tensor_produces_correct_output(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() padding_lengths['num_utterance_tokens'] += 1 padding_lengths['num_entities'] += 1 tensor_dict = self.field.as_tensor(padding_lengths) assert tensor_dict.keys() == {'text', 'linking'} expected_text_tensor = [ [self.mersin_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index], [0, 0, 0] ] assert_almost_equal( tensor_dict['text']['tokens'].detach().cpu().numpy(), expected_text_tensor) linking_tensor = tensor_dict['linking'].detach().cpu().numpy() expected_linking_tensor = [ [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # string:mersin, "where" [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0], # string:mersin, "is" [0, 1, 1, 1, 1, 1, 0, 0, 1, 1], # string:mersin, "mersin" [0, 0, 0, 0, 0, -5, 0, 0, 0, 0], # string:mersin, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # string:mersin, padding [ [0, 0, 0, 0, 0, -2.6, 0, 0, 0, 0], # string_column:name_in_english, "where" [0, 0, 0, 0, 0, -7.5, 0, 0, 0, 0], # string_column:name_in_english, "is" [0, 0, 0, 0, 0, -1.8333, 1, 1, 0, 0], # string_column:..in_english, "mersin" [0, 0, 0, 0, 0, -18, 0, 0, 0, 0], # string_column:name_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # string_column:name_in_english, padding [ [0, 0, 0, 0, 0, -1.6, 0, 0, 0, 0], # string_..:location_in_english, "where" [0, 0, 0, 0, 0, -5.5, 0, 0, 0, 0], # string_column:location_in_english, "is" [0, 0, 0, 0, 0, -1, 0, 0, 0, 0], # string_column:location_in_english, "mersin" [0, 0, 0, 0, 0, -14, 0, 0, 0, 0], # string_column:location_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # string_column:location_in_english, padding [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ] ] # padding, padding for entity_index, entity_features in enumerate( expected_linking_tensor): for question_index, feature_vector in enumerate(entity_features): assert_almost_equal(linking_tensor[entity_index, question_index], feature_vector, decimal=4, err_msg=f"{entity_index} {question_index}") def test_lemma_feature_extractor(self): # pylint: disable=protected-access utterance = self.tokenizer.tokenize("Names in English") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = 'string_column:name_in_english' lemma_feature = field._contains_lemma_match( entity, field._entity_text_map[entity], utterance[0], 0, utterance) assert lemma_feature == 1 def test_span_overlap_fraction(self): # pylint: disable=protected-access utterance = self.tokenizer.tokenize( "what is the name in english of mersin?") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = 'string_column:name_in_english' entity_text = field._entity_text_map[entity] feature_values = [ field._span_overlap_fraction(entity, entity_text, token, i, utterance) for i, token in enumerate(utterance) ] assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0] def test_batch_tensors(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() tensor_dict1 = self.field.as_tensor(padding_lengths) tensor_dict2 = self.field.as_tensor(padding_lengths) batched_tensor_dict = self.field.batch_tensors( [tensor_dict1, tensor_dict2]) assert batched_tensor_dict.keys() == {'text', 'linking'} expected_single_tensor = [ [self.mersin_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index] ] expected_batched_tensor = [ expected_single_tensor, expected_single_tensor ] assert_almost_equal( batched_tensor_dict['text']['tokens'].detach().cpu().numpy(), expected_batched_tensor) expected_linking_tensor = torch.stack( [tensor_dict1['linking'], tensor_dict2['linking']]) assert_almost_equal( batched_tensor_dict['linking'].detach().cpu().numpy(), expected_linking_tensor.detach().cpu().numpy()) def test_field_initialized_with_empty_constructor(self): try: self.field.empty_field() except AssertionError as e: pytest.fail(str(e), pytrace=True)
class TestTextField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace(u"sentence", namespace=u'words') self.vocab.add_token_to_namespace(u"A", namespace=u'words') self.vocab.add_token_to_namespace(u"A", namespace=u'characters') self.vocab.add_token_to_namespace(u"s", namespace=u'characters') self.vocab.add_token_to_namespace(u"e", namespace=u'characters') self.vocab.add_token_to_namespace(u"n", namespace=u'characters') self.vocab.add_token_to_namespace(u"t", namespace=u'characters') self.vocab.add_token_to_namespace(u"c", namespace=u'characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts[u"words"][u"This"] == 1 assert namespace_token_counts[u"words"][u"is"] == 1 assert namespace_token_counts[u"words"][u"a"] == 1 assert namespace_token_counts[u"words"][u"sentence"] == 1 assert namespace_token_counts[u"words"][u"."] == 1 assert list(namespace_token_counts.keys()) == [u"words"] field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts[u"characters"][u"T"] == 1 assert namespace_token_counts[u"characters"][u"h"] == 1 assert namespace_token_counts[u"characters"][u"i"] == 2 assert namespace_token_counts[u"characters"][u"s"] == 3 assert namespace_token_counts[u"characters"][u"a"] == 1 assert namespace_token_counts[u"characters"][u"e"] == 3 assert namespace_token_counts[u"characters"][u"n"] == 2 assert namespace_token_counts[u"characters"][u"t"] == 1 assert namespace_token_counts[u"characters"][u"c"] == 1 assert namespace_token_counts[u"characters"][u"."] == 1 assert list(namespace_token_counts.keys()) == [u"characters"] field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts[u"characters"][u"T"] == 1 assert namespace_token_counts[u"characters"][u"h"] == 1 assert namespace_token_counts[u"characters"][u"i"] == 2 assert namespace_token_counts[u"characters"][u"s"] == 3 assert namespace_token_counts[u"characters"][u"a"] == 1 assert namespace_token_counts[u"characters"][u"e"] == 3 assert namespace_token_counts[u"characters"][u"n"] == 2 assert namespace_token_counts[u"characters"][u"t"] == 1 assert namespace_token_counts[u"characters"][u"c"] == 1 assert namespace_token_counts[u"characters"][u"."] == 1 assert namespace_token_counts[u"words"][u"This"] == 1 assert namespace_token_counts[u"words"][u"is"] == 1 assert namespace_token_counts[u"words"][u"a"] == 1 assert namespace_token_counts[u"words"][u"sentence"] == 1 assert namespace_token_counts[u"words"][u"."] == 1 assert set(namespace_token_counts.keys()) == set([u"words", u"characters"]) def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace(u"sentence", namespace=u'words') capital_a_index = vocab.add_token_to_namespace(u"A", namespace=u'words') capital_a_char_index = vocab.add_token_to_namespace(u"A", namespace=u'characters') s_index = vocab.add_token_to_namespace(u"s", namespace=u'characters') e_index = vocab.add_token_to_namespace(u"e", namespace=u'characters') n_index = vocab.add_token_to_namespace(u"n", namespace=u'characters') t_index = vocab.add_token_to_namespace(u"t", namespace=u'characters') c_index = vocab.add_token_to_namespace(u"c", namespace=u'characters') field = TextField([Token(t) for t in [u"A", u"sentence"]], {u"words": SingleIdTokenIndexer(namespace=u"words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens[u"words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in [u"A", u"sentence"]], {u"characters": TokenCharactersIndexer(namespace=u"characters")}) field1.index(vocab) assert field1._indexed_tokens[u"characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in [u"A", u"sentence"]], token_indexers={u"words": SingleIdTokenIndexer(namespace=u"words"), u"characters": TokenCharactersIndexer(namespace=u"characters")}) field2.index(vocab) assert field2._indexed_tokens[u"words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens[u"characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] # pylint: enable=protected-access def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {u"num_tokens": 5} field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8} field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters"), u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {u"num_tokens": 5, u"num_token_characters": 8} def test_as_tensor_handles_words(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])) def test_as_tensor_handles_longer_lengths(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths[u"num_tokens"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_as_tensor_handles_characters(self): field = TextField([Token(t) for t in [u"This", u"is", u"a", u"sentence", u"."]], token_indexers={u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(), expected_character_array) def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField([Token(t) for t in [u"a", u"sentence", u"."]], token_indexers={u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths[u"num_tokens"] = 5 padding_lengths[u"num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict[u"characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in [u"A", u"sentence"]], {u"words": SingleIdTokenIndexer(namespace=u"words")}) print(field) def test_token_embedder_returns_dict(self): field = TextField([Token(t) for t in [u"A", u"sentence"]], token_indexers={u"field_with_dict": DictReturningTokenIndexer(), u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { u'token_ids': 5, u'additional_key': 2, u'words': 2, u'characters': 2, u'num_token_characters': 8 } padding_lengths[u'token_ids'] = 7 padding_lengths[u'additional_key'] = 3 padding_lengths[u'words'] = 4 padding_lengths[u'characters'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors[u'token_ids'].shape) == [7] assert list(tensors[u'additional_key'].shape) == [3] assert list(tensors[u'words'].shape) == [4] assert list(tensors[u'characters'].shape) == [4, 8]
class FancyIteratorTest(AllenNlpTestCase): def setUp(self): super().setUp() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace('this') self.is_index = self.vocab.add_token_to_namespace('is') self.a_index = self.vocab.add_token_to_namespace('a') self.sentence_index = self.vocab.add_token_to_namespace('sentence') self.another_index = self.vocab.add_token_to_namespace('another') self.yet_index = self.vocab.add_token_to_namespace('yet') self.very_index = self.vocab.add_token_to_namespace('very') self.long_index = self.vocab.add_token_to_namespace('long') instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance([ "this", "is", "a", "very", "very", "very", "very", "long", "sentence" ]), self.create_instance(["sentence"]), ] self.instances = instances def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({'source': TextField(tokens, self.token_indexers)}) return instance def test_truncate(self): # Checks that the truncate parameter works as intended. # Since split size is less than the length of the "very ... very long" sentence, the # iterator should return one batch when the truncation is enabled. split_size = 4 truncated_iterator = FancyIterator(batch_size=5, split_size=split_size, splitting_keys=['source'], truncate=True) truncated_iterator.index_with(self.vocab) batches = list(truncated_iterator(self.instances, num_epochs=1)) assert len(batches) == 1 # When truncation is disabled the iterator should return 3 batches instead. non_truncated_iterator = FancyIterator(batch_size=5, split_size=split_size, splitting_keys=['source'], truncate=False) non_truncated_iterator.index_with(self.vocab) batches = list(non_truncated_iterator(self.instances, num_epochs=1)) assert len(batches) == 3 # When the batch size is larger than the number of instances, truncation will the iterator # to return zero batches of data (since some of the instances in the batch would consist # entirely of padding). Check that the iterator raises an error in this case. invalid_iterator = FancyIterator(batch_size=6, split_size=split_size, splitting_keys=['source'], truncate=True) invalid_iterator.index_with(self.vocab) with self.assertRaises(ConfigurationError): batches = list(invalid_iterator(self.instances, num_epochs=1)) # If truncation is disabled then this should not cause an issue valid_iterator = FancyIterator(batch_size=6, split_size=split_size, splitting_keys=['source'], truncate=False) valid_iterator.index_with(self.vocab) batches = list(valid_iterator(self.instances, num_epochs=1)) assert len(batches) == 3
class RelaxedBeamSearchTest(AllenNlpTestCase): def setUp(self): super().setUp() self.vocab = Vocabulary(non_padded_namespaces=['tokens']) for i in range(transition_probabilities.size(0)): self.vocab.add_token_to_namespace(str(i)) self.end_symbol = str(transition_probabilities.size()[0] - 1) self.end_index = transition_probabilities.size()[0] - 1 # Ensure the end symbol has the expected index assert self.end_index == self.vocab.get_token_index(self.end_symbol) self.beam_search = RelaxedBeamSearch(self.vocab, beam_size=3, end_symbol=self.end_symbol, max_steps=10) # This is what the top k should look like for each item in the batch. self.expected_top_k = [ np.array([1, 2, 3, 4, 5]), np.array([2, 3, 4, 5]), np.array([3, 4, 5]) ] # This is what the log probs should look like for each item in the batch. self.expected_log_probs = np.log(np.array([0.4, 0.3, 0.2])) # pylint: disable=assignment-from-no-return def _check_results(self, batch_size: int = 5, expected_top_k: np.array = None, expected_log_probs: np.array = None, beam_search: RelaxedBeamSearch = None, state: Dict[str, torch.Tensor] = None, step: StepFunctionType = None, rtol: float = 1e-7) -> None: expected_top_k = expected_top_k if expected_top_k is not None else self.expected_top_k expected_log_probs = expected_log_probs if expected_log_probs is not None else self.expected_log_probs state = state or {} step = step or take_step beam_search = beam_search or self.beam_search beam_size = beam_search.beam_size initial_predictions = torch.tensor([0] * batch_size) # pylint: disable=not-callable top_k, log_probs = beam_search.search(initial_predictions, state, step) # type: ignore assert len(top_k) == batch_size assert len(log_probs) == batch_size for i in range(batch_size): assert len(top_k[i]) == beam_size assert len(log_probs[i]) == beam_size for j in range(beam_size): np.testing.assert_array_equal(top_k[i][j].numpy(), expected_top_k[j]) np.testing.assert_allclose(log_probs[i][j].numpy(), expected_log_probs[j]) def test_search(self): self._check_results() def test_finished_state(self): state = {} state["foo"] = torch.tensor( # pylint: disable=not-callable [[1, 0, 1], [2, 0, 1], [0, 0, 1], [1, 1, 1], [0, 0, 0]]) # shape: (batch_size, 3) expected_finished_state = {} expected_finished_state["foo"] = np.array([[1, 0, 1], [1, 0, 1], [1, 0, 1], [2, 0, 1], [2, 0, 1], [2, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) # shape: (batch_size x beam_size, 3) self._check_results(state=state) # check finished state. for key, array in expected_finished_state.items(): np.testing.assert_allclose(state[key].numpy(), array) def test_batch_size_of_one(self): self._check_results(batch_size=1) def test_greedy_search(self): beam_search = RelaxedBeamSearch(self.vocab, beam_size=1, end_symbol=self.end_symbol) expected_top_k = np.array([[1, 2, 3, 4, 5]]) expected_log_probs = np.log(np.array([0.4])) # pylint: disable=assignment-from-no-return self._check_results(expected_top_k=expected_top_k, expected_log_probs=expected_log_probs, beam_search=beam_search) def test_catch_bad_config(self): """ If `per_node_beam_size` (which defaults to `beam_size`) is larger than the size of the target vocabulary, `BeamSearch.search` should raise a ConfigurationError. """ beam_search = RelaxedBeamSearch(self.vocab, beam_size=20, end_symbol=self.end_symbol) with pytest.raises(ConfigurationError): self._check_results(beam_search=beam_search) def test_warn_for_bad_log_probs(self): # The only valid next step from the initial predictions is the end index. # But with a beam size of 3, the call to `topk` to find the 3 most likely # next beams will result in 2 new beams that are invalid, in that have probability of 0. # The beam search should warn us of this. initial_predictions = torch.LongTensor( [self.end_index - 1, self.end_index - 1]) with pytest.warns(RuntimeWarning, match="Infinite log probabilities"): self.beam_search.search(initial_predictions, {}, take_step) def test_empty_sequences(self): initial_predictions = torch.LongTensor( [self.end_index - 1, self.end_index - 1]) beam_search = RelaxedBeamSearch(self.vocab, beam_size=1, end_symbol=self.end_symbol) with pytest.warns(RuntimeWarning, match="Empty sequences predicted"): predictions, log_probs = beam_search.search( initial_predictions, {}, take_step) # predictions hould have shape `(batch_size, beam_size, max_predicted_length)`. assert list(predictions.size()) == [2, 1, 1] # log probs hould have shape `(batch_size, beam_size)`. assert list(log_probs.size()) == [2, 1] assert (predictions == self.end_index).all() assert (log_probs == 0).all() def test_min_steps_warn_for_bad_log_probs(self): initial_predictions = torch.LongTensor([0] * 2) beam_search = RelaxedBeamSearch(self.vocab, beam_size=1, end_symbol=self.end_symbol, min_steps=5) with pytest.warns(RuntimeWarning, match="Infinite log probabilities"): beam_search.search(initial_predictions, {}, take_step) def test_length_penalizer(self): # This is an extreme value for the Wu penalizer just to force # the outputs to switch order length_penalizer = WuLengthPenalizer(-10) beam_search = RelaxedBeamSearch(self.vocab, beam_size=3, end_symbol=self.end_symbol, max_steps=10, length_penalizer=length_penalizer) # The outputs are in the opposite order than expected expected_top_k = [ np.array([3, 4, 5]), np.array([2, 3, 4, 5]), np.array([1, 2, 3, 4, 5]) ] expected_log_probs = np.log(np.array([0.2, 0.3, 0.4])) self._check_results(expected_top_k=expected_top_k, expected_log_probs=expected_log_probs, beam_search=beam_search, step=take_step)
def _create_vocab(cls) -> Vocabulary: vocab = Vocabulary() vocab.add_token_to_namespace("O", "labels") vocab.add_token_to_namespace("B-Tag", "labels") vocab.add_token_to_namespace("I-Tag", "labels") return vocab
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params( self.vocab, params) self.inputs = { "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])), "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])), "words3": Variable(torch.LongTensor([[1, 5, 1, 2]])) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError): self.token_embedder(self.inputs) self.inputs['words3'] = self.inputs['words4'] del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': Variable(torch.rand(3, 4, 5, 6) * 20).long(), 'characters': Variable(torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
def setUp(self): super(SpanBasedF1Test, self).setUp() vocab = Vocabulary() vocab.add_token_to_namespace("O", "tags") vocab.add_token_to_namespace("B-ARG1", "tags") vocab.add_token_to_namespace("I-ARG1", "tags") vocab.add_token_to_namespace("B-ARG2", "tags") vocab.add_token_to_namespace("I-ARG2", "tags") vocab.add_token_to_namespace("B-V", "tags") vocab.add_token_to_namespace("I-V", "tags") vocab.add_token_to_namespace("U-ARG1", "tags") vocab.add_token_to_namespace("U-ARG2", "tags") vocab.add_token_to_namespace("B-C-ARG1", "tags") vocab.add_token_to_namespace("I-C-ARG1", "tags") vocab.add_token_to_namespace("B-ARGM-ADJ", "tags") vocab.add_token_to_namespace("I-ARGM-ADJ", "tags") self.vocab = vocab
import json import argparse from allennlp.data import Vocabulary if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--ontology-path', type=str, required=True) parser.add_argument('--output-path', type=str, required=True) args = parser.parse_args() with open(args.ontology_path) as f: ontology = json.load(f) vocab = Vocabulary() vocab.add_token_to_namespace(token='None', namespace='span_labels') vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels') vocab.add_tokens_to_namespace(tokens=list(ontology['args'].keys()), namespace='span_labels') vocab.add_tokens_to_namespace(tokens=list(ontology['events'].keys()), namespace='event_labels') vocab.save_to_files(args.output_path)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, title_encoder: Seq2VecEncoder, abstract_encoder: Seq2VecEncoder, venue_encoder: Seq2VecEncoder, body_encoder: Seq2VecEncoder = None, predict_mode: bool = False, author_text_embedder: TextFieldEmbedder = None, venue_field_embedder: TextFieldEmbedder = None, author_text_encoder: Seq2VecEncoder = None, # author_id_embedder: Optional[Embedding] = None, author_id_embedder: TextFieldEmbedder = None, # author_position_embedder: Optional[Embedding] = None, author_position_embedder: TextFieldEmbedder = None, feedforward: FeedForward = None, author_feedforward: FeedForward = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, max_num_authors: Optional[int] = 5, dropout: Optional[float] = None, ignore_authors: Optional[bool] = False, layer_norm: Optional[bool] = True, embedding_layer_norm: Optional[bool] = False, loss_distance: Optional[str] = 'l2-norm', loss_margin: Optional[float] = 1, bert_finetune: Optional[bool] = False, include_venue: Optional[bool] = False) -> None: super(Specter, self).__init__(vocab, regularizer) for lbl in range(max_num_authors): vocab.add_token_to_namespace(token=str(lbl), namespace='author_positions') self.text_field_embedder = text_field_embedder self.venue_field_embedder = venue_field_embedder self.title_encoder = title_encoder self.abstract_encoder = abstract_encoder self.body_encoder = body_encoder self.venue_encoder = venue_encoder self.predict_mode = predict_mode self.feedforward = feedforward if loss_distance == 'l2-norm': self.loss = torch.nn.TripletMarginLoss(margin=loss_margin, reduction='none') elif loss_distance == 'binary': self.loss = BinaryLoss(margin=loss_margin) else: self.loss = TripletLoss(margin=loss_margin, distance=loss_distance, reduction='none') if layer_norm: self.layer_norm = LayerNorm(self.feedforward.get_output_dim()) self.do_layer_norm = layer_norm # self.layer_norm_author_embedding = LayerNorm(author_feedforward.get_output_dim()) if embedding_layer_norm: self.layer_norm_word_embedding = LayerNorm( self.title_encoder.get_input_dim()) self.layer_norm_word_embedding_venue = LayerNorm( self.venue_encoder.get_input_dim()) self.embedding_layer_norm = embedding_layer_norm self.dropout = Dropout() self.ignore_authors = ignore_authors if not ignore_authors: self.author_id_embedder = author_id_embedder self.author_position_embedder = author_position_embedder self.author_text_embedder = author_text_embedder self.author_text_encoder = author_text_encoder # author representation would be a concatenation of author-id and author-position # [batch, num-authors, auth-dim + position-dim] # we apply timedistributed mlp on top to make this a: # [batch, num-authors, dim] self.author_time_dist_ff = TimeDistributed(author_feedforward) # internal variable showing that the title/abstract should be encoded with a transformer # do not change this as it should be by default `false` in this class # in the inheriting `PaperRepresentationTransoformer` class it is set to true in the constructor # to indicate that the title/abstract should be encoded with a transformer. self.tansformer_encoder = False self.bert_finetune = bert_finetune self.include_venue = include_venue self.include_venue = include_venue initializer(self)
class TestProductionRuleField(AllenNlpTestCase): def setUp(self): super(TestProductionRuleField, self).setUp() self.vocab = Vocabulary() self.s_rule_index = self.vocab.add_token_to_namespace("S -> [NP, VP]", namespace='rule_labels') self.np_index = self.vocab.add_token_to_namespace("NP -> test", namespace='rule_labels') def test_field_counts_vocab_items_correctly(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 1 field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 0 def test_index_converts_field_correctly(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) assert field._rule_id == self.s_rule_index def test_padding_lengths_are_computed_correctly(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) assert field.get_padding_lengths() == {} def test_as_tensor_produces_correct_output(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) tensor_tuple = field.as_tensor(field.get_padding_lengths()) assert isinstance(tensor_tuple, tuple) assert len(tensor_tuple) == 4 assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False) field.index(self.vocab) tensor_tuple = field.as_tensor(field.get_padding_lengths()) assert isinstance(tensor_tuple, tuple) assert len(tensor_tuple) == 4 assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is False assert tensor_tuple[2] is None def test_batch_tensors_does_not_modify_list(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict1 = field.as_tensor(padding_lengths) field = ProductionRuleField('NP -> test', is_global_rule=True) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict2 = field.as_tensor(padding_lengths) tensor_list = [tensor_dict1, tensor_dict2] assert field.batch_tensors(tensor_list) == tensor_list def test_doubly_nested_field_works(self): field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field2 = ProductionRuleField('NP -> test', is_global_rule=True) field3 = ProductionRuleField('VP -> eat', is_global_rule=False) list_field = ListField([ListField([field1, field2, field3]), ListField([field1, field2])]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensors = list_field.as_tensor(padding_lengths) assert isinstance(tensors, list) assert len(tensors) == 2 assert isinstance(tensors[0], list) assert len(tensors[0]) == 3 assert isinstance(tensors[1], list) assert len(tensors[1]) == 3 tensor_tuple = tensors[0][0] assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) tensor_tuple = tensors[0][1] assert tensor_tuple[0] == 'NP -> test' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index]) tensor_tuple = tensors[0][2] assert tensor_tuple[0] == 'VP -> eat' assert tensor_tuple[1] is False assert tensor_tuple[2] is None tensor_tuple = tensors[1][0] assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) tensor_tuple = tensors[1][1] assert tensor_tuple[0] == 'NP -> test' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index]) # This item was just padding. tensor_tuple = tensors[1][2] assert tensor_tuple[0] == '' assert tensor_tuple[1] is False assert tensor_tuple[2] is None def test_production_rule_field_can_print(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) print(field)
def _read_embeddings_from_text_file( file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens", min_pretrained_embeddings: int = 0) -> torch.FloatTensor: """ Read pre-trained word vectors from an eventually compressed text file, possibly contained inside an archive with multiple files. The text file is assumed to be utf-8 encoded with space-separated fields: [word] [dim 1] [dim 2] ... Lines that contain more numerical tokens than ``embedding_dim`` raise a warning and are skipped. The remainder of the docstring is identical to ``_read_pretrained_embeddings_file``. """ tokens_to_keep = set( vocab.get_index_to_token_vocabulary(namespace).values()) embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading pretrained embeddings from file") with EmbeddingsTextFile(file_uri) as embeddings_file: for index, line in Tqdm.tqdm(enumerate(embeddings_file)): token = line.split(' ', 1)[0] if token in tokens_to_keep or index < min_pretrained_embeddings: fields = line.rstrip().split(' ') if len(fields) - 1 != embedding_dim: # Sometimes there are funny unicode parsing problems that lead to different # fields lengths (e.g., a word with a unicode space character that splits # into more than one column). We skip those lines. Note that if you have # some kind of long header, this could result in all of your lines getting # skipped. It's hard to check for that here; you just have to look in the # embedding_misses_file and at the model summary to make sure things look # like they are supposed to. logger.warning( "Found line with wrong number of dimensions (expected: %d; actual: %d): %s", embedding_dim, len(fields) - 1, line) continue vector = numpy.asarray(fields[1:], dtype='float32') embeddings[token] = vector if token not in tokens_to_keep: vocab.add_token_to_namespace(token, namespace) vocab_size = vocab.get_vocab_size(namespace) if not embeddings: raise ConfigurationError( "No embeddings of correct dimension found; you probably " "misspecified your embedding_dim parameter, or didn't " "pre-populate your Vocabulary") all_embeddings = numpy.asarray(list(embeddings.values())) embeddings_mean = float(numpy.mean(all_embeddings)) embeddings_std = float(numpy.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_( embeddings_mean, embeddings_std) num_tokens_found = 0 index_to_token = vocab.get_index_to_token_vocabulary(namespace) for i in range(vocab_size): token = index_to_token[i] # If we don't have a pre-trained vector for this word, we'll just leave this row alone, # so the word has a random initialization. if token in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[token]) num_tokens_found += 1 else: logger.debug( "Token %s was not found in the embedding file. Initialising randomly.", token) logger.info("Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size) return embedding_matrix
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) self.token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): # Total mismatch self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") self.inputs['words3'] = self.inputs['words4'] # Text field has too many inputs with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("is generating more keys") del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { 'words': (torch.rand(3, 4, 5, 6) * 20).long(), 'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]} }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_old_from_params_new_from_params(self): old_params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) # Allow loading the parameters in the old format with pytest.warns(DeprecationWarning): old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab) new_params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) # But also allow loading the parameters in the new format new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab) assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys() #pylint: disable=protected-access assert new_embedder(self.inputs).size() == (1, 4, 10)
def test_min_padding_length(self): sentence = "AllenNLP is awesome ." tokens = [Token(token) for token in sentence.split(" ")] vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace="characters") # 2 vocab.add_token_to_namespace("l", namespace="characters") # 3 vocab.add_token_to_namespace("e", namespace="characters") # 4 vocab.add_token_to_namespace("n", namespace="characters") # 5 vocab.add_token_to_namespace("N", namespace="characters") # 6 vocab.add_token_to_namespace("L", namespace="characters") # 7 vocab.add_token_to_namespace("P", namespace="characters") # 8 vocab.add_token_to_namespace("i", namespace="characters") # 9 vocab.add_token_to_namespace("s", namespace="characters") # 10 vocab.add_token_to_namespace("a", namespace="characters") # 11 vocab.add_token_to_namespace("w", namespace="characters") # 12 vocab.add_token_to_namespace("o", namespace="characters") # 13 vocab.add_token_to_namespace("m", namespace="characters") # 14 vocab.add_token_to_namespace(".", namespace="characters") # 15 indexer = TokenCharactersIndexer("characters", min_padding_length=10) indices = indexer.tokens_to_indices(tokens, vocab, "char") key_padding_lengths = "num_token_characters" value_padding_lengths = 0 for token in indices["char"]: item = indexer.get_padding_lengths(token) value = item.values() value_padding_lengths = max(value_padding_lengths, max(value)) padded = indexer.pad_token_sequence( indices, {"char": len(indices["char"])}, {key_padding_lengths: value_padding_lengths}) assert padded == { "char": [[2, 3, 3, 4, 5, 6, 7, 8, 0, 0], [9, 10, 0, 0, 0, 0, 0, 0, 0, 0], [11, 12, 4, 10, 13, 14, 4, 0, 0, 0], [15, 0, 0, 0, 0, 0, 0, 0, 0, 0]] }
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super().setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 }, } }) self.token_embedder = BasicTextFieldEmbedder.from_params( vocab=self.vocab, params=params) self.inputs = { "words1": { "tokens": torch.LongTensor([[0, 2, 3, 5]]) }, "words2": { "tokens": torch.LongTensor([[1, 4, 3, 2]]) }, "words3": { "tokens": torch.LongTensor([[1, 5, 1, 2]]) }, } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): # Total mismatch self.inputs["words4"] = self.inputs["words3"] del self.inputs["words3"] with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") self.inputs["words3"] = self.inputs["words4"] # Text field has too many inputs with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") del self.inputs["words4"] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2 }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15 }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "words": { "tokens": (torch.rand(3, 4, 5, 6) * 20).long() }, "characters": { "token_characters": (torch.rand(3, 4, 5, 6, 7) * 15).long() }, } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_forward_params(self): class FakeEmbedder(torch.nn.Module): def __init__(self): super().__init__() def forward(self, tokens: torch.Tensor, extra_arg: int = None): assert tokens is not None assert extra_arg is not None return tokens token_embedder = BasicTextFieldEmbedder({"elmo": FakeEmbedder()}) inputs = {"elmo": {"tokens": (torch.rand(3, 6, 5) * 2).long()}} kwargs = {"extra_arg": 1} token_embedder(inputs, **kwargs) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / "elmo" options_file = str(elmo_fixtures_path / "options.json") weight_file = str(elmo_fixtures_path / "lm_weights.hdf5") params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2 }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "words": { "tokens": (torch.rand(3, 6) * 20).long() }, "elmo": { "tokens": (torch.rand(3, 6, 50) * 15).long() }, } token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_null(self): elmo_fixtures_path = self.FIXTURES_ROOT / "elmo" options_file = str(elmo_fixtures_path / "options.json") weight_file = str(elmo_fixtures_path / "lm_weights.hdf5") params = Params({ "token_embedders": { "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = {"elmo": {"tokens": (torch.rand(3, 6, 50) * 15).long()}} token_embedder(inputs) def test_forward_runs_with_non_bijective_mapping_with_dict(self): elmo_fixtures_path = self.FIXTURES_ROOT / "elmo" options_file = str(elmo_fixtures_path / "options.json") weight_file = str(elmo_fixtures_path / "lm_weights.hdf5") params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2 }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "words": { "tokens": (torch.rand(3, 6) * 20).long() }, "elmo": { "tokens": (torch.rand(3, 6, 50) * 15).long() }, } token_embedder(inputs) def test_forward_runs_with_bijective_and_non_bijective_mapping(self): params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased" }, "token_characters": { "type": "character_encoding", "embedding": { "embedding_dim": 5 }, "encoder": { "type": "cnn", "embedding_dim": 5, "num_filters": 5, "ngram_filter_sizes": [5], }, }, } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { "bert": { "token_ids": (torch.rand(3, 5) * 10).long(), "mask": (torch.rand(3, 5) * 1).bool(), }, "token_characters": { "token_characters": (torch.rand(3, 5, 5) * 1).long() }, } token_embedder(inputs)
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) self.token_embedder = BasicTextFieldEmbedder.from_params( vocab=self.vocab, params=params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): # Total mismatch self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("Mismatched token keys") self.inputs['words3'] = self.inputs['words4'] # Text field has too many inputs with pytest.raises(ConfigurationError) as exc: self.token_embedder(self.inputs) assert exc.match("is generating more keys") del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab, params=params) inputs = { 'words': (torch.rand(3, 4, 5, 6) * 20).long(), 'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "token_embedders": { "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, }, "embedder_to_indexer_map": { "words": ["words"], "elmo": ["elmo", "words"] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs) def test_old_from_params_new_from_params(self): old_params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) # Allow loading the parameters in the old format with pytest.warns(DeprecationWarning): old_embedder = BasicTextFieldEmbedder.from_params( params=old_params, vocab=self.vocab) new_params = Params({ "token_embedders": { "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } } }) # But also allow loading the parameters in the new format new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab) assert old_embedder._token_embedders.keys( ) == new_embedder._token_embedders.keys() assert new_embedder(self.inputs).size() == (1, 4, 10) def test_extension_by_vocab(self): text_embedder = self.token_embedder vocab = self.vocab original_token_embedder_weight_words1 = text_embedder.token_embedder_words1.weight original_token_embedder_weight_words2 = text_embedder.token_embedder_words2.weight original_token_embedder_weight_words3 = text_embedder.token_embedder_words3.weight assert tuple(text_embedder.token_embedder_words1.weight.shape) == (6, 2) assert tuple(text_embedder.token_embedder_words2.weight.shape) == (6, 5) assert tuple(text_embedder.token_embedder_words3.weight.shape) == (6, 3) extended_inputs = { "words1": torch.LongTensor([[6]]), "words2": torch.LongTensor([[7]]), "words3": torch.LongTensor([[8]]) } # This should give error for now. with pytest.raises(Exception) as _: text_embedder(extended_inputs) counter = {"tokens": {"5": 1, "6": 1, "7": 1}} vocab._extend(counter) text_embedder.extend_vocab(vocab) assert tuple(text_embedder.token_embedder_words1.weight.shape) == (9, 2) assert tuple(text_embedder.token_embedder_words2.weight.shape) == (9, 5) assert tuple(text_embedder.token_embedder_words3.weight.shape) == (9, 3) # This shouldn't give error now. text_embedder(extended_inputs) assert torch.all(text_embedder.token_embedder_words1.weight[:6, :] == original_token_embedder_weight_words1[:6, :]) assert torch.all(text_embedder.token_embedder_words2.weight[:6, :] == original_token_embedder_weight_words2[:6, :]) assert torch.all(text_embedder.token_embedder_words3.weight[:6, :] == original_token_embedder_weight_words3[:6, :])
class TestDictField(unittest.TestCase): def setUp(self): super(TestDictField, self).setUp() entity_tokenizer = WordTokenizer( word_splitter=JustSpacesWordSplitter()) self.vocab = Vocabulary() self.vocab.add_token_to_namespace("entity1", "entity") self.vocab.add_token_to_namespace("entity2", "entity") self.vocab.add_token_to_namespace("entity3", "entity") self.entity_indexer = { "entity": TokenCharactersIndexerTokenizer( "entity", character_tokenizer=entity_tokenizer) } tokens1 = "The sentence .".split() tokens_field = TextField( [Token(t) for t in tokens1], token_indexers={'tokens': SingleIdTokenIndexer()}) self.instance1_fields = { "candidate_entities": TextField([Token("entity1 entity2"), Token("entity_unk")], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array([[0.5, 0.5], [1.0, 0.0]])), "candidate_spans": ListField( [SpanField(0, 0, tokens_field), SpanField(1, 2, tokens_field)]) } tokens2 = "The sentence".split() tokens2_field = TextField( [Token(t) for t in tokens2], token_indexers={'tokens': SingleIdTokenIndexer()}) self.instance2_fields = { "candidate_entities": TextField([Token("entity1")], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array([[1.0]])), "candidate_spans": ListField([SpanField(1, 1, tokens2_field)], ) } def test_get_padding_lengths(self): field = DictField(self.instance1_fields) field.index(self.vocab) lengths = field.get_padding_lengths() self.assertDictEqual( lengths, { 'candidate_entities*entity_length': 2, 'candidate_entities*num_token_characters': 2, 'candidate_entities*num_tokens': 2, 'candidate_entity_prior*dimension_0': 2, 'candidate_entity_prior*dimension_1': 2, 'candidate_spans*num_fields': 2 }) def test_dict_field_can_handle_empty(self): field = DictField(self.instance1_fields) empty = field.empty_field() self.assertTrue(True) def _check_tensors(self, tensor, expected): self.assertListEqual(sorted(list(tensor.keys())), sorted(list(expected.keys()))) for key in tensor.keys(): if key == 'candidate_entities': a = tensor[key]['entity'] b = expected[key]['entity'] else: a = tensor[key] b = expected[key] self.assertTrue(np.allclose(a.numpy(), b.numpy())) def test_dict_field_as_tensor(self): field = DictField(self.instance1_fields) field.index(self.vocab) tensor = field.as_tensor(field.get_padding_lengths()) expected = { 'candidate_entities': { 'entity': torch.tensor([[2, 3], [1, 0]]) }, 'candidate_entity_prior': torch.tensor([[0.5000, 0.5000], [1.0000, 0.0000]]), 'candidate_spans': torch.tensor([[0, 0], [1, 2]]) } self._check_tensors(tensor, expected) def test_dict_field_can_iterator(self): from allennlp.data import Instance from allennlp.data.iterators import BasicIterator iterator = BasicIterator() iterator.index_with(self.vocab) instances = [ Instance({"candidates": DictField(self.instance1_fields)}), Instance({"candidates": DictField(self.instance2_fields)}) ] for batch in iterator(instances, num_epochs=1, shuffle=False): break expected_batch = { 'candidates': { 'candidate_entities': { 'entity': torch.tensor([[[2, 3], [1, 0]], [[2, 0], [0, 0]]]) }, 'candidate_entity_prior': torch.tensor([[[0.5000, 0.5000], [1.0000, 0.0000]], [[1.0000, 0.0000], [0.0000, 0.0000]]]), 'candidate_spans': torch.tensor([[[0, 0], [1, 2]], [[1, 1], [-1, -1]]]) } } self._check_tensors(batch['candidates'], expected_batch['candidates']) def test_list_field_of_dict_field(self): from allennlp.data import Instance from allennlp.data.iterators import BasicIterator tokens3 = "The long sentence .".split() tokens3_field = TextField( [Token(t) for t in tokens3], token_indexers={'tokens': SingleIdTokenIndexer()}) instance3_fields = { "candidate_entities": TextField([ Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3") ], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField( np.array([[0.1, 0.1, 0.8], [1.0, 0.0, 0.0], [0.33, 0.67, 0.0]])), "candidate_spans": ListField([ SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field) ], ) } iterator = BasicIterator() iterator.index_with(self.vocab) instances = [ Instance({ "candidates": ListField([ DictField(self.instance1_fields), DictField(self.instance2_fields) ]) }), Instance({ "candidates": ListField([ DictField(self.instance1_fields), DictField(instance3_fields) ]) }) ] for batch in iterator(instances, num_epochs=1, shuffle=False): pass self.assertTrue( batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape)
def setUp(self): super().setUp() vocab = Vocabulary() vocab.add_token_to_namespace("O", "tags") vocab.add_token_to_namespace("B-ARG1", "tags") vocab.add_token_to_namespace("I-ARG1", "tags") vocab.add_token_to_namespace("B-ARG2", "tags") vocab.add_token_to_namespace("I-ARG2", "tags") vocab.add_token_to_namespace("B-V", "tags") vocab.add_token_to_namespace("I-V", "tags") vocab.add_token_to_namespace("U-ARG1", "tags") vocab.add_token_to_namespace("U-ARG2", "tags") vocab.add_token_to_namespace("B-C-ARG1", "tags") vocab.add_token_to_namespace("I-C-ARG1", "tags") vocab.add_token_to_namespace("B-ARGM-ADJ", "tags") vocab.add_token_to_namespace("I-ARGM-ADJ", "tags") # BMES. vocab.add_token_to_namespace("B", "bmes_tags") vocab.add_token_to_namespace("M", "bmes_tags") vocab.add_token_to_namespace("E", "bmes_tags") vocab.add_token_to_namespace("S", "bmes_tags") self.vocab = vocab
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field( ) super(TestListField, self).setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_num_tokens": 5} def test_list_field_can_handle_empty_text_fields(self): list_field = ListField( [self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor_dict["words"].data.cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]])) def test_list_field_can_handle_empty_index_fields(self): list_field = ListField( [self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.data.cpu().numpy(), numpy.array([[1], [1], [-1]])) def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([ self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field ]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor.data.cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])) def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal( tensor_dict["words"][0].data.cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][1].data.cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][2].data.cpu().numpy(), numpy.array([2, 3, 1, 5, 0])) def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField( [LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField( [LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField( [nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).data.cpu().numpy() numpy.testing.assert_almost_equal( tensor, [[[-1], [-1], [-1], [-1], [-1], [-1]], [[0], [1], [2], [3], [4], [-1]], [[5], [6], [7], [8], [9], [10]]]) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_num_tokens"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"][0].data.cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][1].data.cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][2].data.cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][3].data.cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["words"][4].data.cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) def test_as_tensor_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].data.cpu().numpy() characters = tensor_dict["characters"].data.cpu().numpy() numpy.testing.assert_array_almost_equal( words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal( characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal( characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal( characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields( self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField( [self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].data.cpu().numpy() characters = tensor_dict["characters"].data.cpu().numpy() numpy.testing.assert_array_almost_equal( words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal( characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal( characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field)
class TestTokenCharactersEncoder(AllenNlpTestCase): def setUp(self): super(TestTokenCharactersEncoder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1", "token_characters") self.vocab.add_token_to_namespace("2", "token_characters") self.vocab.add_token_to_namespace("3", "token_characters") self.vocab.add_token_to_namespace("4", "token_characters") params = Params({ "embedding": { "embedding_dim": 2, "vocab_namespace": "token_characters" }, "encoder": { "type": "cnn", "embedding_dim": 2, "num_filters": 4, "ngram_filter_sizes": [1, 2], "output_dim": 3 } }) self.encoder = TokenCharactersEncoder.from_params( vocab=self.vocab, params=deepcopy(params)) self.embedding = Embedding.from_params(vocab=self.vocab, params=params["embedding"]) self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"]) constant_init = Initializer.from_params( Params({ "type": "constant", "val": 1. })) initializer = InitializerApplicator([(".*", constant_init)]) initializer(self.encoder) initializer(self.embedding) initializer(self.inner_encoder) def test_get_output_dim_uses_encoder_output_dim(self): assert self.encoder.get_output_dim() == 3 def test_forward_applies_embedding_then_encoder(self): numpy_tensor = numpy.random.randint(6, size=(3, 4, 7)) inputs = torch.from_numpy(numpy_tensor) encoder_output = self.encoder(inputs) reshaped_input = inputs.view(12, 7) embedded = self.embedding(reshaped_input) mask = (inputs != 0).long().view(12, 7) reshaped_manual_output = self.inner_encoder(embedded, mask) manual_output = reshaped_manual_output.view(3, 4, 3) assert_almost_equal(encoder_output.data.numpy(), manual_output.data.numpy()) def test_char_embedding_vocab_extension_with_default_namespace(self): vocab = self.vocab character_encoder = self.encoder original_weight = character_encoder._embedding._module.weight assert tuple(original_weight.shape) == (6, 2) vocab.add_token_to_namespace("5", "token_characters") character_encoder.extend_vocab(vocab) extended_weight = character_encoder._embedding._module.weight assert tuple(extended_weight.shape) == (7, 2) assert torch.all(original_weight == extended_weight[:6, :])
class TestTextField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"} def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace( "A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [ capital_a_index, sentence_index ] field1 = TextField( [Token(t) for t in ["A", "sentence"]], { "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1) }) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] field2 = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1) }) field2.index(vocab) assert field2._indexed_tokens["words"] == [ capital_a_index, sentence_index ] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [ s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ]] # pylint: enable=protected-access def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"words_length": 5, "num_tokens": 5} field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "num_tokens": 5, "characters_length": 5, "num_token_characters": 8 } field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1), "words": SingleIdTokenIndexer("words") }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { "num_tokens": 5, "characters_length": 5, "words_length": 5, "num_token_characters": 8 } def test_as_tensor_handles_words(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])) def test_as_tensor_handles_longer_lengths(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words_length"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_as_tensor_handles_characters(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].detach().cpu().numpy(), expected_character_array) def test_as_tensor_handles_characters_if_empty_field(self): field = TextField( [], token_indexers={ "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([]) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].detach().cpu().numpy(), expected_character_array) def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField( [Token(t) for t in ["a", "sentence", "."]], token_indexers={ "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words_length"] = 5 padding_lengths["characters_length"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal( tensor_dict["characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) print(field) def test_token_indexer_returns_dict(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids_length': 5, 'additional_key_length': 2, 'words_length': 2, 'characters_length': 2, 'num_token_characters': 8, 'num_tokens': 5, } padding_lengths['token_ids_length'] = 7 padding_lengths['additional_key_length'] = 3 padding_lengths['words_length'] = 4 padding_lengths['characters_length'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors['token_ids'].shape) == [7] assert list(tensors['additional_key'].shape) == [3] assert list(tensors['words'].shape) == [4] assert list(tensors['characters'].shape) == [4, 8] def test_token_padding_lengths_are_computed_correctly(self): field = TextField( [Token(t) for t in ["A", "sentence"]], token_indexers={ "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3), "words": SingleIdTokenIndexer("words", token_min_padding_length=3), "characters": TokenCharactersIndexer("characters", min_padding_length=1, token_min_padding_length=3) }) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids_length': 5, 'additional_key_length': 3, 'words_length': 3, 'characters_length': 3, 'num_token_characters': 8, 'num_tokens': 5, } tensors = field.as_tensor(padding_lengths) assert tensors['additional_key'].tolist()[-1] == 0 assert tensors['words'].tolist()[-1] == 0 assert tensors['characters'].tolist()[-1] == [0] * 8 def test_sequence_methods(self): field = TextField( [Token(t) for t in ["This", "is", "a", "sentence", "."]], {}) assert len(field) == 5 assert field[1].text == "is" assert [token.text for token in field] == ["This", "is", "a", "sentence", "."]
class TestKnowledgeGraphField(SemparseTestCase): def setup_method(self): self.tokenizer = SpacyTokenizer(pos_tags=True) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} table_file = self.FIXTURES_ROOT / "data" / "wikitables" / "tables" / "341.tagged" self.graph = TableQuestionContext.read_from_file( table_file, self.utterance).get_table_knowledge_graph() self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace="tokens") self.in_index = self.vocab.add_token_to_namespace("in", namespace="tokens") self.english_index = self.vocab.add_token_to_namespace( "english", namespace="tokens") self.location_index = self.vocab.add_token_to_namespace( "location", namespace="tokens") self.mersin_index = self.vocab.add_token_to_namespace( "mersin", namespace="tokens") self.oov_index = self.vocab.get_token_index("random OOV string", namespace="tokens") self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super().setup_method() def test_count_vocab_items(self): namespace_token_counts = defaultdict(lambda: defaultdict(int)) self.field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["tokens"] == { "name": 1, "in": 2, "english": 2, "location": 1, "mersin": 1, } def test_get_padding_lengths_raises_if_not_indexed(self): with pytest.raises(ConfigurationError): self.field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): self.field.index(self.vocab) assert self.field.get_padding_lengths() == { "num_entities": 3, "num_utterance_tokens": 4, "num_fields": 3, "list_tokens___tokens": 3, } self.field._token_indexers[ "token_characters"] = TokenCharactersIndexer(min_padding_length=1) self.field.index(self.vocab) assert self.field.get_padding_lengths() == { "num_entities": 3, "num_utterance_tokens": 4, "num_fields": 3, "list_tokens___tokens": 3, "list_token_characters___token_characters": 3, "list_token_characters___num_token_characters": 8, } def test_as_tensor_produces_correct_output(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() padding_lengths["num_utterance_tokens"] += 1 padding_lengths["num_entities"] += 1 padding_lengths["num_fields"] += 1 tensor_dict = self.field.as_tensor(padding_lengths) assert tensor_dict.keys() == {"text", "linking"} expected_text_tensor = [ [self.mersin_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index], [0, 0, 0], ] assert_almost_equal( tensor_dict["text"]["tokens"]["tokens"].detach().cpu().numpy(), expected_text_tensor) linking_tensor = tensor_dict["linking"].detach().cpu().numpy() expected_linking_tensor = [ [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # string:mersin, "where" [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0], # string:mersin, "is" [0, 1, 1, 1, 1, 1, 0, 0, 1, 1], # string:mersin, "mersin" [0, 0, 0, 0, 0, -5, 0, 0, 0, 0], # string:mersin, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ], # string:mersin, padding [ [0, 0, 0, 0, 0, -2.6, 0, 0, 0, 0], # string_column:name_in_english, "where" [0, 0, 0, 0, 0, -7.5, 0, 0, 0, 0], # string_column:name_in_english, "is" [0, 0, 0, 0, 0, -1.8333, 1, 1, 0, 0], # string_column:..in_english, "mersin" [0, 0, 0, 0, 0, -18, 0, 0, 0, 0], # string_column:name_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ], # string_column:name_in_english, padding [ [0, 0, 0, 0, 0, -1.6, 0, 0, 0, 0], # string_..:location_in_english, "where" [0, 0, 0, 0, 0, -5.5, 0, 0, 0, 0], # string_column:location_in_english, "is" [0, 0, 0, 0, 0, -1, 0, 0, 0, 0], # string_column:location_in_english, "mersin" [0, 0, 0, 0, 0, -14, 0, 0, 0, 0], # string_column:location_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ], # string_column:location_in_english, padding [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ], ] # padding, padding for entity_index, entity_features in enumerate( expected_linking_tensor): for question_index, feature_vector in enumerate(entity_features): assert_almost_equal( linking_tensor[entity_index, question_index], feature_vector, decimal=4, err_msg=f"{entity_index} {question_index}", ) def test_lemma_feature_extractor(self): utterance = self.tokenizer.tokenize("Names in English") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = "string_column:name_in_english" lemma_feature = field._contains_lemma_match( entity, field._entity_text_map[entity], utterance[0], 0, utterance) assert lemma_feature == 1 def test_span_overlap_fraction(self): utterance = self.tokenizer.tokenize( "what is the name in english of mersin?") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = "string_column:name_in_english" entity_text = field._entity_text_map[entity] feature_values = [ field._span_overlap_fraction(entity, entity_text, token, i, utterance) for i, token in enumerate(utterance) ] assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0] def test_batch_tensors(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() tensor_dict1 = self.field.as_tensor(padding_lengths) tensor_dict2 = self.field.as_tensor(padding_lengths) batched_tensor_dict = self.field.batch_tensors( [tensor_dict1, tensor_dict2]) assert batched_tensor_dict.keys() == {"text", "linking"} expected_single_tensor = [ [self.mersin_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index], ] expected_batched_tensor = [ expected_single_tensor, expected_single_tensor ] assert_almost_equal( batched_tensor_dict["text"]["tokens"] ["tokens"].detach().cpu().numpy(), expected_batched_tensor, ) expected_linking_tensor = torch.stack( [tensor_dict1["linking"], tensor_dict2["linking"]]) assert_almost_equal( batched_tensor_dict["linking"].detach().cpu().numpy(), expected_linking_tensor.detach().cpu().numpy(), ) def test_field_initialized_with_empty_constructor(self): try: self.field.empty_field() except AssertionError as e: pytest.fail(str(e), pytrace=True)
class TestProductionRuleField(AllenNlpTestCase): def setUp(self): super(TestProductionRuleField, self).setUp() self.vocab = Vocabulary() self.s_rule_index = self.vocab.add_token_to_namespace( "S -> [NP, VP]", namespace='rule_labels') self.np_index = self.vocab.add_token_to_namespace( "NP -> test", namespace='rule_labels') def test_field_counts_vocab_items_correctly(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 1 field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["rule_labels"]["S -> [NP, VP]"] == 0 def test_index_converts_field_correctly(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) assert field._rule_id == self.s_rule_index def test_padding_lengths_are_computed_correctly(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) assert field.get_padding_lengths() == {} def test_as_tensor_produces_correct_output(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) tensor_tuple = field.as_tensor(field.get_padding_lengths()) assert isinstance(tensor_tuple, tuple) assert len(tensor_tuple) == 3 assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) field = ProductionRuleField('S -> [NP, VP]', is_global_rule=False) field.index(self.vocab) tensor_tuple = field.as_tensor(field.get_padding_lengths()) assert isinstance(tensor_tuple, tuple) assert len(tensor_tuple) == 3 assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is False assert tensor_tuple[2] is None def test_batch_tensors_does_not_modify_list(self): field = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict1 = field.as_tensor(padding_lengths) field = ProductionRuleField('NP -> test', is_global_rule=True) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict2 = field.as_tensor(padding_lengths) tensor_list = [tensor_dict1, tensor_dict2] assert field.batch_tensors(tensor_list) == tensor_list def test_doubly_nested_field_works(self): field1 = ProductionRuleField('S -> [NP, VP]', is_global_rule=True) field2 = ProductionRuleField('NP -> test', is_global_rule=True) field3 = ProductionRuleField('VP -> eat', is_global_rule=False) list_field = ListField( [ListField([field1, field2, field3]), ListField([field1, field2])]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensors = list_field.as_tensor(padding_lengths) assert isinstance(tensors, list) assert len(tensors) == 2 assert isinstance(tensors[0], list) assert len(tensors[0]) == 3 assert isinstance(tensors[1], list) assert len(tensors[1]) == 3 tensor_tuple = tensors[0][0] assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) tensor_tuple = tensors[0][1] assert tensor_tuple[0] == 'NP -> test' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index]) tensor_tuple = tensors[0][2] assert tensor_tuple[0] == 'VP -> eat' assert tensor_tuple[1] is False assert tensor_tuple[2] is None tensor_tuple = tensors[1][0] assert tensor_tuple[0] == 'S -> [NP, VP]' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.s_rule_index]) tensor_tuple = tensors[1][1] assert tensor_tuple[0] == 'NP -> test' assert tensor_tuple[1] is True assert_almost_equal(tensor_tuple[2].detach().cpu().numpy(), [self.np_index]) # This item was just padding. tensor_tuple = tensors[1][2] assert tensor_tuple[0] == '' assert tensor_tuple[1] is False assert tensor_tuple[2] is None
class TestDataset(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this") self.vocab.add_token_to_namespace("is") self.vocab.add_token_to_namespace("a") self.vocab.add_token_to_namespace("sentence") self.vocab.add_token_to_namespace(".") self.token_indexer = {"tokens": SingleIdTokenIndexer()} self.instances = self.get_instances() super().setUp() def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({"words": TextField([Token("hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2]) def test_padding_lengths_uses_max_instance_lengths(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() assert padding_lengths == { "text1": { "num_tokens": 5, "tokens_length": 5 }, "text2": { "num_tokens": 6, "tokens_length": 6 } } def test_as_tensor_dict(self): dataset = Batch(self.instances) dataset.index_instances(self.vocab) padding_lengths = dataset.get_padding_lengths() tensors = dataset.as_tensor_dict(padding_lengths) text1 = tensors["text1"]["tokens"].detach().cpu().numpy() text2 = tensors["text2"]["tokens"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]])) numpy.testing.assert_array_almost_equal( text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]])) def get_instances(self): field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer) field2 = TextField([ Token(t) for t in ["this", "is", "a", "different", "sentence", "."] ], self.token_indexer) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [ Instance({ "text1": field1, "text2": field2 }), Instance({ "text1": field3, "text2": field4 }) ] return instances
class KnowledgeGraphFieldTest(AllenNlpTestCase): def setUp(self): self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True)) self.utterance = self.tokenizer.tokenize("where is mersin?") self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")} json = { 'question': self.utterance, 'columns': ['Name in English', 'Location in English'], 'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']] } self.graph = TableQuestionKnowledgeGraph.read_from_json(json) self.vocab = Vocabulary() self.name_index = self.vocab.add_token_to_namespace("name", namespace='tokens') self.in_index = self.vocab.add_token_to_namespace("in", namespace='tokens') self.english_index = self.vocab.add_token_to_namespace( "english", namespace='tokens') self.location_index = self.vocab.add_token_to_namespace( "location", namespace='tokens') self.paradeniz_index = self.vocab.add_token_to_namespace( "paradeniz", namespace='tokens') self.mersin_index = self.vocab.add_token_to_namespace( "mersin", namespace='tokens') self.lake_index = self.vocab.add_token_to_namespace("lake", namespace='tokens') self.gala_index = self.vocab.add_token_to_namespace("gala", namespace='tokens') self.negative_one_index = self.vocab.add_token_to_namespace( "-1", namespace='tokens') self.zero_index = self.vocab.add_token_to_namespace("0", namespace='tokens') self.one_index = self.vocab.add_token_to_namespace("1", namespace='tokens') self.oov_index = self.vocab.get_token_index('random OOV string', namespace='tokens') self.edirne_index = self.oov_index self.field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) super(KnowledgeGraphFieldTest, self).setUp() def test_count_vocab_items(self): namespace_token_counts = defaultdict(lambda: defaultdict(int)) self.field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["tokens"] == { '-1': 1, '0': 1, '1': 1, 'name': 1, 'in': 2, 'english': 2, 'location': 1, 'paradeniz': 1, 'mersin': 1, 'lake': 1, 'gala': 1, 'edirne': 1, } def test_index_converts_field_correctly(self): # pylint: disable=protected-access self.field.index(self.vocab) assert self.field._indexed_entity_texts.keys() == {'tokens'} # Note that these are sorted by their _identifiers_, not their cell text, so the # `fb:row.rows` show up after the `fb:cells`. expected_array = [[self.negative_one_index], [self.zero_index], [self.one_index], [self.edirne_index], [self.lake_index, self.gala_index], [self.mersin_index], [self.paradeniz_index], [ self.location_index, self.in_index, self.english_index ], [self.name_index, self.in_index, self.english_index]] assert self.field._indexed_entity_texts['tokens'] == expected_array def test_get_padding_lengths_raises_if_not_indexed(self): with pytest.raises(AssertionError): self.field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): # pylint: disable=protected-access self.field.index(self.vocab) assert self.field.get_padding_lengths() == { 'num_entities': 9, 'num_entity_tokens': 3, 'num_utterance_tokens': 4 } self.field._token_indexers[ 'token_characters'] = TokenCharactersIndexer() self.field.index(self.vocab) assert self.field.get_padding_lengths() == { 'num_entities': 9, 'num_entity_tokens': 3, 'num_utterance_tokens': 4, 'num_token_characters': 9 } def test_as_tensor_produces_correct_output(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() padding_lengths['num_utterance_tokens'] += 1 padding_lengths['num_entities'] += 1 tensor_dict = self.field.as_tensor(padding_lengths) assert tensor_dict.keys() == {'text', 'linking'} expected_text_tensor = [ [self.negative_one_index, 0, 0], [self.zero_index, 0, 0], [self.one_index, 0, 0], [self.edirne_index, 0, 0], [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0], [self.paradeniz_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index], [0, 0, 0] ] assert_almost_equal( tensor_dict['text']['tokens'].detach().cpu().numpy(), expected_text_tensor) linking_tensor = tensor_dict['linking'].detach().cpu().numpy() expected_linking_tensor = [ [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # -1, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # -1, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # -1, "mersin" [0, 0, 0, 0, 0, -1, 0, 0, 0, 0] ], # -1, "?" [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 0, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # 0, "?" [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # 1, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # 1, "?" [ [0, 0, 0, 0, 0, .2, 0, 0, 0, 0], # fb:cell.edirne, "where" [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0], # fb:cell.edirne, "is" [0, 0, 0, 0, 0, .1666, 0, 0, 0, 0], # fb:cell.edirne, "mersin" [0, 0, 0, 0, 0, -5, 0, 0, 0, 0], # fb:cell.edirne, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # fb:cell.edirne, padding [ [0, 0, 0, 0, 0, -.6, 0, 0, 0, 0], # fb:cell.lake_gala, "where" [0, 0, 0, 0, 0, -3.5, 0, 0, 0, 0], # fb:cell.lake_gala, "is" [0, 0, 0, 0, 0, -.3333, 0, 0, 0, 0], # fb:cell.lake_gala, "mersin" [0, 0, 0, 0, 0, -8, 0, 0, 0, 0], # fb:cell.lake_gala, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # fb:cell.lake_gala, padding [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # fb:cell.mersin, "where" [0, 0, 0, 0, 0, -1.5, 0, 0, 0, 0], # fb:cell.mersin, "is" [0, 1, 1, 1, 1, 1, 0, 0, 1, 1], # fb:cell.mersin, "mersin" [0, 0, 0, 0, 0, -5, 0, 0, 0, 0], # fb:cell.mersin, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # fb:cell.mersin, padding [ [0, 0, 0, 0, 0, -.6, 0, 0, 0, 0], # fb:cell.paradeniz, "where" [0, 0, 0, 0, 0, -3, 0, 0, 0, 0], # fb:cell.paradeniz, "is" [0, 0, 0, 0, 0, -.1666, 0, 0, 0, 0], # fb:cell.paradeniz, "mersin" [0, 0, 0, 0, 0, -8, 0, 0, 0, 0], # fb:cell.paradeniz, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # fb:cell.paradeniz, padding [ [0, 0, 0, 0, 0, -2.6, 0, 0, 0, 0], # fb:row.row.name_in_english, "where" [0, 0, 0, 0, 0, -7.5, 0, 0, 0, 0], # fb:row.row.name_in_english, "is" [0, 0, 0, 0, 0, -1.8333, 1, 1, 0, 0], # fb:row.row.name_in_english, "mersin" [0, 0, 0, 0, 0, -18, 0, 0, 0, 0], # fb:row.row.name_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # fb:row.row.name_in_english, padding [ [0, 0, 0, 0, 0, -1.6, 0, 0, 0, 0], # fb:row.row.location_in_english, "where" [0, 0, 0, 0, 0, -5.5, 0, 0, 0, 0], # fb:row.row.location_in_english, "is" [0, 0, 0, 0, 0, -1, 0, 0, 0, 0], # fb:row.row.location_in_english, "mersin" [0, 0, 0, 0, 0, -14, 0, 0, 0, 0], # fb:row.row.location_in_english, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ], # fb:row.row.location_in_english, padding [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "where" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "is" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "mersin" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding, "?" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ] ] # padding, padding for entity_index, entity_features in enumerate( expected_linking_tensor): for question_index, feature_vector in enumerate(entity_features): assert_almost_equal(linking_tensor[entity_index, question_index], feature_vector, decimal=4, err_msg=f"{entity_index} {question_index}") def test_lemma_feature_extractor(self): # pylint: disable=protected-access utterance = self.tokenizer.tokenize("Names in English") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = 'fb:row.row.name_in_english' lemma_feature = field._contains_lemma_match( entity, field._entity_text_map[entity], utterance[0], 0, utterance) assert lemma_feature == 1 def test_span_overlap_fraction(self): # pylint: disable=protected-access utterance = self.tokenizer.tokenize( "what is the name in english of mersin?") field = KnowledgeGraphField(self.graph, self.utterance, self.token_indexers, self.tokenizer) entity = 'fb:row.row.name_in_english' entity_text = field._entity_text_map[entity] feature_values = [ field._span_overlap_fraction(entity, entity_text, token, i, utterance) for i, token in enumerate(utterance) ] assert feature_values == [0, 0, 0, 1, 1, 1, 0, 0, 0] def test_batch_tensors(self): self.field.index(self.vocab) padding_lengths = self.field.get_padding_lengths() tensor_dict1 = self.field.as_tensor(padding_lengths) tensor_dict2 = self.field.as_tensor(padding_lengths) batched_tensor_dict = self.field.batch_tensors( [tensor_dict1, tensor_dict2]) assert batched_tensor_dict.keys() == {'text', 'linking'} expected_single_tensor = [ [self.negative_one_index, 0, 0], [self.zero_index, 0, 0], [self.one_index, 0, 0], [self.edirne_index, 0, 0], [self.lake_index, self.gala_index, 0], [self.mersin_index, 0, 0], [self.paradeniz_index, 0, 0], [self.location_index, self.in_index, self.english_index], [self.name_index, self.in_index, self.english_index] ] expected_batched_tensor = [ expected_single_tensor, expected_single_tensor ] assert_almost_equal( batched_tensor_dict['text']['tokens'].detach().cpu().numpy(), expected_batched_tensor) expected_linking_tensor = torch.stack( [tensor_dict1['linking'], tensor_dict2['linking']]) assert_almost_equal( batched_tensor_dict['linking'].detach().cpu().numpy(), expected_linking_tensor.detach().cpu().numpy())
class TestBasicTextFieldEmbedder(AllenNlpTestCase): def setUp(self): super(TestBasicTextFieldEmbedder, self).setUp() self.vocab = Vocabulary() self.vocab.add_token_to_namespace("1") self.vocab.add_token_to_namespace("2") self.vocab.add_token_to_namespace("3") self.vocab.add_token_to_namespace("4") params = Params({ "words1": { "type": "embedding", "embedding_dim": 2 }, "words2": { "type": "embedding", "embedding_dim": 5 }, "words3": { "type": "embedding", "embedding_dim": 3 } }) self.token_embedder = BasicTextFieldEmbedder.from_params( self.vocab, params) self.inputs = { "words1": torch.LongTensor([[0, 2, 3, 5]]), "words2": torch.LongTensor([[1, 4, 3, 2]]), "words3": torch.LongTensor([[1, 5, 1, 2]]) } def test_get_output_dim_aggregates_dimension_from_each_embedding(self): assert self.token_embedder.get_output_dim() == 10 def test_forward_asserts_input_field_match(self): self.inputs['words4'] = self.inputs['words3'] del self.inputs['words3'] with pytest.raises(ConfigurationError): self.token_embedder(self.inputs) self.inputs['words3'] = self.inputs['words4'] del self.inputs['words4'] def test_forward_concats_resultant_embeddings(self): assert self.token_embedder(self.inputs).size() == (1, 4, 10) def test_forward_works_on_higher_order_input(self): params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "characters": { "type": "character_encoding", "embedding": { "embedding_dim": 4, "num_embeddings": 15, }, "encoder": { "type": "cnn", "embedding_dim": 4, "num_filters": 10, "ngram_filter_sizes": [3], }, } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 4, 5, 6) * 20).long(), 'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(), } assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12) def test_forward_runs_with_non_bijective_mapping(self): elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo' options_file = str(elmo_fixtures_path / 'options.json') weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5') params = Params({ "words": { "type": "embedding", "num_embeddings": 20, "embedding_dim": 2, }, "elmo": { "type": "elmo_token_embedder", "options_file": options_file, "weight_file": weight_file }, "embedder_to_indexer_map": { "words": ["words"], "elmo": ["elmo", "words"] } }) token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params) inputs = { 'words': (torch.rand(3, 6) * 20).long(), 'elmo': (torch.rand(3, 6, 50) * 15).long(), } token_embedder(inputs)
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexer = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters") } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) super(TestListField, self).setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "num_tokens": 5} def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) array_dict = list_field.as_array(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(array_dict["words"][0], numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(array_dict["words"][1], numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(array_dict["words"][2], numpy.array([2, 3, 1, 5, 0])) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["num_tokens"] = 7 padding_lengths["num_fields"] = 5 array_dict = list_field.as_array(padding_lengths) numpy.testing.assert_array_almost_equal( array_dict["words"][0], numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal( array_dict["words"][1], numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal( array_dict["words"][2], numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal( array_dict["words"][3], numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal( array_dict["words"][4], numpy.array([0, 0, 0, 0, 0, 0, 0])) def test_as_array_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexer self.field2._token_indexers = self.words_and_characters_indexer self.field3._token_indexers = self.words_and_characters_indexer list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() array_dict = list_field.as_array(padding_lengths) words = array_dict["words"] characters = array_dict["characters"] numpy.testing.assert_array_almost_equal( words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal( characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal( characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal( characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]]))
def test_min_padding_length(self): sentence = "AllenNLP is awesome ." tokens = [Token(token) for token in sentence.split(" ")] vocab = Vocabulary() vocab.add_token_to_namespace("A", namespace="characters") # 2 vocab.add_token_to_namespace("l", namespace="characters") # 3 vocab.add_token_to_namespace("e", namespace="characters") # 4 vocab.add_token_to_namespace("n", namespace="characters") # 5 vocab.add_token_to_namespace("N", namespace="characters") # 6 vocab.add_token_to_namespace("L", namespace="characters") # 7 vocab.add_token_to_namespace("P", namespace="characters") # 8 vocab.add_token_to_namespace("i", namespace="characters") # 9 vocab.add_token_to_namespace("s", namespace="characters") # 10 vocab.add_token_to_namespace("a", namespace="characters") # 11 vocab.add_token_to_namespace("w", namespace="characters") # 12 vocab.add_token_to_namespace("o", namespace="characters") # 13 vocab.add_token_to_namespace("m", namespace="characters") # 14 vocab.add_token_to_namespace(".", namespace="characters") # 15 indexer = TokenCharactersIndexer("characters", min_padding_length=10) indices = indexer.tokens_to_indices(tokens, vocab) padded = indexer.as_padded_tensor_dict( indices, indexer.get_padding_lengths(indices)) assert padded["token_characters"].tolist() == [ [2, 3, 3, 4, 5, 6, 7, 8, 0, 0], [9, 10, 0, 0, 0, 0, 0, 0, 0, 0], [11, 12, 4, 10, 13, 14, 4, 0, 0, 0], [15, 0, 0, 0, 0, 0, 0, 0, 0, 0], ]
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", 'words') self.vocab.add_token_to_namespace("s", 'characters') self.vocab.add_token_to_namespace("e", 'characters') self.vocab.add_token_to_namespace("n", 'characters') self.vocab.add_token_to_namespace("t", 'characters') self.vocab.add_token_to_namespace("c", 'characters') for label in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']: self.vocab.add_token_to_namespace(label, 'labels') self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = {"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")} self.field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_words_length": 5, "list_num_tokens": 5} def test_list_field_can_handle_empty_text_fields(self): list_field = ListField([self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]])) def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]])) def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]])) def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0])) def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ['a', 'b', 'c', 'd', 'e']]) nested_field2 = ListField([LabelField(c) for c in ['f', 'g', 'h', 'i', 'j', 'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {'num_fields': 3, 'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]]) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words_length"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["words"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) def test_as_tensor_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"].detach().cpu().numpy() characters = tensor_dict["characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field) def test_sequence_methods(self): list_field = ListField([self.field1, self.field2, self.field3]) assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3]
class SamplerTest(AllenNlpTestCase): def setup_method(self): super().setup_method() self.token_indexers = {"tokens": SingleIdTokenIndexer()} self.vocab = Vocabulary() self.this_index = self.vocab.add_token_to_namespace("this") self.is_index = self.vocab.add_token_to_namespace("is") self.a_index = self.vocab.add_token_to_namespace("a") self.sentence_index = self.vocab.add_token_to_namespace("sentence") self.another_index = self.vocab.add_token_to_namespace("another") self.yet_index = self.vocab.add_token_to_namespace("yet") self.very_index = self.vocab.add_token_to_namespace("very") self.long_index = self.vocab.add_token_to_namespace("long") instances = [ self.create_instance(["this", "is", "a", "sentence"]), self.create_instance(["this", "is", "another", "sentence"]), self.create_instance(["yet", "another", "sentence"]), self.create_instance([ "this", "is", "a", "very", "very", "very", "very", "long", "sentence" ]), self.create_instance(["sentence"]), ] self.instances = instances self.lazy_instances = LazyIterable(instances) def get_mock_reader(self) -> DatasetReader: class MockReader(DatasetReader): def __init__(self, instances, **kwargs): super().__init__(**kwargs) self.instances = instances def _read(self, file_path: str): for instance in self.instances: yield instance return MockReader(self.instances) def create_instance(self, str_tokens: List[str]): tokens = [Token(t) for t in str_tokens] instance = Instance({"text": TextField(tokens, self.token_indexers)}) instance.index_fields(self.vocab) return instance def create_instances_from_token_counts( self, token_counts: List[int]) -> List[Instance]: return [ self.create_instance(["word"] * count) for count in token_counts ] def get_batches_stats( self, batches: Iterable[Batch]) -> Dict[str, Union[int, List[int]]]: grouped_instances = [batch.instances for batch in batches] group_lengths = [len(group) for group in grouped_instances] sample_sizes = [] for batch in batches: batch_sequence_length = max( instance.get_padding_lengths()["text"]["tokens___tokens"] for instance in batch.instances) sample_sizes.append(batch_sequence_length * len(batch.instances)) return { "batch_lengths": group_lengths, "total_instances": sum(group_lengths), "sample_sizes": sample_sizes, } def assert_instances_are_correct(self, candidate_instances): # First we need to remove padding tokens from the candidates. candidate_instances = [ tuple(w for w in instance if w != 0) for instance in candidate_instances ] expected_instances = [ tuple(instance.fields["text"]._indexed_tokens["tokens"]["tokens"]) for instance in self.instances ] assert set(candidate_instances) == set(expected_instances)
class TestTextField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("sentence", namespace='words') self.vocab.add_token_to_namespace("A", namespace='words') self.vocab.add_token_to_namespace("A", namespace='characters') self.vocab.add_token_to_namespace("s", namespace='characters') self.vocab.add_token_to_namespace("e", namespace='characters') self.vocab.add_token_to_namespace("n", namespace='characters') self.vocab.add_token_to_namespace("t", namespace='characters') self.vocab.add_token_to_namespace("c", namespace='characters') super(TestTextField, self).setUp() def test_field_counts_vocab_items_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert list(namespace_token_counts.keys()) == ["words"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert list(namespace_token_counts.keys()) == ["characters"] field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) namespace_token_counts = defaultdict(lambda: defaultdict(int)) field.count_vocab_items(namespace_token_counts) assert namespace_token_counts["characters"]["T"] == 1 assert namespace_token_counts["characters"]["h"] == 1 assert namespace_token_counts["characters"]["i"] == 2 assert namespace_token_counts["characters"]["s"] == 3 assert namespace_token_counts["characters"]["a"] == 1 assert namespace_token_counts["characters"]["e"] == 3 assert namespace_token_counts["characters"]["n"] == 2 assert namespace_token_counts["characters"]["t"] == 1 assert namespace_token_counts["characters"]["c"] == 1 assert namespace_token_counts["characters"]["."] == 1 assert namespace_token_counts["words"]["This"] == 1 assert namespace_token_counts["words"]["is"] == 1 assert namespace_token_counts["words"]["a"] == 1 assert namespace_token_counts["words"]["sentence"] == 1 assert namespace_token_counts["words"]["."] == 1 assert set(namespace_token_counts.keys()) == {"words", "characters"} def test_index_converts_field_correctly(self): vocab = Vocabulary() sentence_index = vocab.add_token_to_namespace("sentence", namespace='words') capital_a_index = vocab.add_token_to_namespace("A", namespace='words') capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters') s_index = vocab.add_token_to_namespace("s", namespace='characters') e_index = vocab.add_token_to_namespace("e", namespace='characters') n_index = vocab.add_token_to_namespace("n", namespace='characters') t_index = vocab.add_token_to_namespace("t", namespace='characters') c_index = vocab.add_token_to_namespace("c", namespace='characters') field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) field.index(vocab) # pylint: disable=protected-access assert field._indexed_tokens["words"] == [capital_a_index, sentence_index] field1 = TextField([Token(t) for t in ["A", "sentence"]], {"characters": TokenCharactersIndexer(namespace="characters")}) field1.index(vocab) assert field1._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] field2 = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"words": SingleIdTokenIndexer(namespace="words"), "characters": TokenCharactersIndexer(namespace="characters")}) field2.index(vocab) assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index] assert field2._indexed_tokens["characters"] == [[capital_a_char_index], [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] # pylint: enable=protected-access def test_get_padding_lengths_raises_if_no_indexed_tokens(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) with pytest.raises(ConfigurationError): field.get_padding_lengths() def test_padding_lengths_are_computed_correctly(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"words_length": 5, "num_tokens": 5} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "num_token_characters": 8} field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters"), "words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == {"num_tokens": 5, "characters_length": 5, "words_length": 5, "num_token_characters": 8} def test_as_tensor_handles_words(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])) def test_as_tensor_handles_longer_lengths(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words_length"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0])) def test_as_tensor_handles_characters(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], token_indexers={"characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() tensor_dict = field.as_tensor(padding_lengths) expected_character_array = numpy.array([[1, 1, 1, 3, 0, 0, 0, 0], [1, 3, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4], [1, 0, 0, 0, 0, 0, 0, 0]]) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), expected_character_array) def test_as_tensor_handles_words_and_characters_with_longer_lengths(self): field = TextField([Token(t) for t in ["a", "sentence", "."]], token_indexers={"words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() padding_lengths["words_length"] = 5 padding_lengths["characters_length"] = 5 padding_lengths["num_token_characters"] = 10 tensor_dict = field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict["words"].detach().cpu().numpy(), numpy.array([1, 2, 1, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict["characters"].detach().cpu().numpy(), numpy.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 4, 5, 6, 4, 5, 7, 4, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])) def test_printing_doesnt_crash(self): field = TextField([Token(t) for t in ["A", "sentence"]], {"words": SingleIdTokenIndexer(namespace="words")}) print(field) def test_token_indexer_returns_dict(self): field = TextField([Token(t) for t in ["A", "sentence"]], token_indexers={"field_with_dict": DictReturningTokenIndexer(), "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters")}) field.index(self.vocab) padding_lengths = field.get_padding_lengths() assert padding_lengths == { 'token_ids_length': 5, 'additional_key_length': 2, 'words_length': 2, 'characters_length': 2, 'num_token_characters': 8, 'num_tokens': 5, } padding_lengths['token_ids_length'] = 7 padding_lengths['additional_key_length'] = 3 padding_lengths['words_length'] = 4 padding_lengths['characters_length'] = 4 tensors = field.as_tensor(padding_lengths) assert list(tensors['token_ids'].shape) == [7] assert list(tensors['additional_key'].shape) == [3] assert list(tensors['words'].shape) == [4] assert list(tensors['characters'].shape) == [4, 8] def test_sequence_methods(self): field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], {}) assert len(field) == 5 assert field[1].text == "is" assert [token.text for token in field] == ["This", "is", "a", "sentence", "."]
class TestListField(AllenNlpTestCase): def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", "words") self.vocab.add_token_to_namespace("s", "characters") self.vocab.add_token_to_namespace("e", "characters") self.vocab.add_token_to_namespace("n", "characters") self.vocab.add_token_to_namespace("t", "characters") self.vocab.add_token_to_namespace("c", "characters") for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]: self.vocab.add_token_to_namespace(label, "labels") self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer ) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer ) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer ) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) empty_list_field = ListField([text_field.empty_field()]) empty_fields = {"list_tensor": empty_list_field} self.empty_instance = Instance(empty_fields) non_empty_list_field = ListField([text_field]) non_empty_fields = {"list_tensor": non_empty_list_field} self.non_empty_instance = Instance(non_empty_fields) super().setUp() def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_words___tokens": 5} def test_list_field_can_handle_empty_text_fields(self): list_field = ListField([self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]), ) def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]]) ) def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField( [self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field] ) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]]) ) def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0]) ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5]) ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0]) ) def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ["a", "b", "c", "d", "e"]]) nested_field2 = ListField([LabelField(c) for c in ["f", "g", "h", "i", "j", "k"]]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {"num_fields": 3, "list_num_fields": 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal( tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]] ) def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words___tokens"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0]), ) def test_as_tensor_can_handle_multiple_token_indexers(self): self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"]["tokens"].detach().cpu().numpy() characters = tensor_dict["characters"]["token_characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]]) ) numpy.testing.assert_array_almost_equal( characters[0], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], ] ), ) numpy.testing.assert_array_almost_equal( characters[1], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0], ] ), ) numpy.testing.assert_array_almost_equal( characters[2], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], ] ), ) def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self): self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict["words"]["tokens"].detach().cpu().numpy() characters = tensor_dict["characters"]["token_characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal( words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]]) ) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal( characters[1], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], ] ), ) numpy.testing.assert_array_almost_equal( characters[2], numpy.array( [ [5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0], ] ), ) def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field) def test_sequence_methods(self): list_field = ListField([self.field1, self.field2, self.field3]) assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3] def test_empty_list_can_be_tensorized(self): tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) list_field = ListField([text_field.empty_field()]) fields = { "list": list_field, "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer), } instance = Instance(fields) instance.index_fields(self.vocab) instance.as_tensor_dict() def test_batch_with_some_empty_lists_works(self): dataset = [self.empty_instance, self.non_empty_instance] model = DummyModel(self.vocab) model.eval() iterator = BasicIterator(batch_size=2) iterator.index_with(self.vocab) batch = next(iterator(dataset, shuffle=False)) model.forward(**batch) # This use case may seem a bit peculiar. It's intended for situations where # you have sparse inputs that are used as additional features for some # prediction, and they are sparse enough that they can be empty for some # cases. It would be silly to try to handle these as None in your model; it # makes a whole lot more sense to just have a minimally-sized tensor that # gets entirely masked and has no effect on the rest of the model. def test_batch_of_entirely_empty_lists_works(self): dataset = [self.empty_instance, self.empty_instance] model = DummyModel(self.vocab) model.eval() iterator = BasicIterator(batch_size=2) iterator.index_with(self.vocab) batch = next(iterator(dataset, shuffle=False)) model.forward(**batch)