예제 #1
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
     vocab.add_token_to_namespace(unicode_space)
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write(
             f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8'))
     params = Params({
         'pretrained_file': embeddings_filename,
         'embedding_dim': 3,
     })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word")]
     assert numpy.allclose(word_vector.numpy(),
                           numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         unicode_space)]
     assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3,
                                                             5.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([1.0, 2.3, -1.0]))
예제 #2
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
예제 #3
0
    def test_make_vocab_without_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        # if extend is False, its users responsibility to make sure that dataset instances
        # will be indexible by provided vocabulary. At least @@UNKNOWN@@ should be present in
        # namespace for which there could be OOV entries seen in dataset during indexing.
        # For `tokens` ns, new words will be seen but `tokens` has @@UNKNOWN@@ token.
        # but for 'labels' ns, there is no @@UNKNOWN@@ so required to add 'N', 'V' upfront.
        vocab.add_token_to_namespace('N', namespace='labels')
        vocab.add_token_to_namespace('V', namespace='labels')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = False
        make_vocab_from_params(self.params, extended_serialization_dir)

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'
        assert len(tokens) == 3
예제 #4
0
    def setUp(self):
        super(IteratorTest, self).setUp()
        self.token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.vocab = Vocabulary()
        self.this_index = self.vocab.add_token_to_namespace('this')
        self.is_index = self.vocab.add_token_to_namespace('is')
        self.a_index = self.vocab.add_token_to_namespace('a')
        self.sentence_index = self.vocab.add_token_to_namespace('sentence')
        self.another_index = self.vocab.add_token_to_namespace('another')
        self.yet_index = self.vocab.add_token_to_namespace('yet')
        self.very_index = self.vocab.add_token_to_namespace('very')
        self.long_index = self.vocab.add_token_to_namespace('long')
        instances = [
            self.create_instance(["this", "is", "a", "sentence"]),
            self.create_instance(["this", "is", "another", "sentence"]),
            self.create_instance(["yet", "another", "sentence"]),
            self.create_instance([
                "this", "is", "a", "very", "very", "very", "very", "long",
                "sentence"
            ]),
            self.create_instance(["sentence"]),
        ]

        class LazyIterable:
            def __iter__(self):
                return (instance for instance in instances)

        self.instances = instances
        self.lazy_instances = LazyIterable()
예제 #5
0
    def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace='words')
        capital_a_index = vocab.add_token_to_namespace("A", namespace='words')
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace='characters')
        s_index = vocab.add_token_to_namespace("s", namespace='characters')
        e_index = vocab.add_token_to_namespace("e", namespace='characters')
        n_index = vocab.add_token_to_namespace("n", namespace='characters')
        t_index = vocab.add_token_to_namespace("t", namespace='characters')
        c_index = vocab.add_token_to_namespace("c", namespace='characters')

        field = TextField([Token(t) for t in ["A", "sentence"]],
                          {"words": SingleIdTokenIndexer(namespace="words")})
        field.index(vocab)
        # pylint: disable=protected-access
        assert field._indexed_tokens["words"] == [capital_a_index, sentence_index]

        field1 = TextField([Token(t) for t in ["A", "sentence"]],
                           {"characters": TokenCharactersIndexer(namespace="characters")})
        field1.index(vocab)
        assert field1._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
        field2 = TextField([Token(t) for t in ["A", "sentence"]],
                           token_indexers={"words": SingleIdTokenIndexer(namespace="words"),
                                           "characters": TokenCharactersIndexer(namespace="characters")})
        field2.index(vocab)
        assert field2._indexed_tokens["words"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"] == [[capital_a_char_index],
                                                        [s_index, e_index, n_index, t_index,
                                                         e_index, n_index, c_index, e_index]]
예제 #6
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
            "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
            "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
            "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
            "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
        }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
            'pretrained_file':
            str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
            'embedding_dim':
            5
        })
        with pytest.raises(
                ValueError,
                message=
                "No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(
                self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(
                archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({'pretrained_file': file_uri, 'embedding_dim': 5})
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i],
                                   vec), 'Problem with format ' + archive_path
 def setUp(self):
     super(TestTokenCharactersEncoder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1", "token_characters")
     self.vocab.add_token_to_namespace("2", "token_characters")
     self.vocab.add_token_to_namespace("3", "token_characters")
     self.vocab.add_token_to_namespace("4", "token_characters")
     params = Params({
         "embedding": {
             "embedding_dim": 2,
             "vocab_namespace": "token_characters"
         },
         "encoder": {
             "type": "cnn",
             "embedding_dim": 2,
             "num_filters": 4,
             "ngram_filter_sizes": [1, 2],
             "output_dim": 3
         }
     })
     self.encoder = TokenCharactersEncoder.from_params(
         vocab=self.vocab, params=deepcopy(params))
     self.embedding = Embedding.from_params(vocab=self.vocab,
                                            params=params["embedding"])
     self.inner_encoder = Seq2VecEncoder.from_params(params["encoder"])
     constant_init = lambda tensor: torch.nn.init.constant_(tensor, 1.)
     initializer = InitializerApplicator([(".*", constant_init)])
     initializer(self.encoder)
     initializer(self.embedding)
     initializer(self.inner_encoder)
예제 #8
0
    def test_elmo_as_array_produces_token_sequence(self):  # pylint: disable=invalid-name
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token('Second'), Token('.')]
        indices = indexer.tokens_to_indices(tokens, Vocabulary(),
                                            "test-elmo")["test-elmo"]
        padded_tokens = indexer.pad_token_sequence(
            {'test-elmo': indices},
            desired_num_tokens={'test-elmo': 3},
            padding_lengths={})
        expected_padded_tokens = [
            [
                259, 84, 102, 100, 112, 111, 101, 260, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                259, 47, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
                261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
            ],
            [
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0
            ]
        ]

        assert padded_tokens['test-elmo'] == expected_padded_tokens
 def setUp(self):
     super(TestBasicTextFieldEmbedder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1")
     self.vocab.add_token_to_namespace("2")
     self.vocab.add_token_to_namespace("3")
     self.vocab.add_token_to_namespace("4")
     params = Params({
         "words1": {
             "type": "embedding",
             "embedding_dim": 2
         },
         "words2": {
             "type": "embedding",
             "embedding_dim": 5
         },
         "words3": {
             "type": "embedding",
             "embedding_dim": 3
         }
     })
     self.token_embedder = BasicTextFieldEmbedder.from_params(
         vocab=self.vocab, params=params)
     self.inputs = {
         "words1": torch.LongTensor([[0, 2, 3, 5]]),
         "words2": torch.LongTensor([[1, 4, 3, 2]]),
         "words3": torch.LongTensor([[1, 5, 1, 2]])
     }
예제 #10
0
 def setUp(self):
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("this")
     self.vocab.add_token_to_namespace("is")
     self.vocab.add_token_to_namespace("a")
     self.vocab.add_token_to_namespace("sentence")
     self.vocab.add_token_to_namespace(".")
     self.token_indexer = {"tokens": SingleIdTokenIndexer()}
     self.instances = self.get_instances()
     super(TestDataset, self).setUp()
예제 #11
0
    def test_label_field_can_index_with_vocab(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("entailment", namespace="labels")
        vocab.add_token_to_namespace("contradiction", namespace="labels")
        vocab.add_token_to_namespace("neutral", namespace="labels")

        label = LabelField("entailment")
        label.index(vocab)
        tensor = label.as_tensor(label.get_padding_lengths())
        assert tensor.item() == 0
예제 #12
0
 def setUp(self):
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("sentence", namespace='words')
     self.vocab.add_token_to_namespace("A", namespace='words')
     self.vocab.add_token_to_namespace("A", namespace='characters')
     self.vocab.add_token_to_namespace("s", namespace='characters')
     self.vocab.add_token_to_namespace("e", namespace='characters')
     self.vocab.add_token_to_namespace("n", namespace='characters')
     self.vocab.add_token_to_namespace("t", namespace='characters')
     self.vocab.add_token_to_namespace("c", namespace='characters')
     super(TestTextField, self).setUp()
예제 #13
0
 def test_bos_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token('<S>')], Vocabulary(),
                                         "test-elmo")
     expected_indices = [
         259, 257, 260, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == {"test-elmo": [expected_indices]}
예제 #14
0
 def test_unicode_to_char_ids(self):
     indexer = ELMoTokenCharactersIndexer()
     indices = indexer.tokens_to_indices([Token(chr(256) + 't')],
                                         Vocabulary(), "test-unicode")
     expected_indices = [
         259, 197, 129, 117, 260, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,
         261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261
     ]
     assert indices == {"test-unicode": [expected_indices]}
    def test_tokens_to_indices_produces_correct_characters(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace='characters')
        vocab.add_token_to_namespace("s", namespace='characters')
        vocab.add_token_to_namespace("e", namespace='characters')
        vocab.add_token_to_namespace("n", namespace='characters')
        vocab.add_token_to_namespace("t", namespace='characters')
        vocab.add_token_to_namespace("c", namespace='characters')

        indexer = TokenCharactersIndexer("characters")
        indices = indexer.tokens_to_indices([Token("sentential")], vocab, "char")
        assert indices == {"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}
예제 #16
0
 def test_get_embedding_layer_uses_correct_embedding_dim(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace('word1')
     vocab.add_token_to_namespace('word2')
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word1 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write("word2 0.1 0.4 -4.0\n".encode('utf-8'))
     embedding_weights = _read_pretrained_embeddings_file(
         embeddings_filename, 3, vocab)
     assert tuple(embedding_weights.size()) == (
         4, 3)  # 4 because of padding and OOV
     with pytest.raises(ConfigurationError):
         _read_pretrained_embeddings_file(embeddings_filename, 4, vocab)
예제 #17
0
 def test_tokens_to_indices_uses_ner_tags(self):
     tokens = self.tokenizer.split_words("Larry Page is CEO of Google.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     person_index = vocab.add_token_to_namespace('PERSON',
                                                 namespace='ner_tags')
     none_index = vocab.add_token_to_namespace('NONE', namespace='ner_tags')
     vocab.add_token_to_namespace('ORG', namespace='ner_tags')
     indexer = NerTagIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
         "tokens1": [person_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
         "tokens-1": [none_index]
     }
예제 #18
0
 def test_tokens_to_indices_uses_pos_tags(self):
     tokens = self.tokenizer.split_words("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     root_index = vocab.add_token_to_namespace('ROOT',
                                               namespace='dep_labels')
     none_index = vocab.add_token_to_namespace('NONE',
                                               namespace='dep_labels')
     indexer = DepLabelIndexer()
     assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
         "tokens1": [root_index]
     }
     assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
         "tokens-1": [none_index]
     }
예제 #19
0
    def test_elmo_token_representation_bos_eos(self):
        # The additional <S> and </S> embeddings added by the embedder should be as expected.
        indexer = ELMoTokenCharactersIndexer()

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)

        for correct_index, token in [[0, '<S>'], [2, '</S>']]:
            indices = indexer.tokens_to_indices([Token(token)], Vocabulary(),
                                                "correct")
            indices = torch.from_numpy(numpy.array(indices["correct"])).view(
                1, 1, -1)
            embeddings = elmo_token_embedder(indices)['token_embedding']
            assert numpy.allclose(embeddings[0, correct_index, :].data.numpy(),
                                  embeddings[0, 1, :].data.numpy())
예제 #20
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
예제 #21
0
    def test_elmo_bilm(self):
        # get the raw data
        sentences, expected_lm_embeddings = self._load_sentences_embeddings()

        # load the test model
        elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file)

        # Deal with the data.
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for batch in zip(*sentences):
            for sentence in batch:
                tokens = [Token(token) for token in sentence.split()]
                field = TextField(tokens, {'character_ids': indexer})
                instance = Instance({"elmo": field})
                instances.append(instance)

        vocab = Vocabulary()

        # Now finally we can iterate through batches.
        iterator = BasicIterator(3)
        iterator.index_with(vocab)
        for i, batch in enumerate(
                iterator(instances, num_epochs=1, shuffle=False)):
            lm_embeddings = elmo_bilm(batch['elmo']['character_ids'])
            top_layer_embeddings, mask = remove_sentence_boundaries(
                lm_embeddings['activations'][2], lm_embeddings['mask'])

            # check the mask lengths
            lengths = mask.data.numpy().sum(axis=1)
            batch_sentences = [sentences[k][i] for k in range(3)]
            expected_lengths = [
                len(sentence.split()) for sentence in batch_sentences
            ]
            self.assertEqual(lengths.tolist(), expected_lengths)

            # get the expected embeddings and compare!
            expected_top_layer = [
                expected_lm_embeddings[k][i] for k in range(3)
            ]
            for k in range(3):
                self.assertTrue(
                    numpy.allclose(
                        top_layer_embeddings[k, :lengths[k], :].data.numpy(),
                        expected_top_layer[k],
                        atol=1.0e-6))
예제 #22
0
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset('embedding',
                                    embeddings.shape,
                                    dtype='float32',
                                    data=embeddings)

        params = Params({
            'pretrained_file': embeddings_filename,
            'embedding_dim': 5,
        })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
예제 #23
0
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(
         self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     params = Params({
         'pretrained_file': embeddings_filename,
         'embedding_dim': 3,
     })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([0.0, 0.0, 0.0]))
예제 #24
0
    def setUp(self):
        super(SpanBasedF1Test, self).setUp()
        vocab = Vocabulary()
        vocab.add_token_to_namespace("O", "tags")
        vocab.add_token_to_namespace("B-ARG1", "tags")
        vocab.add_token_to_namespace("I-ARG1", "tags")
        vocab.add_token_to_namespace("B-ARG2", "tags")
        vocab.add_token_to_namespace("I-ARG2", "tags")
        vocab.add_token_to_namespace("B-V", "tags")
        vocab.add_token_to_namespace("I-V", "tags")
        vocab.add_token_to_namespace("U-ARG1", "tags")
        vocab.add_token_to_namespace("U-ARG2", "tags")
        vocab.add_token_to_namespace("B-C-ARG1", "tags")
        vocab.add_token_to_namespace("I-C-ARG1", "tags")
        vocab.add_token_to_namespace("B-ARGM-ADJ", "tags")
        vocab.add_token_to_namespace("I-ARGM-ADJ", "tags")

        self.vocab = vocab
예제 #25
0
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset('embedding',
                                    embeddings.shape,
                                    dtype='float32',
                                    data=embeddings)

        params = Params({
            'pretrained_file': embeddings_filename,
            'embedding_dim': 5,
        })
        embedding_layer = Embedding.from_params(vocab, params)
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
예제 #26
0
    def setUp(self):
        self.tokenizer = WordTokenizer(SpacyWordSplitter(pos_tags=True))
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        json = {
            'question': self.utterance,
            'columns': ['Name in English', 'Location in English'],
            'cells': [['Paradeniz', 'Mersin'], ['Lake Gala', 'Edirne']]
        }
        self.graph = TableQuestionKnowledgeGraph.read_from_json(json)
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace='tokens')
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace='tokens')
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace='tokens')
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace='tokens')
        self.paradeniz_index = self.vocab.add_token_to_namespace(
            "paradeniz", namespace='tokens')
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace='tokens')
        self.lake_index = self.vocab.add_token_to_namespace("lake",
                                                            namespace='tokens')
        self.gala_index = self.vocab.add_token_to_namespace("gala",
                                                            namespace='tokens')
        self.negative_one_index = self.vocab.add_token_to_namespace(
            "-1", namespace='tokens')
        self.zero_index = self.vocab.add_token_to_namespace("0",
                                                            namespace='tokens')
        self.one_index = self.vocab.add_token_to_namespace("1",
                                                           namespace='tokens')

        self.oov_index = self.vocab.get_token_index('random OOV string',
                                                    namespace='tokens')
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super(KnowledgeGraphFieldTest, self).setUp()
예제 #27
0
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        params = Params({
            'pretrained_file':
            str(self.FIXTURES_ROOT / 'embeddings/glove.6B.300d.sample.txt.gz'),
            'embedding_dim':
            300,
            'projection_dim':
            20
        })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = torch.LongTensor([[3, 2, 1, 0]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)
예제 #28
0
    def test_get_vocab_index_mapping(self):
        # pylint: disable=line-too-long
        mml_model_archive_file = (self.FIXTURES_ROOT / "semantic_parsing" / "nlvr_direct_semantic_parser" /
                                  "serialization" / "model.tar.gz")
        archive = load_archive(mml_model_archive_file)
        mapping = self.model._get_vocab_index_mapping(archive.model.vocab)
        expected_mapping = [(i, i) for i in range(16)]
        assert mapping == expected_mapping

        new_vocab = Vocabulary()
        def copy_token_at_index(i):
            token = self.vocab.get_token_from_index(i, "tokens")
            new_vocab.add_token_to_namespace(token, "tokens")
        copy_token_at_index(5)
        copy_token_at_index(7)
        copy_token_at_index(10)
        mapping = self.model._get_vocab_index_mapping(new_vocab)
        # Mapping of indices from model vocabulary to new vocabulary. 0 and 1 are padding and unk
        # tokens.
        assert mapping == [(0, 0), (1, 1), (5, 2), (7, 3), (10, 4)]
예제 #29
0
    def test_make_vocab_with_extension(self):
        existing_serialization_dir = self.TEST_DIR / 'existing'
        extended_serialization_dir = self.TEST_DIR / 'extended'
        existing_vocab_path = existing_serialization_dir / 'vocabulary'
        extended_vocab_path = extended_serialization_dir / 'vocabulary'

        vocab = Vocabulary()
        vocab.add_token_to_namespace('some_weird_token_1', namespace='tokens')
        vocab.add_token_to_namespace('some_weird_token_2', namespace='tokens')
        os.makedirs(existing_serialization_dir, exist_ok=True)
        vocab.save_to_files(existing_vocab_path)

        self.params['vocabulary'] = {}
        self.params['vocabulary']['directory_path'] = existing_vocab_path
        self.params['vocabulary']['extend'] = True
        self.params['vocabulary']['min_count'] = {"tokens": 3}
        make_vocab_from_params(self.params, extended_serialization_dir)

        vocab_files = os.listdir(extended_vocab_path)
        assert set(vocab_files) == {
            'labels.txt', 'non_padded_namespaces.txt', 'tokens.txt'
        }

        with open(extended_vocab_path / 'tokens.txt') as f:
            tokens = [line.strip() for line in f]

        assert tokens[0] == '@@UNKNOWN@@'
        assert tokens[1] == 'some_weird_token_1'
        assert tokens[2] == 'some_weird_token_2'

        tokens.sort()
        assert tokens == [
            '.', '@@UNKNOWN@@', 'animals', 'are', 'some_weird_token_1',
            'some_weird_token_2'
        ]

        with open(extended_vocab_path / 'labels.txt') as f:
            labels = [line.strip() for line in f]

        labels.sort()
        assert labels == ['N', 'V']
예제 #30
0
    def test_elmo_token_representation(self):
        # Load the test words and convert to char ids
        with open(os.path.join(self.elmo_fixtures_path, 'vocab_test.txt'),
                  'r') as fin:
            words = fin.read().strip().split('\n')

        vocab = Vocabulary()
        indexer = ELMoTokenCharactersIndexer()
        tokens = [Token(word) for word in words]

        indices = indexer.tokens_to_indices(tokens, vocab, "elmo")
        # There are 457 tokens. Reshape into 10 batches of 50 tokens.
        sentences = []
        for k in range(10):
            char_indices = indices["elmo"][(k * 50):((k + 1) * 50)]
            sentences.append(
                indexer.pad_token_sequence({'key': char_indices},
                                           desired_num_tokens={'key': 50},
                                           padding_lengths={})['key'])
        batch = torch.from_numpy(numpy.array(sentences))

        elmo_token_embedder = _ElmoCharacterEncoder(self.options_file,
                                                    self.weight_file)
        elmo_token_embedder_output = elmo_token_embedder(batch)

        # Reshape back to a list of words and compare with ground truth.  Need to also
        # remove <S>, </S>
        actual_embeddings = remove_sentence_boundaries(
            elmo_token_embedder_output['token_embedding'],
            elmo_token_embedder_output['mask'])[0].data.numpy()
        actual_embeddings = actual_embeddings.reshape(
            -1, actual_embeddings.shape[-1])

        embedding_file = os.path.join(self.elmo_fixtures_path,
                                      'elmo_token_embeddings.hdf5')
        with h5py.File(embedding_file, 'r') as fin:
            expected_embeddings = fin['embedding'][...]

        assert numpy.allclose(actual_embeddings[:len(tokens)],
                              expected_embeddings,
                              atol=1e-6)