Exemplo n.º 1
0
    def test_read_embedding_file_inside_archive(self):
        token2vec = {
            "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]),
            "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]),
            "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]),
            "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0])
        }
        vocab = Vocabulary()
        for token in token2vec:
            vocab.add_token_to_namespace(token)

        params = Params({
            'pretrained_file':
            str(self.FIXTURES_ROOT / 'embeddings/multi-file-archive.zip'),
            'embedding_dim':
            5
        })
        with pytest.raises(
                ValueError,
                message=
                "No ValueError when pretrained_file is a multi-file archive"):
            Embedding.from_params(vocab, params)

        for ext in ['.zip', '.tar.gz']:
            archive_path = str(
                self.FIXTURES_ROOT / 'embeddings/multi-file-archive') + ext
            file_uri = format_embeddings_file_uri(
                archive_path, 'folder/fake_embeddings.5d.txt')
            params = Params({'pretrained_file': file_uri, 'embedding_dim': 5})
            embeddings = Embedding.from_params(vocab, params).weight.data
            for tok, vec in token2vec.items():
                i = vocab.get_token_index(tok)
                assert torch.equal(embeddings[i],
                                   vec), 'Problem with format ' + archive_path
Exemplo n.º 2
0
 def test_embedding_layer_actually_initializes_word_vectors_correctly(self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
     vocab.add_token_to_namespace(unicode_space)
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
         embeddings_file.write(
             f"{unicode_space} 3.4 3.3 5.0\n".encode('utf-8'))
     params = Params({
         'pretrained_file': embeddings_filename,
         'embedding_dim': 3,
     })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word")]
     assert numpy.allclose(word_vector.numpy(),
                           numpy.array([1.0, 2.3, -1.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         unicode_space)]
     assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3,
                                                             5.0]))
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([1.0, 2.3, -1.0]))
 def from_params(cls, vocab: Vocabulary, params: Params) -> 'TokenCharactersEncoder':  # type: ignore
     # pylint: disable=arguments-differ
     embedding_params: Params = params.pop("embedding")
     # Embedding.from_params() uses "tokens" as the default namespace, but we need to change
     # that to be "token_characters" by default.
     embedding_params.setdefault("vocab_namespace", "token_characters")
     embedding = Embedding.from_params(vocab, embedding_params)
     encoder_params: Params = params.pop("encoder")
     encoder = Seq2VecEncoder.from_params(encoder_params)
     dropout = params.pop_float("dropout", 0.0)
     params.assert_empty(cls.__name__)
     return cls(embedding, encoder, dropout)
Exemplo n.º 4
0
 def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero(
         self):
     vocab = Vocabulary()
     vocab.add_token_to_namespace("word")
     vocab.add_token_to_namespace("word2")
     embeddings_filename = str(self.TEST_DIR / "embeddings.gz")
     with gzip.open(embeddings_filename, 'wb') as embeddings_file:
         embeddings_file.write("word 1.0 2.3 -1.0\n".encode('utf-8'))
     params = Params({
         'pretrained_file': embeddings_filename,
         'embedding_dim': 3,
     })
     embedding_layer = Embedding.from_params(vocab, params)
     word_vector = embedding_layer.weight.data[vocab.get_token_index(
         "word2")]
     assert not numpy.allclose(word_vector.numpy(),
                               numpy.array([0.0, 0.0, 0.0]))
Exemplo n.º 5
0
    def test_read_hdf5_raises_on_invalid_shape(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 10)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset('embedding',
                                    embeddings.shape,
                                    dtype='float32',
                                    data=embeddings)

        params = Params({
            'pretrained_file': embeddings_filename,
            'embedding_dim': 5,
        })
        with pytest.raises(ConfigurationError):
            _ = Embedding.from_params(vocab, params)
Exemplo n.º 6
0
    def test_read_hdf5_format_file(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("word")
        vocab.add_token_to_namespace("word2")
        embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5")
        embeddings = numpy.random.rand(vocab.get_vocab_size(), 5)
        with h5py.File(embeddings_filename, 'w') as fout:
            _ = fout.create_dataset('embedding',
                                    embeddings.shape,
                                    dtype='float32',
                                    data=embeddings)

        params = Params({
            'pretrained_file': embeddings_filename,
            'embedding_dim': 5,
        })
        embedding_layer = Embedding.from_params(vocab, params)
        assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
Exemplo n.º 7
0
    def test_forward_works_with_projection_layer(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace('the')
        vocab.add_token_to_namespace('a')
        params = Params({
            'pretrained_file':
            str(self.FIXTURES_ROOT / 'embeddings/glove.6B.300d.sample.txt.gz'),
            'embedding_dim':
            300,
            'projection_dim':
            20
        })
        embedding_layer = Embedding.from_params(vocab, params)
        input_tensor = torch.LongTensor([[3, 2, 1, 0]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 4, 20)

        input_tensor = torch.LongTensor([[[3, 2, 1, 0]]])
        embedded = embedding_layer(input_tensor).data.numpy()
        assert embedded.shape == (1, 1, 4, 20)