def test_embedding_vocab_extension_is_no_op_when_extension_should_not_happen( self): # Case1: When vocab is already in sync with embeddings it should be a no-op. vocab = Vocabulary({"tokens": {"word1": 1, "word2": 1}}) embedding_params = Params({ "vocab_namespace": "tokens", "embedding_dim": 10 }) embedder = Embedding.from_vocab_or_file( vocab, **embedding_params.as_dict(quiet=True)) original_weight = embedder.weight embedder.extend_vocab(vocab, "tokens") assert torch.all(embedder.weight == original_weight) # Case2: Shouldn't wrongly assuming "tokens" namespace for extension if no # information on vocab_namespece is available. Rather log a warning and be a no-op. vocab = Vocabulary() vocab.add_token_to_namespace("word1", "tokens") vocab.add_token_to_namespace("word2", "tokens") embedding_params = Params({ "vocab_namespace": "tokens", "embedding_dim": 10 }) embedder = Embedding.from_vocab_or_file( vocab, **embedding_params.as_dict(quiet=True)) # Previous models won't have _vocab_namespace attribute. Force it to be None embedder._vocab_namespace = None embedder.weight = torch.nn.Parameter(embedder.weight[:1, :]) assert embedder.weight.shape[0] == 1 embedder.extend_vocab(vocab) # Don't specify namespace assert embedder.weight.shape[0] == 1
def test_embedding_layer_actually_initializes_word_vectors_correctly(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") unicode_space = "\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" vocab.add_token_to_namespace(unicode_space) embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write( f"{unicode_space} 3.4 3.3 5.0\n".encode("utf-8")) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 3 }) embedding_layer = Embedding.from_vocab_or_file( vocab, **params.as_dict(quiet=True)) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word")] assert numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( unicode_space)] assert numpy.allclose(word_vector.numpy(), numpy.array([3.4, 3.3, 5.0])) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([1.0, 2.3, -1.0]))
def test_embedding_vocab_extension_without_stored_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace("word1", "tokens_a") vocab.add_token_to_namespace("word2", "tokens_a") embedding_params = Params({ "vocab_namespace": "tokens_a", "embedding_dim": 10 }) embedder = Embedding.from_vocab_or_file( vocab, **embedding_params.as_dict(quiet=True)) # Previous models won't have _vocab_namespace attribute. Force it to be None embedder._vocab_namespace = None original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens_a": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab, "tokens_a") # specified namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])
def test_embedding_vocab_extension_raises_error_for_incorrect_vocab(self): # When vocab namespace of extension vocab is smaller than embeddings # it should raise configuration error. vocab = Vocabulary({"tokens": {"word1": 1, "word2": 1}}) embedding_params = Params({ "vocab_namespace": "tokens", "embedding_dim": 10 }) embedder = Embedding.from_vocab_or_file( vocab, **embedding_params.as_dict(quiet=True)) with pytest.raises(ConfigurationError): embedder.extend_vocab(Vocabulary(), "tokens")
def test_read_embedding_file_inside_archive(self): token2vec = { "think": torch.Tensor([0.143, 0.189, 0.555, 0.361, 0.472]), "make": torch.Tensor([0.878, 0.651, 0.044, 0.264, 0.872]), "difference": torch.Tensor([0.053, 0.162, 0.671, 0.110, 0.259]), "àèìòù": torch.Tensor([1.0, 2.0, 3.0, 4.0, 5.0]), } vocab = Vocabulary() for token in token2vec: vocab.add_token_to_namespace(token) params = Params({ "pretrained_file": str(self.FIXTURES_ROOT / "embeddings/multi-file-archive.zip"), "embedding_dim": 5, }) with pytest.raises( ValueError, match= "The archive .*/embeddings/multi-file-archive.zip contains multiple files, " "so you must select one of the files inside " "providing a uri of the type: " "\\(path_or_url_to_archive\\)#path_inside_archive\\.", ): Embedding.from_vocab_or_file(vocab, **params.as_dict(quiet=True)) for ext in [".zip", ".tar.gz"]: archive_path = str( self.FIXTURES_ROOT / "embeddings/multi-file-archive") + ext file_uri = format_embeddings_file_uri( archive_path, "folder/fake_embeddings.5d.txt") params = Params({"pretrained_file": file_uri, "embedding_dim": 5}) embeddings = Embedding.from_vocab_or_file( vocab, **params.as_dict(quiet=True)).weight.data for tok, vec in token2vec.items(): i = vocab.get_token_index(tok) assert torch.equal(embeddings[i], vec), "Problem with format " + archive_path
def test_embedding_vocab_extension_works_with_pretrained_embedding_file( self): vocab = Vocabulary() vocab.add_token_to_namespace("word1") vocab.add_token_to_namespace("word2") embeddings_filename = str(self.TEST_DIR / "embeddings2.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word3 0.5 0.3 -6.0\n".encode("utf-8")) embeddings_file.write("word4 1.0 2.3 -1.0\n".encode("utf-8")) embeddings_file.write("word2 0.1 0.4 -4.0\n".encode("utf-8")) embeddings_file.write("word1 1.0 2.3 -1.0\n".encode("utf-8")) embedding_params = Params({ "vocab_namespace": "tokens", "embedding_dim": 3, "pretrained_file": embeddings_filename, }) embedder = Embedding.from_vocab_or_file( vocab, **embedding_params.as_dict(quiet=True)) # Change weight to simulate embedding training embedder.weight.data += 1 assert torch.all(embedder.weight[2:, :] == torch.Tensor( [[2.0, 3.3, 0.0], [1.1, 1.4, -3.0]])) original_weight = embedder.weight assert tuple(original_weight.size()) == ( 4, 3) # 4 because of padding and OOV vocab.add_token_to_namespace("word3") embedder.extend_vocab( vocab, extension_pretrained_file=embeddings_filename) # default namespace extended_weight = embedder.weight # Make sure extenstion happened for extra token in extended vocab assert tuple(extended_weight.size()) == (5, 3) # Make sure extension doesn't change original trained weights. assert torch.all(original_weight[:4, :] == extended_weight[:4, :]) # Make sure extended weight is taken from the embedding file. assert torch.all( extended_weight[4, :] == torch.Tensor([0.5, 0.3, -6.0]))
def test_get_embedding_layer_initializes_unseen_words_randomly_not_zero( self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = str(self.TEST_DIR / "embeddings.gz") with gzip.open(embeddings_filename, "wb") as embeddings_file: embeddings_file.write("word 1.0 2.3 -1.0\n".encode("utf-8")) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 3 }) embedding_layer = Embedding.from_vocab_or_file( vocab, **params.as_dict(quiet=True)) word_vector = embedding_layer.weight.data[vocab.get_token_index( "word2")] assert not numpy.allclose(word_vector.numpy(), numpy.array([0.0, 0.0, 0.0]))
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, "w") as fout: _ = fout.create_dataset("embedding", embeddings.shape, dtype="float32", data=embeddings) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 5 }) with pytest.raises(ConfigurationError): _ = Embedding.from_vocab_or_file(vocab, **params.as_dict(quiet=True))
def test_read_hdf5_format_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = str(self.TEST_DIR / "embeddings.hdf5") embeddings = numpy.random.rand(vocab.get_vocab_size(), 5) with h5py.File(embeddings_filename, "w") as fout: _ = fout.create_dataset("embedding", embeddings.shape, dtype="float32", data=embeddings) params = Params({ "pretrained_file": embeddings_filename, "embedding_dim": 5 }) embedding_layer = Embedding.from_vocab_or_file( vocab, **params.as_dict(quiet=True)) assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
def test_forward_works_with_projection_layer(self): vocab = Vocabulary() vocab.add_token_to_namespace("the") vocab.add_token_to_namespace("a") params = Params({ "pretrained_file": str(self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz"), "embedding_dim": 300, "projection_dim": 20, }) embedding_layer = Embedding.from_vocab_or_file( vocab, **params.as_dict(quiet=True)) input_tensor = torch.LongTensor([[3, 2, 1, 0]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 4, 20) input_tensor = torch.LongTensor([[[3, 2, 1, 0]]]) embedded = embedding_layer(input_tensor).data.numpy() assert embedded.shape == (1, 1, 4, 20)
def test_embedding_vocab_extension_with_default_namespace(self): vocab = Vocabulary() vocab.add_token_to_namespace("word1") vocab.add_token_to_namespace("word2") embedding_params = Params({ "vocab_namespace": "tokens", "embedding_dim": 10 }) embedder = Embedding.from_vocab_or_file( vocab, **embedding_params.as_dict(quiet=True)) original_weight = embedder.weight assert original_weight.shape[0] == 4 extension_counter = {"tokens": {"word3": 1}} vocab._extend(extension_counter) embedder.extend_vocab(vocab) # default namespace extended_weight = embedder.weight assert extended_weight.shape[0] == 5 assert torch.all(extended_weight[:4, :] == original_weight[:4, :])