def test_compare_word2vec_glove(self): token2idx_word2vec, embedding_matrix_word2vec = load_word2vec( path="tests/data/word2vec.wv") token2idx_glove, embedding_matrix_glove = load_glove( path="tests/data/glove.txt") self.assertDictEqual(token2idx_word2vec, token2idx_glove) self.assertTrue( embedding_matrix_word2vec.shape == embedding_matrix_glove.shape)
def test_load_word2vec_without_pad_unk(self): token2idx, embedding_matrix = load_word2vec( path="tests/data/word2vec.wv", add_pad=False, add_unk=False, ) self.assertEqual(len(token2idx), 8) self.assertEqual(len(token2idx), embedding_matrix.shape[0]) self.assertEqual(embedding_matrix.shape[-1], 100) self.assertTrue("<PAD>" not in token2idx) self.assertTrue("<UNK>" not in token2idx)
def test_load_word2vec_without_unk(self): token2idx, embedding_matrix = load_word2vec( path="tests/data/word2vec.wv", add_unk=False, ) self.assertEqual(len(token2idx), 9) self.assertEqual(len(token2idx), embedding_matrix.shape[0]) self.assertEqual(embedding_matrix.shape[-1], 100) self.assertEqual(token2idx["<PAD>"], 0) self.assertTrue("<UNK>" not in token2idx) self.assertTrue( np.allclose(embedding_matrix[0], np.zeros_like(embedding_matrix[0])), )
def test_load_word2vec(self): token2idx, embedding_matrix = load_word2vec( path="tests/data/word2vec.wv") self.assertEqual(len(token2idx), 10) self.assertEqual(len(token2idx), embedding_matrix.shape[0]) self.assertEqual(embedding_matrix.shape[-1], 100) self.assertEqual(token2idx["<PAD>"], 0) self.assertEqual(token2idx["<UNK>"], 1) self.assertTrue( np.allclose(embedding_matrix[0], np.zeros_like(embedding_matrix[0])), ) self.assertTrue( np.allclose(embedding_matrix[1], embedding_matrix[2:].mean(axis=0)), )
from pytorch_ner.nn_modules.dropout import SpatialDropout1d from pytorch_ner.nn_modules.embedding import ( Embedding, EmbeddingPreTrained, EmbeddingWithDropout, load_glove, load_word2vec, ) from pytorch_ner.prepare_data import prepare_conll_data_format token_seq, _ = prepare_conll_data_format(path="tests/data/conll.txt", sep=" ", verbose=False) tokens = list(set(token for sentence in token_seq for token in sentence)) _, word2vec_embeddings = load_word2vec( path="tests/data/word2vec.wv") # type: ignore _, glove_embeddings = load_glove(path="tests/data/glove.txt") # type: ignore embedding_w2v_freeze = EmbeddingPreTrained(word2vec_embeddings) embedding_w2v_fine_tune = EmbeddingPreTrained(word2vec_embeddings, freeze=False) embedding_glove_freeze = EmbeddingPreTrained(glove_embeddings) embedding_glove_fine_tune = EmbeddingPreTrained(glove_embeddings, freeze=False) random_embedding_with_spatial_dropout = EmbeddingWithDropout( embedding_layer=Embedding(num_embeddings=2000, embedding_dim=128), dropout=SpatialDropout1d(p=0.5), ) emb = random_embedding_with_spatial_dropout(