def truncate_w2v(w2v, new_dims): """Limit w2v to the specified number of dimensions, selected at random""" old_dims = w2v.vectors.shape[1] new_w2v = KeyedVectors(new_dims) vocab = list(w2v.vocab.keys()) cols_idx = np.random.choice(old_dims, size=new_dims, replace=False) weights = w2v.vectors[:, cols_idx] new_w2v.add(vocab, weights) return new_w2v
def test_add_type(self): kv = KeyedVectors(2) assert kv.vectors.dtype == REAL words, vectors = ["a"], np.array([1., 1.], dtype=np.float64).reshape(1, -1) kv.add(words, vectors) assert kv.vectors.dtype == REAL
def test_embeddings_input(): corpus = [ ["the", "man", "ran", "to", "boy"], ["the", "boy", "pig"], ["the", "pig", "man", "boy"], ] # "pig" and "to" is not in the vocabulary model = KeyedVectors(vector_size=2) model.add(["the"], [np.array([1, 3])]) # idx 0 model.add(["man"], [np.array([2, 5])]) # idx 1 model.add(["ran"], [np.array([6, 9])]) # idx 2 model.add(["boy"], [np.array([4, 20])]) # idx 3 model.add([PAD_TOK], [np.array([0, 0])]) # idx 4 embedding_input = lib.embeddings.get_embedding_input(corpus, model) assert (embedding_input == np.array([ [0, 1, 2, 3], [0, 3, 4, 4], [0, 1, 3, 4], ])).all() embedding_layer = lib.embeddings.get_keras_embedding(model) embedded = embedding_layer(embedding_input) assert (embedded.numpy() == np.array([[[1., 3.], [2., 5.], [6., 9.], [4., 20.]], [[1., 3.], [4., 20.], [0., 0.], [0., 0.]], [[1., 3.], [2., 5.], [4., 20.], [0., 0.]]])).all()
def create_small_w2v_model(num_most_common_words=500000, cache_dir=W2VDIR): orig_model = load_word_vector_model(small=False, cache_dir=cache_dir) words = orig_model.index2entity[:num_most_common_words] kv = KeyedVectors(vector_size=orig_model.wv.vector_size) vectors = [] for word in words: vectors.append(orig_model.get_vector(word)) # adds keys (words) & vectors as batch kv.add(words, vectors) w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL) kv.save_word2vec_format(w2v_small_filename, binary=True)
def test_no_header(self): randkv = KeyedVectors(vector_size=100) count = 20 keys = [str(i) for i in range(count)] weights = [ pseudorandom_weak_vector(randkv.vector_size) for _ in range(count) ] randkv.add(keys, weights) tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt") randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False) reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True) self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key) self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())
def test_add_single(self): """Test that adding entity in a manual way works correctly.""" entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. for ent, vector in zip(entities, vectors): self.vectors.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) # Test `add` on empty kv. kv = EuclideanKeyedVectors(self.vectors.vector_size) for ent, vector in zip(entities, vectors): kv.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector))
def _load_small_word_vector_model(cache_dir, num_most_common_words=500000): filename = fewshot_filename(cache_dir, W2V_SMALL) if not os.path.exists(filename): orig_model = _load_large_word_vector_model(cache_dir) words = orig_model.index2entity[:num_most_common_words] kv = KeyedVectors(vector_size=orig_model.wv.vector_size) vectors = [] for word in words: vectors.append(orig_model.get_vector(word)) # adds keys (words) & vectors as batch kv.add(words, vectors) w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL) kv.save_word2vec_format(w2v_small_filename, binary=True) return KeyedVectors.load_word2vec_format(filename, binary=True)
def test_add_single(self): """Test that adding entity in a manual way works correctly.""" entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. for ent, vector in zip(entities, vectors): self.vectors.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) # Test `add` on empty kv. kv = EuclideanKeyedVectors(self.vectors.vector_size) for ent, vector in zip(entities, vectors): kv.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector))
def get_kv(model_path, embed_size=None): # optional ecosystem is to limit to only one ecosystem if embed_size is None: embed_size = int(''.join(c for c in model_path if c.isdigit())) w2v = KeyedVectors(embed_size) model = tf.keras.models.load_model(model_path) all_weights = model.get_weights() # input_weights = all_weights[0] # embed_bias = all_weights[1] output_weights = all_weights[2].T # output_bias = all_weights[3] index = list(id2lib) weights = output_weights w2v.add(index, weights) return w2v
def test_average_embeddings_custom(): corpus = [ ["the", "man", "ran"], ["the", "boy"], ["the", "man", "boy"], ] max_sent_len = 3 model = KeyedVectors(vector_size=1) model.add(["the"], [np.array([1])]) model.add(["man"], [np.array([2])]) model.add(["ran"], [np.array([3])]) model.add(["boy"], [np.array([6])]) model.add([PAD_TOK], [np.array([0])]) embedding_input = lib.embeddings.get_embedding_input(corpus, model, max_sent_len) avg_embedding_model = lib.mlp.get_average_embedding_model( input_shape=embedding_input.shape[1:], w2v_model=model, ) avg = avg_embedding_model.predict(embedding_input) print(avg) assert avg[0][0] == (1 + 2 + 3) / 3 assert avg[1][0] == (1 + 6) / 2 assert avg[2][0] == (1 + 2 + 6) / 3
indices = np.array(list(freq_vocab.values())) M_ = (M[indices].T)[indices].T M_.shape # Square root of the matrix M before Arnoldi iteration sqrt_M = M_.sqrt() # Construct word embeddings by Arnoldi iteration Q_file = f"./tmp/{tag}_Q_{win_size}.npy" if os.path.exists(Q_file): Q = np.load(Q_file) logging.info("Successfully loaded embeddings matrix from %s.", Q_file) else: logging.info("Start arnoldi iterations.") b = np.random.random(size=max_vocab) # initial vector Q, h = arnoldi_iteration(sqrt_M, b, embed_dim) logging.info( "Successfully extracted word embeddings from arnoldi iteration.") np.save(Q_file, Q) # save Word embeddings logging.info("Successfully saved word embedding matrix Q to %s.", Q_file) dim = embed_dim word2vec_file = f"./tmp/arnodi_{dim}_{win_size}.kv" Q_ = Q[:, :dim] we_ = normalize(Q_, axis=1, norm="l2") kv = KeyedVectors(vector_size=dim) kv.add(list(vocab_.keys()), we_) kv.save_word2vec_format(word2vec_file) logging.info("Successfully saved word embeddings of dimension %d to %s.", dim, word2vec_file)
def load_keyedvectors(**kwargs): embeddings = load_embeddings(**kwargs) vectors = KeyedVectors(int(kwargs["size"])) vectors.add(embeddings.index, embeddings.values) return vectors
def get_random_kv(embed_size): w2v = KeyedVectors(embed_size) w2v.add(list(id2lib), np.random.random((len(id2lib), embed_size))) return w2v