예제 #1
0
def truncate_w2v(w2v, new_dims):
    """Limit w2v to the specified number of dimensions, selected at random"""
    old_dims = w2v.vectors.shape[1]
    new_w2v = KeyedVectors(new_dims)
    vocab = list(w2v.vocab.keys())
    cols_idx = np.random.choice(old_dims, size=new_dims, replace=False)
    weights = w2v.vectors[:, cols_idx]
    new_w2v.add(vocab, weights)
    return new_w2v
예제 #2
0
    def test_add_type(self):
        kv = KeyedVectors(2)
        assert kv.vectors.dtype == REAL

        words, vectors = ["a"], np.array([1., 1.],
                                         dtype=np.float64).reshape(1, -1)
        kv.add(words, vectors)

        assert kv.vectors.dtype == REAL
예제 #3
0
def test_embeddings_input():
    corpus = [
        ["the", "man", "ran", "to", "boy"],
        ["the", "boy", "pig"],
        ["the", "pig", "man", "boy"],
    ]

    # "pig" and "to" is not in the vocabulary
    model = KeyedVectors(vector_size=2)
    model.add(["the"], [np.array([1, 3])])  # idx 0
    model.add(["man"], [np.array([2, 5])])  # idx 1
    model.add(["ran"], [np.array([6, 9])])  # idx 2
    model.add(["boy"], [np.array([4, 20])])  # idx 3
    model.add([PAD_TOK], [np.array([0, 0])])  # idx 4

    embedding_input = lib.embeddings.get_embedding_input(corpus, model)

    assert (embedding_input == np.array([
        [0, 1, 2, 3],
        [0, 3, 4, 4],
        [0, 1, 3, 4],
    ])).all()

    embedding_layer = lib.embeddings.get_keras_embedding(model)

    embedded = embedding_layer(embedding_input)

    assert (embedded.numpy() == np.array([[[1., 3.], [2., 5.], [6., 9.],
                                           [4., 20.]],
                                          [[1., 3.], [4., 20.], [0., 0.],
                                           [0., 0.]],
                                          [[1., 3.], [2., 5.], [4., 20.],
                                           [0., 0.]]])).all()
예제 #4
0
def create_small_w2v_model(num_most_common_words=500000, cache_dir=W2VDIR):
    orig_model = load_word_vector_model(small=False, cache_dir=cache_dir)
    words = orig_model.index2entity[:num_most_common_words]

    kv = KeyedVectors(vector_size=orig_model.wv.vector_size)

    vectors = []
    for word in words:
        vectors.append(orig_model.get_vector(word))

    # adds keys (words) & vectors as batch
    kv.add(words, vectors)

    w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL)
    kv.save_word2vec_format(w2v_small_filename, binary=True)
예제 #5
0
 def test_no_header(self):
     randkv = KeyedVectors(vector_size=100)
     count = 20
     keys = [str(i) for i in range(count)]
     weights = [
         pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)
     ]
     randkv.add(keys, weights)
     tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt")
     randkv.save_word2vec_format(tmpfiletxt,
                                 binary=False,
                                 write_header=False)
     reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt,
                                                     binary=False,
                                                     no_header=True)
     self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key)
     self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())
예제 #6
0
    def test_add_single(self):
        """Test that adding entity in a manual way works correctly."""
        entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)]
        vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)]

        # Test `add` on already filled kv.
        for ent, vector in zip(entities, vectors):
            self.vectors.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(self.vectors[ent], vector))

        # Test `add` on empty kv.
        kv = EuclideanKeyedVectors(self.vectors.vector_size)
        for ent, vector in zip(entities, vectors):
            kv.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(kv[ent], vector))
예제 #7
0
def _load_small_word_vector_model(cache_dir, num_most_common_words=500000):
    filename = fewshot_filename(cache_dir, W2V_SMALL)
    if not os.path.exists(filename):
        orig_model = _load_large_word_vector_model(cache_dir)
        words = orig_model.index2entity[:num_most_common_words]

        kv = KeyedVectors(vector_size=orig_model.wv.vector_size)

        vectors = []
        for word in words:
            vectors.append(orig_model.get_vector(word))

        # adds keys (words) & vectors as batch
        kv.add(words, vectors)

        w2v_small_filename = fewshot_filename(cache_dir, W2V_SMALL)
        kv.save_word2vec_format(w2v_small_filename, binary=True)

    return KeyedVectors.load_word2vec_format(filename, binary=True)
예제 #8
0
    def test_add_single(self):
        """Test that adding entity in a manual way works correctly."""
        entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)]
        vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)]

        # Test `add` on already filled kv.
        for ent, vector in zip(entities, vectors):
            self.vectors.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(self.vectors[ent], vector))

        # Test `add` on empty kv.
        kv = EuclideanKeyedVectors(self.vectors.vector_size)
        for ent, vector in zip(entities, vectors):
            kv.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(kv[ent], vector))
예제 #9
0
def get_kv(model_path, embed_size=None):
    # optional ecosystem is to limit to only one ecosystem
    if embed_size is None:
        embed_size = int(''.join(c for c in model_path if c.isdigit()))

    w2v = KeyedVectors(embed_size)
    model = tf.keras.models.load_model(model_path)

    all_weights = model.get_weights()
    # input_weights = all_weights[0]
    # embed_bias = all_weights[1]
    output_weights = all_weights[2].T
    # output_bias = all_weights[3]

    index = list(id2lib)
    weights = output_weights

    w2v.add(index, weights)
    return w2v
예제 #10
0
def test_average_embeddings_custom():
    corpus = [
        ["the", "man", "ran"],
        ["the", "boy"],
        ["the", "man", "boy"],
    ]

    max_sent_len = 3

    model = KeyedVectors(vector_size=1)

    model.add(["the"], [np.array([1])])
    model.add(["man"], [np.array([2])])
    model.add(["ran"], [np.array([3])])
    model.add(["boy"], [np.array([6])])
    model.add([PAD_TOK], [np.array([0])])

    embedding_input = lib.embeddings.get_embedding_input(corpus, model, max_sent_len)

    avg_embedding_model = lib.mlp.get_average_embedding_model(
        input_shape=embedding_input.shape[1:],
        w2v_model=model,
    )

    avg = avg_embedding_model.predict(embedding_input)

    print(avg)

    assert avg[0][0] == (1 + 2 + 3) / 3
    assert avg[1][0] == (1 + 6) / 2
    assert avg[2][0] == (1 + 2 + 6) / 3
예제 #11
0
indices = np.array(list(freq_vocab.values()))
M_ = (M[indices].T)[indices].T
M_.shape

# Square root of the matrix M before Arnoldi iteration
sqrt_M = M_.sqrt()

# Construct word embeddings by Arnoldi iteration
Q_file = f"./tmp/{tag}_Q_{win_size}.npy"
if os.path.exists(Q_file):
    Q = np.load(Q_file)
    logging.info("Successfully loaded embeddings matrix from %s.", Q_file)
else:
    logging.info("Start arnoldi iterations.")
    b = np.random.random(size=max_vocab)  # initial vector
    Q, h = arnoldi_iteration(sqrt_M, b, embed_dim)
    logging.info(
        "Successfully extracted word embeddings from arnoldi iteration.")
    np.save(Q_file, Q)  # save Word embeddings
    logging.info("Successfully saved word embedding matrix Q to %s.", Q_file)

dim = embed_dim
word2vec_file = f"./tmp/arnodi_{dim}_{win_size}.kv"
Q_ = Q[:, :dim]
we_ = normalize(Q_, axis=1, norm="l2")
kv = KeyedVectors(vector_size=dim)
kv.add(list(vocab_.keys()), we_)
kv.save_word2vec_format(word2vec_file)
logging.info("Successfully saved word embeddings of dimension %d to %s.", dim,
             word2vec_file)
예제 #12
0
def load_keyedvectors(**kwargs):
    embeddings = load_embeddings(**kwargs)
    vectors = KeyedVectors(int(kwargs["size"]))
    vectors.add(embeddings.index, embeddings.values)
    return vectors
예제 #13
0
def get_random_kv(embed_size):
    w2v = KeyedVectors(embed_size)
    w2v.add(list(id2lib), np.random.random((len(id2lib), embed_size)))
    return w2v