예제 #1
0
    def create_keyedvector_from_matrix(self, embedding_matrix, word2id):
        """
        Imports the necessary attributes for the Embedding object from an embedding matrix and a word2id vocabulary. Can be used for custom pre-trained embeddings.
        Parameters
        ----------
        embedding_matrix: numpy.ndarray
            Embedding matrix as a numpy object
        word2id: dict
            Word vocabulary (key: word, value: word_index)
        """

        vocab = {
            word: word2id[word]
            for word in sorted(word2id, key=word2id.__getitem__, reverse=False)
        }
        embedding_matrix = embedding_matrix
        vector_size = embedding_matrix.shape[1]

        kv = KeyedVectors(vector_size)
        kv.vector_size = vector_size
        kv.vectors = embedding_matrix

        kv.index2word = list(vocab.keys())

        kv.vocab = {
            word: Vocab(index=word_id, count=0)
            for word, word_id in vocab.items()
        }

        self.embedding = kv
예제 #2
0
 def update(self):
     wv = self.word_vectors_file.get_word_vectors()
     voc = self.vocabs_file.get_vocabs()['word']
     words_in_vocab = [
         k for k, _ in sorted(voc.items(), key=lambda i: i[1][0])
     ]
     word_embs = wv[words_in_vocab[1:]]
     unk_emb = np.mean(word_embs, 0, keepdims=True)
     embs = np.concatenate((unk_emb, word_embs), 0)
     kv = KeyedVectors(embs.shape[1])
     kv.syn0 = embs
     kv.vocab = dict(
         (k, Vocab(index=v[0], count=v[1])) for k, v in voc.items())
     kv.index2word = words_in_vocab
     kv.save(self.path)
예제 #3
0
def delete_keys(w2v_model: KeyedVectors, del_keys: list):
    del_indexes = []
    with click.progressbar(del_keys,
                           length=len(del_keys),
                           label='Deleted keys') as bar:
        for key in bar:
            del_idx = w2v_model.vocab[key].index
            del_indexes.append(del_idx)
            del w2v_model.vocab[key]
            w2v_model.index2word[del_idx] = ''
    w2v_model.syn0 = np.delete(w2v_model.syn0, del_indexes, axis=0)
    w2v_model.index2word = [word for word in w2v_model.index2word if word]
    for i, word in enumerate(w2v_model.index2word):
        w2v_model.vocab[word].index = i
    print(len(model.vocab), w2v_model.syn0.shape)