예제 #1
0
def build_glove(word2vec, target_files, output_path):
    word2vec1 = KeyedVectors(vector_size=300)
    print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size))
    buf1 = []
    buf2 = []
    contains = set()

    def add_buffer(w, f):
        nonlocal buf1, buf2
        if w not in contains:
            buf1.append(w)
            buf2.append(f)
            contains.add(w)

    def clear_buffer():
        nonlocal buf1, buf2
        buf1 = []
        buf2 = []

    for f in target_files:
        for i, s in enumerate(load_json(f), 1):
            sentence = s['description']

            for w in tokenize(sentence):
                w = w.lower()
                if w in word2vec:
                    add_buffer(w, word2vec[w])
            if i % 10 == 0 and len(buf1) > 0:
                word2vec1.add(buf1, buf2, replace=False)
                clear_buffer()
    if len(buf1) > 0:
        word2vec1.add(buf1, buf2, replace=False)

    print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size))
    KeyedVectors.save_word2vec_format(word2vec1, output_path, binary=True)
예제 #2
0
    def as_keyed_vectors(self):
        """
        Generated a KeyedVectors instance with all
        :return:
        """

        edge_generator = combinations_with_replacement(self.kv.index2word, r=2)

        if not self.quiet:
            vocab_size = len(self.kv.vocab)
            total_size = reduce(lambda x, y: x * y, range(1, vocab_size + 2)) / \
                         (2 * reduce(lambda x, y: x * y, range(1, vocab_size)))

            edge_generator = tqdm(edge_generator, desc='Generating edge features', total=total_size)

        # Generate features
        tokens = []
        features = []
        for edge in edge_generator:
            token = str(tuple(sorted(edge)))
            embedding = self._embed(edge)

            tokens.append(token)
            features.append(embedding)

        # Build KV instance
        edge_kv = KeyedVectors(vector_size=self.kv.vector_size)
        edge_kv.add(
            entities=tokens,
            weights=features)

        return edge_kv
예제 #3
0
def get_most_similar(path, path2, postfix='_80_20_128', n=250):
    """
    Probably Deprecated
    
    From `path`.model read WordVectors and to each, find their `n` most similar
    (by cosine similarity) nodes. Save it in `path`_top`n`.json.
    Filter it by known file.
    """
    wv = KeyedVectors.load(path + path2 + postfix + '.model')
    known = read_instagram_known(path + '.known')
    nxgraph = read_nxgraph(path + path2 + '.edgelist')
    wv2 = KeyedVectors(vector_size = 128)
    # filter by known
    for ent, vec in zip(wv.index2entity, wv.vectors):
        if int(ent) in known:
            wv2.add([ent], [vec])
    samples = dict([
            (int(u), [
                    (int(i[0]), i[1])
                    for i in wv.most_similar([u], topn=n)
                    if int(i[0]) not in nxgraph.neighbors(int(u))
                    ])
            for u in wv2.vocab.iterkeys()
            ])

    write_pickle(samples, '{}_top{}.pick'.format(path, n))
예제 #4
0
class MultiKeyedVectorRecSys(KeyedVectorRecSys):
    def train(self, items, **kwargs):
        # Load sub-models
        models = [
            KeyedVectors.load_word2vec_format(fp, **kwargs) for fp in items
        ]

        self.vector_size = np.sum([m.vector_size for m in models])

        # Build new keyed vector model
        self.model = KeyedVectors(vector_size=self.vector_size)

        missing_docs = 0

        # Iterate over all words (in first model)
        for doc_id in models[0].index2word:
            # Stack vectors from all models
            models_vec = []
            for m in models:
                if doc_id in m.index2word:
                    models_vec.append(m.get_vector(doc_id))
                else:
                    # Use zero-vector if doc id does not exist
                    # print(f'WARNING: {doc_id} does not exist in {m}')
                    models_vec.append(np.zeros((m.vector_size)))
                    missing_docs += 1

            vec = np.hstack(models_vec)

            self.model.add(doc_id, vec)

        if missing_docs > 0:
            logger.warning(f'Missing documents: {missing_docs}')

        return self.model
예제 #5
0
파일: navec.py 프로젝트: EruditePanda/navec
    def as_gensim(self):
        from gensim.models import KeyedVectors

        model = KeyedVectors(self.pq.dim)
        weights = self.pq.unpack()  # warning! memory heavy
        model.add(self.vocab.words, weights)
        return model
예제 #6
0
def show_similar(embeds, labels, n_examples=10, n_nearby=6):
    # Gather a random set of queries (sentences embeddings that we'll compare against)
    query_label_idx = random.randint(0, len(labels) - n_examples)
    query_vectors = embeds[query_label_idx:query_label_idx + n_examples, :]
    # Find indices for the embeddings that are nearest to the queries
    t = time.time()
    indices, dists = _find_nearest(query_vectors,
                                   embeds,
                                   n_nearby,
                                   batch_size=1000000)
    print indices.shape, dists.shape
    t = time.time() - t
    query_sentences = labels[query_label_idx:query_label_idx + dists.shape[0]]
    for query_num, query_sentence in enumerate(query_sentences):
        print "*******************************************************************"
        print query_sentence
        dist_row = dists[query_num, :]
        index_row = indices[query_num, :]
        for dist, idx in zip(dist_row, index_row):
            print dist, labels[idx]
    print 'Took {} seconds ({} s/per query)'.format(t, t / n_examples)
    return

    # TODO: gensim is much faster but doesn't handle duplicate entries very well, and goes OOM
    kv = KeyedVectors(embeds.shape[-1])
    kv.add(labels, embeds)
    random_labels = random.sample(labels, 10)
    for label in random_labels:
        print label
        for tup in kv.most_similar(label):
            print tup
        print '--------------------------------------------------'
    def load(cls, token2id, test=False, limit=None):
        embed_shape = (len(token2id), 300)
        freqs = np.zeros((len(token2id)), dtype='f')

        if test:
            np.random.seed(0)
            vectors = np.random.normal(0, 1, embed_shape)
            vectors[0] = 0
            vectors[len(token2id) // 2:] = 0
        else:
            vectors = np.zeros(embed_shape, dtype='f')
            path = f'{os.environ["DATADIR"]}/{cls.path}'
            for i, o in enumerate(open(path, encoding="utf8",
                                       errors='ignore')):
                token, *vector = o.split(' ')
                token = str.lower(token)
                if token not in token2id or len(o) <= 100:
                    continue
                if limit is not None and i > limit:
                    break
                freqs[token2id[token]] += 1
                vectors[token2id[token]] += np.array(vector, 'f')

        vectors[freqs != 0] /= freqs[freqs != 0][:, None]
        vec = KeyedVectors(300)
        vec.add(list(token2id.keys()), vectors, replace=True)

        return vec
예제 #8
0
파일: transformer_based.py 프로젝트: f4g2/i
class SentenceTransformerRecSys(KeyedVectorRecSys):
    model_name_or_path = None
    batch_size = 12
    language_model = None

    def train(self, texts: List):
        from sentence_transformers import SentenceTransformer

        # load sentence transformer model
        if not self.language_model:
            logger.info(
                f'Loading Sentence Transformer: {self.model_name_or_path}')
            self.language_model = SentenceTransformer(self.model_name_or_path)

        # reset doc vector model
        self.model = KeyedVectors(
            vector_size=self.language_model.get_sentence_embedding_dimension())

        # encode
        sentence_embeddings = self.language_model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=self.print_progress)

        # save into keyed vector
        for idx, vec in enumerate(sentence_embeddings):
            self.model.add([str(self.idx2doc_id[idx])], [vec])

        return self.model
예제 #9
0
파일: clusters.py 프로젝트: gpucce/spikex
def cluster_chunks(
    chunks: List[Span],
    stopwords: bool = False,
    filter_pos: List[str] = None,
    min_score: float = None,
):
    """
    Cluster chunks by using a revisited **Radial Ball Mapper** algorithm

    Parameters
    ----------
    chunks : List[Span]
        Chunks to cluster.
    stopwords : bool, optional
        Flag to exclude stopwords from chunks, by default False.
    filter_pos : List[str], optional
        POS tags to filter chunk words, by default None
    min_score : float, optional
        Threshold for clustering chunks, by default None

    Returns
    -------
    List[List[Span]]
        Clusters of chunks
    """
    key2index, key2vector = _map_key_to_vector(chunks, stopwords, filter_pos)
    if not key2index or not key2vector:
        return
    model = KeyedVectors(chunks[0].vector.size)
    keys = list(key2vector.keys())
    weights = list(key2vector.values())
    model.add(keys, weights)
    clusters = cluster_balls_multi(model, keys, min_score=min_score)
    return [[chunks[key2index[i]] for i in cluster] for cluster in clusters]
예제 #10
0
def new_w2v():
    pkl_file = open("%s/data.para" % TPS, 'rb')
    vocab_u, vocab_i = load_vocabulary(pkl_file)
    # print(vocab_u)
    print(len(vocab_u))
    print(len(vocab_i))
    print(vocab_u['love'])
    all_words = set()
    all_words = all_words.union(set(vocab_u.keys()))
    print(len(all_words))
    all_words = all_words.union(set(vocab_i.keys()))
    print(len(all_words))
    length = len(all_words)
    w2v_model = KeyedVectors.load_word2vec_format('E:/embedding/GoogleNews-vectors-negative300.bin', binary=True)
    word_list = list(all_words)
    embeds_list = []
    miss = set()
    for w in word_list:
        if w in w2v_model:
            # in_set.add(w)
            embeds = w2v_model[w]
        else:
            miss.add(w)
            embeds = np.random.uniform(-0.25, 0.25, 300)
        embeds_list.append(embeds)
    print("miss:", len(miss)/len(all_words))
    new_w2v = KeyedVectors(300)
    new_w2v.add(word_list, embeds_list)
    new_w2v.save_word2vec_format("%s/google.w2v.bin" % TPS, binary=True)
예제 #11
0
def main(args, metrics):
    graph_dataset = MAGDataset(name="", path=args.data_path, raw=False)
    metrics = [getattr(module_metric, met) for met in metrics]
    pre_metric = partial(module_metric.obtain_ranks, mode=0)

    full_graph = graph_dataset.g_full.to_networkx()
    core_subgraph = get_holdout_subgraph(graph_dataset.train_node_ids,
                                         full_graph)
    pseudo_leaf_node = -1
    for node in list(core_subgraph.nodes()):
        core_subgraph.add_edge(node, pseudo_leaf_node)
    node2descendants = {
        n: set(descendants(core_subgraph, n))
        for n in core_subgraph.nodes
    }
    candidate_positions = list(
        set(
            chain.from_iterable([[(n, d) for d in ds]
                                 for n, ds in node2descendants.items()])))

    edge2nbs = {}
    for u, v in candidate_positions:
        pu = set(core_subgraph.predecessors(u))
        cu = set(core_subgraph.successors(u))
        if v == pseudo_leaf_node:
            pv = set()
            cv = set()
        else:
            pv = set(core_subgraph.predecessors(v))
            cv = set(core_subgraph.successors(v))
        nbs = pu.union(pv).union(cu).union(cv)
        if pseudo_leaf_node in nbs:
            nbs.remove(pseudo_leaf_node)
        edge2nbs[(u, v)] = list(map(str, nbs))

    holdout_subgraph = get_holdout_subgraph(
        graph_dataset.train_node_ids + graph_dataset.test_node_ids, full_graph)
    node2pos = find_insert_posistion(graph_dataset.test_node_ids,
                                     core_subgraph, holdout_subgraph,
                                     pseudo_leaf_node)

    node_features = graph_dataset.g_full.ndata['x']
    node_features = F.normalize(node_features, p=2, dim=1)
    kv = KeyedVectors(vector_size=node_features.shape[1])
    kv.add([str(i) for i in range(len(node_features))], node_features.numpy())

    all_ranks = []
    for node in tqdm(graph_dataset.test_node_ids):
        dists = distances(str(node), candidate_positions, edge2nbs, kv,
                          pseudo_leaf_node)
        scores, labels = rearrange(torch.Tensor(dists), candidate_positions,
                                   node2pos[node])
        all_ranks.extend(pre_metric(scores, labels))
    total_metrics = [metric(all_ranks) for metric in metrics]

    for i, mtr in enumerate(metrics):
        print('    {:15s}: {}'.format(mtr.__name__, total_metrics[i]))

    return
def make_bert_sentence_file(filename, bert_sent_model, labels, vec_size=300):
    #Get all the
    embeddings = get_sentence_bert(bert_sent_model, labels)
    kv = KeyedVectors(vector_size=vec_size)
    vec_id_list = range(0, len(labels))
    kv.add(vec_id_list, embeddings)
    kv.save_word2vec_format(filename, binary=False)
    return
예제 #13
0
def dict_to_gensim(embeddings):
    if len(embeddings) == 0:
        raise ValueError("Empty embedding dictionary")
    words = list(embeddings.keys())
    vectors = np.row_stack(list(embeddings.values()))
    result = KeyedVectors(vectors.shape[1])
    result.add(words, vectors)
    return result
예제 #14
0
파일: conftest.py 프로젝트: textvec/textvec
def keyed_vectors():
    model = KeyedVectors(5)
    words = ["cat", "dog", "foo", "bar", "one", "two"]
    vectors = np.array([[0, 0, 0, 0, 1], [0, 0, 0, 0.28,
                                          0.96], [1, 0, 0, 0, 0],
                        [0, 0, 1, 0, 0], [0.28, 0.96, 0, 0, 0],
                        [0.6, 0.8, 0, 0, 0]])
    model.add(words, vectors)
    return model
예제 #15
0
 def _init_gensim_model(self, filters):
     filters_uid = hash_data(filters)
     gensim_model_dir = (
         join(self.gen_dir, filters_uid) if filters_uid else self.gen_dir
     )
     gensim_model_path = join(gensim_model_dir, "_gensim_model.bin")
     if exists(gensim_model_path):
         self._gensim_model = KeyedVectors.load(gensim_model_path)
         if filters:
             self._import_filter_info(filters_uid)
         else:
             self._case_insensitive = vocab_case_insensitive(
                 self._gensim_model.index2word
             )
     elif filters:
         makedirs(gensim_model_dir, exist_ok=True)
         source_model_path = join(self.gen_dir, "_gensim_model.bin")
         if exists(source_model_path):
             source_model = KeyedVectors.load(source_model_path)
         else:
             source_model = gensim_data.load(self.name)
             source_model.save(source_model_path)
         source_vocab = source_model.index2word
         self._case_insensitive = vocab_case_insensitive(source_vocab)
         filtered_vocab, filter_report, filter_details = filter_vocab_list(
             source_vocab,
             filters,
             case_insensitive=self._case_insensitive,
             incl_report=True,
         )
         self._export_filter_info(
             uid=filters_uid,
             details=filter_details,
             case_insensitive=self._case_insensitive,
             report=filter_report,
         )
         weights = [
             source_model.get_vector(word) for word in filtered_vocab
         ]
         filtered_model = KeyedVectors(source_model.vector_size)
         filtered_model.add(filtered_vocab, weights)
         self._gensim_model = filtered_model
     else:
         makedirs(gensim_model_dir, exist_ok=True)
         self._gensim_model = gensim_data.load(self.name)
         self._case_insensitive = vocab_case_insensitive(
             self._gensim_model.index2word
         )
     self._gensim_model.save(gensim_model_path)
     self._gen_dir = gensim_model_dir
     self._dim_size = self._gensim_model.vector_size
     self._vocab = [PAD_TOKEN] + self._gensim_model.index2word
     self._vocab_size = len(self._vocab)
     pad_value = [np.zeros(shape=self._dim_size).astype(np.float32)]
     self._vectors = np.concatenate([pad_value, self._gensim_model.vectors])
예제 #16
0
    def __build_sentence_vectors(self, sentences):
        word_vec = self.word_vec
        sent_vec = KeyedVectors(word_vec.vector_size)
        idf_index = self.idf_index

        for sent in sentences:
            sent_vec.add( \
             ' '.join(sent), \
             np.average([word_vec.get_vector(word) * idf_index[word] \
                                     for word in sent], 0))
        return sent_vec
예제 #17
0
def embedding_ayir(dizin, kelimeler, model_embedding, hParams):
    model_embedding_kv = KeyedVectors(hParams.embedding_matris_boyut)

    for kelime in kelimeler:
        try:
            model_embedding_kv.add([kelime], [model_embedding.wv[kelime]])
        except:
            pass

    model_embedding_kv.save(dizin)
    return model_embedding_kv
예제 #18
0
def test_cluster_balls(nlp):
    ents, wgts = zip(*[(c.text.lower(), c.vector) for c in (
        nlp("apple"),
        nlp("pear"),
        nlp("orange"),
        nlp("lemon"),
    )])
    model = KeyedVectors(wgts[0].size)
    model.add(ents, list(wgts))
    print(cluster_balls(model))  # is not None  # no root
    print(cluster_balls(model, root="orange"))  # with root
예제 #19
0
def embedding_seperate(path, words, model_embedding, hParams):
	model_embedding_kv = KeyedVectors(hParams.embedding_size)

	for word in words:
		try:
			model_embedding_kv.add([word], [model_embedding.wv[word]])
		except:
			pass

	model_embedding_kv.save(path)
	return model_embedding_kv
예제 #20
0
def concatenate_embeddings(models, padding='random'):
    aligned_models = align_models(models, padding=padding)
    words = aligned_models[0].index2word
    vectors = np.column_stack([emb.vectors for emb in aligned_models])

    ncols = sum([emb.vector_size for emb in models])
    assert vectors.shape == (len(words), ncols)

    vector_size = vectors.shape[1]
    result = KeyedVectors(vector_size)
    result.add(words, vectors)
    return result
예제 #21
0
def load_embedding_dict(pickle_path):
    """
    returns gensim KeyedVectors
    """
    with open(pickle_path, 'rb') as f:
        embedding_dict = pickle.load(f)

    words = [w for w in embedding_dict]
    vectors = [embedding_dict[w] for w in words]
    embedding_dict = KeyedVectors(len(vectors[0]))
    embedding_dict.add(words, vectors)

    return embedding_dict
예제 #22
0
    def to_keyed_vectors(self):
        '''Export model content to KeyedVectors object'''
        try:
            from gensim.models import KeyedVectors
        except ImportError:
            raise ImportError(
                'You must install gensim for KeyedVectors export')

        keyed_vectors = KeyedVectors(self.dim)
        words = self.keys()
        keyed_vectors.add(words, self.batch_embedding(words))

        return keyed_vectors
def make_word2vec_file(filename, model, labels):
    # Get mean word2vec vector for all labels and write them to a file.

    kv = KeyedVectors(vector_size=model.wv.vector_size)
    vec_id_list = range(0, len(labels))

    vectors = []
    for label in labels:
        vec = get_mean_vector(model, label)
        vectors.append(vec)
    kv.add(vec_id_list, vectors)
    kv.save_word2vec_format(filename, binary=False)
    return
예제 #24
0
def save_fasttext(vocab):
    model = FastText.load_word2vec_format('../../corpora/wiki.en.vec')
     # 新建KeyedVectors
    kmodel = KeyedVectors(300)
    loss = 0
    for word in vocab:
        try:
            vec = model[word]
        except:
            loss += 1
            continue
        kmodel.add(word, vec, replace=True)
    print('loss word: ', loss)
    kmodel.save('../../corpora/fasttext.wv')
예제 #25
0
def merge_mapped_embeddings2(embs, modifiers):
    """
    Merge the embeddings into one KeyedVector instance. Modify the words of each provided embedding space with 'modifiers'
    to distinguish them from the each other i.e. modifiers must be a list with the same size embs
    :param embs: List of KeyedVectors instances to merge
    :param modifiers: modifiers for the words of each provided embedding space
    :return: merged KeyedVectors instance
    """
    merged_emb = KeyedVectors(100)
    for i, emb in enumerate(embs):
        word_list = [word + modifiers[i] for word in emb.vocab]
        vec_list = [emb[word] for word in emb.vocab]
        merged_emb.add(word_list, vec_list)
    return merged_emb
예제 #26
0
def filter_by_mincount(embeddings: KeyedVectors, min_count):
    """
    Eliminate all word that occur less than mincount times. Keep in mind that the counts of words are currently not
    copied to the returned KeyedVectors instance.
    """
    words = []
    vectors = []
    for word in embeddings.vocab:
        if embeddings.vocab[word].count > min_count:
            words.append(word)
            vectors.append(embeddings[word])
    filtered = KeyedVectors(embeddings.vector_size)
    filtered.add(words, vectors)
    return filtered
예제 #27
0
def main(kv_filepath, vocab_filepath, output_filepath):

    model = KeyedVectors.load_word2vec_format(kv_filepath, binary=True)
    vocab = Vocab(vocab_filepath)
    short_kv = KeyedVectors(vector_size=len(model['hello']))

    for word in vocab.word2int.keys():
        try:
            short_kv.add(word, model[word])
        except KeyError:
            continue

    short_kv.save_word2vec_format(
        os.path.join(output_filepath, 'short-vectors.bin'))
예제 #28
0
def reduce_word2vec_vocab(input_path, output_path, vocab):
    """
    Downsamples the vocabulary in word2vec embeddings to less storage overhead.
    Given the input path of the embeddings and the vocabulary needed, create
    a new word2vec model removing words not in the voabulary. Save this resulting
    model in the output_path.
    """
    input_model = KeyedVectors.load_word2vec_format(input_path, binary=True)
    output_model = KeyedVectors(100)
    for word in vocab:
        if word in input_model.vocab:
            output_model.add([word], [input_model[word]])

    output_model.save_word2vec_format(output_path, binary=True)
예제 #29
0
def save_gnews(vocab):
    model = KeyedVectors.load_word2vec_format('../../corpora/GoogleNews-vectors-negative300.bin', binary=True)
    # 新建KeyedVectors
    kmodel = KeyedVectors(300)
    loss = 0
    for word in vocab:
        try:
            vec = model[word]
        except:
            loss += 1
            continue
        kmodel.add(word, vec, replace=True)
    print('loss word: ', loss)
    kmodel.save('../../corpora/gnews.wv')
예제 #30
0
def combine_embeddings(models):
    emb_dict = {}
    for model in models:
        temp_dict = {
            k: model.vectors[v.index]
            for (k, v) in model.vocab.items()
        }
        emb_dict.update(temp_dict)
        # emb_dict = {**emb_dict,**temp_dict}
    emb_sorted = sorted(emb_dict.items(), key=lambda x: x[0])
    words = [item[0] for item in emb_sorted]
    vectors = np.row_stack([item[1] for item in emb_sorted])
    result = KeyedVectors(model.vector_size)
    result.add(words, vectors)
    return result
예제 #31
0
    def test_add_single(self):
        """Test that adding entity in a manual way works correctly."""
        entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)]
        vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)]

        # Test `add` on already filled kv.
        for ent, vector in zip(entities, vectors):
            self.vectors.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(self.vectors[ent], vector))

        # Test `add` on empty kv.
        kv = EuclideanKeyedVectors(self.vectors.vector_size)
        for ent, vector in zip(entities, vectors):
            kv.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(kv[ent], vector))