def build_glove(word2vec, target_files, output_path): word2vec1 = KeyedVectors(vector_size=300) print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size)) buf1 = [] buf2 = [] contains = set() def add_buffer(w, f): nonlocal buf1, buf2 if w not in contains: buf1.append(w) buf2.append(f) contains.add(w) def clear_buffer(): nonlocal buf1, buf2 buf1 = [] buf2 = [] for f in target_files: for i, s in enumerate(load_json(f), 1): sentence = s['description'] for w in tokenize(sentence): w = w.lower() if w in word2vec: add_buffer(w, word2vec[w]) if i % 10 == 0 and len(buf1) > 0: word2vec1.add(buf1, buf2, replace=False) clear_buffer() if len(buf1) > 0: word2vec1.add(buf1, buf2, replace=False) print(word2vec1.vectors.shape, (len(word2vec1.vocab), word2vec1.vector_size)) KeyedVectors.save_word2vec_format(word2vec1, output_path, binary=True)
def as_keyed_vectors(self): """ Generated a KeyedVectors instance with all :return: """ edge_generator = combinations_with_replacement(self.kv.index2word, r=2) if not self.quiet: vocab_size = len(self.kv.vocab) total_size = reduce(lambda x, y: x * y, range(1, vocab_size + 2)) / \ (2 * reduce(lambda x, y: x * y, range(1, vocab_size))) edge_generator = tqdm(edge_generator, desc='Generating edge features', total=total_size) # Generate features tokens = [] features = [] for edge in edge_generator: token = str(tuple(sorted(edge))) embedding = self._embed(edge) tokens.append(token) features.append(embedding) # Build KV instance edge_kv = KeyedVectors(vector_size=self.kv.vector_size) edge_kv.add( entities=tokens, weights=features) return edge_kv
def get_most_similar(path, path2, postfix='_80_20_128', n=250): """ Probably Deprecated From `path`.model read WordVectors and to each, find their `n` most similar (by cosine similarity) nodes. Save it in `path`_top`n`.json. Filter it by known file. """ wv = KeyedVectors.load(path + path2 + postfix + '.model') known = read_instagram_known(path + '.known') nxgraph = read_nxgraph(path + path2 + '.edgelist') wv2 = KeyedVectors(vector_size = 128) # filter by known for ent, vec in zip(wv.index2entity, wv.vectors): if int(ent) in known: wv2.add([ent], [vec]) samples = dict([ (int(u), [ (int(i[0]), i[1]) for i in wv.most_similar([u], topn=n) if int(i[0]) not in nxgraph.neighbors(int(u)) ]) for u in wv2.vocab.iterkeys() ]) write_pickle(samples, '{}_top{}.pick'.format(path, n))
class MultiKeyedVectorRecSys(KeyedVectorRecSys): def train(self, items, **kwargs): # Load sub-models models = [ KeyedVectors.load_word2vec_format(fp, **kwargs) for fp in items ] self.vector_size = np.sum([m.vector_size for m in models]) # Build new keyed vector model self.model = KeyedVectors(vector_size=self.vector_size) missing_docs = 0 # Iterate over all words (in first model) for doc_id in models[0].index2word: # Stack vectors from all models models_vec = [] for m in models: if doc_id in m.index2word: models_vec.append(m.get_vector(doc_id)) else: # Use zero-vector if doc id does not exist # print(f'WARNING: {doc_id} does not exist in {m}') models_vec.append(np.zeros((m.vector_size))) missing_docs += 1 vec = np.hstack(models_vec) self.model.add(doc_id, vec) if missing_docs > 0: logger.warning(f'Missing documents: {missing_docs}') return self.model
def as_gensim(self): from gensim.models import KeyedVectors model = KeyedVectors(self.pq.dim) weights = self.pq.unpack() # warning! memory heavy model.add(self.vocab.words, weights) return model
def show_similar(embeds, labels, n_examples=10, n_nearby=6): # Gather a random set of queries (sentences embeddings that we'll compare against) query_label_idx = random.randint(0, len(labels) - n_examples) query_vectors = embeds[query_label_idx:query_label_idx + n_examples, :] # Find indices for the embeddings that are nearest to the queries t = time.time() indices, dists = _find_nearest(query_vectors, embeds, n_nearby, batch_size=1000000) print indices.shape, dists.shape t = time.time() - t query_sentences = labels[query_label_idx:query_label_idx + dists.shape[0]] for query_num, query_sentence in enumerate(query_sentences): print "*******************************************************************" print query_sentence dist_row = dists[query_num, :] index_row = indices[query_num, :] for dist, idx in zip(dist_row, index_row): print dist, labels[idx] print 'Took {} seconds ({} s/per query)'.format(t, t / n_examples) return # TODO: gensim is much faster but doesn't handle duplicate entries very well, and goes OOM kv = KeyedVectors(embeds.shape[-1]) kv.add(labels, embeds) random_labels = random.sample(labels, 10) for label in random_labels: print label for tup in kv.most_similar(label): print tup print '--------------------------------------------------'
def load(cls, token2id, test=False, limit=None): embed_shape = (len(token2id), 300) freqs = np.zeros((len(token2id)), dtype='f') if test: np.random.seed(0) vectors = np.random.normal(0, 1, embed_shape) vectors[0] = 0 vectors[len(token2id) // 2:] = 0 else: vectors = np.zeros(embed_shape, dtype='f') path = f'{os.environ["DATADIR"]}/{cls.path}' for i, o in enumerate(open(path, encoding="utf8", errors='ignore')): token, *vector = o.split(' ') token = str.lower(token) if token not in token2id or len(o) <= 100: continue if limit is not None and i > limit: break freqs[token2id[token]] += 1 vectors[token2id[token]] += np.array(vector, 'f') vectors[freqs != 0] /= freqs[freqs != 0][:, None] vec = KeyedVectors(300) vec.add(list(token2id.keys()), vectors, replace=True) return vec
class SentenceTransformerRecSys(KeyedVectorRecSys): model_name_or_path = None batch_size = 12 language_model = None def train(self, texts: List): from sentence_transformers import SentenceTransformer # load sentence transformer model if not self.language_model: logger.info( f'Loading Sentence Transformer: {self.model_name_or_path}') self.language_model = SentenceTransformer(self.model_name_or_path) # reset doc vector model self.model = KeyedVectors( vector_size=self.language_model.get_sentence_embedding_dimension()) # encode sentence_embeddings = self.language_model.encode( texts, batch_size=self.batch_size, show_progress_bar=self.print_progress) # save into keyed vector for idx, vec in enumerate(sentence_embeddings): self.model.add([str(self.idx2doc_id[idx])], [vec]) return self.model
def cluster_chunks( chunks: List[Span], stopwords: bool = False, filter_pos: List[str] = None, min_score: float = None, ): """ Cluster chunks by using a revisited **Radial Ball Mapper** algorithm Parameters ---------- chunks : List[Span] Chunks to cluster. stopwords : bool, optional Flag to exclude stopwords from chunks, by default False. filter_pos : List[str], optional POS tags to filter chunk words, by default None min_score : float, optional Threshold for clustering chunks, by default None Returns ------- List[List[Span]] Clusters of chunks """ key2index, key2vector = _map_key_to_vector(chunks, stopwords, filter_pos) if not key2index or not key2vector: return model = KeyedVectors(chunks[0].vector.size) keys = list(key2vector.keys()) weights = list(key2vector.values()) model.add(keys, weights) clusters = cluster_balls_multi(model, keys, min_score=min_score) return [[chunks[key2index[i]] for i in cluster] for cluster in clusters]
def new_w2v(): pkl_file = open("%s/data.para" % TPS, 'rb') vocab_u, vocab_i = load_vocabulary(pkl_file) # print(vocab_u) print(len(vocab_u)) print(len(vocab_i)) print(vocab_u['love']) all_words = set() all_words = all_words.union(set(vocab_u.keys())) print(len(all_words)) all_words = all_words.union(set(vocab_i.keys())) print(len(all_words)) length = len(all_words) w2v_model = KeyedVectors.load_word2vec_format('E:/embedding/GoogleNews-vectors-negative300.bin', binary=True) word_list = list(all_words) embeds_list = [] miss = set() for w in word_list: if w in w2v_model: # in_set.add(w) embeds = w2v_model[w] else: miss.add(w) embeds = np.random.uniform(-0.25, 0.25, 300) embeds_list.append(embeds) print("miss:", len(miss)/len(all_words)) new_w2v = KeyedVectors(300) new_w2v.add(word_list, embeds_list) new_w2v.save_word2vec_format("%s/google.w2v.bin" % TPS, binary=True)
def main(args, metrics): graph_dataset = MAGDataset(name="", path=args.data_path, raw=False) metrics = [getattr(module_metric, met) for met in metrics] pre_metric = partial(module_metric.obtain_ranks, mode=0) full_graph = graph_dataset.g_full.to_networkx() core_subgraph = get_holdout_subgraph(graph_dataset.train_node_ids, full_graph) pseudo_leaf_node = -1 for node in list(core_subgraph.nodes()): core_subgraph.add_edge(node, pseudo_leaf_node) node2descendants = { n: set(descendants(core_subgraph, n)) for n in core_subgraph.nodes } candidate_positions = list( set( chain.from_iterable([[(n, d) for d in ds] for n, ds in node2descendants.items()]))) edge2nbs = {} for u, v in candidate_positions: pu = set(core_subgraph.predecessors(u)) cu = set(core_subgraph.successors(u)) if v == pseudo_leaf_node: pv = set() cv = set() else: pv = set(core_subgraph.predecessors(v)) cv = set(core_subgraph.successors(v)) nbs = pu.union(pv).union(cu).union(cv) if pseudo_leaf_node in nbs: nbs.remove(pseudo_leaf_node) edge2nbs[(u, v)] = list(map(str, nbs)) holdout_subgraph = get_holdout_subgraph( graph_dataset.train_node_ids + graph_dataset.test_node_ids, full_graph) node2pos = find_insert_posistion(graph_dataset.test_node_ids, core_subgraph, holdout_subgraph, pseudo_leaf_node) node_features = graph_dataset.g_full.ndata['x'] node_features = F.normalize(node_features, p=2, dim=1) kv = KeyedVectors(vector_size=node_features.shape[1]) kv.add([str(i) for i in range(len(node_features))], node_features.numpy()) all_ranks = [] for node in tqdm(graph_dataset.test_node_ids): dists = distances(str(node), candidate_positions, edge2nbs, kv, pseudo_leaf_node) scores, labels = rearrange(torch.Tensor(dists), candidate_positions, node2pos[node]) all_ranks.extend(pre_metric(scores, labels)) total_metrics = [metric(all_ranks) for metric in metrics] for i, mtr in enumerate(metrics): print(' {:15s}: {}'.format(mtr.__name__, total_metrics[i])) return
def make_bert_sentence_file(filename, bert_sent_model, labels, vec_size=300): #Get all the embeddings = get_sentence_bert(bert_sent_model, labels) kv = KeyedVectors(vector_size=vec_size) vec_id_list = range(0, len(labels)) kv.add(vec_id_list, embeddings) kv.save_word2vec_format(filename, binary=False) return
def dict_to_gensim(embeddings): if len(embeddings) == 0: raise ValueError("Empty embedding dictionary") words = list(embeddings.keys()) vectors = np.row_stack(list(embeddings.values())) result = KeyedVectors(vectors.shape[1]) result.add(words, vectors) return result
def keyed_vectors(): model = KeyedVectors(5) words = ["cat", "dog", "foo", "bar", "one", "two"] vectors = np.array([[0, 0, 0, 0, 1], [0, 0, 0, 0.28, 0.96], [1, 0, 0, 0, 0], [0, 0, 1, 0, 0], [0.28, 0.96, 0, 0, 0], [0.6, 0.8, 0, 0, 0]]) model.add(words, vectors) return model
def _init_gensim_model(self, filters): filters_uid = hash_data(filters) gensim_model_dir = ( join(self.gen_dir, filters_uid) if filters_uid else self.gen_dir ) gensim_model_path = join(gensim_model_dir, "_gensim_model.bin") if exists(gensim_model_path): self._gensim_model = KeyedVectors.load(gensim_model_path) if filters: self._import_filter_info(filters_uid) else: self._case_insensitive = vocab_case_insensitive( self._gensim_model.index2word ) elif filters: makedirs(gensim_model_dir, exist_ok=True) source_model_path = join(self.gen_dir, "_gensim_model.bin") if exists(source_model_path): source_model = KeyedVectors.load(source_model_path) else: source_model = gensim_data.load(self.name) source_model.save(source_model_path) source_vocab = source_model.index2word self._case_insensitive = vocab_case_insensitive(source_vocab) filtered_vocab, filter_report, filter_details = filter_vocab_list( source_vocab, filters, case_insensitive=self._case_insensitive, incl_report=True, ) self._export_filter_info( uid=filters_uid, details=filter_details, case_insensitive=self._case_insensitive, report=filter_report, ) weights = [ source_model.get_vector(word) for word in filtered_vocab ] filtered_model = KeyedVectors(source_model.vector_size) filtered_model.add(filtered_vocab, weights) self._gensim_model = filtered_model else: makedirs(gensim_model_dir, exist_ok=True) self._gensim_model = gensim_data.load(self.name) self._case_insensitive = vocab_case_insensitive( self._gensim_model.index2word ) self._gensim_model.save(gensim_model_path) self._gen_dir = gensim_model_dir self._dim_size = self._gensim_model.vector_size self._vocab = [PAD_TOKEN] + self._gensim_model.index2word self._vocab_size = len(self._vocab) pad_value = [np.zeros(shape=self._dim_size).astype(np.float32)] self._vectors = np.concatenate([pad_value, self._gensim_model.vectors])
def __build_sentence_vectors(self, sentences): word_vec = self.word_vec sent_vec = KeyedVectors(word_vec.vector_size) idf_index = self.idf_index for sent in sentences: sent_vec.add( \ ' '.join(sent), \ np.average([word_vec.get_vector(word) * idf_index[word] \ for word in sent], 0)) return sent_vec
def embedding_ayir(dizin, kelimeler, model_embedding, hParams): model_embedding_kv = KeyedVectors(hParams.embedding_matris_boyut) for kelime in kelimeler: try: model_embedding_kv.add([kelime], [model_embedding.wv[kelime]]) except: pass model_embedding_kv.save(dizin) return model_embedding_kv
def test_cluster_balls(nlp): ents, wgts = zip(*[(c.text.lower(), c.vector) for c in ( nlp("apple"), nlp("pear"), nlp("orange"), nlp("lemon"), )]) model = KeyedVectors(wgts[0].size) model.add(ents, list(wgts)) print(cluster_balls(model)) # is not None # no root print(cluster_balls(model, root="orange")) # with root
def embedding_seperate(path, words, model_embedding, hParams): model_embedding_kv = KeyedVectors(hParams.embedding_size) for word in words: try: model_embedding_kv.add([word], [model_embedding.wv[word]]) except: pass model_embedding_kv.save(path) return model_embedding_kv
def concatenate_embeddings(models, padding='random'): aligned_models = align_models(models, padding=padding) words = aligned_models[0].index2word vectors = np.column_stack([emb.vectors for emb in aligned_models]) ncols = sum([emb.vector_size for emb in models]) assert vectors.shape == (len(words), ncols) vector_size = vectors.shape[1] result = KeyedVectors(vector_size) result.add(words, vectors) return result
def load_embedding_dict(pickle_path): """ returns gensim KeyedVectors """ with open(pickle_path, 'rb') as f: embedding_dict = pickle.load(f) words = [w for w in embedding_dict] vectors = [embedding_dict[w] for w in words] embedding_dict = KeyedVectors(len(vectors[0])) embedding_dict.add(words, vectors) return embedding_dict
def to_keyed_vectors(self): '''Export model content to KeyedVectors object''' try: from gensim.models import KeyedVectors except ImportError: raise ImportError( 'You must install gensim for KeyedVectors export') keyed_vectors = KeyedVectors(self.dim) words = self.keys() keyed_vectors.add(words, self.batch_embedding(words)) return keyed_vectors
def make_word2vec_file(filename, model, labels): # Get mean word2vec vector for all labels and write them to a file. kv = KeyedVectors(vector_size=model.wv.vector_size) vec_id_list = range(0, len(labels)) vectors = [] for label in labels: vec = get_mean_vector(model, label) vectors.append(vec) kv.add(vec_id_list, vectors) kv.save_word2vec_format(filename, binary=False) return
def save_fasttext(vocab): model = FastText.load_word2vec_format('../../corpora/wiki.en.vec') # 新建KeyedVectors kmodel = KeyedVectors(300) loss = 0 for word in vocab: try: vec = model[word] except: loss += 1 continue kmodel.add(word, vec, replace=True) print('loss word: ', loss) kmodel.save('../../corpora/fasttext.wv')
def merge_mapped_embeddings2(embs, modifiers): """ Merge the embeddings into one KeyedVector instance. Modify the words of each provided embedding space with 'modifiers' to distinguish them from the each other i.e. modifiers must be a list with the same size embs :param embs: List of KeyedVectors instances to merge :param modifiers: modifiers for the words of each provided embedding space :return: merged KeyedVectors instance """ merged_emb = KeyedVectors(100) for i, emb in enumerate(embs): word_list = [word + modifiers[i] for word in emb.vocab] vec_list = [emb[word] for word in emb.vocab] merged_emb.add(word_list, vec_list) return merged_emb
def filter_by_mincount(embeddings: KeyedVectors, min_count): """ Eliminate all word that occur less than mincount times. Keep in mind that the counts of words are currently not copied to the returned KeyedVectors instance. """ words = [] vectors = [] for word in embeddings.vocab: if embeddings.vocab[word].count > min_count: words.append(word) vectors.append(embeddings[word]) filtered = KeyedVectors(embeddings.vector_size) filtered.add(words, vectors) return filtered
def main(kv_filepath, vocab_filepath, output_filepath): model = KeyedVectors.load_word2vec_format(kv_filepath, binary=True) vocab = Vocab(vocab_filepath) short_kv = KeyedVectors(vector_size=len(model['hello'])) for word in vocab.word2int.keys(): try: short_kv.add(word, model[word]) except KeyError: continue short_kv.save_word2vec_format( os.path.join(output_filepath, 'short-vectors.bin'))
def reduce_word2vec_vocab(input_path, output_path, vocab): """ Downsamples the vocabulary in word2vec embeddings to less storage overhead. Given the input path of the embeddings and the vocabulary needed, create a new word2vec model removing words not in the voabulary. Save this resulting model in the output_path. """ input_model = KeyedVectors.load_word2vec_format(input_path, binary=True) output_model = KeyedVectors(100) for word in vocab: if word in input_model.vocab: output_model.add([word], [input_model[word]]) output_model.save_word2vec_format(output_path, binary=True)
def save_gnews(vocab): model = KeyedVectors.load_word2vec_format('../../corpora/GoogleNews-vectors-negative300.bin', binary=True) # 新建KeyedVectors kmodel = KeyedVectors(300) loss = 0 for word in vocab: try: vec = model[word] except: loss += 1 continue kmodel.add(word, vec, replace=True) print('loss word: ', loss) kmodel.save('../../corpora/gnews.wv')
def combine_embeddings(models): emb_dict = {} for model in models: temp_dict = { k: model.vectors[v.index] for (k, v) in model.vocab.items() } emb_dict.update(temp_dict) # emb_dict = {**emb_dict,**temp_dict} emb_sorted = sorted(emb_dict.items(), key=lambda x: x[0]) words = [item[0] for item in emb_sorted] vectors = np.row_stack([item[1] for item in emb_sorted]) result = KeyedVectors(model.vector_size) result.add(words, vectors) return result
def test_add_single(self): """Test that adding entity in a manual way works correctly.""" entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. for ent, vector in zip(entities, vectors): self.vectors.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) # Test `add` on empty kv. kv = EuclideanKeyedVectors(self.vectors.vector_size) for ent, vector in zip(entities, vectors): kv.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector))