#!/usr/bin/python3 # coding: utf-8 from gensim.models.keyedvectors import KeyedVectors model = KeyedVectors.load_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300.bin', binary=True) model.save_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300', binary=False)
def ensemble_embedding(self, word_embedding, context_embedding): """Replace current syn0 with the sum of context and word embeddings. Parameters ---------- word_embedding : str Path to word embeddings in GloVe format. context_embedding : str Path to context embeddings in word2vec_format. Returns ------- numpy.ndarray Matrix with new embeddings. """ glove2word2vec(context_embedding, context_embedding + '.w2vformat') w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding) c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding) # compare vocab words using keys of dict vocab assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings' # sort context embedding to have words in same order as word embedding prev_c_emb = copy.deepcopy(c_emb.syn0) for word_id, word in enumerate(w_emb.index2word): c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index] # add vectors of the two embeddings new_emb = w_emb.syn0 + c_emb.syn0 self.syn0 = new_emb return new_emb
def test_type_conversion(self): path = datapath('high_precision.kv.txt') binary_path = datapath('high_precision.kv.bin') model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16) model1.save_word2vec_format(binary_path, binary=True) model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True) self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0])) self.assertEqual(model1["horse.n.01"][0].dtype, np.float16) self.assertEqual(model2["horse.n.01"][0].dtype, np.float64)
def load_w2v_model(self): # Load pre-trained word2vec model model_loc = os.path.join(os.getcwd(),get_str('devise', 'w2v_model_name')) word_vectors = KeyedVectors.load_word2vec_format(model_loc, binary=True) # Get dimensions of word vector word_dim = word_vectors['the'].shape[0] return word_vectors, word_dim
def load_GNews_model(): """ Convenience function for loading the pre-trained Google News word2vec model vectors published with the original work. For more information see: https://code.google.com/archive/p/word2vec/ """ model = KeyedVectors.load_word2vec_format('rdata/GoogleNews-vectors-negative300.bin', binary=True) return model
def create_and_load_dic(self): model = KeyedVectors.load_word2vec_format(self.files_path + '.bin', binary=True) kmeans = cluster.KMeans(n_clusters=self.num_clusters) kmeans.fit(model.wv.vectors) self.w2v_dic = dict(zip(model.wv.index2word, zip(model.wv.vectors, kmeans.labels_))) output = open(self.files_path + '.pkl', 'wb') pickle.dump(self.w2v_dic, output) output.close()
def __init__(self, config=None): super().__init__() self.embedding_path = config.get("embedding_path") self.embedding_type = config.get("embedding_type") if self.embedding_path is None or self.embedding_path == "": raise ValueError("Embedding_path is expected.") is_binary = True if self.embedding_type == "bin" else False from gensim.models.keyedvectors import KeyedVectors self.embedding = KeyedVectors.load_word2vec_format(self.embedding_path, binary=is_binary)
def load_kv(filename=None, path=None, limit=None): if path is not None: return KeyedVectors.load_word2vec_format( path, binary=True, limit=limit) elif filename is not None: for dir_path in ASSET_SEARCH_DIRS: try: path = os.path.join(dir_path, filename) return KeyedVectors.load_word2vec_format( path, binary=True, limit=limit) except FileNotFoundError: continue raise FileNotFoundError("Please make sure that 'filename' \ specifies the word vector binary name \ in default search paths or 'path' \ speficies file path of the binary") else: raise TypeError( "load_kv() requires either 'filename' or 'path' to be set.")
def load_word2vec_model_from_path(self): """ Load Word2Vec model Returns: the Word2Vec model """ word_embeddings_model = KeyedVectors.load_word2vec_format( self.word2vec_model_path, binary=True) if not word_embeddings_model: return None return word_embeddings_model
def _load_word2vec(path, limit=500000): """ Init word2vec model :param path: path to the model :param limit: optional :return: word2vec model """ print('Выгружаю семантическую модель слов...') w2v = KeyedVectors.load_word2vec_format(path, binary=True, unicode_errors='ignore', limit=limit) w2v.init_sims(replace=True) print('Выгрузка окончена') return w2v
from gensim.models.keyedvectors import KeyedVectors import logging from scipy import stats import numpy as np from sklearn import metrics file = input('The vector file:') model = KeyedVectors.load_word2vec_format(file, binary=False) #verbs similar = model.most_similar('击败') print('击败:') print(similar) print('\n') similar = model.most_similar('引用') print('引用:') print(similar) print('\n') similar = model.most_similar('研究') print('研究:') print(similar) print('\n') similar = model.most_similar('形成') print('形成:') print(similar) print('\n') similar = model.most_similar('增加') print('增加:') print(similar) print('\n')
import numpy as np import pickle import gensim import time from gensim.models.keyedvectors import KeyedVectors t0 = time.time() path = "glove.6B.50d.txt.w2v" glove = KeyedVectors.load_word2vec_format(path, binary=False) t1 = time.time() print("loaded word vectors in ", t1 - t0) class Word_Association: """ Capable of playing a simple word association game using word embeddings The computer starts by giving a random word, then the user gives a related word. This continues until the user gives a word that isn't related or has been used already or the computer can't come up with a related word that hasn't been used. """ def __init__(self, seed=None, level=1): """ Creates a game session. Parameters ---------- seed: str the word on whcih the game will start (random if None) level: int the difficulty level (1 is recommended, higher is harder)
def get_w2v(): model = KeyedVectors.load_word2vec_format('word2vec.bin', binary=True) return model
y_true = sample['labels'].numpy() if task not in recalls: recalls[task] = 0. recalls[task] += get_recall(y_true, y) if task not in counts: counts[task] = 0 counts[task] += 1 recalls = {task: recall / counts[task] for task, recall in recalls.items()} if save_argmax: return argmaxes else: return recalls print('Loading word vectors...') we = KeyedVectors.load_word2vec_format(args.we_path, binary=1) testset = CrossTask( data_path=args.data_path, features_path=args.features_path, features_path_3D=args.features_path_3D, we=we, feature_framerate=args.feature_framerate, feature_framerate_3D=args.feature_framerate_3D, we_dim=args.we_dim, max_words=args.max_words, ) testloader = DataLoader( testset, batch_size=1, num_workers=args.num_thread_reader, shuffle=False,
from gensim.models import Word2Vec from gensim.models.keyedvectors import KeyedVectors from flask import Flask import urllib import numpy as np import json from elasticsearch import Elasticsearch app = Flask(__name__) #b = Word2Vec.load('tmp/brown.bin') #g = Word2Vec.load('tmp/GoogleNews-vectors-negative300.bin') g = KeyedVectors.load_word2vec_format('tmp/GoogleNews-vectors-negative300.bin', binary=True) log = [] @app.route("/") def hello(): return "OK" def ConvertVectorSetToVecAverageBased(vectorSet, ignore=[]): if len(ignore) == 0: return np.mean(vectorSet, axis=0) else: return np.dot(np.transpose(vectorSet), ignore) / sum(ignore) def phrase_similarity(_phrase_1, _phrase_2): phrase_1 = _phrase_1.split(" ") phrase_2 = _phrase_2.split(" ")
def main(schema_path, word_embeddings_path, output_path, header=True, zero_vector=False): """The logic of the script.""" if header: print "Skipping header row." else: print "No header row." # Initialize the gensim model. print "Loading word vectors. This may take a moment." start = time.time() gensim_model = KeyedVectors.load_word2vec_format(word_embeddings_path, binary=False) print "Model loaded in %0.3f seconds." % (time.time() - start) print "Word vectors loaded." try: _ = gensim_model["unk"] except: print "unk not in model" # We'll probably have to look up the same table and field names many times; # may as well cache them. cached_vectors = {} # Read in the schema CSV. all_vectors = [] with open(schema_path, 'r') as csvfile: reader = csv.reader(csvfile) for row in reader: if header: header = False continue if not len(row) == NUM_FIELDS: print "Expected %d fields, but found %d; skipping row." % (NUM_FIELDS, len(row)) print "\t" + ", ".join(row) continue # Get word vectors for table names and field names from_table_name = row[0].strip() from_field_name = row[1].strip() to_table_name = row[2].strip() to_field_name = row[3].strip() vectors = [] for name in [from_table_name, to_table_name, from_field_name, to_field_name]: if name not in cached_vectors: cached_vectors[name] = get_word_vector(name, gensim_model) current_vector = cached_vectors[name] vectors.append(current_vector) # Concatenate the four vectors entire_vector = np.concatenate(vectors) all_vectors.append(entire_vector) if zero_vector: print "Appending zero vector" zeros = np.zeros_like(all_vectors[0]) all_vectors.append(zeros) matrix = np.stack(all_vectors) print "schema_map shape: %s" % str(matrix.shape) # Save matrix np.save(output_path, matrix)
from gensim.models.keyedvectors import KeyedVectors from gensim.test.utils import datapath from pathlib import Path model = KeyedVectors.load_word2vec_format(Path.cwd() / 'model_test.txt', binary=False) print('女生') print(model.most_similar(positive='女生')) print('----------------') print('中国') print(model.most_similar(positive='中国'))
# project = "db4o" # cs_packages = ["Db4objects.","Db4oUnit"] # java_packages = ["db4o."] usage_type = "method" with open(URL,"r") as f: data = f.readlines() keys = list() for line in data: line = line.strip() splits = line.split("-") keys.append(splits[0]) cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_global_local.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_global_local.txt",binary=False) for key in keys: try: vector = java_vectors[key] k_nearest= cs_vectors.similar_by_vector(vector, topn=50) relevant_k = list() for k in k_nearest: if check_if_token_is_method_signature(k[0]) == True: # if check_package_include(java_packages,k[0]) == True: relevant_k.append(k[0]) if len(relevant_k) != 0:
labels, test_size=0.3, random_state=42) classes = sorted(list(set(y_train))) print(classes) print('train', len(x_train)) print('test', len(x_test)) del labels del features neigh = KNeighborsClassifier(n_neighbors=50) neigh.fit(x_train, y_train) print('loading model') # load google pretrained word2vec model w2v_model = KeyedVectors.load_word2vec_format( './word2vec_model/GoogleNews-vectors-negative300.bin', binary=True) from bs4 import BeautifulSoup import re from nltk.corpus import stopwords import nltk.data import operator, functools # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # English stop words stops = set(stopwords.words("english")) def get_deep_features(text): # Remove HTML
def exampleTest(): #set up corpus from classifier data file known=set() corpus=pd.read_csv(os.path.join("data","classifier_data.csv")) corpus=corpus.fillna('') #eliminates all NaN boxed from spreadsheet for i in range(len(corpus)): questions=corpus.iloc[i]['question'].split('\r\n') answer=corpus.iloc[i]['text'] for question in questions: tokens=tokenize(question) known.update(tokens) answer_tokens=tokenize(answer) known.update(answer_tokens) # corpus_model=Word2Vec([known], min_count=1) # print(str(corpus_model.most_similar(positive=['navy'],topn=5))) # word_vectors = corpus_model.wv # print(str(word_vectors.most_similar(positive=['navy'],topn=5))) google_model=KeyedVectors.load_word2vec_format(os.path.join('..','GoogleNews-vectors-negative300.bin'), binary=True) allWords=google_model.index2word modelData={w: google_model[w] for w in allWords} origModel=DictVectorModel(modelData) #google_model=KeyedVectors.load_word2vec_format(os.path.join('..','GoogleNews-vectors-negative300.bin'), binary=True) vocab_obj = google_model.vocab["word"] print("count of word in keyed: " + str(vocab_obj.count)) print("index of word in keyed: " + str(vocab_obj.index)) allWords=google_model.index2word print("First word: " + str(allWords[1])) #totalCount = len(allWords)*(len(allWords)-1)/2.0 totalCount = len(allWords) print("total words from totalCount: " + str(totalCount)) #print(str(allWords)) #create a pandas DataFrame with columns labeled # list_ = [] # limit_array_length = 0 # for w in google_model.index2word: # if limit_array_length <= 1000: # list_.append(w) # limit_array_length += 1 # #print("w in google model " + str(w)) # frame = pd.DataFrame(np.reshape(np.array(list_),-1), columns=["Word"]) # frame = frame.drop_duplicates(['Word'],keep="first") # frame.to_csv("../first_words.csv", index=None) limit_array_length = 0 freqs={w: zipfWordFrequency(google_model.vocab[w],totalCount) for w in allWords} modelData={w: google_model[w] for w in allWords} origModel=DictVectorModel(modelData) model=origModel thesaurus={} #print(model._modelDict['computer']) #v1=model._modelDict['the'] #for later comparison with reduced dict # Config Values maxDim = 4 ''' FREQUENCY for word counts with Google news vector(3 million words) calculated using FrequencyMetrics.py 25000 words: 2.6820216255e-06 50000 words: 1.34101081275e-06 100000 words: 6.70505406374e-07 150000 words: 4.4700360425e-07 200000 words: 3.35252703187e-07 250000 words: 2.6820216255e-07 500000 words: 1.34101081275e-07 750000 words: 8.94007208499e-08 1000000 words: 6.70505406374e-08 1250000 words: 5.364043251e-08 1500000 words: 4.4700360425e-08 1750000 words: 3.831459465e-08 2000000 words: 3.35252703187e-08 2250000 words: 2.98002402833e-08 2500000 words: 2.6820216255e-08 2750000 words: 2.43820147773e-08 3000000 words: 2.23501802125e-08 ''' #250000 words by freq thresh freqThresh = 2.6820216255e-07 #just below 300000 words by rel thresh knownThresh = 1 #currently using cosine, avg relevance with dot is ??? 2.27510558315 #clumpThresh = 0.99 #not yet implemented print("Initialized") start=time.time() filterReducer = VectorModelReducer(model, freqDict=freqs, knownWords=known) print("Filtering...") filteredModel = filterReducer.filterModel(freqThresh, knownThresh) #reduceDimModel = filterReducer.reduceDimModel(filterModel) end=time.time() elapsed=end-start print("Time to filter is "+str(elapsed)) #print(filteredModel._modelDict['computer']) #v2=filteredModel._modelDict['the'] #norm = float([w**2 for w in v1])**0.5 + float([w**2 for w in v2])**0.5 #similarity = 1 - scipy.spatial.distance.cosine(v1, v2) #print("Similarity is "+str(similarity)) if not os.path.exists('vector_models'): os.mkdir('vector_models') model_file='vector_models'+os.sep+'model_'+str(freqThresh)+'_'+str(knownThresh)+'.pkl' start=time.time() print("Dumping...") pickle_dump(filteredModel._modelDict, model_file) end=time.time() elapsed=end-start print("Time to dump is " + str(elapsed)) print("Finished filtering, length of new model: " + str(filteredModel.getLength())) #with open(model_file, 'wb') as pickle_file: # pickle.dump(filteredModel._modelDict, pickle_file) #Code for all combinations of freqThresh and knownThresh # for freq_thresh in freqThresh: # for known_thresh in knownThresh: # filteredModel = filterReducer.filterModel(freqThresh, knownThresh) # model = filteredModel # model_file='vector_models'+os.sep+'model_'+str(freq_thresh)+'_'+str(known_thresh)+'.json' # with open(model_file, 'w') as json_file: # json.dump(model, json_file) #clumpReducer = VectorModelReducer(model) #clumpedModel, thesaurus = clumpReducer.clumpModel(clumpThresh) #model = clumpedModel return model
# Train the Doc2Vec model model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs) #%% # Google word2vec import gensim import gensim.downloader as api path = api.load("word2vec-google-news-300", return_path=True) print(path) from gensim.models.keyedvectors import KeyedVectors gensim_model = KeyedVectors.load_word2vec_format(path, binary=True, limit=300000) gensim_model['regression'] gensim_model.most_similar(positive=['statistics', 'diagnostics', 'outlier']) gensim_model.most_similar(positive=['Tea', 'United_States'], negative=['England']) gensim_model.most_similar(positive=['statistics', 'diagnostics', 'outlier']) #%% # Wikipedia2vec - Studio Ousia
testlabel = [] labelfile = open('testlabel.txt') for l in labelfile: line = l.strip('\n') if line in emo: testlabel.append(emo.index(line)) else: testlabel.append(7) labelfile.close() ''' traintexts = [[word for word in document.split() if word not in stop_words] for document in traindata] testtexts = [[word for word in document.split() if word not in stop_words] for document in testdata] word_vectors = KeyedVectors.load_word2vec_format( 'zhwiki_2017_03.sg_50d.word2vec', binary=False) gensim_dict = Dictionary() gensim_dict.doc2bow(word_vectors.vocab.keys(), allow_update=True) w2indx = {v: k + 1 for k, v in gensim_dict.items()} # 词语的索引,从1开始编号 w2vec = {word: word_vectors[word] for word in w2indx.keys()} trainseq = text_to_index_array(w2indx, traintexts) testseq = text_to_index_array(w2indx, testtexts) traindata = pad_sequences(trainseq, maxlen=MAX_SEQUENCE_LENGTH) testdata = pad_sequences(testseq, maxlen=MAX_SEQUENCE_LENGTH) word_index = w2indx print('Found %s unique tokens.' % len(word_index)) labels = to_categorical(np.asarray(label)) #testlabels = to_categorical(np.asarray(testlabel)) indices = np.arange(traindata.shape[0])
def test5(): from gensim.models import KeyedVectors word_vectors = KeyedVectors.load_word2vec_format(file_to_path, binary=False)
mean2 = mean2 + s(y, A) # distance of all y from A in set 1 vs. set 2 mean2 = mean2 / float(len(Y)) sd = [] for w in X + Y: sd.append(s(w, A)) return (mean1 - mean2) / np.std(sd) global wv global wv2 base = 'vecs/pk_vectors.bin' countries = glob.glob("vecs/*") wv = KeyedVectors.load_word2vec_format(base, binary=True) print "PK VS THE WORLD" for country in countries: print "~~~~~~~~~~~~~~ " + country.split("/")[1] + " ~~~~~~~~~~~~~~" wv2 = KeyedVectors.load_word2vec_format(country, binary=True) woman = [ "she", "woman", "female", "her", "hers", "girl", "daughter", "mother", "sister", "aunt" ] man = [ "he", "man", "male", "his", "him", "boy", "son", "brother", "father",
with open('Final_Labeled_DataCollection-NTI_Tweets.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for tweet in Tweets: if len(analyze(tweet[2])) >= 4 and tweet[0] != '-1': writer.writerow(tweet) ''''' with open('Final_Labeled_DataCollection-NTI_Tweets.csv', 'r', newline='', encoding='utf-8') as f: reader = csv.reader(f) NTI_Tweets_1 = [] NTI_Whole_Tweets = [] for row in reader: NTI_Whole_Tweets.append(row) NTI_Tweets_1.append(row[2]) Google_Word2Vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, encoding='latin-1') vectorizer = CountVectorizer(min_df=1, stop_words='english', ngram_range=(1, 1), analyzer=u'word') analyze = vectorizer.build_analyzer() miss_words = [] Total_Tweet_Vec_List = [] for t in NTI_Tweets_1: Name = analyze(t) Tweet_Vec_List = [] for word in Name: try: wordVec = Google_Word2Vec[word] Tweet_Vec_List.append(word) except KeyError: miss_words.append(word) Total_Tweet_Vec_List.append(Tweet_Vec_List)
def downloadGlove(file = "C:/glove/glove.6B.50d.txt.w2v"): gant = KeyedVectors.load_word2vec_format(file, binary=False) return gant
Copyright (C) - All Rights Reserved """ import os from gensim.models.keyedvectors import KeyedVectors from examples.example_corpus import corpus from nlpkit.nlp_feature_extraction import WordEmbedsDocVectorizer # Replace with the path of a pre-trained w2v model WORD_EMBEDDINGS_FILEPATH = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', 'fake-news', 'data', 'resources', 'word-embeddings', 'glove.6B', 'glove.6B.300d.word2vec.txt')) if __name__ == "__main__": word2vec = KeyedVectors.load_word2vec_format(WORD_EMBEDDINGS_FILEPATH, binary=False) w2v_vectorizer = WordEmbedsDocVectorizer(word2vec, tfidf_weights=True) X = w2v_vectorizer.fit_transform(corpus) print(X) # Example usage in a pipeline # pipeline = Pipeline( # ('vec', WordEmbedsDocVectorizer(word2vec, tfidf_weights=True)), # ('clf', SVC(kernel='linear', C=1, probability=True)) # ])
import codecs from sklearn.manifold import TSNE from gensim.models.keyedvectors import KeyedVectors import numpy as np import csv from sklearn.metrics.pairwise import cosine_similarity from util import vector_averaging from util import vector_averaging_with_tfidf from util import process_source_code from util import word2weight import sys DIMENSION = 25 csv.field_size_limit(sys.maxsize) PROJECT = "cordova" cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_10_25_include_functions.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_10_25_include_functions.txt",binary=False) with open("./sentences/sentences_cs_10.txt","r") as cs_f: cs_data = cs_f.readlines() with open("./sentences/sentences_java_10.txt","r") as java_f: java_data = java_f.readlines() cs_sentences = [x for x in cs_data] java_sentences = [x for x in java_data] cs_word2weight = word2weight(cs_sentences) java_word2weight = word2weight(java_sentences) # print cs_word2weight # Predicting part ---------------------------------------------- # print(cosine_similarity(cs_vectors["while"].reshape(1,-1),java_vectors["class"].reshape(1,-1))
from argparse import ArgumentParser from gensim.models.keyedvectors import KeyedVectors import torch from tqdm import tqdm if __name__ == '__main__': parser = ArgumentParser(description='Convert binary word2vec to txt') parser.add_argument('input') parser.add_argument('output') args = parser.parse_args() model = KeyedVectors.load_word2vec_format(args.input, binary=True) model.save_word2vec_format(args.output, binary=False)
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT)) p2 = int(len(data)*(1-TEST_SPLIT)) x_train = data[:p1] y_train = labels[:p1] x_val = data[p1:p2] y_val = labels[p1:p2] x_test = data[p2:] y_test = labels[p2:] print ('train docs: '+str(len(x_train))) print ('val docs: '+str(len(x_val))) print ('test docs: '+str(len(x_test))) ################################################################### print ('(4) load word2vec as embedding...') from gensim.models.keyedvectors import KeyedVectors w2v_model0 = KeyedVectors.load_word2vec_format('uniqueWords.vector', encoding='utf-8') # w2v_model1 = KeyedVectors.load_word2vec_format('medCorpus_meanVec.zh.vector', encoding='utf-8') embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) not_in_model = 0 in_model = 0 #the origin word vector for word, i in word_index.items(): if word in w2v_model0: in_model += 1 vec0 = w2v_model0[word].tolist() # vec1 = w2v_model1[word].tolist() embedding_matrix[i] = np.asarray(vec0, dtype='float32') else: not_in_model += 1 print (str(not_in_model)+' words not in w2v model')
1. using most recent publication of researchers as input to generate user profiles 2. pretrain word2vec model window_5.model.bin and candidate_paper.csv are available via google drive link, you can download the files and change the path in this script so as to run the script successfully. 3. result saved in rank_result_rm/rank_result_mr_own_corpus.csv """ import sys from gensim.models.keyedvectors import KeyedVectors import numpy as np import pandas as pd from datetime import datetime # load pre-train model on my own corpus model = '/Users/sherry/Downloads/window_5/window_5.model.bin' w2v_model = KeyedVectors.load_word2vec_format(model, binary=True) # read all candidate papers info, contain two columns: paper ID and paper content candidate_paper_df = pd.read_csv( '/Users/sherry/Downloads/candidate_papers.csv') # define DocSim class to calculate document similarities class DocSim(object): def __init__(self, w2v_model, stopwords=[]): self.w2v_model = w2v_model self.stopwords = stopwords def vectorize(self, doc): """Identify the vector values for each word in the given document""" doc = str(doc)
import math from gensim.models.keyedvectors import KeyedVectors import pkuseg model = KeyedVectors.load_word2vec_format( 'GoogleNews-vectors-negative300.bin', binary=True, limit=300000) seg = pkuseg.pkuseg() sentence_1 = '' sentence_2 = '' text1 = seg.cut(sentence_1) # 进行分词 text2 = seg.cut(sentence_2) stopwords = [line.strip() for line in open('Englishstopwords.txt', encoding='UTF-8').readlines()] clean1= list() clean2= list() for word in sentence_1.split(" "): if word not in stopwords: clean1.append(word) for word in sentence_2.split(" "): if word not in stopwords: clean2.append(word) vec1=list() vec2=list() sum=0 sq1=0
def load_gensim_model(path_to_model): model = KeyedVectors.load_word2vec_format(path_to_model, binary=True) return model
from gensim.models.keyedvectors import KeyedVectors from konlpy.tag import Twitter import numpy as np pos_vectors = KeyedVectors.load_word2vec_format('pos.vec', binary=False) pos_vectors.most_similar("('남자','Noun')") twitter = Twitter() word = "대통령이" pos_list = twitter.pos(word, norm=True) word_vector = np.sum([pos_vectors.word_vec(str(pos).replace(" ", "")) for pos in pos_list], axis=0)
import csv from sklearn.metrics.pairwise import cosine_similarity from util import vector_averaging from util import vector_averaging_with_tfidf from util import process_source_code from util import process_diff_srcml from util import process_diff_srcml2 from util import word2weight from util import process_expression from util import mean_average_precision from util import average_precision from util import precision_at_k import sys DIMENSION = 20 cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_11_20.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_11_20.txt",binary=False) with open("./sentences/sentences_cs_11.txt","r") as cs_f: cs_data = cs_f.readlines() with open("./sentences/sentences_java_11.txt","r") as java_f: java_data = java_f.readlines() cs_sentences = [x for x in cs_data] java_sentences = [x for x in java_data] cs_word2weight = word2weight(cs_sentences) java_word2weight = word2weight(java_sentences) with codecs.open("./evaluation_data/keywords.csv","r") as f_csv:
) #list9+list10+list11+list12 , list99+list1010+list1111+list1212 #test_data, test_label = preprocessing(list1+list2+list3+list4), enumerate_list(list111+list222+list333+list444) #list1+list2+list3+list4 , list11+list22+list33+list44 #note: full test dataset is too huge-> memory error, so use part of it test_data_list = pd.read_csv( os.getcwd() + '\\data\\EI-reg-En-part-test.csv')['Tweet'].tolist() test_label_list = pd.read_csv( os.getcwd() + '\\data\\EI-reg-En-part-test.csv')['Affect Dimension'].tolist() test_data, test_label = preprocessing(test_data_list), test_label_list print("Train shape:", len(train_data), len(train_label)) print("Validation shape:", len(dev_data), len(dev_label)) print("Test shape:", len(test_data), len(test_label)) # Loading all models glove_model = KeyedVectors.load_word2vec_format( 'word2vec.twitter.27B.100d.txt', binary=False) # load Glove model w2v_model = Word2Vec.load('w2v_model.bin') # load word2vec model e2v_model = gsm.KeyedVectors.load_word2vec_format( 'emoji2vec.bin', binary=True) # load emoji2vec model print("All Models Loaded!") # word embedding data with glove pretrained model and real word2vec/w2v input_data = np.concatenate((train_data, dev_data, test_data)) max_sequence_length = max([len(x) for x in input_data ]) # find the length of longest twitter print("Max twitter length:", max_sequence_length) print("input_data shape:", len(input_data)) # Find embedding for corpus def embedding(data, max_len):
# if "System." in split[0] or "antlr" in split[0].lower(): cs_signature_tokens.append(split[0]) print "cs tokens : " + str(len(cs_signature_tokens)) for java_emb in java_embeddings: split = java_emb.split(" ") if func(split[0]) == True: if check_package_include(java_packages,split[0]) == True: # if "java." in split[0] or "antlr" in split[0].lower(): java_signature_tokens.append(split[0]) print "java tokens : " + str(len(java_signature_tokens)) print "Loading word embedding..........." cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_new_window3.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_new_window3.txt",binary=False) print "Finish loading.............." # print cs_vectors.similar_by_vector(java_vectors["java.util.concurrent.locks.Lock.lock()"], topn=30) # print cs_vectors.similar_by_vector(java_vectors["package"], topn=30) def check_if_relevant_k_contains_exact_name(method_source, relevant_k): check = False for k in relevant_k: split = k.split(".")
print indexi start = time.time() num_features = 200 # Word vector dimensionality min_word_count = 20 # Minimum word count num_workers = 40 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str( context) + "context_len2alldata" # Load the trained Word2Vec model. model_name = 'wordvectors_reuters_nonpoly.txt' #model = Word2Vec.load(model_name)#.syn0 # Get wordvectors for all words in vocabulary. model = KeyedVectors.load_word2vec_format(model_name, binary=False) word_vectors = model.syn0 all = pd.read_pickle('all.pkl') start1 = time.time() start = time.time() # Set number of clusters. num_clusters = 60 idx, idx_proba = cluster_GMM(num_clusters, word_vectors) idx_proba[idx_proba < 0.2] = 0 n_clusteri = num_clusters f = open(filename, 'a') print "number of k clusters ", str(n_clusteri) f.write("number of k clusters " + str(n_clusteri) + "\n") start2 = time.time() f.write("time taken in clustering " + str(start2 - start1) + "\n") # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
h1 = F.dropout(h1,0.3) h2 = F.dropout(h2, 0.3) h1 = F.relu(h1) h2 = F.relu(h2) h = F.concat([h1, h2]) out = self.hy(h) #out = self.bn3(out) #out = F.dropout(out,0.3) out = F.tanh(out) out = F.normalize(out) return F.mean_squared_error(out,t) if __name__ == '__main__': dic = KeyedVectors.load_word2vec_format("trainer/glove.6B.100d.bin") original_word = 'does' glove_vec = dic.get_vector(original_word).reshape(1, 100) first_net = First_Network(100,30,len(original_word)) first_net.cleargrads() optimizer1 = optimizers.Adam() optimizer1.setup(first_net) second_net = Second_Network(27,30, 50) second_net.cleargrads() second_net.reset() optimizer2 = optimizers.Adam() optimizer2.setup(second_net) second_net2 = Second_Network(27, 30, 50) second_net2.cleargrads()
odd = '/home/dpappas/' ########################################################### w2v_bin_path = '/home/dpappas/bioasq_all/pubmed2018_w2v_30D.bin' idf_pickle_path = '/home/dpappas/bioasq_all/idf.pkl' ########################################################### avgdl, mean, deviation = 21.688767020746013, 0.7375801616020308, 1.3411418040865049 print(avgdl, mean, deviation) ########################################################### k_for_maxpool = 5 k_sent_maxpool = 5 embedding_dim = 30 #200 ########################################################### print('loading idfs') idf, max_idf = load_idfs(idf_pickle_path) print('loading w2v') wv = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) wv = dict([(word, wv[word]) for word in wv.vocab.keys()]) ########################################################### my_seed = 1 random.seed(my_seed) torch.manual_seed(my_seed) ########################################################### print('Compiling model...') model = Sent_Posit_Drmm_Modeler(embedding_dim=embedding_dim, k_for_maxpool=k_for_maxpool) if (use_cuda): model = model.cuda() ########################################################### resume_from = '/home/dpappas/bioasq_w2vjpdrmm_demo_run_0/best_dev_checkpoint.pth.tar' load_model_from_checkpoint(resume_from)
if __name__ == '__main__': """ @description: 训练词向量,保存、加载模型,并计算词的相似度 word2vec保存和加载模型方法(save方式保存的模型可继续训练,save_word2vec_format速度快内存小): 1.model.save('w2v.model')->model = Word2Vec.load('w2v.model') 2.model.wv.save_word2vec_format('w2v.bin') ->KeyedVectors.load_word2vec_format('w2v.bin') """ train_segx_path = '../datasets/train_set.seg_x.txt' train_segy_path = '../datasets/train_set.seg_y.txt' test_segx_path = '../datasets/test_set.seg_x.txt' sentences_path = '../datasets/sentences.txt' w2v_bin_path = 'w2v.bin' ft_bin_path = 'ft.bin' voacb_path = '../datasets/voacb.txt' lines = read_data(train_segx_path) lines += read_data(train_segy_path) lines += read_data(test_segx_path) save_data(lines, sentences_path) word2vec_build(sentences_path, w2v_bin_path) fasttext_build(sentences_path, ft_bin_path) w2v_model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) model_test(w2v_model, '汽车', '车') ft_model = KeyedVectors.load_word2vec_format(ft_bin_path, binary=True) model_test(ft_model, '汽车', '车') mt = embedding_matrix(w2v_model, voacb_path)
def load_w2v_data(self, binary_file_name): self.w2v_model = KeyedVectors.load_word2vec_format( os.path.join(DATA_PATH, binary_file_name), binary=True)
validation = pd.read_csv("preprocess/validation_char.csv") validation["content"] = validation.apply(lambda x: eval(x[1]), axis=1) model_dir = "model_capsule_char/" maxlen = 1000 max_features = 20000 batch_size = 128 epochs = 1 tokenizer = text.Tokenizer(num_words=None) tokenizer.fit_on_texts(data["content"].values) with open('tokenizer_char.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) word_index = tokenizer.word_index w2_model = KeyedVectors.load_word2vec_format("word2vec/chars.vector", binary=True, encoding='utf8', unicode_errors='ignore') embeddings_index = {} embeddings_matrix = np.zeros((len(word_index) + 1, w2_model.vector_size)) word2idx = {"_PAD": 0} vocab_list = [(k, w2_model.wv[k]) for k, v in w2_model.wv.vocab.items()] for word, i in word_index.items(): if word in w2_model: embedding_vector = w2_model[word] else: embedding_vector = None if embedding_vector is not None: embeddings_matrix[i] = embedding_vector column_list = [
def add_embedding(self, embedding_file): self.embedding_files.append(KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary))
#import gensim import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.7 set_session(tf.Session(config=config)) # basic model for tag recommendation # Embedding layer -> BiLSTM -> Dense with softmax # word model #embeddings_file_bin = '../glove/vectors.bin' #word_model = KeyedVectors.load_word2vec_format('../glove/vectors.txt', binary=False, unicode_errors='ignore') word_model = KeyedVectors.load_word2vec_format('word2vec/vec_Body_Title.bin', binary=True, unicode_errors='ignore') #meta_model = KeyedVectors.load_word2vec_format('metapath2vec/code_metapath2vec/stack_new_1000', binary=True, unicode_errors='ignore') user_id = pickle.load(open("user.p", 'rb')) user_tag = pickle.load(open("user_tags.p", 'rb')) user_num = pickle.load(open("user_num.p", 'rb')) count = len(user_tag) meta_model = {} openfile = open("graph_train.emd", 'r') for line in openfile: arr = line.split() meta_model[arr[0]] = arr[1:] #print meta_model['0']
import csv from sklearn.metrics.pairwise import cosine_similarity from util import vector_averaging from util import vector_averaging_with_tfidf from util import process_source_code from util import process_diff_srcml from util import process_diff_srcml2 from util import word2weight from util import process_expression from util import mean_average_precision from util import average_precision from util import precision_at_k import sys DIMENSION = 20 cs_vectors = KeyedVectors.load_word2vec_format( "./bi2vec_vectors/cs_vectors_11_20.txt", binary=False) java_vectors = KeyedVectors.load_word2vec_format( "./bi2vec_vectors/java_vectors_11_20.txt", binary=False) with open("./sentences/sentences_cs_11.txt", "r") as cs_f: cs_data = cs_f.readlines() with open("./sentences/sentences_java_11.txt", "r") as java_f: java_data = java_f.readlines() cs_sentences = [x for x in cs_data] java_sentences = [x for x in java_data] cs_word2weight = word2weight(cs_sentences) java_word2weight = word2weight(java_sentences) with codecs.open("./evaluation_data/keywords.csv", "r") as f_csv:
def setUp(self): self.vectors = EuclideanKeyedVectors.load_word2vec_format( datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
def load_embeddings(self, embeddingsfilename): self.emb = KeyedVectors.load_word2vec_format(embeddingsfilename, binary=False)
def load_model(self, datatype): path = datapath('high_precision.kv.txt') kv = KeyedVectors.load_word2vec_format(path, binary=False, datatype=datatype) return kv