def load_word2vec(w2v): if isinstance(w2v, str): print("Loading word vectors from '%s'..." % w2v, flush=True) try: w2v = Word2Vec.load_word2vec_format(w2v) except ValueError: w2v = Word2Vec.load_word2vec_format(w2v, binary=True) return w2v
class GunicornApplication(BaseApplication): parser = SafeConfigParser() with codecs.open('config.ini', 'r', encoding='utf-8') as f: parser.readfp(f) #Mandatory Loading for standard disambiguation wiki_w2v_embeddings_file = parser.get('Word2VecRest', 'embeddings_w2v_wikipedia') w2vmodel_dbpedia = Word2Vec.load_word2vec_format(wiki_w2v_embeddings_file, binary=True) #If no doc2vec embeddings are loaded (due to memory constraints), we always return 0 as cosine similarity wiki_d2v_embeddings_file = parser.get('Word2VecRest', 'embeddings_d2v_wikipedia') if os.path.isfile(wiki_d2v_embeddings_file): d2vmodel = Doc2Vec.load(wiki_d2v_embeddings_file) else: d2vmodel = None #Optional Embeddings biomed_w2v_embedings_file = parser.get('Word2VecRest', 'embeddings_w2v_calbc') if os.path.isfile(biomed_w2v_embedings_file): w2vmodel_biomed = Word2Vec.load_word2vec_format( biomed_w2v_embedings_file, binary=True) wiki_d2v_german_embeddings = parser.get('Word2VecRest', 'embeddings_d2v_wikipedia_german') if os.path.isfile(wiki_d2v_german_embeddings): d2vmodel_german = Doc2Vec.load(wiki_d2v_german_embeddings) def __init__(self, wsgi_app, port=5000): self.options = { 'bind': "127.0.0.1:{port}".format(port=port), 'workers': 3, 'preload_app': True, 'timeout': 200, } self.application = wsgi_app super(GunicornApplication, self).__init__() def load_config(self): config = dict([(key, value) for key, value in self.options.iteritems() if key in self.cfg.settings and value is not None]) for key, value in config.iteritems(): self.cfg.set(key.lower(), value) def load(self): return self.application
def calculate_sim_without_tag(self, load_model, ofname, write_flag=True): # 加载指定w2v model w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.MODELS_DIR, load_model), binary=True) # C format # 读入评测词对语料 id_list, word1_list, word2_list, headline = utils.read2wordlist(self.f_tuple_list, mode='no_tag') # 新的题头 new_headline = headline.strip() + '\tPrediction\n' # 计算相似度 auto_sim_list = [] for w1, w2 in zip(word1_list, word2_list): try: auto_sim = w2v_model.similarity(w1, w2) # 向量余弦相似度[-1,1] auto_sim = utils.convert_sim(auto_sim) # 将余弦相似度放到1-10得分 print '%-10s\t%-10s\t%-10s' % (w1, w2, auto_sim) except: auto_sim = 1 # 未登录词,为了区分1.0,赋值为1 print '%-10s\t%-10s\t%-10s' % (w1, w2, '______Not Found______') auto_sim_list.append(auto_sim) # 相似度计算的结果是否写入文件 if write_flag: print 'write result to file...' with open('%s/%s' % (macro.RESULTS_DIR, ofname), 'w') as fw: fw.write(new_headline) for w1, w2, auto_sim in zip(word1_list, word2_list, auto_sim_list): fw.write('%s\t%s\t%s\n' % (w1, w2, auto_sim)) return word1_list, word2_list, auto_sim_list, new_headline
def main(): global _wv import argparse ap = argparse.ArgumentParser() ap.add_argument("vector_file", action="store", help="Path to the word2vec file to serve") ap.add_argument("-t", "--testclient", dest="test", action="store_true", help="If set, runs the test client against an existing server instance") ap.add_argument("--host", dest="host", action="store", default="localhost", help="The host to bind to, defaults to localhost") ap.add_argument("-p", "--port", dest="port", action="store", help="The port to serve on, defaults to 5000", default=5000) ap.add_argument("-d", "--debug", dest="debug", action="store_true", help="If true, provides debug output") args = ap.parse_args() if args.test: print("Running test client against http://{}:{}".format(args.host,args.port)) tc = WordservClient(args.host,args.port) for i in range(0,500): vecs = tc.vector(["hello","world"]) sys.exit(1) print("Loading word vector - this might take a while...") _wv = Word2Vec.load_word2vec_format(args.vector_file, binary=True) app.run(port=args.port, debug=args.debug)
def get_init_data(model_file, ark_file): from gensim.models.word2vec import Word2Vec model = Word2Vec.load_word2vec_format(model_file, binary=False) ark_clusters = get_ark_clusters(ark_file) files = [resource_filename('twitter_dm', 'data/identity_dictionaries/identity/'+x) for x in resource_listdir('twitter_dm', 'data/identity_dictionaries/identity/')] files += [resource_filename('twitter_dm', 'data/identity_dictionaries/non_identity_words/'+x) for x in resource_listdir('twitter_dm', 'data/identity_dictionaries/non_identity_words/')] all_dictionaries = Dictionaries(list_of_files=files) twit_sets = [] stopwords = get_stopwords() tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(None) for v in [10, 100, 1000, 10000,50000]: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords} twit_sets.append([twit_id,"twit_identities_"+str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"]) twit_sets.append([stopwords,"stopword"]) return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
def return_data(data_type, embed_dim=50): """Return the data specified by the inputted `data_type`. This function is built to allow for easier calls for the data from scripts external to this one. Args: ---- data_type: str embed_dim (optional): int Return: varied """ if data_type == "word_embedding": embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim) wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False) return wrd_embedding elif data_type == "articles": body_fp = 'data/articles/twenty_newsgroups/bodies.pkl' headline_fp = 'data/articles/twenty_newsgroups/headlines.pkl' with open(body_fp, 'rb') as f: bodies = pickle.load(f) with open(headline_fp, 'rb') as f: headlines = pickle.load(f) return bodies, headlines else: raise Exception('Invalid data type requested!')
def __init__(self,args): self.sentences = [] self.sentence = [] self.vocab = set() if args.restore is None: with codecs.open(args.file,'r',encoding='utf-8') as fr: for count,line in enumerate(fr): if count > 1 and line.startswith('-DOC'): self.sentences.append(self.sentence) self.sentence = [] else: try: word,tag = line.rstrip('\r\n').split() except Exception as e: print("no enough element to unpack line {} is: {} with error:{}".format(count+1,line,e)) else: for char in word: self.sentence.append(char) self.vocab.add(char) #pdb.set_trace() self.wordvec = Word2Vec(sentences=self.sentences,size=args.vector_size,window=args.window,min_count=args.min_count,max_vocab_size=len(self.vocab), workers=args.workers,sg=args.sg,batch_words=args.batch_size) else: self.wordvec = Word2Vec.load_word2vec_format(args.restore) self.randvec = RandomVec(args.vector_size)
def return_data(data_type, embed_dim=50): """Return the data specified by the inputted `data_type`. This function is built to allow for easier calls for the data from scripts external to this one. Args: ---- data_type: str embed_dim (optional): int Return: varied """ if data_type == "word_embedding": embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format( embed_dim) wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False) return wrd_embedding elif data_type == "articles": body_fp = 'data/articles/twenty_newsgroups/bodies.pkl' headline_fp = 'data/articles/twenty_newsgroups/headlines.pkl' with open(body_fp, 'rb') as f: bodies = pickle.load(f) with open(headline_fp, 'rb') as f: headlines = pickle.load(f) return bodies, headlines else: raise Exception('Invalid data type requested!')
def loadW2VFloat(self, emb_path, type="text"): #print("Loading W2V data...") num_keys = 0 if type=="textgz": # this seems faster than gensim non-binary load for line in gzip.open(emb_path): l = line.strip().split() st=l[0].lower() self.pre_emb[st]=np.asarray(l[1:], dtype=np.float32) num_keys=len(self.pre_emb) if type=="text": # this seems faster than gensim non-binary load for line in open(emb_path): l = line.strip().split() st,emb=l[0].lower(),[] for val in l[1:]: try: v = float(val) emb.append(v) except: emb.append(0) self.pre_emb[st]=np.asarray(emb) num_keys=len(self.pre_emb) else: self.pre_emb = Word2Vec.load_word2vec_format(emb_path,binary=True) self.pre_emb.init_sims(replace=True) num_keys=len(self.pre_emb.vocab) #print("loaded word2vec len ", num_keys) gc.collect()
def mergeW2V(self, emb_path, type="bin"): print("Loading W2V data...") num_keys = 0 if type == "textgz": # this seems faster than gensim non-binary load for line in gzip.open(emb_path): l = line.strip().split() st = l[0].lower() self.pre_emb[st] = np.asarray(l[1:]) num_keys = len(self.pre_emb) if type == "text": # this seems faster than gensim non-binary load i = 0 for line in open(emb_path): l = line.strip().split() st = l[0].lower() if st in self.pre_emb: continue else: i += 1 #if i % 10000 == 0: # print st self.pre_emb[st] = np.asarray(l[1:]) num_keys = len(self.pre_emb) print (num_keys) else: self.pre_emb = Word2Vec.load_word2vec_format(emb_path, binary=True) self.pre_emb.init_sims(replace=True) num_keys = len(self.pre_emb.vocab) print("loaded word2vec len ", num_keys) gc.collect()
def from_word2vec_model(cls, word2vec_model): """ WARNING: `gensim` is required to use this function! Load a word2vec vector model. :param word2vec_model: path to word2vec model or a fitted word2vec model :return: a `Vectors` object """ try: import gensim # gensim version hack if (int(gensim.__version__.split('.')[0]) < 1): from gensim.models.word2vec import Word2Vec as Word2VecLoader else: from gensim.models import KeyedVectors as Word2VecLoader except ImportError as ex: logging.error('Gensim is required to use this method!') raise ex if (isinstance(word2vec_model, str)): model = Word2VecLoader.load_word2vec_format(word2vec_model, binary=word2vec_model.endswith('bin')) else: model = word2vec_model vocab = model.vocab.keys() vectors = {} dims = len(model[next(iter(vocab))]) # vector dimensionality dimension_names = ['f%02d' % i for i in range(dims)] for word in vocab: vectors[word] = zip(dimension_names, model[word]) return Vectors(vectors)
def load_word2vec(filename): global sym np.random.seed(1337) sym = 2 * (np.random.rand(300) - 0.5) embedding = w2v.load_word2vec_format(filename, binary=True) print('Loaded word embedding') return embedding
def __init__(self, args): print('processing corpus') if args.restore is None: corpus = open(args.corpus, 'r').read().lower().split() sentences = [] sentence = [] length = 0 for word in corpus: sentence.append(word) length += 1 if length == args.sentence_length: sentences.append(sentence) sentence = [] length = 0 if length != 0: sentences.append(sentence) print('training') self.wvec_model = Word2Vec(sentences=sentences, size=args.dimension, window=args.window, workers=args.workers, sg=args.sg, batch_words=args.batch_size, min_count=1, max_vocab_size=args.vocab_size) else: self.wvec_model = Word2Vec.load_word2vec_format(args.restore, binary=True) self.rand_model = RandomVec(args.dimension)
def __init__(self, args): print('processing corpus') if args.restore is None: sentences = [] with codecs.open(args.corpus, 'r', 'utf-8') as file: for line in tqdm(file): line = line.strip().lower() sentence = line.split(u' ') #print(sentence) sentences.append(sentence) #bigram_transformer = Phrases(sentences) #print(bigram_transformer[sentences]) print('start to train word2vec embeddings') self.wvec_model = Word2Vec(sentences=sentences, size=args.dimension, window=args.window, workers=args.workers, sg=args.sg, batch_words=args.batch_size, min_count=1 #max_vocab_size=args.vocab_size ) else: self.wvec_model = Word2Vec.load_word2vec_format(args.restore, binary=True) self.rand_model = RandomVec(args.dimension)
def pretrained_embedding(vocab_processor): """Creates word embedding matrix from GoogleNews w2v. Requieres google news w2v downloaded from https://code.google.com/archive/p/word2vec/ in data """ if not os.path.exists('data/GoogleNews-vectors-negative300.bin'): print( 'You need to have google news w2v downloaded (from https://code.google.com/archive/p/word2vec/) and placed in ./data/GoogleNews-vectors-negative300.bin' ) sys.exit() w2v = Word2Vec.load_word2vec_format( 'data/GoogleNews-vectors-negative300.bin', binary=True) w2v.init_sims(replace=True) gc.collect() words = [ vocab_processor.vocabulary_.reverse(i) for i in range(vocab_processor.vocabulary_.__len__()) ] W_embeddings = [] for w in words: try: W_embeddings.append(w2v.__getitem__(w)) except KeyError: W_embeddings.append(np.random.uniform( -0.1, 0.1, 300)) # Boundries makes variance equal as the ones from google del w2v gc.collect() W_embeddings = np.array(W_embeddings) return W_embeddings
def return_data(data_type, embed_dim=50): """Return the data specified by the inputted `data_type`. This function is built to allow for easier calls for the data from scripts external to this one. Args: ---- data_type: str embed_dim (optional): int Return: varied """ if data_type == "word_embedding": embedding_fp = 'data/word_embeddings/glove.6B.{}d.txt'.format(embed_dim) wrd_embedding = Word2Vec.load_word2vec_format(embedding_fp, binary=False) return wrd_embedding elif data_type == "reviews": reviews_fp = 'work/reviews/amazon/filtered_tokenized_reviews.pkl' ratios_fp = 'work/reviews/amazon/filtered_ratios.npy' with open(reviews_fp, 'rb') as f: reviews = pickle.load(f) ratios = np.load(ratios_fp) return reviews, ratios else: raise Exception('Invalid data type requested!')
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val): if not os.path.exists(data_path): os.makedirs(data_path) embed_short = os.path.normpath("%s/embed.dat" % data_path) if not os.path.exists(embed_short): print("Caching word embeddings in memmapped format...") print(binary_val, filepath) wv = Word2Vec.load_word2vec_format("%s" % (filepath), binary=binary_val) fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open(os.path.normpath("%s/embed.vocab" % data_path), "w", encoding='utf-8') as fp: for _, w in sorted( (voc.index, word) for word, voc in wv.vocab.items()): fp.write('%s\n' % w) del fp, wv self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size)) with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f: vocab_list = [x.strip() for x in f.readlines()] self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
def _update_word_vec_dict(self): '''Updates the word vector dictionary ''' glove_file = 'data/glove_data/glove.6B.' + str(self.W2V_DIM) + 'd.txt' glove = Word2Vec.load_word2vec_format(glove_file) self.glove_dict = glove
def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file): model = Word2Vec.load_word2vec_format(model_file, binary=False) ark_clusters = get_ark_clusters(ark_file) all_dictionaries = Dictionaries(dict_filepath) twit_sets = [] stopwords = get_stopwords() tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat( twit_dict_file) for v in [10, 100, 1000, 10000, 50000]: twit_id = set(tw_distant_supervision_identity_dat[( tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = { t for t in twit_id if t not in stopwords and t.replace(" person", "") not in stopwords } twit_sets.append([twit_id, "twit_identities_" + str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES, "expert_non"]) twit_sets.append([stopwords, "stopword"]) return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets], [ t[1] for t in twit_sets ]
def load_model(model_type, json_dir="jsons"): config = load_config(json_dir) model_dir = config["model_dir"][model_type] if model_type == 'english1000': print('\n Loaded english1000! \n') return SemanticModel.load(os.path.join(model_dir, "english1000sm.hf5")) elif model_type == 'word2vec': modelfile = os.path.join(model_dir, "GoogleNews-vectors-negative300.bin") norm = False from gensim.models.word2vec import Word2Vec model = Word2Vec.load_word2vec_format(modelfile, binary=True, max_vocab_size=10000) usevocab = set( cPickle.load( open( "/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat-vocab" ))) vocab, vocinds = zip(*[(w, model.vocab[w].index) for w in model.vocab]) #w2v_usevocab = [(w,val.index) for w,val in w2v.vocab.items() if w in usevocab] #srtvocab = [w for w,voc in sorted(w2v.vocab.items(), key=lambda item:item[1].index)] #srtvocab,srtinds = zip(*sorted(w2v_usevocab, key=lambda item:item[1])) if norm: data = model.syn0norm[list(vocinds)] else: data = model.syn0[list(vocinds)] return SemanticModel(data.T, vocab) else: raise ValueError('Unknown model type: %s' % self.model_type)
def load(self): self.word_vect_modle = Word2Vec.load_word2vec_format( self.vec_model_file, binary=False) print('load word2vector model done!') self.max_word_len = len(self.word_vect_modle.wv.vocab) print('all word num is:', self.max_word_len) self.init_data()
def load_word_vec(path, vocab): model = Word2Vec.load_word2vec_format(path, binary=True) word_vecs = {} for word in vocab: if word in model: word_vecs[word] = model[word] return word_vecs, model.vector_size
def wordEmbedding(): """ These code is from http://vene.ro/blog/word-movers-distance-in-python.html """ if not os.path.exists("data/embed.dat"): print ("Caching word embeddings in memmapped format...") from gensim.models.word2vec import Word2Vec wv = Word2Vec.load_word2vec_format("/home/medialab/NLP_data/GoogleNews-vectors-negative300.bin.gz", binary = True) fp = numpy.memmap("data/embed.dat", dtype=numpy.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open("data/embed.vocab", "w") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): print >> f, unidecode(w) pass del fp, wv W = numpy.memmap("data/embed.dat", dtype=numpy.double, mode="r", shape=(3000000, 300)) with open("data/embed.vocab") as f: vocab_list = map(str.strip, f.readlines()) vocab_dict = {w:k for k,w in enumerate(vocab_list)} return W, vocab_dict
def Analogy_test(embedding='vectors.txt', analogy_test='analogy-test.txt'): model_vec = Word2Vec.load_word2vec_format(embedding, binary=False) vec_sim = 0 count = 0 sum_line = 0 with open(analogy_test, 'r') as f: for line in f: wordsArray = line.split() if len(wordsArray) < 4: count += 1 continue word1 = wordsArray[0].lower() word2 = wordsArray[1].lower() word3 = wordsArray[2].lower() word4 = wordsArray[3].lower() try: tuple1 = model_vec.most_similar(positive=[word3, word2], negative=[word1], topn=1) if tuple1[0][0] == word4: vec_sim += 1 except KeyError: count += 1 continue sum_line += 1 print "ignored lines is " + str(count) print "precision is " + str(float(vec_sim) / sum_line)
def __init__(self, tokenWeights = True, extraFeatures = True, EXTRA_WEIGHTS_LABELS = [ 'bleuScore', 'similarityScore', 'wordMoversDistance', 'crossUnigramsRatio']): self.words = {} self.words2 = {} # hypothesis words self.wordId = 0 self.wordId2 = 0 # hypothesis self.extraFeatures = {} # for our new features self.docId = 0 self.documents = {} self.tokenWeights = tokenWeights self.extraFeatures = extraFeatures self.EXTRA_WEIGHTS_LABELS = EXTRA_WEIGHTS_LABELS ##################### if not os.path.exists("data/embed.dat"): print("Caching word embeddings in memmapped format...") #from gensim import models from gensim.models.word2vec import Word2Vec wv = Word2Vec.load_word2vec_format("data/GoogleNews-vectors-negative300.bin.gz", binary=True) wv.init_sims(replace=True) # recommended new step? fp = np.memmap("data/embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open("data/embed.vocab", "w") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): f.write(w.encode('utf-8')) f.write('\n'.encode('utf-8')) #print(w, file=f) pass del wv self.W = np.memmap("data/embed.dat", dtype=np.double, mode="r", shape=(3000000, 300)) with open("data/embed.vocab") as f: self.vocab_list = map(str.strip, f.readlines()) self.vocab_dict = {w: k for k, w in enumerate(self.vocab_list)}
def read_extend_file(self): print('now read extend file') self.extend_word_vect_modle = Word2Vec.load_word2vec_format( self.extend_vect_file, binary=False) max_extend_word_len = len(self.extend_word_vect_modle.wv.vocab) for index in range(max_extend_word_len): word = self.extend_word_vect_modle.index2word[index] self.words_vect_matrix.append(self.extend_word_vect_modle[word]) print('type self.extend_word_vect_modle[word] ', type(self.extend_word_vect_modle[word])) self.extend_words_vect_matrix.append( self.extend_word_vect_modle[word]) self.extend_words_vect[word] = self.extend_word_vect_modle[ word].tolist() self.extend_words_index[word] = self.max_word_len + index self.extend_index_words[self.max_word_len + index] = word # inf = open(self.extend_vect_file, 'r') # print('now read extend file') # for index, line in enumerate(inf.readlines()): # word_vect = line.strip().split(' ') # word = word_vect[0] # vects = word_vect[1:] # vects_f=[float(item) for item in vects] # self.extend_words_vect[word] = vects_f # self.extend_words_vect_matrix.append(np.array(vects_f)) # self.extend_words_index[word] = self.max_word_len+index # self.extend_index_words[self.max_word_len+index] = word # self.words_vect_matrix.append(np.array(vects_f)) # print('word:%s,index:%s'%(word,str(self.max_word_len+index))) # inf.close() print('now read extend file done!')
class MyView(View): model = Word2Vec.load_word2vec_format(MODEL_FILE, binary=True) @classmethod def n_similarity(cls, s1, s2): # TODO: preprocesses of s1, s2 goes here s1, s2 = cls.__removeStopwords(s1), cls.__removeStopwords(s2) if not s1 or not s2: return 0.0 return cls.model.n_similarity(s1, s2) @classmethod def __removeStopwords(cls, tokens): withoutstops = [] for word in tokens: if not word or (word not in cls.model): continue withoutstops.append(word) return withoutstops @classmethod def get(cls, request, *args, **kwargs): s1, s2 = request.GET.get('s1', '[]'), request.GET.get('s2', '[]') s1, s2 = json.loads(s1), json.loads(s2) return JsonResponse({'n_similarity': cls.n_similarity(s1, s2)})
def word2vec(): print('Loading word2vec model...') w2v = Word2Vec.load_word2vec_format( 'data/GoogleNews-vectors-negative300.bin', binary=True) print('Creating listener...') address = ('localhost', 6000) with Listener(address, authkey=b'password') as listener: while True: with listener.accept() as conn: print('connection accepted from {0}'.format( listener.last_accepted)) while True: try: msg = conn.recv() try: if msg[0] == 'vocab': conn.send(msg[1] in w2v.vocab) elif isinstance(msg[0], list): conn.send(w2v.n_similarity(*msg)) else: conn.send(w2v.similarity(*msg)) except KeyError: conn.send(0.) except (EOFError, ConnectionResetError): break
def load_vectors(): print("loading word2vec vectors...") t0 = time() model = Word2Vec.load_word2vec_format('/Volumes/Seagate Backup Plus Drive/MacFilesThatICantFit/GoogleNews-vectors-negative300.bin', binary = True) loadTime = time() - t0 print("word2vec vectors loaded in %0.3f seconds" % loadTime) print() # done "training" the model; we can do the following to trim uneeded memory t0 = time() print("trimming model memory...") model.init_sims(replace=True) trimTime = time() - t0 print("trimmed memory in %0.3f seconds" % trimTime) print() vec = model['hello'] print('type of vector') print(type(vec)) print('vector') print(vec) sys.exit(1) return model
def load_word_embedding_dict(embedding, embedding_path): """ 从文件中读取词向量 :param embedding: 词嵌入类型 :param embedding_path: 词嵌入路径 :return: 词向量字典 词向量维度 """ if embedding == 'word2vec': word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) embed_dim = word2vec.vector_size return word2vec, embed_dim, False elif embedding == 'glove': embed_dim = -1 embed_dict = dict() with open(embedding_path, 'r', encoding='utf-8') as fp: for line in fp: line = line.strip() if len(line) == 0: continue tokens = line.split() if embed_dim < 0: embed_dim = len(tokens) - 1 else: assert (embed_dim + 1 == len(tokens)) embed = np.empty([1, embed_dim], dtype=np.float64) embed[:] = tokens[1:] embed_dict[tokens[0]] = embed return embed_dict, embed_dim, True else: raise ValueError("词嵌入需从 [word2vec, glove] 选取")
def __init__(self, wmodel, sentiment_tagger, opinion_extractor, clusterer): if isinstance(wmodel, Word2Vec): self.wmodel = wmodel else: self.wmodel = Word2Vec.load_word2vec_format(wmodel, binary=True) self.tagger = sentiment_tagger self.extractor = opinion_extractor self.clusterer = clusterer
def loadGoogleVector(): t1 = time.clock() vector_bin = "/home/paul/Data/GoogleNews-vectors-negative300.bin" vector_bin2 = "/home/paul/Data/" model = Word2Vec.load_word2vec_format(vector_bin, binary=True) t2 = time.clock() print ("loading GoogleVector time : %.2f" % (t2 - t1)) return model
def from_file(cls, filepath, binary, stemmer=None, pos_tagger=None): assert (isinstance(binary, bool)) assert (isinstance(stemmer, Stemmer) or stemmer is None) assert (isinstance(pos_tagger, POSTagger) or pos_tagger is None) w2v_model = Word2Vec.load_word2vec_format(filepath, binary=binary) return cls(w2v_model, stemmer, pos_tagger)
def check(self, model): assert model.contains(['topics_term', 'sentences_term']) with ElapsedTimeIndicator('load ' + self._word2vec_model + ' [{elapsed}]') as indicator: self._word2vec = Word2Vec.load_word2vec_format( self._word2vec_model, binary=True) self._word2vec.init_sims(replace=True)
def __init__(self, word2vec_model, use_binary=True): """ :param word2vec_model: The word2vec model path. :param use_binary: Whether the word2vec model is binary. :return: """ logger.info("Loading word2vec ...") self.model = Word2Vec.load_word2vec_format(word2vec_model, binary=use_binary) print("Loading done...")
def get_model(model_num, model_names): if model_num < 10: model = Word2Vec.load(model_path + model_names) elif model_num < 99: model = Doc2Vec.load(model_path + model_names) else: model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True) # C text format return model
def __train__(self,): if self.restore == None: print('start to train word2vec models ... ') self.wvec_model = Word2Vec(sentences=self.corpuss, size=args.dimension, window=args.window, workers=args.workers, sg=args.sg, batch_words=args.batch_size, min_count=3#, max_vocab_size=args.vocab_size ) else: self.wvec_model = Word2Vec.load_word2vec_format(args.restore, binary=True) #self.rand_model = RandomVec(args.dimension) '''
def __init__(self): # ATTENTION ------------------------------------ # if changing filepath of vectors.bin do so here home = expanduser("~") filename = home + "/trunk/vectors.bin" # ---------------------------------------------- try: self.model = Word2Vec.load_word2vec_format(filename, binary=True) except IOError: self.foundFile = False
def word2vec_features(data_matrix, stemming=False, stop_words=None, TFIDF=False, ngram_range=(1, 1), max_features=None, length=False, number_in_tweet=False, words_present=[], policy='sum'): print '\n------------------' print 'Creating feature vector matrix...\n' if stemming: print '\n------------------' print 'Stemming...' stemmer = SnowballStemmer("english") tweets = [" ".join([stemmer.stem(word) for word in word_tokenize(data_point[2].lower().decode("utf8"))]) for data_point in data_matrix] else: tweets = [data_point[2].lower() for data_point in data_matrix] print '\n------------------' print 'Loading word2vec model...' model = Word2Vec.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True) # C binary format # determine the policy on how to build vectors if policy == 'sum': policy = _build_sent_vec_as_sum else: policy = _build_sent_vec_as_average print 'Applying word2vec model...' # create a len(tweets) x 300 dimensional matrix dataset = np.squeeze(np.array([policy(sent, model) for sent in tweets])) print "Done" if length: lengths = np.array([[len(word_tokenize(data_point[2].decode("utf8")))] for data_point in data_matrix]) dataset = np.concatenate((dataset, lengths), axis=1) if number_in_tweet: numbers = [] for data_point in data_matrix: number_list = list_of_ints_from_string(data_point[2]) filtered_number_list = [number for number in number_list if abs(number) < 10] if len(filtered_number_list) == 0: numbers.append([0]) else: numbers.append([np.mean(filtered_number_list)]) dataset = np.concatenate((dataset, numbers), axis=1) for word in words_present: word_present = np.array([[int(word.lower() in word_tokenize(data_point[2].lower().decode("utf8")))] for data_point in data_matrix]) dataset = np.concatenate((dataset, word_present), axis=1) print '\n------------------' print 'Feature vector constructed.' return dataset
def teword(): # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False) # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2) # print sim documents=[u"今天 天气 真是 好 啊",u"明天 就要 下雨 了,伐 开心"] model=Word2Vec(documents,size=20,window=5,min_count=1) sim=model.most_similar(positive=[u"好"],topn=2) # model.save('./tmp/tevec') print sim model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False) Word2Vec.intersect_word2vec_format(model,'fieldvec.bin',binary=False) Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
def getEmbeddingsAndVocab(w2vModelFilename, rebuild=False): if path.exists(w2vModelFilename): p, f = path.split(w2vModelFilename) fName = f.split('.')[0] matFile = path.join(p, fName + "-mat.npy") vocFile = path.join(p, fName + "-voc.pkl") if not path.exists(matFile) or not path.exists(vocFile): model = Word2Vec.load_word2vec_format(w2vModelFilename, binary=False) np.save(matFile, model.syn0) cPickle.dump(model.vocab, open(vocFile, "w")) m = np.load(matFile) v = cPickle.load(open(vocFile, "r")) return m, v
def make_model(type='gensim'): if type=='google': model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) else: nlu = spacy.en.English() model = Word2Vec(size=300) for i, lex in enumerate(nlu.vocab): model.vocab[lex.orth_] = Vocab(index=i, count=None) model.index2word.append(lex.orth_) model.syn0norm = np.asarray(map(lambda x: x.repvec, nlu.vocab)) model.syn0 = np.asarray(map(lambda x: x.repvec, nlu.vocab)) return model
def main(self): self.in_filen = sys.argv[1] self.file_pref, ext = os.path.splitext(self.in_filen) # TODO skip words with control characters and decrease vocab size # in header if ext == '.pkl': # this branch is for embeddings from # https://sites.google.com/site/rmyeid/projects/polyglot logging.warning( 'There is a version of this function in the multiwsi repo ' 'that writes the embedding with fewer digits (using st.format)') with open(self.file_pref+'.w2v', mode='w') as out_file: with open(self.in_filen, mode='rb') as in_file: words, vecs = pickle.load(in_file) out_file.write('{} {}\n'.format(*vecs.shape)) for word, vec in zip(words, vecs): out_file.write('{} {}\n'.format( word.encode('utf8'), ' '.join(str(coord) for coord in vec.tolist()))) elif ext == '.w2v': m = Word2Vec.load_word2vec_format(self.in_filen) m.save(self.file_pref+'.gensim') elif ext == '.txt': self.read_txt() elif ext == '.bin': if 'glove' in self.file_pref: raise NotImplementedError( 'glove binaries are not suppoerted') else: m = Word2Vec.load_word2vec_format(self.in_filen, binary=True) logging.info("Saving {}".format(self.file_pref+'.gensim')) m.save(self.file_pref+'.gensim') logging.info("Saving {}".format(self.file_pref+'.w2v')) m.save_word2vec_format(self.file_pref+'.w2v') else: raise NotImplementedError('unknown extension')
def __init__(self, feature_extractor, **kwargs): self.feature_extractor = feature_extractor self.sizes = {} self.embedding = {} for suffix, dims in kwargs.items(): dim = dims[0] if isinstance(dim, int): self.sizes[suffix] = dim self.embedding[suffix] = defaultdict(lambda s=dim: Config().random.normal(size=s)) else: print("Loading word vectors from '%s'..." % dim) w2v = Word2Vec.load_word2vec_format(dim) unk = Config().random.normal(size=w2v.vector_size) self.sizes[suffix] = w2v.vector_size self.embedding[suffix] = Word2VecWrapper(w2v, unk)
def get(self, config, from_file=False): print("Getting sentence vectors") lines_ref = codecs.open(os.path.expanduser(config.get('Data', 'ref')) + '.' + 'token', 'r', 'utf-8').readlines() lines_tgt = codecs.open(os.path.expanduser(config.get('Data', 'tgt')) + '.' + 'token', 'r', 'utf-8').readlines() fvectors = os.path.expanduser(config.get('Vectors', 'path')) wv = Word2Vec.load_word2vec_format(fvectors, binary=False) AbstractProcessor.set_result_tgt(self, self.sents2vec(lines_tgt, wv)) AbstractProcessor.set_result_ref(self, self.sents2vec(lines_ref, wv)) wv = None print("Finished getting sentence vectors")
def __init__(self, feature_extractor, **kwargs): self.feature_extractor = feature_extractor self.feature_types = {"numeric": FeatureInformation(feature_extractor.num_features_numeric())} for suffix, (dim, size) in kwargs.items(): if isinstance(dim, int): init = None indices = self.auto_increment_dict(size) else: print("Loading word vectors from '%s'..." % dim) w2v = Word2Vec.load_word2vec_format(dim) size = len(w2v.vocab) + 1 dim = w2v.vector_size init = (w2v,) indices = self.auto_increment_dict(size, w2v.vocab) self.feature_types[suffix] = FeatureInformation( feature_extractor.num_features_non_numeric(suffix), dim, size, init, indices)
def train_google_model(self, google_file): """ using the google word vector dataset to extract the word feature Parameters: ----------- google_file: the location about google.bin/G.bin type: string Return: ------- None """ self.google_modopel = Word2Vec.load_word2vec_format(google_file, binary=True) return
def __init__(self, remote): global _w2v global _w2v_conn self.remote = remote if not remote and _w2v is None: print('Loading word2vec model...') _w2v = Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True) self.vocab = Vocab(remote, None) print('Done loading word2vec') elif _w2v_conn is None: print('Connecting to word2vec process...') address = ('localhost', 6000) _w2v_conn = Client(address, authkey=b'password') self.vocab = Vocab(remote, _w2v_conn) print('Done connecting to word2vec') self.conn = _w2v_conn
def create_cache(filepath="data"): if ((not os.path.exists(filepath+"/"+"embed.dat") or not os.path.exists(filepath+"/"+"embed.vocab"))): print("Cache of word embeddings...", file=sys.stderr) from gensim.models.word2vec import Word2Vec wv = Word2Vec.load_word2vec_format( filepath+"/"+"GoogleNews-vectors-negative300.bin.gz", binary=True) fp = np.memmap(filepath+"/"+"embed.dat", dtype=np.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open(filepath+"/"+"embed.vocab", "w", encoding="utf8") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): print(w, file=f) del fp, wv print('done', file=sys.stderr)
def get_init_data(model_file, ark_file, dict_filepath, twit_dict_file): model = Word2Vec.load_word2vec_format(model_file, binary=False) ark_clusters = get_ark_clusters(ark_file) all_dictionaries = Dictionaries(dict_filepath) twit_sets = [] stopwords = get_stopwords() tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat(twit_dict_file) for v in [10, 100, 1000, 10000,50000]: twit_id = set(tw_distant_supervision_identity_dat[ (tw_distant_supervision_identity_dat.tot > v)].term.values) twit_id = {t for t in twit_id if t not in stopwords and t.replace(" person","") not in stopwords} twit_sets.append([twit_id,"twit_identities_"+str(v)]) twit_sets.append([EXPERT_NON_IDENTITIES,"expert_non"]) twit_sets.append([stopwords,"stopword"]) return model, all_dictionaries, ark_clusters, [t[0] for t in twit_sets],[t[1] for t in twit_sets]
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val): if not os.path.exists(data_path): os.makedirs(data_path) embed_short = os.path.normpath("%s/embed.dat" % data_path) if not os.path.exists(embed_short): print("Caching word embeddings in memmapped format...") print(binary_val, filepath) wv = Word2Vec.load_word2vec_format("%s" % (filepath), binary=binary_val) fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open(os.path.normpath("%s/embed.vocab" % data_path), "w", encoding='utf-8') as fp: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): fp.write('%s\n' % w) del fp, wv self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size)) with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f: vocab_list = [x.strip() for x in f.readlines()] self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
def __init__(self, fn='models/GoogleNews-vectors-negative300.bin', threshold=0.4): """creates a Tranformer :param fn: location of the model to load :type fn: str """ url = 'https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM' download_msg = 'Download the Google News model here: ' + url if not os.path.isfile(fn): raise ValueError('File {} not found!\n'.format(fn) + download_msg) self.model = Word2Vec.load_word2vec_format(fn, binary=True) self.threshold = threshold # dumb caching self._last_less = None self._last_more = None
def create_embeddings_weights(self): config = self.config tk = self.tokenizer word2index = tk.word_index # reverse index index2word = {i:w for (w,i) in tk.word_index.items()} max_size = len(index2word) + 1 # load w2v model w2v_vectors_file = config["w2v_data"] w2v = Word2Vec.load_word2vec_format(w2v_vectors_file, binary=True) word_vector_dims = w2v.vector_size embedding_weights = np.zeros((max_size, word_vector_dims)) for i,w in index2word.items(): try: embedding_weights[i,:] = w2v[w] except: print("{} not found".format(w)) return (w2v, embedding_weights)
def get(self, config, from_file=False): lines_ref = codecs.open(os.path.expanduser(config.get('Data', 'ref')) + '.' + 'token', 'r', 'utf-8').readlines() lines_tgt = codecs.open(os.path.expanduser(config.get('Data', 'tgt')) + '.' + 'token', 'r', 'utf-8').readlines() fvectors = os.path.expanduser(config.get('Vectors', 'path')) print("Loading word vectors from " + fvectors) wv = Word2Vec.load_word2vec_format(fvectors, binary=False) print("Finished loading word vectors from " + fvectors) print("Building sentence vectors for target...") AbstractProcessor.set_result_tgt(self, self.words2vec(lines_tgt, wv)) print("Finished building sentence vectors for target") print("Building sentence vectors for reference...") AbstractProcessor.set_result_ref(self, self.words2vec(lines_ref, wv)) print("Finished building sentence vectors for reference") wv = None print("Finished getting word vectors")