def compare(dataset, model_name, pre_model_name): # build model if(os.path.isfile(model_name)): model = Word2Vec.load(model_name) logger.debug("model %s already exist, stop training wordvector", model_name) else: logger.info("start trainning word vector") start_time = timeit.default_timer() model = wordvector.build_word_vector(dataset, save=True, save_file=model_name) logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time) # find most similar words: for word in keywords: print word print model.most_similar(word, topn=10); # load pre-trained google news model logger.info("start loading pre-trained dataset") start_time = timeit.default_timer() pre_model = Word2Vec.load_word2vec_format(pre_model_name, binary=True) logger.info("pre-trained dataset loaded in %.4lfs", timeit.default_timer() - start_time) # find most similar words: for word in keywords: print word print pre_model.most_similar(word, topn=10);
def load(filename): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if filename[:-6] == "bin.gz": model = Word2Vec.load_word2vec_format(filename, binary=True) else: model = Word2Vec.load(filename) return model
def load_word_embedding(data_name='google_news', data_type='bin'): logger.info('Start load word2vec word embedding') os_name = get_os_name() if os_name == "windows": file1 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz' file2 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz' file3 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin' file4 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin' elif os_name == 'ubuntu': file1 = '/home/hs/Data/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz' file2 = '/home/hs/Data/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz' file3 = '/home/hs/Data/Word_Embeddings/google_news.bin' file4 = '/home/hs/Data/Word_Embeddings/freebase.bin' if data_name == 'google_news': if data_type == 'bin': model = Word2Vec.load_word2vec_format(file3, binary=True) else: # load .bin.gz data model = Word2Vec.load_word2vec_format(file1, binary=True) else: # load freebase if data_type == 'bin': model = Word2Vec.load_word2vec_format(file4, binary=True) else: model = Word2Vec.load_word2vec_format(file2, binary=True) # using gzipped/bz2 input works too, no need to unzip: logging.info('Loading word embedding complete') return model
def initialize(fword, tword, modelfn, start, debug): juman = Juman() # parse and check from_word ms_f = juman.analysis(fword).mrph_list() if len(ms_f) > 1: print(u'{} is parsed multiple words'.format(fword)) exit(1) wm_f = ms_f[0] if not wm_f.repname: print(u'no repname with {}'.format(fword)) exit(1) fword = wm_f.repname # parse and check to_word ms_t = juman.analysis(tword).mrph_list() if len(ms_t) > 1: print(u'{} is parsed multiple words'.format(tword)) exit(1) wm_t = ms_t[0] if not wm_t.repname: print(u'no repname with {}'.format(tword)) exit(1) tword = wm_t.repname # load and check model print(u'loading model...') if modelfn.split('.')[-1] == 'model': model = Word2Vec.load(modelfn) elif modelfn.split('.')[-1] == 'bin': model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore') if fword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(fword)) exit(1) elif tword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(tword)) exit(1) model.save('hs0.100m.500.5.18mgt100.model') t1 = time.clock() - start if debug: printtime(t1) print(u'constructing id2vocab map...') id2vocab = {} for i, v in enumerate(model.vocab): id2vocab[i] = v t2 = time.clock() - t1 if debug: printtime(t2) print(u'constructing V...') V = [] for v in model.vocab: V.append(model[v]) V = np.vstack(V) t3 = time.clock() - t2 if debug: printtime(t3) return fword, tword, model, V, id2vocab, t3
def _load_vector_space_mapper(model_1_path, model_2_path, bilingual_path): """Build a vector space mapper from model 1,2 and bilingual dict.""" model_1 = Word2Vec.load(model_1_path) model_2 = Word2Vec.load(model_2_path) bilingual_dict = bg.load_bilingual_dictionary(bilingual_path) tvecs_vm = VectorSpaceMapper(model_1, model_2, bilingual_dict) tvecs_vm.map_vector_spaces() return tvecs_vm
def runWord2Vec(condition_one, condition_two): num_features = 300 try: model = Word2Vec.load("../models/W2V/"+ str(num_features) + "features_20minwords_10context") except Exception, e: Create_W2V_model.trainWorld2Vec(num_features) model = Word2Vec.load("../models/W2V/"+ str(num_features) + "features_20minwords_10context")
def __init__(self, vector_fn='vectors_example.bin', word2vec=True): if word2vec: self.lsa_model = Word2Vec.load_word2vec_format( os.path.join(os.environ['LSA_DIR'], vector_fn), binary=True) else: self.lsa_model = Word2Vec.load( os.path.join(os.environ['LSA_DIR'], vector_fn)) self.alpha = 0.25 self.cache = {} self.wn_cache = WordnetCache()
def get_vec_sim(self): model_fn = self.config.get("vectors", "model") model_type = self.config.get("vectors", "model_type") logging.warning("Loading model: {0}".format(model_fn)) if model_type == "word2vec": self.vec_model = Word2Vec.load_word2vec_format(model_fn, binary=True) elif model_type == "gensim": self.vec_model = Word2Vec.load(model_fn) else: raise Exception("Unknown LSA model format") logging.warning("Model loaded: {0}".format(model_fn))
def load_w2v(model_path, vocab): from gensim.models import Word2Vec try: model = Word2Vec.load_word2vec_format(model_path, binary=True) except UnicodeDecodeError: model = Word2Vec.load(model_path) dim = model.layer1_size word_vecs = {} for word in vocab: if word in model: word_vecs[word] = model[word] return word_vecs, dim
def get_vec_sim(self): model_fn = self.config.get('vectors', 'model') model_type = self.config.get('vectors', 'model_type') logging.warning('Loading model: {0}'.format(model_fn)) if model_type == 'word2vec': self.vec_model = Word2Vec.load_word2vec_format(model_fn, binary=True) elif model_type == 'gensim': self.vec_model = Word2Vec.load(model_fn) else: raise Exception('Unknown LSA model format') logging.warning('Model loaded: {0}'.format(model_fn))
def read_glove_model(dim=50, huge=False): print "reading gloVe word embedding vectors..." if dim == 50: return Word2Vec.load_word2vec_format(glove_vector_50, binary=False) elif dim == 100: return Word2Vec.load_word2vec_format(glove_vector_100, binary=False) elif dim == 200: return Word2Vec.load_word2vec_format(glove_vector_200, binary=False) elif dim == 300: return Word2Vec.load_word2vec_format(glove_vector_300, binary=False) elif huge: return read_glove_to_dict(glove_vector_huge)
def set_embedding_weights(self, embedding_init): # load embedding with gensim from gensim.models import Word2Vec try: m = Word2Vec.load_word2vec_format(embedding_init, binary=False) edim = m.layer1_size except UnicodeDecodeError: try: m = Word2Vec.load_word2vec_format(embedding_init, binary=True) edim = m.layer1_size except UnicodeDecodeError: # not in word2vec format m = Word2Vec.load(embedding_init) edim = m.layer1_size except ValueError: # glove model m = {} if embedding_init.endswith('gz'): fp = gzip.open(embedding_init) else: fp = open(embedding_init) for l in fp: le = l.split() m[le[0].decode('utf-8')] = numpy.array( [float(e) for e in le[1:]], dtype=theano.config.floatX) edim = len(le) - 1 if edim != self.edim: raise Exception("Embedding dim and edim doesn't match") m_lower = {} vocab = (m.vocab if hasattr(m, 'vocab') else m) for k in vocab: if k in ['UNKNOWN', 'PADDING']: continue if self.num: m_lower[replace_numerals(k.lower())] = m[k] else: m_lower[k.lower()] = m[k] # transform weight matrix with using self.w2i params = numpy.zeros( self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX) e = self.edim for w in self.w2i: if w in m_lower: v = m_lower[w] i = self.w2i[w] params[i*e:(i+1)*e] = v if 'UNKNOWN' in vocab: params[-1*e:] = vocab['UNKNOWN'] if 'PADDING' in vocab: params[-2*e:-1*e] = vocab['PADDING'] self.tagger.layers[0].layers[0].set_param_vector(params)
def to_space( word2vec=('', 'GoogleNews-vectors-negative300.bin.gz', 'Path to word2vec vectors.'), output=('o', 'space.h5', 'The output space file.'), word2vec_format=('', False, 'Word2vec_format.'), pos_separator=('', '', 'POS separator.'), ): """Read a word2vec file and save it as a space file.""" from gensim.models import Word2Vec if word2vec_format: model = Word2Vec.load_word2vec_format(word2vec, binary=True) else: model = Word2Vec.load(word2vec) if not pos_separator: targets = pd.DataFrame( { 'id': range(len(model.index2word)), 'ngram': model.index2word, 'tag': '_', }, ) else: tokens = [s.rsplit(pos_separator, maxsplit=1) for s in model.index2word] targets = pd.DataFrame( { 'id': range(len(model.index2word)), 'ngram': [n for n, _ in tokens], 'tag': [t for _, t in tokens], }, ) targets.set_index(['ngram', 'tag'], inplace=True) context = pd.DataFrame( { 'id': range(model.syn0.shape[1]), 'ngram': range(model.syn0.shape[1]), 'tag': '_' }, ) context.set_index(['ngram', 'tag'], inplace=True) space = Space( data_ij=model.syn0, row_labels=targets, column_labels=context, ) space.write(output)
def load_matrix_and_dictionary(fn, typ, dict_fn=None, filt_dict=None): if typ == 'numpy': return np.load(fn), load_dictionary_as_dict(dict_fn) elif typ == 'glove': from glove import Glove m = Glove().load_stanford(fn) return m.word_vectors, m.dictionary elif typ == 'word2vec': from gensim.models import Word2Vec if 'txt' in fn or 'w2v' in fn: m = Word2Vec.load_word2vec_format(fn, binary=False) else: m = Word2Vec.load_word2vec_format(fn, binary=True) return extract_wordvec_matrix_and_dict(m, filt_dict) raise Exception('Unknown matrix format: {}'.format(typ))
def __init__(self, section, config): self.model_fn = config.get(section, 'model') self.model_type = config.get(section, 'model_type') self.wordnet_boost = config.getboolean(section, 'wordnet_boost') self.use_twitter_norm = config.getboolean('alignment', 'twitter_norm') logging.info('Loading model: {0}'.format(self.model_fn)) if self.model_type == 'word2vec': self.model = Word2Vec.load_word2vec_format( self.model_fn, binary=False) elif self.model_type == 'gensim': self.model = Word2Vec.load(self.model_fn) else: raise Exception('Unknown LSA model format') logging.info('Model loaded: {0}'.format(self.model_fn)) self.sim_cache = {}
def train(train_dir, test_dir=None, nn='berger_cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, verbose=1): # Figure out whether we're predicting categories or keywords if NO_OF_LABELS == 14: scaler_path = CATEGORY_SCALER w2v_path = CATEGORY_WORD2VEC else: scaler_path = KEYWORD_SCALER w2v_path = KEYWORD_WORD2VEC model = MagpieModel( word2vec_model=Word2Vec.load(w2v_path), scaler=load_from_disk(scaler_path), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.train( train_dir, get_labels(NO_OF_LABELS), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history) return history, model
def vectors_to_pickled_dict(desired_words, output_file, norm = True, filename = __BIN_FILE_): print("Loading Model") model = Word2Vec.load_word2vec_format(filename, binary=True) print("Loaded") wd2vec = dict() if desired_words: desired_words = set(desired_words) else: desired_words = model.vocab.keys() for i, wd in enumerate(desired_words): if i % 1000 == 0: print(i) wd = remove_non_ascii(wd).replace(" "," ").strip() # for phrases wd_key = wd.replace(" ", "_") if wd_key in model.vocab: ix = model.vocab[wd_key].index vector = model.syn0norm[ix] if norm else model.syn0[ix] wd2vec[wd.replace("_"," ").strip()] = vector with open(output_file, "w+") as f: Pickle.dump(wd2vec, f)
def load_CUI_vectors(): ''' From De Vine et al., CIKM 2014 https://github.com/clinicalml/embeddings ''' m = Word2Vec.load_word2vec_format("DeVine_etal_200.txt.gz") return m
def __init__(self, model_file_path=''.join(config['model_file_path']), label_file_path=''.join(config['label_file_path']), word2vec_to_solve_oov=config['word2vec_to_solve_oov'] ): ''' :param word2vec_to_solve_oov: 是否使用word2vec去查oov ''' self._word2vec_to_solve_oov = word2vec_to_solve_oov self._model_file_path = model_file_path self._full_mode = config['full_mode'] logging.debug('使用full切词模式?%s...'%(self._full_mode)) logging.debug('=' * 20) logging.debug('加载分类器模型和编码器...') model_in_file = open(model_file_path, 'r') self._model = pickle.load(model_in_file) self._bow_encoder = pickle.load(model_in_file) self._cnn_encoder = pickle.load(model_in_file) self._index_to_label = np.load(open(label_file_path,'r')) self._keywords = self._bow_encoder.get_feature_names() # 测试 logging.debug('=' * 20) logging.debug('测试...') logging.debug('-' * 20) logging.debug('加载word2vec模型...') logging.debug('-' * 20) logging.debug('=' * 20) if config['word2vec_to_solve_oov']: self._word2vec_model = Word2Vec.load(config['word2vec_model_file_path'])
def word_2_vec(): csv_paths = ['set1.csv','set2.csv','combined.csv'] model = Word2Vec.load_word2vec_format('/root/libanghuai/homework/GoogleNews-vectors-negative300.bin', binary=True) for csv_path in csv_paths: print "deal_with %s \n" % csv_path out_file_name = "word2vec_result_"+csv_path wordpairs = list(csv_parser(csv_path)) wordpairs = cal_rank(wordpairs,3) ans_list=[] for wordpair in wordpairs: fst_word = wordpair[1] sec_word = wordpair[2] max_sim = model.similarity(fst_word,sec_word) wordpair.append(max_sim) ans_list.append(wordpair) ans_list = cal_rank(ans_list,5) num = 0 sum_gap = 0 for line in ans_list: num += 1 sum_gap += (line[4] - line[6])*(line[4] - line[6]) print num output_file(out_file_name,ans_list) print (1-sum_gap*6.0/(num*(num*num - 1)))
def __main__(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) parser = argparse.ArgumentParser(description='') parser.add_argument('-f', action='store', dest='filename', help='Data filename') parser.add_argument('-d', action='store', nargs="+", dest='dataset', help='Dataset name') parser.add_argument('-c', action='store', nargs="+", dest='categories', help='Dataset name') parser.add_argument('--topn', action='store', nargs="+", dest='topn', default='0', help='Dataset name') parser.add_argument('--model', action='store', nargs="+", dest='modelname', help='Similarity dictionary name') parser.add_argument('--lda', action='store_true', dest='test_lda', help='If on test lda features') parser.add_argument('--sd', action='store_true', dest='test_simdict', help='knn similarity') parser.add_argument('--w2v', action='store_true', dest='test_w2v', help='If on test w2v features') parser.add_argument('--w2v-topn', action='store_true', dest='test_w2v_topn', help='If on test w2v features') parser.add_argument('--pword', action='store_true', dest='perword', help='whether similar words taken per word') parser.add_argument('--kt', action='store_true', dest='kt', help='kenyan twits') arguments = parser.parse_args() print arguments datasets, filenames = prep_arguments(arguments) topns = map(int, arguments.topn) perword = arguments.perword if arguments.modelname is not None and not arguments.test_simdict: w2v_model_name = arguments.modelname[0] print w2v_model_name w2v_model = Word2Vec.load(w2v_model_name) w2v_model.init_sims(replace=True) else: w2v_model = None for dataset, filename in zip(datasets, filenames): for topn in topns: print dataset, filename, topn test_one_file(filename, dataset, topn, perword, w2v_model, arguments)
def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param vec_file: the file storing vectors :param binary: if vector are stored in binary. Google news use binary while yelp not """ self._wordvec = Word2Vec.load_word2vec_format(FileIO.filename(vec_file), binary=binary)
def get_model(): ''' lazy initialization for w2v model so it works in pool ''' global model if model == None: print 'loading the w2v model...' model = Word2Vec.load('w2v/lemma_stopwords') return model
def train(): t0 = time.time() filename = './data/seg20180327.txt' if not os.path.exists(filename): for tsv in ['labeledTrainData.tsv', 'unlabeledTrainData.tsv', 'testData.tsv']: logger.info("loading %s ...." % tsv) load_tsv('./data/' + tsv, filename) sents = word2vec.Text8Corpus(filename) t1 = time.time() logger.info("load text taks %s" % (time.time()-t0)) model_path = './data/model.w2v' if not os.path.exists(model_path): num_features, num_workers = 300, 4 min_word_count, context = 20, 10 downsampling = 1e-3 model = word2vec.Word2Vec( sents, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling ) model.init_sims(replace=True) else: model = Word2Vec.load(model_path) # model.save_word2vec_foramt(output_vec, binary=False) model.build_vocab(sents, update=True) model.train(sents, total_examples=model.corpus_count, epochs=model.iter) # 生成的词典 # model.vocab logger.info('w2v train taks %s' % (time.time()-t1)) model.save('./data/model.w2v')
def test_ofm_word2vec_cosine_selection(self): model = Word2Vec.load(self.brownFilePath) ofmPredictor = OFMPredictions() testData = self.getOFMTestData() pred = ofmPredictor.word2VecSimilaritySelectionCosine(testData, model) optionSentences = [option['sent'] for option in testData['word1']['options']] self.assertTrue(pred['word1']['solution'] in optionSentences)
def __filter_w2v_model(filename, words_to_remove, num_to_keep): """Filters the words in the Spanish model, removing all the words in the given list and returning the top x words :param filename: The name of the file to read the words in from :param words_to_remove: A list of all the words to get rid of :param num_to_keep: The number of words to keep """ good_words = list() with open(filename, 'r') as f: for line in f: for word in words_to_remove: if not line.startswith(word): good_words.append(line) random.shuffle(good_words) kept_words = good_words[:num_to_keep] with open('tempmodel', 'w') as f: for word in kept_words: f.write(word) f.write('\n') return Word2Vec.load_word2vec_format('tempmodel')
def train(train_dir, test_dir=None, nn='cnn', nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, persist=False, no_of_labels=NO_OF_LABELS, verbose=1): model = MagpieModel( word2vec_model=Word2Vec.load(WORD2VEC_PATH), scaler=load_from_disk(SCALER_PATH), ) logger = CustomLogger(nn) model_checkpoint = ModelCheckpoint( os.path.join(logger.log_dir, 'keras_model'), save_best_only=True, ) history = model.train( train_dir, get_labels(no_of_labels), test_dir=test_dir, nn_model=nn, callbacks=[logger, model_checkpoint], batch_size=batch_size, nb_epochs=nb_epochs, verbose=verbose, ) finish_logging(logger, history, model.keras_model, persist=persist) return history, model
def load_word2vec_model(model): """ """ embed_data_path = "../data/embed_dat" embed_vocab_path = "../data/embed_vocab" vector_model_path = "../data/user_vector" if os.path.exists(embed_data_path): os.remove(embed_data_path) if os.path.exists(embed_vocab_path): os.remove(embed_vocab_path) if not os.path.exists(embed_data_path): print("Caching word embeddings in memmapped format...") wv = Word2Vec.load_word2vec_format(vector_model_path, binary=True) print "wv syn0norm shape : " + str(wv.syn0norm.shape) fp = np.memmap(embed_data_path, dtype= np.double, mode='w+', shape=wv.syn0norm.shape) fp[:] = wv.syn0norm[:] with open(embed_vocab_path, "w") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): f.write(w + "\n") del fp, wv
def build_model_for_corpus(corpus): """ Build an appropriate Keras NN model depending on the corpus """ if corpus == 'keywords': keras_model = cnn(embedding_size=100, output_length=10000) elif corpus == 'categories': keras_model = cnn(embedding_size=100, output_length=14) elif corpus == 'experiments': keras_model = cnn(embedding_size=100, output_length=500) else: raise ValueError('The corpus is not valid') model_path = os.path.join(DATA_DIR, corpus, 'model.pickle') keras_model.load_weights(model_path) w2v_model = Word2Vec.load(WORD2VEC_PATH) scaler = load_from_disk(SCALER_PATH) labels = get_labels(keras_model.output_shape[1]) model = MagpieModel( keras_model=keras_model, word2vec_model=w2v_model, scaler=scaler, labels=labels, ) return model
def mineAbbreviation(): print 'mining abbreviation' jieba.load_userdict("../../../data/jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') word2vec_model = Word2Vec.load('../../../data/word2vec.model') word_set = getWords() word_syn_dict = {} for word in word_set: word_syn_dict.setdefault(word,set([word])) if len(word) != 2: continue try: for simi_word_tuple in word2vec_model.most_similar(positive=[word],topn=20): simi_word = simi_word_tuple[0] simi_value = simi_word_tuple[1] reverse_word = word[1]+word[0] if reverse_word == simi_word: pass else: if len(set(word)&set(simi_word)) != len(word) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word: continue word_syn_dict[word].add(simi_word) except: pass # print word outfile = open('abbreviation.txt','wb') for word in word_syn_dict.keys(): if len(word_syn_dict[word])>=2: outfile.write(word+'@'+','.join(word_syn_dict[word])+'\r\n')
# Description: # Author: orange # Date: 2021/7/10 # ------------------------------------------------------------------------------- import csv import tools import numpy as np from gensim.models import Word2Vec agnews_label = [] agnews_title = [] agnews_text = [] agnews_train = csv.reader(open("./dataset/train.csv", "r")) for line in agnews_train: agnews_label.append(np.float32(line[0])) agnews_title.append(tools.text_clear(line[1])) agnews_text.append(tools.text_clear(line[2])) print("开始训练模型") model = Word2Vec(agnews_text, size=64, min_count=0, window=5, iter=128) model_name = "corpusWord2Vec.bin" model.save(model_name) model = Word2Vec.load('./corpusWord2Vec.bin') model.train(agnews_title, epochs=model.epochs, total_examples=model.corpus_count)
train_file = 'data/train_{}.tsv'.format(type) test_file = 'data/test_{}.tsv'.format(type) files = [train_file, test_file] sentences = [] for file in files: corpus = open(file, 'r').readlines()[1:] for line in corpus: sentences.append(line.split( '\t')[2].split()) # sentences should be a list of lists of tokens # Train Model model = Word2Vec(sentences, size=20, window=5, sg=0, hs=1, negative=5, workers=4) model.save('data/word2vec_20_{}.bin'.format(type)) model.wv.save_word2vec_format('data/word2vec_20_{}_vec.bin'.format(type), binary=True) # Load Model # model = Word2Vec.load('data/word2vec_20_char_num.bin') # word_vectors = KeyedVectors.load_word2vec_format('data/word2vec_20_char_num_vec.bin', binary=True) # Test # results = model.most_similar([u'good'], topn=10) # for result in results: # print result
from __future__ import print_function import logging import sys import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': if len(sys.argv) < 3: print( "Please provide two arguments, first one is path to the revised corpus, second one is path to the output file for model." ) print("Example command: python3 word2vec.py wiki.tr.txt trmodel") sys.exit() inputFile = sys.argv[1] outputFile = sys.argv[2] logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') model = Word2Vec(LineSentence(inputFile), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count()) model.wv.save_word2vec_format(outputFile, binary=True)
if temp: training_word_data.append(temp) #print(training_word_data) else: break ##### 형태소 분석 ###### ##### Word2Vec Model 생성 ###### #model = Word2Vec.load(model_path + '.model') #Model 생성 model = Word2Vec(training_data_path, size=300, window=3, min_count=1, workers=1) model.build_vocab(training_word_data, update=True) model.train(training_word_data, epochs=model.epochs, total_examples=model.corpus_count) model.save(model_path + '.model') ##### Word2Vec Model 생성 ###### ###### Pickle Data 생성 ######## f = open(raw_data_path, 'rt', encoding='utf-8') raw_data = f.readlines() f.close()
from gensim.models import Word2Vec, Phrases # update the encoding system reload(sys) sys.setdefaultencoding('utf-8') client = MongoClient("localhost", 27017) db = client["undocs"] for year in range(1995, 2017): search_json = {'year': str(year)} docs = db.docs.find(search_json) count = docs.count() print count i = 0 all_sentences = [] for doc in docs: i += 1 print i sentences = doc[u'sentences_keep_swords'] all_sentences += sentences model = Word2Vec(all_sentences, size=100, window=10, min_count=1, workers=16) model.save("model/phrases/phrases_year_%s_1203.w2v" % year) print "Year %s completed!" % year print "completed!"
def print_list(a): for i, s in enumerate(a): if i != 0: print('+', end=' ') print(s, end=' ') if __name__ == '__main__': corpora_path = './sogou_corpora' corpora_model_path = './sogou_corpora/corpora_segment' model_name = './sogou_corpora/200806.model' if not os.path.exists(model_name): sentences = LoadCorpora(corpora_model_path) t_start = time() model = Word2Vec(sentences, size=200, min_count=5, workers=8) # 词向量维度为200,丢弃出现次数少于5次的词 model.save(model_name) print('OK:', time() - t_start) model = Word2Vec.load(model_name) print('model.wv.vocab = ', type(model.wv.vocab), len(model.wv.vocab)) for i, word in enumerate(model.wv.vocab): print(word, end=' ') if i % 50 == 49: print() print() intrested_words = ('中国', '手机', '学习', '人民', '名义') print('特征向量:') for word in intrested_words: print(word, len(model[word]), model[word])
def load_from(self, file): self.model = Word2Vec.load(file) self.model_initialized = True
from gensim.models import Word2Vec # test model print('loading model...') model = Word2Vec.load("assets/gay_seattle.w2v") print("seattle", model.wv.most_similar('seattle', topn=50)) print(model.wv.distances('seattle', ('news', 'june', 'times', 'march'))) # seattle [('news', 0.9989323616027832), ('june', 0.998815655708313), ('times', 0.9987982511520386), ('march', 0.9987823963165283), ('apr', 0.9987049102783203), ('july', 0.9985809326171875), ('nov', 0.9984444379806519), # print("model details: ", model) # print('similar words to seattle:') # print("capitol", model.wv.most_similar('capitol')) # # print("gay", model.wv.most_similar('gay', topn=50)) # print(model.wv.most_similar('lesbian')) # print(model.wv.most_similar('considered')) # print(model.wv.most_similar('number')) # print("=================") # print(model.wv.distances('seattle', ('gay', 'renton', 'lesbian', 'rain'))) # print(model.wv.distance('seattle', 'civil')) # print(model.wv.distance('seattle', 'lesbian')) # print(model.wv.rank('seattle', 'gay')) # print(model.wv.rank('seattle', 'lesbian')) # print(model.wv.distances('seattle'))
import numpy as np import scipy.signal import sklearn import dill import pickle import csv import sys import copy from gensim.models import Word2Vec model = Word2Vec.load('w2vmodel') def clip(v): x = v[:10] if len(x) % 2 == 0: b = a = (10 - len(x)) / 2 else: b = (10 - len(x)) / 2 a = b + 1 return np.lib.pad(np.array(x), ((b, a), (0, 0)), 'constant') def save_model(brnn): with open('cnn_model_%s.pkl' % TYPE, 'wb') as f: dill.dump(brnn, f) def load_model():
import os import time import gc import random from keras.preprocessing import text, sequence import torch from torch import nn from torch.utils import data from torch.nn import functional as F from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from gensim.models import KeyedVectors, word2vec, Word2Vec import pickle, json wv_from_bin = pickle.load(open("GloVe_50.pkl", 'rb')) ###GLOVE wv_from_scratch = Word2Vec.load('word2vec.model') ##word2vec from scratch wordVectors = np.load( "/home/wzh/wzh/glove/wordVectors.npy") ##word2vec delta training tokens = json.load( open("/home/wzh/wzh/glove/tokens.json")) ##word2vec delta training cate2id = json.load(open("label.json", "rb")) NUM_LABLES = len(cate2id) NUM_MODELS = 1 LSTM_UNITS = 128 DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS MAX_LEN = 40 BATCH_SIZE = 32 EPOCH = 20 MODE = "delta_word2vec" ###GLOVE, word2vec, delta_word2vec DATASET = "embed_eval_data_sample_general.csv"
import os.path import sys import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) if len(sys.argv) != 4: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp, outp2 = sys.argv[1:4] model = Word2Vec(LineSentence(inp), size=300, window=5, iter=5, min_count=5, workers=multiprocessing.cpu_count()) model.save(outp) model.wv.save_word2vec_format(outp2, binary=False)
predicted = model.predict(data)[0] predictedY = predicted.argmax(axis=-1) return predictedY if __name__ == "__main__": filterSizes = [3, 4, 5] numOfFilters = 100 # tested with 10, 20 dropout = 0.5 batchSize = 1000 epochs = 20 sequenceLength = 20 # Twitter max length is 140 chars embeddingDim = 50 numOfLabels = 5 drop = 0.5 wvModel = Word2Vec.load('vectors.bin') # sentencesTrain, emojisTrain = obtainData() # dataTrain, labelsTrain, wordIdTrain = obtainData() # dataTest, labelsTest, wordIdTest = obtainData("test") dataTrain, dataTest, labelsTrain, labelsTest, wordIdMap, maxLength, idEmojiMap = buildDataFull() packedData = {"len": maxLength, "dic": wordIdMap, "emo": idEmojiMap} js = json.dumps(packedData) fp = open("datacnn.json", "w") fp.write(js) fp.close() embeddingMatrix = np.zeros((len(wordIdMap)+1, embeddingDim)) for word, i in wordIdMap.items(): try: vector = wvModel.wv[word] embeddingMatrix[i] = vector
print('Total training sentences: %d' % len(sentences)) ''' The word2vec algorithm processes documents sentence by sentence. This means we will preserve the sentence-based structure during cleaning. The model is fit when constructing the class. We pass in the list of clean sentences from the training data, then specify the size of the embedding vector space (we use 100 again), the number of neighboring words to look at when learning how to embed each word in the training sentences (we use 5 neighbors), the number of threads to use when fitting the model (we use 8, but change this if you have more or less CPU cores), and the minimum occurrence count for words to consider in the vocabulary (we set this to 1 as we have already prepared the vocabulary). ''' # train word2vec model model = Word2Vec(sentences, size=100, window=5, workers=8, min_count=1) # summarize vocabulary size in model words = list(model.wv.vocab) ''' Vocabulary size should be same as that of vocab created in Vectorizer ''' print('Vocabulary size: %d' % len(words)) ''' Finally, we save the learned embedding vectors to file using the save_word2vec_format() on the model’s ‘wv‘ (word vector) attribute. The embedding is saved in ASCII format with one word and vector per line. ''' # save model in ASCII (word2vec) format filename = 'embedding_word2vec.txt'
def word2vector(self, embedding_size): model = Word2Vec(self.vec_sentence, size=embedding_size, min_count=6) self.vec_model = model
def load_word2vec(self): fname = 'word2vec.' + str(self.w2v_dim) + '_' + str( self.w2v_window) + '.model' path = os.path.join(DATA_PATH, fname) self.word2vec = Word2Vec.load(path)
DL = StarCraftDataLoader(replay_path, ignore_null=True, with_print=True) corpus = DL.actions path_w2vec = get_tmpfile("act2vec.model") for window in windows: for embedding_dim in embedding_dims: print( 'Building Action2Vec model with window size {} and embedding dim {}' .format(window, embedding_dim)) model = Word2Vec(corpus, size=embedding_dim, window=window, min_count=1, workers=4) vocab = list(model.wv.vocab) print('Training model...') model.train(corpus, total_examples=len(corpus), epochs=50) desc = "test_act2vec_dataset:starcraft" + \ "_dim:" + str(embedding_dim) + \ "_win:" + str(window) if ignore_null: desc += '_ignorenull'
self.stop_words = stop_words def fit(self, X, y=None): return self def transform(self, X): preprocessed_sentences = [] for doc in nlp.pipe(X, n_threads=8): if self.stop_words is not None: preprocessed_sentences.append([ t.lower_ for t in doc if not t.is_punct and t.lemma_ not in self.stop_words ]) else: preprocessed_sentences.append( [t.lower_ for t in doc if not t.is_punct]) return preprocessed_sentences gsl = getsingleword(stop_words=common_terms) reviews_token = gsl.fit_transform(cleaned_tweets.text) model = Word2Vec(reviews_token, seed=1, min_count=5, size=100, workers=8, window=3) model.save("singleword2v.w2v")
pd.options.mode.chained_assignment = None splitted_movies = rating_splitter(df_ratings_train) ##使用gensim的word2vec训练item2vec模型 for movie_list in splitted_movies: random.shuffle(movie_list) print("splitted_movies:") print(splitted_movies[0]) model = Word2Vec(sentences=splitted_movies, # 迭代序列 iter=5, # 迭代次数 min_count=4, # 忽略词频,小于10的将被忽略掉 size=32, # 训练后的向量维度 workers=2, # 设置的线程数 sg=1, # 训练模型的模型选择,1=skip-gram,0=CBOW hs=0, # 训练代价函数的选择 negative=5, # 负采样 window=5) # 当前词和预测词的最大间隔 ##保存模型,保存了所有模型相关的信息,隐藏权重,词汇频率和模型二叉树,保存为word2vec文本格式,不能追加训练 model.wv.save_word2vec_format(path+"model/item2vec_model_0315.bin", binary=True) model.wv.save_word2vec_format(path+"model/item2vec_model_0315.txt", binary=False) ##保存模型,可以追加训练 model.save(path+"model/item2vec_model_0315.model") ##模型加载 # gensim.models.KeyedVectors.load_word2vec_format('XX.txt', binary=False) # gensim.models.KeyedVectors.load_word2vec_format('XX.bin', binary=True)
def read_w2v_model(model_dir, persist=True): if persist: w2v_model = Word2Vec.load(model_dir).wv else: w2v_model = KeyedVectors.load_word2vec_format(model_dir) return w2v_model
n_classes = len(classes) doc_word = [] for i in range(1, df['text'].count()): doc_word.append( str(df['text'][i]).translate(None, string.punctuation).split()) num_features = 100 # Word vector dimensionality min_word_count = 50 # Minimum word count num_workers = 20 # Number of threads to run in parallel context = 300 # Context window size downsampling = 1e-2 # Downsample setting for frequent words word2vec_model = Word2Vec(doc_word, workers=num_workers, size=num_features, min_count=min_word_count, window=context) word2vec_model.init_sims(replace=True) index2word_set = word2vec_model.index2word # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. word2vec_model.init_sims(replace=True) X_w2v = np.zeros(shape=(len(y), num_features)) i = 0 for sen in doc_word: X_w2v[i] = avg_feature_vector(sen, model=word2vec_model, num_features=num_features,
from gensim.models import Word2Vec from sklearn.decomposition import PCA from matplotlib import pyplot # define training data sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'], ['this', 'is', 'the', 'second', 'sentence'], ['yet', 'another', 'sentence'], ['one', 'more', 'sentence'], ['and', 'the', 'final', 'sentence']] # train model #SG=0 CBOW #SG=1 Skip Gram embedding = Word2Vec(sentences, min_count=1, window=5, size=32, sg=1) embedding['more'] #embedding.most_similar('the', topn=1)) embedding.most_similar('sentence', topn=5)
from gensim.models import Word2Vec from nltk.corpus import movie_reviews #maximum allowed length of a sequence. max_length = 100 #the size of the embeddings. embedding_size = 100 #the word2vec object that stores the vocabulary and embeddings. w2v = Word2Vec(movie_reviews.sents())
# -*- coding: utf-8 -*- """ Created on Tue Oct 10 16:09:44 2017 @author: 颜 """ from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import multiprocessing model=Word2Vec(LineSentence("wiki_corpus.txt",limit=True),size=300,workers=multiprocessing.cpu_count()) model.save("wordvector") word_vec=model.wv #from gensim.models import Word2Vec #import wordtovector as w ''' program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) wiki=WikiCorpus("enwiki-latest-pages-articles.xml.bz2",lemmatize=False) print("inited!!!") ''' from gensim.corpora import WikiCorpus import sys import logging
logger.info("WD is set to " + filePath) dataPath = filePath + "/02_Data/" logger.info("Writing to " + dataPath) # %% Import Data logger.info("Reading Parquet File") data = pq.read_table(dataPath + "/data.clean.parquet").to_pandas() # %% load w2v model logger.info("Loading Model") from gensim.models import Word2Vec model = Word2Vec.load(dataPath + "word2vec.model") # %% Conversion Function def workDataFrame(currentData): # create empty data frame for working converted = pd.DataFrame(columns=range(0, len(currentData.columns) + 300)) df = pd.DataFrame(columns=range(0, 300)) # vectors df = df.add_prefix("DESC_") # prefix for vectors columns = currentData.columns # naming columns = columns.append(df.columns) # combine converted.columns = columns # set names for index, firm in currentData.iterrows(): # for every single company
''' add track uri ''' start_time = time.time() with open('../data/song2artist.pkl', 'rb') as f: song2artist = pickle.load(f) artist_uri_test = [song2artist[track] for track in tracks_test] result_test['artist_uri'] = artist_uri_test print("add artist uri --- %s seconds ---" % (time.time() - start_time)) del artist_uri_test, song2artist gc.collect() pids_test = result_test['pid'] ''' add similarity ''' from gensim.models import Word2Vec model1 = Word2Vec.load('../data/w2v_model1.bin') model2 = Word2Vec.load('../data/w2v_model2.bin') model3 = Word2Vec.load('../data/w2v_model3.bin') with open('../data/song2album.pkl', 'rb') as f: song2album = pickle.load(f) with open('../data/song2artist.pkl', 'rb') as f: song2artist = pickle.load(f) def remove_iteral(sentence): return ast.literal_eval(sentence) df = pd.read_csv(readfile, usecols=['pid', 'pos_songs'], nrows=None)
def get_similarnode(model_path, node_name): # 获取相似节点 node2vec_model = Word2Vec.load(model_path) similar_node = node2vec_model.most_similar(node_name) # 查找相似节点 return similar_node
word_dict = {'<pad>': 0} for word in words: word_dict[word] = len(word_dict) #%% Train w2v TRAIN_corpus = DATA['TOKEN'].values # setting vector_dim = 64 window_size = 5 min_count = 1 training_iter = 20 # model word2vec_model = Word2Vec(sentences=TRAIN_corpus, size=vector_dim, window=window_size, min_count=min_count, iter=training_iter) #%% Embedding (gensim) model_path = "./GoogleNews-vectors-negative300.bin.gz" w2v_google_model = KeyedVectors.load_word2vec_format(model_path, binary=True) F = open('gensim_w2v.pkl', 'ab') for row in DATA.loc[0:2, 'TOKEN']: # numpy to store the result result = np.zeros((len(row), 300)) for i, token in enumerate(row): # using try in case encounter unseen word try: result[i] = w2v_google_model[token]
context = 4 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # list of unique days days = df_en.Day.unique().tolist() # dictionary with day as key and a list of words as values daily_similar_words = dict.fromkeys(days) # loop through all unique days for i in days: # subset by day tmpdf = df_en[df_en['Day'] == "30"] # create new vector of just the text sentences = tmpdf['cleaned_text'] # this is where the magic happens----word2vec model model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) # makes the model more memory efficient model.init_sims(replace=True) # save the resulting words to a dictionary with the key being the day daily_similar_words = model.most_similar_cosmul("trump", topn=2000)
session = driver.session() res = session.run("match (a:Page) return a.title") arr = [] for x in res: arr.append(x['a.title']) session.close() nodes = arr # write it to the pickle file with open('all_nodes.pickle', 'wb') as f: pickle.dump(nodes, f) walks_file_path = sys.argv[1] with open(walks_file_path, "r", encoding="UTF-8") as walks_file: workers = multiprocessing.cpu_count() d = 256 window = 80 #model = makeNodeModel(0.5, 100000, 80, 1, 256, 80, True, workers, nodes, log_file) model = Word2Vec(size=d, window=window, min_count=0, sg=1, workers=workers, iter=1, sample=0.0) model.build_vocab([nodes]) model.train(LineSentence(walks_file)) #makeNodeModel(0.5, 100000, 80, 1, 256, 80, True, workers, nodes, log_file) model.save_word2vec_format("training_model.bin", binary=True)
def _embedding_training(args, G_=None): seed=args.seed if args.method == 'struc2vec': logging.basicConfig(filename='./src/bionev/struc2vec/struc2vec.log', filemode='w', level=logging.DEBUG, format='%(asctime)s %(message)s') if (args.OPT3): until_layer = args.until_layer else: until_layer = None G = struc2vec.Graph(G_, args.workers, untilLayer=until_layer) if (args.OPT1): G.preprocess_neighbors_with_bfs_compact() else: G.preprocess_neighbors_with_bfs() if (args.OPT2): G.create_vectors() G.calc_distances(compactDegree=args.OPT1) else: G.calc_distances_all_vertices(compactDegree=args.OPT1) print('create distances network..') G.create_distances_network() print('begin random walk...') G.preprocess_parameters_random_walk() G.simulate_walks(args.number_walks, args.walk_length) print('walk finished..\nLearning embeddings...') walks = LineSentence('random_walks.txt') model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, seed=seed) os.remove("random_walks.txt") model.wv.save_word2vec_format(args.output) elif args.method == 'GAE': model = gae_model(args) G = G_[0] node_list = G_[1] model.train(G) # save embeddings model.save_embeddings(args.output, node_list) elif args.method == 'SVD': SVD_embedding(G_, args.output, size=args.dimensions) else: if args.method == 'Laplacian': model = lap.LaplacianEigenmaps(G_, rep_size=args.dimensions) elif args.method == 'GF': model = gf.GraphFactorization(G_, rep_size=args.dimensions, epoch=args.epochs, learning_rate=args.lr, weight_decay=args.weight_decay) elif args.method == 'HOPE': model = hope.HOPE(graph=G_, d=args.dimensions) elif args.method == 'GraRep': model = grarep.GraRep(graph=G_, Kstep=args.kstep, dim=args.dimensions) elif args.method == 'DeepWalk': model = node2vec.Node2vec(graph=G_, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dimensions, workers=args.workers, window=args.window_size, dw=True) elif args.method == 'node2vec': model = node2vec.Node2vec(graph=G_, path_length=args.walk_length, num_paths=args.number_walks, dim=args.dimensions, workers=args.workers, p=args.p, q=args.q, window=args.window_size) elif args.method == 'LINE': model = line.LINE(G_, epoch=args.epochs, rep_size=args.dimensions, order=args.order) elif args.method == 'SDNE': encoder_layer_list = ast.literal_eval(args.encoder_list) model = sdne.SDNE(G_, encoder_layer_list=encoder_layer_list, alpha=args.alpha, beta=args.beta, nu1=args.nu1, nu2=args.nu2, batch_size=args.bs, epoch=args.epochs, learning_rate=args.lr) else: raise ValueError(f'Invalid method: {args.method}') print("Saving embeddings...") model.save_embeddings(args.output) return