def build_doc2vec_model(dataset, vec_lenght, save_folder, name="Movie"): """ build doc2vec model """ #use helperfunction write_all_in_txt, which creates an txt file with required format for doc2vec model if name == "Movie": txt_file = make_doc2vec_inputfile( dataset=dataset, save_file="data/doc_2_vec/movie_d2v_input.txt") elif name == "Financial": txt_file = make_doc2vec_inputfile( dataset=dataset, save_file="data/doc_2_vec/financial_d2v_input.txt") doc = open(txt_file, "r", encoding="utf-8") documents = TaggedLineDocument(doc) model = gensim.models.Doc2Vec(documents, dm=0, dbow_words=0, size=vec_lenght, window=10, hs=0, negative=5, sample=1e-4, iter=20, min_count=10, workers=4, alpha=0.1) doc.close() model.save(fname_or_handle=save_folder) return model
def stem(CONFIGURATION, sents): with open(CONFIGURATION.rundir + "w2v_training_material.csv", mode="w+", encoding="UTF-8") as f: for sent in sents: tmp = list() for expression in sent: if not 'http://' in expression: expression_new = list() for word in expression.split(' '): expression_new.append( ps.stem(re.sub('[^A-z0-9<>]', '', word.lower()))) expression = expression_new else: expression = [re.sub('[\r\n]', '', expression)] if not " ".join(expression) == '' and len( " ".join(expression)) > 1: tmp = tmp + expression if len(tmp) > 1: for x in tmp: f.write(str(x) + " ") f.write("\n") #f.write("<> <>\n") from gensim.test.utils import datapath from gensim.models.doc2vec import TaggedLineDocument for document in TaggedLineDocument( datapath(CONFIGURATION.rundir + "w2v_training_material.csv")): yield document
def crawl_page(thread_name, page_url): if page_url not in Spider.crawled: print(thread_name + ' now crawling ' + page_url) print('Queue ' + str(len(Spider.queue)) + ' | Crawled ' + str(len(Spider.crawled))) Spider.add_links_to_queue(Spider.gather_links(page_url)) Spider.queue.remove(page_url) #Building the Doc2vec Model f = urlopen(page_url) html = f.read() open('temp.txt', 'w').close() f = open('temp.txt', 'w') f.write(html) f.close() html = TaggedLineDocument('temp.txt') model = Doc2Vec(html, size=100, window=8, min_count=5, workers=4) model.train(html, total_examples=100, epochs=5) #print model.docvecs[0] #saving data for building and testing the svm # if len(Spider.data_train)<50: # Spider.data_train.add(model.docvecs[0]) #else: # Spider.data_test.add(model.docvecs[0]) #set_to_file(Spider.data_train,'data_train.txt') #set_to_file(Spider.data_test,'data_test.txt') Spider.crawled.add(page_url) Spider.update_files()
def train(args): documents = TaggedLineDocument(LYRIC) return Doc2Vec(documents, size=args.size, window=args.window, min_count=args.min_count, workers=args.workers, dm=args.dm)
def create_doc2vec_model_v2(train_d2v_data_file_path, vector_size, lexicon, files_prefix, model_path): print('Training d2v model') docs = TaggedLineDocument(train_d2v_data_file_path) model = Doc2Vec(docs, size=vector_size, window=10, min_count=1, workers=8) model.save(model_path) return model
def pre_train(): if os.path.exists('data/dataset/data.txt'): # documents = TaggedLineDocument('data/dataset/data.txt') print("data.txt 文件存在") pass else: data_txt() documents = TaggedLineDocument('data/dataset/data.txt') return documents
def check_for_doc2vecmodel(doc2vec_fname, docs_fname, corpus, dictionary): try: doc2vec = models.Doc2Vec.load(doc2vec_fname) except IOError: print('Training Doc2Vec model, this may take a long time') documents = TaggedLineDocument(docs_fname) doc2vec = models.doc2vec.Doc2Vec(documents=documents, workers=4) doc2vec.save(doc2vec_fname) return doc2vec
def dataset(): df = open('data/blog_notopic.txt','r',encoding='utf-8') blogs=TaggedLineDocument(df) #计算x_train数据量 '''count=-1 for count, line in enumerate(open('data/blog_notopic.txt', 'r',encoding='utf-8')): pass count += 1''' return blogs
def train(self, source_corpus_path, update=False): """ Train an uninitialized model using corpus. Each line in the corpus should be words of a sentence separated by space. :param source_corpus_path: Path to corpus. :param update: Update vocab. :return: Nothing. """ documents = TaggedLineDocument(source_corpus_path) self.model.build_vocab(documents, update=update) self.model.train(documents, total_examples=self.model.corpus_count, epochs=self.model.iter)
def prepare_training_data(sentences, CONFIGURATION): #sentences = stem(CONFIGURATION, sentences) ctr = 0 with open(CONFIGURATION.rundir + "w2v_training_material.csv", mode="w+", encoding="UTF-8") as f: for sent in sentences: tmp = list() for expression in sent: if not 'http://' in expression: expression_new = list() for word in expression.split(' '): word = (ps.stem(re.sub('[^A-z0-9<>]', '', word.lower()))) #if len(word)>2: # words = #[word[i:i+3] for i in range(len(word)-3+1)] #else: words = [word] expression_new = expression_new + words expression = expression_new else: expression = [re.sub('[\r\n]', '', expression)] #if not " ".join(expression) == '' and len(" ".join(expression))>1: tmp = tmp + expression if len(tmp) > 0: for x in tmp: f.write(str(x) + " ") f.write("\n") ctr += 1 #f.write("<> <>\n") from gensim.test.utils import datapath from gensim.models.doc2vec import TaggedLineDocument sentences = [ document for document in TaggedLineDocument( datapath(CONFIGURATION.rundir + "w2v_training_material.csv")) ] #x = tuplize(sentences, CONFIGURATION) #x = eliminate_rare_and_frequent_terms(x) #documents = list() #for index, row in x.iterrows(): # documents.append([str(row[0])] + [str(row[1])]) #documents = literalize(documents) documents = sentences return documents
def trainDoc2Vector(sentence_count, vector_dimension): # train and save the model sentences = TaggedLineDocument('sources/splited_words.txt') model = Doc2Vec(sentences, size=vector_dimension, window=8, min_count=2, workers=multiprocessing.cpu_count()) model.train(sentences, total_examples=sentence_count, epochs=model.iter) model.save('result/doc2vec.model') # save vectors out = open('result/doc2vec.vector', mode='w+', encoding='utf-8') for index in range(0, sentence_count, 1): docvec = model.docvecs[index] out.write(' '.join(str(f) for f in docvec) + "\n") out.close()
def build_doc2vec_model(): # Creating labeled sentences from training data sentences = TaggedLineDocument('bulk-total.txt') model = Doc2Vec(alpha=0.1, size=30, window=10, min_count=5, dm=0, dbow_words=1, iter=10) model.build_vocab(sentences) model.train(sentences, total_examples=81863, epochs=10) model.save('../models/clpsych-30dim-large.d2v')
def process(self): log.info("Commencing execution") tagged_docs = TaggedLineDocument(self.labeled_articles_file_path) log.info("Training Doc2Vec model") doc2vec_model = doc2vec_helper.init_model(tagged_docs) doc2vec_model.save(self.doc2vec_model_file_path) log.info("Learnt vocab from training set and saved doc2vec model") x_train = list() with open(self.labeled_articles_file_path) as training_set: for line in training_set: x_train.append(doc2vec_model.infer_vector(line)) y_train = [0] * self.samples_per_class_train y_train.extend([1] * self.samples_per_class_train) x_test = list() with open(self.articles_source_file_path) as test_set: for line in test_set: x_test.append(doc2vec_model.infer_vector(line)) y_true = [1] * self.samples_per_class_test y_true.extend([0] * self.samples_per_class_test) ml_model_logreg = scikit_ml_helper.train_logistic_reg_classifier(x_train, y_train) scikit_ml_helper.persist_model_to_disk(ml_model_logreg, self.ml_model_file_path) y_pred = ml_model_logreg.predict(x_test) log.info("Logistic Regression") log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true))) log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true))) log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true))) ml_model_svm = scikit_ml_helper.train_svm_classifier(x_train, y_train) y_pred = ml_model_svm.predict(x_test) log.info("SVM") log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true))) log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true))) log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true))) ml_model_nb = scikit_ml_helper.train_gnb_classifier(x_train, y_train) y_pred = ml_model_nb.predict(x_test) log.info("Naive Bayes") log.info("Precision: " + str(metrics.precision_score(y_pred=y_pred, y_true=y_true))) log.info("Recall: " + str(metrics.recall_score(y_pred=y_pred, y_true=y_true))) log.info("Accuracy: " + str(metrics.accuracy_score(y_pred=y_pred, y_true=y_true))) log.info("Completed execution")
def calculate_and_save_word2vec_dict(words_list, files): dataset_file = os.path.join(word2vec_taget_dir, 'dataset.txt') FileProcessor(dataset_file).file_write( 'utf8', u''.join([words + u'\n' for words in words_list][:-1])) doces = TaggedLineDocument(dataset_file) doc2Vec_model = doc2vec.Doc2Vec(doces, size=200, window=10, workers=4) doc2Vec_model.train(doces, total_examples=doc2Vec_model.corpus_count, epochs=200) doc2Vec_model.save(os.path.join(word2vec_taget_dir, 'doc2vec_model.txt')) FileProcessor(os.path.join(word2vec_taget_dir, 'tagged_map.txt'))\ .file_write('utf8', u''.join([u'{0} {1} \n'.format(index, value.decode('utf8')) for index, value in enumerate(files)])) return doc2Vec_model
def train(): tagged = TaggedLineDocument(filetgge) model = Word2Vec(alpha=0.025, min_alpha=0.025, size=50, window=5, min_count=5, workers=8) model.build_vocab(tagged) for i in range(10): model.train(tagged) model.alpha -= 0.0002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay # model.save_word2vec_format(filetgge+'.model') model.save(filetgge + '.model')
def main(): model_file = datahub.get_full_path("doc2vec.model") if not Path(model_file).exists(): tokenized_file = datahub.get_full_path("articles1_token.txt") if not Path(tokenized_file).exists(): print("data is not yet tokenized and saved. Doing now.") print("loading data") data_list = datahub.load_data(datahub.get_full_path("articles1.csv")) print("tokenizing data") tokenized_content_list = datahub.tokenize_content(data_list) print("saving tokenizing data in txt file") datahub.save_tagged_data(tokenized_content_list, tokenized_file) vector_size = 50 epochs = 100 print("training model") trained_model = v_model.train_model(TaggedLineDocument(tokenized_file), vector_size, epochs) print("saving model") v_model.save_model(trained_model, model_file) print("finish") else: print("{} already exists. Exiting.".format(model_file))
def get_tagged_sentences(filepath): sentences = TaggedLineDocument(filepath) return sentences
chunksize = int(math.ceil(n / float(procs))) with gzip.open(base_output_path + 'docs_songs.txt.gz', 'w') as fout, gzip.open( base_output_path + 'indices.txt.gz', 'w') as indices: for userid, doc in tq(pool.imap_unordered(get_songs, files, chunksize=100), total=n): fout.write(doc + '\n') indices.write(userid + '\n') with timed('Loading docs'): #documents = TaggedLineDocument(base_output_path+'docs_artist_blocks.txt.gz') #documents = [doc for doc in tq(TaggedLineDocument(base_output_path+'docs_songs.txt.gz'))] documents = TaggedLineDocument(base_output_path + 'docs_songs.txt.gz') with timed('Running model'): model = Doc2Vec(documents, size=dim, window=win, min_count=min_count, workers=procs) with timed('Saving results'): # from sklearn.preprocessing import Normalizer # nrm = Normalizer('l2') # normed = nrm.fit_transform(model.docvecs.doctag_syn0) # words_normed = nrm.fit_transform(model.syn0) # np.save(output_path+'/doc_features_normed-{}-{}-{}.npy'.format(dim,win,min_count),normed)
user_dict[line.split()[0]] = '' u_f.close() for line in f_b_t.readlines(): if utils.key_in_dic(line.replace('\n', '').split('\t')[0], user_dict): p_f.write(line) else: pass f_b_t.close() p_f.close() # from gensim.models.doc2vec import TaggedLineDocument, Doc2Vec user_tranj_vec = '../data/user_tranj_vec.txt' documents = TaggedLineDocument(doc_file) model = Doc2Vec(documents, size=128, negative=10, window=8, hs=0, min_count=0, workers=15, iter=30) user_id_list = [] u_f = open(user_file) for line in u_f: user_id_list.append(line.split('\n')[0]) u_f.close()
import os.path import sys import multiprocessing from gensim.models import Doc2Vec from gensim.models.doc2vec import TaggedLineDocument if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 4: print globals()['__doc__'] % locals() sys.exit(1) inp, outp1, outp2 = sys.argv[1:4] model = Doc2Vec(TaggedLineDocument(inp), size=200, window=5, min_count=5, workers=multiprocessing.cpu_count()) # trim unneeded model memory = use(much) less RAM #model.init_sims(replace=True) model.save(outp1) #save dov2vec model.save_word2vec_format(outp2, binary=False) #save word2vec
# docLabels = ["input.txt"] # data = [] # for doc in docLabels: # data.append(open(doc, 'r')) # print(data) # # it = LabeledLineSentence(data, [1]) # # print(it)= ["input.txt"] # data = [] questions_path = "train_questionsq.txt" answers_path = "train_answersq.txt" questions = open(questions_path, 'r') answers = open(answers_path, 'r') doc = TaggedLineDocument("input.txt") model = gensim.models.Doc2Vec(size=100, window=10, min_count=1, workers=11, alpha=0.025, min_alpha=0.025) # use fixed learning rate model.build_vocab(doc) model.iter = 300 model.train(doc, total_examples=model.corpus_count, epochs=model.iter) # # for epoch in range(10):
if not os.path.exists(base_output_path+'docs_artist_blocks.txt.gz'): with gzip.open(base_output_path+'docs_artist_blocks.txt.gz','w') as fout, gzip.open(base_output_path+'indices.txt.gz','w') as indices: files = sorted(glob.glob(scrobble_path+'*.txt')) for fi in tq(files): artists = [line.split('\t')[1] for line in open(fi)] last = None blocks = [] for a in tq(artists): if a != last: blocks.append(a) last = a doc = ' '.join(blocks) fout.write(doc+'\n') userid = fi[fi.rfind('\\')+1:-4] indices.write(userid+'\n') documents = [doc for doc in tq(TaggedLineDocument(base_output_path+'docs_artist_blocks.txt.gz'))] %time model = Doc2Vec(documents, size=dim, window=win, min_count=min_count,workers=workers) dpath = 'P:/Projects/BigMusic/jared.data/d2v/artist_dict.pkl' if not os.path.exists(dpath): artist_dict = {} for line in tq(open('P:/Projects/BigMusic/jared.rawdata/lastfm_itemlist.txt')): line = line.split('\t') if line[1]=='0': artist_dict[line[2]] = line[0] cPickle.dump(artist_dict,open(dpath,'wb')) else: artist_dict = cPickle.load(open(dpath))
def train(cls): model = Doc2Vec(documents=TaggedLineDocument(cls.corpus_path), vector_size=300, window=5, min_count=1, workers=4) model.save(config.model_path.format('d2v.model'))
import sys from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument file = sys.argv[1] epochs = int(sys.argv[2]) words = sys.argv[3].split(' ') steps = 50 docs = TaggedLineDocument(file) model = Doc2Vec(docs, min_count=1, epochs=epochs) docs_list = list(docs) to_docstr = lambda x: ' '.join(docs_list[x].words) print(f'--- similar : {to_docstr(0)} ---') for i, p in model.docvecs.most_similar(0): print(f'{p}, {to_docstr(i)}') print('') print(f'--- similar : {words} ---') x = model.infer_vector(words, steps=steps) for tag, p in model.docvecs.most_similar([x]): print(f'{p}, {to_docstr(tag)}')
def d2ctest(): documents = TaggedLineDocument("new_text2.txt") model = Doc2Vec(documents, size=10, window=2, min_count=1, workers=1) print(model) model
from keras.models import Sequential, Model from keras.optimizers import Adam data_file = sys.argv[1] dest_file_prefix = sys.argv[2] epoch = int(sys.argv[3]) batch = int(sys.argv[4]) wv_size = 200 wv_epoch = 2000 num_unit = 512 input_size = (10, ) docs = TaggedLineDocument(data_file) words = [d.words for d in docs] wv_model = Word2Vec(words, wv_size, min_count=1, iter=wv_epoch) input_size += (wv_model.vector_size, ) word_maxlen = np.max([len(w) for w in words]) def discriminator(input_shape): model = Sequential() model.add(GRU(num_unit, input_shape=input_shape)) model.add(Dropout(0.3))
model_file = sys.argv[1] data_file = sys.argv[2] gen_size = int(sys.argv[3]) questions = [q.split(' ') for q in sys.argv[4].split(';')] steps = 50 base_word_prob = 0.7 prob_weight = 1.5 keyword_rate = 2 replace_targets = ['名詞', '形容詞'] model = Doc2Vec.load(model_file) docs_list = list(TaggedLineDocument(data_file)) docs_list_str = [''.join(d.words) for d in docs_list] is_replace_target = lambda t: np.any( [t.part_of_speech.startswith(trg) for trg in replace_targets]) def random_choice(cd): probs = np.exp(np.array([p for _, p in cd]) * prob_weight) probs /= probs.sum() return np.random.choice([d for d, _ in cd], p=probs) adjust_prob = lambda c, q: (c[0], c[1] * keyword_rate) if c[0] in q else c
from gensim.models.doc2vec import Doc2Vec from gensim.models.doc2vec import TaggedLineDocument from time import localtime, strftime # document for training doc_path = 'dota_picks' corpus = TaggedLineDocument(doc_path) # Doc2Vec parameters; self explanatory vector_size = 50 window_size = 5 min_count = 0 sampling_threshold = 1e-4 negative_size = 5 train_epoch = 100 dm = 0 #0 = dbow; 1 = dmpv worker_count = 8 #number of parallel processes model = Doc2Vec(size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1) print("Building Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime())) model.build_vocab(corpus) print("Built Vocab:", strftime("%a, %d %b %Y, %H:%M:%S", localtime()))
#-*-coding:utf-8-*- from __future__ import division from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument # import numpy.linalg import numpy as np import math import scipy from PIL import Image, ImageDraw input_file = r"H:\network_diagnosis_data\test\GTPC_TUNNEL_PATH_BROKEN.3054.txt" sentences = TaggedLineDocument(input_file) dim = 1000 model = Doc2Vec(alpha=0.025, min_alpha=0.025, size=dim) # default 300 ά model.build_vocab(sentences) for epoch in range(10): model.train(sentences) model.alpha -= 0.002 model.min_alpha = model.alpha model.save(r'.\data\test_d2v') # print model.infer_vector([u'people', u'like', u'words']) total_num = model.docvecs.count # print total_num # print len( model.docvecs[0] ) para_vec = [] for i in xrange(total_num): if i == 0: para_vec = model.docvecs[i] continue para_vec = np.vstack((para_vec, model.docvecs[i])) print para_vec
import pandas as pd import numpy as np from time import time import pickle import gensim from gensim import corpora, models, similarities from gensim.models.doc2vec import TaggedDocument, TaggedLineDocument from gensim.models import Doc2Vec import gensim.models.doc2vec print('loading docs...') start_time = time() documents = [ doc for doc in TaggedLineDocument('volume2/processed_body_docs.txt') ] print("--- %s seconds ---" % (time() - start_time)) #documents = [] #with open('/volume/processed_body_docs.txt') as f: # for line in f: # documents.append(TaggedLineDocument(line)) print('training doc2vec model...') start_time = time() model = Doc2Vec(documents, vector_size=200, window=5, min_count=5, workers=14, epochs=20)