def doc2vec_embedding(self, min_count=2, epochs=200): """ embed using doc2vec :return: """ # fit embedding by doc2vec print('{}: starting fitting doc2vec model'.format( time.asctime(time.localtime(time.time())))) self.doc2vec_model = Doc2Vec(fname='', linux=False, use_file=False, data=self.data, vector_size=self.embedding_size, min_count=min_count, epochs=epochs) print('{}: finish fitting doc2vec model'.format( time.asctime(time.localtime(time.time())))) # save trained model self.doc2vec_fitted_model_file_path = os.path.join( self.data_directory, 'doc2vec_submission_titles.pkl') print('{}: starting saving doc2vec model'.format( time.asctime(time.localtime(time.time())))) joblib.dump(self.doc2vec_model, self.doc2vec_fitted_model_file_path) print('{}: finishing saving doc2vec model'.format( time.asctime(time.localtime(time.time())))) return
def main(): numProcs = 3 taskID = process.fork_processes(numProcs, max_restarts=0) port = BASE_PORT + taskID if taskID == 0: app = httpserver.HTTPServer(tornado.web.Application([ (r"/submit", Web)], **SETTINGS)) logging.info("webapp listening on %d" % port) else: #load trained model from either dm or dbow if os.path.isfile(dmLabeled) and os.path.isfile(dbowLabeled): fname = dmLabeled if taskID == 1 else dbowLabeled model = Doc2Vec.load(fname) else: raise RuntimeError("Must first train doc2vec model") app = httpserver.HTTPServer(web.Application([(r"/doc2vec", Doc2vecServer, dict(model = model))])) logging.info("Doc2vec server %d listening on %d" % (taskID, port)) app.add_sockets(netutil.bind_sockets(port)) IOLoop.current().start()
def get_res(iter, baseline): train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None) test_file = pd.read_csv('../data/test_public.csv') # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None) train_vec = np.array(train_vec) # train_vec_sentiment = np.array(train_vec_sentiment) data = pd.read_csv('../data/train.csv') subject_vocab = list( ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观']) subject_list = list() for i in data['subject']: for k in range(10): if subject_vocab[k] == i: subject_list.append(k) break value_list = list() for i in data['sentiment_value']: value_list.append(i) bdc = Bdc.cal_bdc(train_vec, subject_list, 10) for i in range(train_vec.shape[0]): for j in range(train_vec.shape[1]): if train_vec[i][j] > 0: train_vec[i][j] = bdc[j] print(train_vec) test_vec = Doc2Vec.test2vec() for i in range(test_vec.shape[0]): for j in range(test_vec.shape[1]): if test_vec[i][j] > 0: test_vec[i][j] = bdc[j] print(test_vec) test_id = list(test_file['content_id']) res_id, res_subject, value_list = Lgb.cal_subject_mul( train_vec, subject_list, test_id, test_vec, iter, baseline) GetResult.res2doc_mul(res_id, res_subject, value_list)
#!/usr/bin/env python # -*- coding: utf-8 -*- # aba : just some tests to see that it is working import logging import sys import os from word2vec import Word2Vec from doc2vec import Doc2Vec, LineSentence logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) input_file = 'test.txt' model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=5, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec') #aba : initialize it with a already learned word vectors through model_file sent_file = 'sent.txt' model = Doc2Vec(LineSentence(sent_file), model_file=input_file + '.model') model.save_doc2vec_format(sent_file + '.vec') program = os.path.basename(sys.argv[0]) logging.info("finished running %s" % program)
words = words[:10000] print('Data size', len(words)) # fake some docs doc_length = 100 docs = [words[i:i+doc_length] for i in range(0, doc_length, len(words))] vocabulary_size = 500 d2v = Doc2Vec(vocabulary_size=vocabulary_size, document_size=len(docs), n_steps=2001) # print w2v.get_params() d2v.fit(docs) print(d2v.word_embeddings.shape) print(d2v.doc_embeddings.shape) save_path = d2v.save('models/test_d2v_model') print(d2v.word_embeddings[0,0]) print(d2v.doc_embeddings[0,0]) print save_path # restore a saved model d2v_restored = Doc2Vec.restore(save_path) print(d2v_restored.word_embeddings[0,0]) print(d2v_restored.doc_embeddings[0,0])
return data filename = 'text8.zip' words = read_data(filename) words = words[:10000] print('Data size', len(words)) # fake some docs doc_length = 100 docs = [words[i:i + doc_length] for i in range(0, doc_length, len(words))] vocabulary_size = 500 d2v = Doc2Vec(vocabulary_size=vocabulary_size, document_size=len(docs), n_steps=2001) # print w2v.get_params() d2v.fit(docs) print(d2v.word_embeddings.shape) print(d2v.doc_embeddings.shape) save_path = d2v.save('models/test_d2v_model') print(d2v.word_embeddings[0, 0]) print(d2v.doc_embeddings[0, 0]) print save_path # restore a saved model d2v_restored = Doc2Vec.restore(save_path)
# -*- coding: utf-8 -*- import os, sys from settings import size, dmUnlabeled, dbowUnlabeled,\ dmLabeled, dbowLabeled, useModifiedModule, testRes, trainedClassifer import logging import numpy as np import pickle from sklearn.linear_model import SGDClassifier from doc2vec import Doc2Vec from doc2vec import LabeledSentence from docVecTrain import flushLoggerInfo, cleanUpText if __name__ == "__main__": flushLoggerInfo() if os.path.isfile(dmLabeled) and os.path.isfile(dbowLabeled)\ and os.path.isfile(trainedClassifer): model_dm = Doc2Vec.load(dmLabeled) model_dbow = Doc2Vec.load(dbowLabeled) neg = "This movie is an absolute disaster within a disaster film. It is full of great action scenes, which are only meaningful if you throw away all sense of reality. Let's see, word to the wise, lava burns you; steam burns you. You can't stand next to lava. Diverting a minor lava flow is difficult, let alone a significant one. Scares me to think that some might actually believe what they saw in this movie.<br /><br />Even worse is the significant amount of talent that went into making this film. I mean the acting is actually very good. The effects are above average. Hard to believe somebody read the scripts for this and allowed all this talent to be wasted. I guess my suggestion would be that if this movie is about to start on TV ... look away! It is like a train wreck: it is so awful that once you know what is coming, you just have to watch. Look away and spend your time on more meaningful content." neg_test_vecs = np.hstack((model_dm.train_online(cleanUpText(neg)), model_dbow.train_online(cleanUpText(neg)))) pos = "Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty." pos_test_vecs = np.hstack((model_dm.train_online(cleanUpText(pos)), model_dbow.train_online(cleanUpText(pos)))) with open(trainedClassifer, 'rb') as f: lr = pickle.load(f) #we are expecting the classification to be 0 but there is no guarantee print lr.predict([neg_test_vecs, pos_test_vecs]) else: print 'runtime error' sys.exit(1)
def cv_test_mul(): train_vec = pd.read_csv( '/home/hujoe/PycharmProjects/df-2018-NLP/content_vec_withoutD.csv', header=None) test_file = pd.read_csv( '/home/hujoe/PycharmProjects/df-2018-NLP/data/test_public.csv') # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None) train_vec = np.array(train_vec) # train_vec_sentiment = np.array(train_vec_sentiment) data = pd.read_csv( '/home/hujoe/PycharmProjects/df-2018-NLP/data/train.csv') subject_vocab = list( ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观']) subject_list = list() for i in data['subject']: for k in range(10): if subject_vocab[k] == i: subject_list.append(k) break subject_list = np.array(subject_list) value_list = list() for i in data['sentiment_value']: value_list.append(i) value_list = np.array(value_list) bdc = Bdc.cal_bdc(train_vec, subject_list, 10) for i in range(train_vec.shape[0]): for j in range(train_vec.shape[1]): if train_vec[i][j] > 0: train_vec[i][j] = bdc[j] print(train_vec) test_vec = Doc2Vec.test2vec() for i in range(test_vec.shape[0]): for j in range(test_vec.shape[1]): if test_vec[i][j] > 0: test_vec[i][j] = bdc[j] test_id = list(test_file['content_id']) X, test, y, test_id, y1 = train_vec, test_vec, value_list, test_id, subject_list N = 10 res = open('res2.txt', 'w') # kf = StratifiedKFold(n_splits=N, random_state=2018).split(X, y) for i in range(10): subject_oh = y1.copy() for l in range(len(subject_oh)): if subject_oh[l] != i: subject_oh[l] = 0 else: subject_oh[l] = 1 params = { 'boosting_type': 'gbdt', 'num_leaves': 55, 'reg_alpha': 0.1, 'reg_lambda': 1, 'max_depth': 15, 'objective': 'binary', 'subsample': 0.8, 'colsample_bytree': 0.8, 'subsample_freq': 1, 'learning_rate': 0.06, 'min_child_weight': 1, 'random_state': 20, 'n_jobs': 4 } data_train = lgb.Dataset(X, subject_oh) clf = lgb.cv(params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='rmse', early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0) res.write(str(len(clf['rmse-mean']))) res.write(' ') res.write(str(clf['rmse-mean'][-1])) res.write('\n')
def run_base_bdc(): train_vec = pd.read_csv( '/home/hujoe/PycharmProjects/df-2018-NLP/content_vec_withoutD.csv', header=None) test_file = pd.read_csv( '/home/hujoe/PycharmProjects/df-2018-NLP/data/test_public.csv') # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None) train_vec = np.array(train_vec) # train_vec_sentiment = np.array(train_vec_sentiment) data = pd.read_csv( '/home/hujoe/PycharmProjects/df-2018-NLP/data/train.csv') subject_vocab = list( ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观']) subject_list = list() for i in data['subject']: for k in range(10): if subject_vocab[k] == i: subject_list.append(k) break subject_list = np.array(subject_list) value_list = list() for i in data['sentiment_value']: value_list.append(i) value_list = np.array(value_list) bdc = Bdc.cal_bdc(train_vec, subject_list, 10) for i in range(train_vec.shape[0]): for j in range(train_vec.shape[1]): if train_vec[i][j] > 0: train_vec[i][j] = bdc[j] print(train_vec) test_vec = Doc2Vec.test2vec() for i in range(test_vec.shape[0]): for j in range(test_vec.shape[1]): if test_vec[i][j] > 0: test_vec[i][j] = bdc[j] print(test_vec) test_id = list(test_file['content_id']) N = 10 kf = StratifiedKFold(n_splits=N, random_state=2018).split(train_vec, subject_list) clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=80, reg_alpha=0.1, reg_lambda=1, max_depth=8, n_estimators=500, objective='binary', subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4) clf_1 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=80, reg_alpha=0.1, reg_lambda=1, max_depth=8, n_estimators=10, objective='binary', subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.06, min_child_weight=1, random_state=20, n_jobs=4) y_train_oofp = np.zeros_like(subject_list, dtype='float64') y_train_oofp1 = np.zeros_like(subject_list, dtype='float64') ''' y_train_oofp: y_y_train_oofp1: ''' y_test_oofp = np.zeros((test_vec.shape[0], N)) y_test_oofp_1 = np.zeros((test_vec.shape[0], N)) acc = 0 vcc = 0 l = 0 ll = 0 for i, (train_fold, test_fold) in enumerate(kf): X_train, X_validate, label_train, label_validate, label_1_train, label_1_validate, = \ train_vec[train_fold, :], train_vec[test_fold,:], value_list[train_fold], value_list[test_fold], subject_list[train_fold], subject_list[test_fold] clf.fit(X_train, label_train) val_ = clf.predict(X_validate) y_train_oofp[test_fold] = val_ if micro_avg_f1(label_validate, val_) > 0.7: l += 1 print('sentiment_value_f1:%f' % micro_avg_f1(label_validate, val_)) acc += micro_avg_f1(label_validate, val_) result = clf.predict(test_vec) y_test_oofp[:, i] = result # clf = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6) clf_1.fit(X_train, label_1_train) val_1 = clf_1.predict(X_validate) y_train_oofp1[test_fold] = val_ if micro_avg_f1(label_1_validate, val_1) > 0.6: ll += 1 vcc += micro_avg_f1(label_1_validate, val_1) result = clf_1.predict(test_vec) y_test_oofp_1[:, i] = result print(acc / l) print(vcc / ll) lbl = pk.load(open('../tmp/label_encoder.sav', 'rb')) res_2 = [] for i in range(y_test_oofp_1.shape[0]): tmp = [] for j in range(N): tmp.append(int(y_test_oofp_1[i][j])) word_counts = Counter(tmp) yes = word_counts.most_common(1) res_2.append(lbl.inverse_transform([yes[0][0]])[0]) res = [] for i in range(y_test_oofp.shape[0]): tmp = [] for j in range(N): tmp.append(y_test_oofp[i][j]) res.append(max(set(tmp), key=tmp.count)) result = pd.DataFrame() result['content_id'] = list(test_id) result['subject'] = list(res_2) result['subject'] = result['subject'] result['sentiment_value'] = list(res) result['sentiment_value'] = result['sentiment_value'].astype(int) result['sentiment_word'] = '' result.to_csv('../submit_bdc.csv', index=False)
print('done reading data') # In[4]: #prepare models #for Doc2Vec wind_size = 15 embedding_dim = 300 min_count = 5 models = {} models["TF-IDF"] = {"model": TfIdfRetrieval(docs), "results": {}, "metrics": {}} # models["word2vec"] = {"model": ..., "results": {}, "metrics": {}} models["doc2vec"] = {"model": Doc2Vec(docs, wind_size, embedding_dim, min_count=min_count), "results": {}, "metrics": {}} # models["LSI-BoW"] = {"model": ..., "results": {}, "metrics": {}} # models["LSI-TF-IDF"] = {"model": ..., "results": {}, "metrics": {}} # models["LDA"] = {"model": ..., "results": {}, "metrics": {}} # In[5]: #run each model for each query for qid in qrels: query_text = queries[qid] #this might be slightly different for each model models["TF-IDF"]["results"][qid] = dict(models["TF-IDF"]["model"].search(query_text))
tfidf_vectorizer = generate_tfidf_vectorizer(corpus) logger.info("Codificando y...") X = np.array(corpus.data.acordao) l_enc = preprocessing.LabelEncoder() y = l_enc.fit_transform(np.array(corpus.data.relator.tolist())) logger.info("Dividindo dados para treinamento e teste...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) embedders = [ tfidf_vectorizer, Doc2Vec(model_pretrained), Doc2Vec(model_trained) ] embedders_map = {0: 'TFIDF', 1: 'PRETRAINED', 2: 'TRAINED'} logger.info("Pré-codificando dados em cada modelo de vetorização...") try: train_encoded_data = pickle.load(open("train_enc.pickle", "rb")) test_encoded_data = pickle.load(open("test_enc.pickle", "rb")) except: train_encoded_data = [] test_encoded_data = [] for embedder in tqdm(embedders): X_train_enc = embedder.transform(X_train) train_encoded_data.append(X_train_enc)
d2v.get_doc_vecs(docs_by_id) for qid in tqdm(qrels): query_text = queries[qid] results = d2v.search(query_text) overall_ser[qid] = dict(results) with open("d2v_windsize_"+str(wind_size)+".json", "w") as writer: json.dump(overall_ser, writer, indent=1) for vocab_size in vocab_sizes: overall_ser = {} d2v = Doc2Vec(docs_by_id, wind_size_def, vec_dim_def, vocab_size) d2v.get_doc_vecs(docs_by_id) for qid in tqdm(qrels): query_text = queries[qid] results = d2v.search(query_text) overall_ser[qid] = dict(results) with open("d2v_vocabsize_"+str(vocab_size)+".json", "w") as writer: json.dump(overall_ser, writer, indent=1) """ wind_size = 15 vec_dim = 200 vocab = 50 overall_ser = {} d2v = Doc2Vec(docs_by_id, wind_size, vec_dim, vocab) d2v.get_doc_vecs(docs_by_id) for qid in tqdm(qrels): query_text = queries[qid] results = d2v.search(query_text) overall_ser[qid] = dict(results) with open("d2v_vecdim_" + str(vec_dim) + ".json", "w") as writer: json.dump(overall_ser, writer, indent=1)
from sklearn import svm import lightgbm as lgb import numpy as np import pandas as pd from bdc import Bdc from doc2vec import Doc2Vec from getResult import GetResult if __name__ == '__main__': res = pd.read_csv('../tmp/baseline.csv') train = pd.read_csv('../data/train.csv') train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None) train_vec = np.array(train_vec) test_vec = Doc2Vec.test2vec() value_list = list(train['sentiment_value']) subject_vocab = list( ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观']) subject_list = list() for i in train['subject']: for k in range(10): if subject_vocab[k] == i: subject_list.append(k) break predict_subject = list() for i in res['subject']: for k in range(10):
save_path = 'models/docs_model' data_files = 'trabajos/*.txt' saved_model = glob.glob(save_path+'/checkpoint') restore_model = continue_training = False if len(saved_model)>0: answer = input('Do you want to (t)rain, (r)estore, or (c)ontinue training a model?') c = answer[0].lower() restore_model = (c == 'r') continue_training = (c == 'c') docs = docs_from_path(data_files) if restore_model or continue_training: print('Restoring a saved model...') d2v = Doc2Vec.restore(save_path + '/model.ckpt') else: # restart training d2v = Doc2Vec(vocabulary_size=vocabulary_size, document_size=len(docs), embedding_size_d=64, embedding_size_w=64, learning_rate=0.1, n_steps=100001) if not restore_model: if continue_training: steps = input('How many steps? (%d)'%d2v.n_steps) if len(steps.strip()) != 0: d2v.n_steps = int(steps) d2v.fit(docs, continue_training=continue_training)