def process(): _, _, _, _, w2v_list, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines()) stop_word_list = [ line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!", "").replace( "¡", "").lower() for line in stop_word] d2c_list = [] for line in w2v_list: # line_list = [x for x in line if x not in stop_word_list] d2c_list.append(line) alldocuments = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for id, record in enumerate(d2c_list): qid = str('SENT_%s' % id) words = record words_text = " ".join(words) words = gensim.utils.simple_preprocess(words_text) tags = [qid] alldocuments.append(analyzedDocument(words, tags)) print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now()))) saved_model_name = "doc_2_vec_" + str(int(time.time())) model_4 = gensim.models.Doc2Vec(alldocuments, dm=1, dm_concat=1, vector_size=300, window=5, min_count=2, epochs=100) model_4.save("%s" % (saved_model_name)) print("model training completed : %s" % (saved_model_name))
def process(): x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data( FLAGS.en_train, FLAGS.sp_train) word2index, index2word = datahelper.create_vocabulary(x_train_reshape) vocab_size = len(index2word) word_embedding = datahelper.asign_pretrained_word_embedding( index2word, vocab_size, FLAGS.word2vec_model_path) max_len = max([len(x.split(" ")) for x in x_train_reshape]) test1, test2 = datahelper.load_testdata(filepath_test) test1_int = [] test2_int = [] x_text1_int = [] x_text2_int = [] for line in x_text1: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text1_int.append(text) for line in x_text2: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text2_int.append(text) for line in test1: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] test1_int.append(text) for line in test2: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] test2_int.append(text) x_train1 = pad_sequences(x_text1_int, max_len) x_train2 = pad_sequences(x_text2_int, max_len) x_test1 = pad_sequences(test1_int, max_len) x_test2 = pad_sequences(test2_int, max_len) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_shuffled1 = x_train1[shuffle_indices] x_shuffled2 = x_train2[shuffle_indices] y_shuffled = y_train[shuffle_indices] dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y_train))) x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[ dev_sample_index:] x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x_text1, x_text2, x_text1_int, x_text2_int return x_shuffled1, x_shuffled2, y_shuffled, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size, x_test1, x_test2
def process(): x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data( FLAGS.en_train, FLAGS.sp_train) word2index, index2word = datahelper.create_vocabulary(x_train_reshape) vocab_size = len(index2word) word_embedding = datahelper.asign_pretrained_word_embedding( index2word, vocab_size, FLAGS.word2vec_model_path) max_len = max([len(x.split(" ")) for x in x_train_reshape]) x_text1_int = [] x_text2_int = [] stop_word = list(open(FLAGS.stop_word, "r", encoding='UTF-8').readlines()) stop_word_list = [ line.replace("\n", "").replace(",", "").replace(".", "").replace( "?", "").replace("¿", "").replace("!", "").replace("¡", "").lower() for line in stop_word ] for line in x_text1: line_list = line.split(" ") line_list = [x for x in line_list if x not in stop_word_list] text = [word2index.get(x, UNK_ID) for x in line_list] x_text1_int.append(text) for line in x_text2: line_list = line.split(" ") line_list = [x for x in line_list if x not in stop_word_list] text = [word2index.get(x, UNK_ID) for x in line_list] x_text2_int.append(text) x_train1 = pad_sequences(x_text1_int, max_len) x_train2 = pad_sequences(x_text2_int, max_len) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_shuffled1 = x_train1[shuffle_indices] x_shuffled2 = x_train2[shuffle_indices] y_shuffled = y_train[shuffle_indices] dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y_train))) x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[ dev_sample_index:] x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x_text1, x_text2, x_text1_int, x_text2_int return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size
test_doc2vec = pd.read_csv( "I:\\temp\\CNNimpl_static\\CIKM\\features\\doc2vec\\test_doc2vec1.csv") train_doc2vec.drop(['doc2vec_train1', 'doc2vec_train2'], axis=1, inplace=True) test_doc2vec.drop(['doc2vec_test1', 'doc2vec_test2'], axis=1, inplace=True) train = pd.concat([ train_bag, train_magic1, train_magic2, train_freq, train_ngram, train_simple, train_weight, train_page, train_w2v, train_doc2vec ], axis=1) test = pd.concat([ test_bag, test_magic1, test_magic2, test_freq, test_ngram, test_simple, test_weight, test_page, test_w2v, test_doc2vec ], axis=1) _, _, _, y_train, _, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) import numpy as np sums = np.sum(y_train, axis=0) print(sums / len(y_train)) from sklearn.model_selection import train_test_split x_train, x_dev, y_train, y_dev = train_test_split(train.values, y_train, test_size=0.1, random_state=0) import lightgbm as lgb train_input = lgb.Dataset(x_train, y_train)
filepath_test = "I:\\CIKM\\cikm_test_a_20180516.txt" filepath_unlabel = "I:\\CIKM\\cikm_unlabel_spanish_train_20180516\\cikm_unlabel_spanish_train_20180516.txt" w2v_pah = "I:\\CIKM\\w2v.model.bin" fast_path = "I:\\CIKM\\fast_text_vectors_wiki.es.vec\\wiki.es.vec" file_stop_word = "I:\\CIKM\\spanish_stop_word.txt" from CIKM.datautils import datahelper import pandas as pd import numpy as np from gensim.models.tfidfmodel import TfidfModel from gensim.similarities import MatrixSimilarity from scipy import spatial import datetime from scipy.stats import skew, kurtosis from gensim.corpora.dictionary import Dictionary x_train1, x_train2, _, _, _, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) x_test1, x_test2 = datahelper.load_testdata(filepath_test) train = pd.DataFrame() test = pd.DataFrame() train['question1'] = x_train1 train['question2'] = x_train2 test['question1'] = x_test1 test['question2'] = x_test2 # clean tfidf_txt = train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test[ 'question2'].tolist() train_qs = pd.Series(tfidf_txt).astype(str) dictionary = Dictionary(x.split(" ") for x in tfidf_txt)
def train_w2v(): _, _, _, _, data, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) model = gensim.models.Word2Vec(data, size=300, min_count=1) model.wv.save_word2vec_format('w2v.model.bin', binary=True)
def makeFeature(): x_train1, x_train2, _, _, _, _ = datahelper.load_data( filepath_en_train, filepath_sp_train) x_test1, x_test2 = datahelper.load_testdata(filepath_test) x_train1 = process_data(x_train1) x_train2 = process_data(x_train2) x_test1 = process_data(x_test1) x_test2 = process_data(x_test2) now = datetime.datetime.now() print now.strftime('%Y-%m-%d %H:%M:%S') print('get sentence vector') train = pd.DataFrame() test = pd.DataFrame() train['doc2vec_train1'] = [ doc2vec_model.get_question_vector(x, model) for x in x_train1 ] train['doc2vec_train2'] = [ doc2vec_model.get_question_vector(x, model) for x in x_train2 ] test['doc2vec_test1'] = [ doc2vec_model.get_question_vector(x, model) for x in x_test1 ] test['doc2vec_test2'] = [ doc2vec_model.get_question_vector(x, model) for x in x_test2 ] print('get six kinds of coefficient about vector') train['cosine1'] = train.apply( lambda x: Cosine(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['manhatton1'] = train.apply( lambda x: Manhatton(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['euclidean1'] = train.apply( lambda x: Euclidean(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['pearson1'] = train.apply( lambda x: PearsonSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['spearman1'] = train.apply( lambda x: SpearmanSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train['kendall1'] = train.apply( lambda x: KendallSimilar(x['doc2vec_train1'], x['doc2vec_train2']), axis=1) train.to_csv('train_doc2vec1.csv', index=False) test['cosine1'] = test.apply( lambda x: Cosine(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['manhatton1'] = test.apply( lambda x: Manhatton(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['euclidean1'] = test.apply( lambda x: Euclidean(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['pearson1'] = test.apply( lambda x: PearsonSimilar(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['spearman1'] = test.apply( lambda x: SpearmanSimilar(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test['kendall1'] = test.apply( lambda x: KendallSimilar(x['doc2vec_test1'], x['doc2vec_test2']), axis=1) test.to_csv('test_doc2vec1.csv', index=False)