def concat_train_test(train: str, test: str): """ parameter: train, test - path of data return: len(train),train_test_data_path, train_test_data """ train_data, _ = load_csv(train) test_data, _ = load_csv(test) train_test_data = pd.concat([train_data, test_data]) train_test_data_path = data_path + 'train_test_data.csv' train_test_data.to_csv(train_test_data_path, index=None, encoding='utf8') return len(train_data), train_test_data_path, train_test_data
def train(train_data_path, test_data_path): """ return: len(train_data), train_test_data_path """ start = datetime.now() print(start, "Start training dm and dbow model!") add_number_to_query(train_data_path) train_data, _ = load_csv(train_data_path) X_train = tokenize_data(train_data) tfidf_matrix_dump(X_train, 'tfidf_train.pkl') new_train_data_path = fill_train_data(train_data, X_train) length, train_test_data_path, train_test_data = concat_train_test( new_train_data_path, test_data_path) add_number_to_query(train_test_data_path) X_train_test = tokenize_data(train_test_data) tfidf_matrix_dump(X_train_test, 'tfidf_train_test.pkl') train_doc2vec(train_test_data_path, length, 'dbow') train_doc2vec(train_test_data_path, length, 'dm') end = datetime.now() print( end, "Train dm and dbow model down! Duration time: {}s ".format( (end - start).seconds)) return length, train_test_data_path
def train_tfidf_stack(csv_path, length): """ csv_path: train_test_data_path length: the length of train_data """ print(datetime.now(), 'training tfidf stack start!') train_test_data, dic = load_csv(csv_path) # tfv = TfidfVectorizer(tokenizer=Tokenizer(len(train_test_data)), min_df=3, max_df=0.95, sublinear_tf=True) # X = tfv.fit_transform(train_test_data['Query']) with codecs.open(data_path + 'tfidf_train_test.pkl', 'rb') as f: X = pickle.load(f) # load tfidf matrix from tfidf_train_test.pkl df_stack = pd.DataFrame(index=range(len(train_test_data))) # -----------------------stacking for education/age/gender------------------ # ['Education', 'Age','Gender'] for i in ['Education', 'Age', 'Gender']: print(i) TR = length # print(train_test_data.iloc[:TR][i].value_counts()) # print(train_test_data.iloc[TR:][i].value_counts()) num_class = len(pd.value_counts(dic[i])) n = 5 X_tr = X[:TR] y_tr = dic[i][:TR] X_te = X[TR:] y_te = dic[i][TR:] stack = np.zeros((X_tr.shape[0], num_class)) stack_te = np.zeros((X_te.shape[0], num_class)) kf = KFold(n_splits=n) for j, (tr, va) in enumerate(kf.split(X_tr, y_tr)): print('%s stack:%d/%d' % (str(datetime.now()), j + 1, n)) # print(train_test_data.iloc[tr][i].value_counts()) # print(train_test_data.iloc[va][i].value_counts()) clf = LogisticRegression(C=3, solver='liblinear', dual=True, max_iter=10000) clf.fit(X_tr[tr], y_tr[tr]) y_pred_va = clf.predict_proba(X_tr[va]) y_pred_te = clf.predict_proba(X_te) print('va acc:', myAcc(y_tr[va], y_pred_va)) print('te acc:', myAcc(y_te, y_pred_te)) stack[va] += y_pred_va stack_te += y_pred_te stack_te /= n stack_all = np.vstack([stack, stack_te]) for k in range(stack_all.shape[1]): df_stack['tfidf_{}_{}'.format(i, k)] = stack_all[:, k] df_stack.to_csv(data_path + 'tfidf_stack.csv', index=None, encoding='utf8') print(datetime.now(), 'training tfidf stack done!')
def train(train_test_data_path, length): """ length: length of train data """ df_lr, df_dm, df_dbow = load_model_data() data, dic = load_csv(train_test_data_path) # seed = 10 TR = length df_sub = pd.DataFrame() df_sub['Id'] = data.iloc[TR:]['Id'] df = pd.concat([df_lr, df_dbow, df_dm], axis=1) print("----" * 5 + "Training xgb-ens start" + "----" * 5) print(df.columns) for lb in ['Education', 'Age', 'Gender']: # for lb in ['Gender']: print("-----" * 5 + lb + "-----" * 5) num_class = len(pd.value_counts(dic[lb][:length])) X = df.iloc[:TR] y = dic[lb][:TR] X_te = df.iloc[TR:] y_te = dic[lb][TR:] print('{} train value_counts'.format(lb)) print(pd.value_counts(dic[lb][:length])) print('{} test value_counts'.format(lb)) print(pd.value_counts(dic[lb][length:])) esr = 100 evals = 1 n_trees = 10 lb_2_model = eval(lb) params = lb_2_model.params params['num_class'] = num_class dtrain = xgb.DMatrix(X, y) dvalid = xgb.DMatrix(X_te, y_te) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] bst = xgb.train(params, dtrain, n_trees, evals=watchlist, feval=xgb_acc_score, maximize=True, early_stopping_rounds=esr, verbose_eval=evals) df_sub[lb] = np.argmax(bst.predict(dvalid), axis=1) df_sub = df_sub[['Age', 'Education', 'Gender', 'Id']] df_sub.columns = ['Age', 'Education', 'Gender', 'Id'] results_path = data_path + 'tfidf_dm_dbow_.csv' df_sub.to_csv(results_path, index=None, encoding='utf8') ensAcc(results_path, test_set_path) print("----" * 5 + "Training xgb-ens finished" + "----" * 5)
def train_doc2vec(csv_path, length, model_type='dbow'): """ :param csv_path: path of train and test data with .csv format :param model_type: 'dbow' or 'dm' :param length: the length of train data :return: dbow_d2v.model in ./data/ """ d2v = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=3, window=30, sample=1e-5, workers=8, alpha=0.025, min_alpha=0.025) epoch = 2 if model_type == 'dm': epoch = 3 d2v = Doc2Vec(dm=1, vector_size=300, negative=5, hs=0, min_count=3, window=10, sample=1e-5, workers=8, alpha=0.025, min_alpha=0.025) doc_list = DocList(csv_path[:-4] + '_num.txt') d2v.build_vocab(doc_list) _, dic = load_csv(csv_path) print(datetime.now(), model_type + ' model training!') for i in range(epoch): print(datetime.now(), 'pass: {}/{}'.format(i + 1, epoch)) doc_list = DocList(csv_path[:-4] + '_num.txt') d2v.train(doc_list, total_examples=d2v.corpus_count, epochs=d2v.epochs) X_d2v = np.array([d2v.docvecs[i] for i in range(length)]) for j in ["Education", 'Age', 'Gender']: clf = LogisticRegression(C=3, solver='saga', dual=False, max_iter=10000) scores = cross_val_score(clf, X_d2v, dic[j][:length], cv=5) print(model_type, j, scores, np.mean(scores)) d2v.save(data_path + model_type + '_d2v.model') print(datetime.now(), model_type + ' model save done!')
def add_number_to_query(csv_path): """ :param csv_path: train_data_path :return: the total number of train_data. """ print("----" * 5 + "Add number to query: Start" + "----" * 5) train_data, _ = load_csv(csv_path) f = codecs.open(csv_path[:-4] + '_num.txt', 'w', encoding='utf8') for i, queries in enumerate(train_data.iloc[:len(train_data)]['Query']): words = [] for query in queries.split('\t'): words.extend(list(jieba.cut(query))) f.write('_*{} {}'.format(i, ' '.join(words))) f.close() print("----" * 5 + "Add number to query: Done" + "----" * 5) return len(train_data)
def train_dbow_dm_nn(feat: str, length: int): """ :param feat: str ['dbow_d2v'.'dm_d2v'] :param length: length of train data :return: none """ print(datetime.now(), 'training ' + feat + ' stack start!') train_data, dic = load_csv(csv_path) model = Doc2Vec.load(data_path + feat + '.model') doc_vec = np.array([model.docvecs[i] for i in range(len(train_data))]) df_stack = pd.DataFrame(index=range(len(train_data))) TR = length n = 5 X_tr = doc_vec[:TR] X_te = doc_vec[TR:] for _, lb in enumerate(['Education', 'Age', 'Gender']): num_class = len(pd.value_counts(dic[lb])) y_tr = dic[lb][:TR] y_te = dic[lb][TR:] stack = np.zeros((X_tr.shape[0], num_class)) stack_te = np.zeros((X_te.shape[0], num_class)) kf = KFold(n_splits=n) for k, (tr, va) in enumerate(kf.split(X_tr, y_tr)): print('{} stack:{}/{} {}'.format(datetime.now(), k + 1, n, lb)) nb_classes = num_class X_train = X_tr[tr] y_train = y_tr[tr].astype(np.int) X_test = X_te y_test = y_te.astype(np.int) X_train = X_train.astype('float32') X_test = X_test.astype('float32') Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) model = Sequential() model.add(Dense(300, input_shape=(X_train.shape[1], ))) model.add(Dropout(0.1)) model.add(Activation('tanh')) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) model.fit(X_train, Y_train, shuffle=True, batch_size=128, epochs=35, verbose=2, validation_data=(X_test, Y_test)) y_pred_va = model.predict(X_tr[va]) y_pred_te = model.predict(X_te) print('va acc:', myAcc(y_tr[va], y_pred_va)) print('te acc:', myAcc(y_te, y_pred_te)) stack[va] += y_pred_va stack_te += y_pred_te stack_te /= n stack_all = np.vstack([stack, stack_te]) for l in range(stack_all.shape[1]): df_stack['{}_{}_{}'.format(feat, lb, l)] = stack_all[:, l] df_stack.to_csv(data_path + feat + '_stack.csv', encoding='utf8', index=None) print(datetime.now(), 'training ' + feat + ' stack done!')
def __iter__(self): for _, line in enumerate(codecs.open(self.f, encoding='utf8')): words = line.split() tags = [int(words[0][2:])] words = words[1:] yield self.SentimentDocument(words, tags) if __name__ == "__main__": train_data_path = './data/train_data.csv' test_data_path = './data/test_data.csv' start = datetime.now() print("Start training dm and dbow model!") add_number_to_query(train_data_path) train_data, _ = load_csv(train_data_path) X_train = tokenize_data(train_data) tfidf_matrix_dump(X_train, 'tfidf_train.pkl') new_train_data_path = fill_train_data(train_data, X_train) length, train_test_data_path, train_test_data = concat_train_test( new_train_data_path, test_data_path) add_number_to_query(train_test_data_path) X_train_test = tokenize_data(train_test_data) tfidf_matrix_dump(X_train_test, 'tfidf_train_test.pkl') train_doc2vec(train_test_data_path, length, 'dbow') train_doc2vec(train_test_data_path, length, 'dm') end = datetime.now() print("Train dm and dbow model down! Duration time: {}s ".format( (end - start).seconds))