def gen_model(): ''' for online submit train on entire data ''' with open('../data/features_all_v2.5.pkl', 'rb') as my_input: all_pid = dill.load(my_input) all_feature = dill.load(my_input) all_label = dill.load(my_input) all_pid = np.array(all_pid) all_feature = np.array(all_feature) all_label = np.array(all_label) print('all feature shape: {0}'.format(all_feature.shape)) clf_1 = MyXGB() clf_2 = MyXGB() clf_3 = MyXGB() clf_4 = MyXGB() clf_5 = MyXGB() clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5]) print('start training') clf_final.fit(all_feature, all_label) print('done training') pred = clf_final.predict(all_feature) print(MyEval.F1Score3(pred, all_label)) # with open('../../tmp_model/v2.5_xgb4_all.pkl', 'wb') as fout: with open('../model/v2.5_xgb5_all_v2.pkl', 'wb') as fout: dill.dump(clf_final, fout) print('save model done')
def XGBcv(all_pid, all_feature, all_label, subsample, max_depth, colsample_bytree, min_child_weight): ''' TODO: try kf = StratifiedKFold(n_splits=5, shuffle=True) ''' wrong_stat = [] ## k-fold cross validation all_pid = np.array(all_pid) all_feature = np.array(all_feature) all_label = np.array(all_label) F1_list = [] kf = StratifiedKFold(n_splits=5, shuffle=True) for train_index, test_index in kf.split(all_feature, all_label): train_data = all_feature[train_index] train_label = all_label[train_index] test_data = all_feature[test_index] test_label = all_label[test_index] clf = MyXGB(subsample=subsample, max_depth=max_depth, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight) clf.fit(train_data, train_label) pred = clf.predict(test_data) F1_list.append(MyEval.F1Score3(pred, test_label, False)) print('\n\nAvg F1: ', np.mean(F1_list)) return np.mean(F1_list)
def gen_model(): with open('../data/features_all_v1.6.pkl', 'rb') as my_input: all_pid = dill.load(my_input) all_feature = dill.load(my_input) all_label = dill.load(my_input) all_pid = np.array(all_pid) all_feature = np.array(all_feature) all_label = np.array(all_label) clf = MyXGB() clf.fit(all_feature, all_label) pred = clf.predict(all_feature) print(MyEval.F1Score3(pred, all_label)) with open('../model/v1.6_xgb.pkl', 'wb') as fout: dill.dump(clf, fout)
def TestEncase(all_pid, all_feature, all_label): #if __name__ == "__main__": # with open('../data/features_all_v2.2.pkl', 'rb') as my_input: # all_pid = np.array(dill.load(my_input)) # feat_feature = np.array(dill.load(my_input)) # all_label = np.array(dill.load(my_input)) # ## mean_wave = np.array(ReadData.read_mean_wave()) # mean_wave = np.array(ReadData.read_mean_wave_simp()) # all_feature = np.array(np.c_[mean_wave, feat_feature]) # all_feature = np.array(mean_wave) wrong_stat = [] clf_final_list = [] ## k-fold cross validation all_feature = np.array(all_feature) all_label = np.array(all_label) all_pid = np.array(all_pid) F1_list = [] kf = StratifiedKFold(n_splits=5, shuffle=True) i_fold = 1 print('all feature shape: {0}'.format(all_feature.shape)) for train_index, test_index in kf.split(all_feature, all_label): train_data = all_feature[train_index] train_label = all_label[train_index] test_data = all_feature[test_index] test_label = all_label[test_index] test_pid = all_pid[test_index] clf_1 = MyXGB() clf_2 = MyXGB() clf_3 = MyXGB() clf_4 = MyXGB() clf_5 = MyXGB() clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5]) clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) # pred_train = clf_final.predict(train_data) # MyEval.F1Score3(pred_train, train_label) F1_list.append(MyEval.F1Score3(pred, test_label, False)) # wrong_stat.extend(MyEval.WrongStat(i_fold, pred, test_label, test_pid)) i_fold += 1 clf_final_list.append(clf_final) avg_f1 = np.mean(F1_list) print('\n\nAvg F1: ', avg_f1) # wrong_stat = pd.DataFrame(wrong_stat, columns=['i_fold', 'pid', 'gt', 'pred']) # wrong_stat.to_csv('../../stat/wrong_stat_f1'+str(np.mean(F1_list))+'.csv') clf_final_final = Encase(clf_final_list) pred = clf_final_final.predict(all_feature) print(MyEval.F1Score3(pred, all_label)) with open('../../tmp_model/v2.5_v0.1/v2.5_v0.1_' + str(avg_f1) + '.pkl', 'wb') as fout: dill.dump(clf_final_final, fout)
def TestXGB(fout, original_pid, original_label, all_pid, all_feature, all_label): # wrong_stat = [] ## k-fold cross validation original_pid = np.array(original_pid) original_label = np.array(original_label) all_feature = np.array(all_feature) all_label = np.array(all_label) F1_list_set = [] F1_list_seq = [] kf = StratifiedKFold(n_splits=5, shuffle=True) i_fold = 1 for original_train_index, original_test_index in kf.split( original_label, original_label): original_train_pid = set(original_pid[original_train_index]) original_test_pid = set(original_pid[original_test_index]) train_index = [] test_index = [] for ii in range(len(all_pid)): ii_pid = all_pid[ii].split('_')[0] if ii_pid in original_train_pid: train_index.append(ii) elif ii_pid in original_test_pid: test_index.append(ii) else: print('wrong') train_index = np.array(train_index) test_index = np.array(test_index) train_data = all_feature[train_index] train_label = all_label[train_index] train_pid = np.array(all_pid)[train_index] test_data = all_feature[test_index] test_label = all_label[test_index] test_pid = np.array(all_pid)[test_index] clf = MyXGB() clf.fit(train_data, train_label) pred = clf.predict(test_data) pred_train = clf.predict(train_data) _, pred_train_seq = shrink_set_to_seq(train_pid, pred_train) _, train_label_seq = shrink_set_to_seq(train_pid, train_label) print('pred_train') MyEval.F1Score3(pred_train, train_label) print('pred_train_seq') MyEval.F1Score3(pred_train_seq, train_label_seq) _, pred_seq = shrink_set_to_seq(test_pid, pred) _, test_label_seq = shrink_set_to_seq(test_pid, test_label) print('\n pred') F1_list_set.append(MyEval.F1Score3(pred, test_label)) print('pred_seq') f1_pred = MyEval.F1Score3(pred_seq, test_label_seq) F1_list_seq.append(f1_pred) print('=====================================') # wrong_stat.extend(MyEval.WrongStat(i_fold, pred, test_label, test_pid)) fout.write('{0}, {1} \n'.format(i_fold, f1_pred)) i_fold += 1 # with open('../tmp_model/v1.9_xgb_z_'+str(f1_pred)+'.pkl', 'wb') as fout: # dill.dump(f1_pred, fout) # break avg_f1 = np.mean(F1_list_seq) print('\n\nAvg F1: ', avg_f1) # wrong_stat = pd.DataFrame(wrong_stat, columns=['i_fold', 'pid', 'gt', 'pred']) # wrong_stat.to_csv('../../result/wrong_stat.csv') fout.write('avg, {0} \n'.format(f1_pred))
feat_deep_centerwave = np.array(dill.load(my_input)) print('feat_deep_centerwave shape: ', feat_deep_centerwave.shape) with open('../data/feat_resnet.pkl', 'rb') as my_input: feat_resnet = np.array(dill.load(my_input)) print('feat_resnet shape: ', feat_resnet.shape) # k-fold cross validation all_feature = np.c_[all_feature, feat_deep_centerwave, feat_resnet] all_label = np.array(all_label) train_data = all_feature train_label = all_label clf = MyXGB() clf.fit(train_data, train_label) print('train done') imp_scores = clf.get_importance() feat_num = all_feature.shape[1] imp_scores_key_num = set([int(k[1:]) for k in imp_scores.keys()]) print(feat_num) print(len(imp_scores)) pred_train = clf.predict(train_data) MyEval.F1Score3(pred_train, train_label) with open('../../stat/feat_imp_v2.5_v0.1_v0.1.csv', 'w') as fout: for i in range(1,feat_num+1): if i in imp_scores_key_num:
def TestExp(all_pid, all_feature, all_label, method, i_iter): kf = StratifiedKFold(n_splits=5, shuffle=True) i_fold = 1 print('all feature shape: {0}'.format(all_feature.shape)) for train_index, test_index in kf.split(all_feature, all_label): train_data = all_feature[train_index] train_label = all_label[train_index] test_data = all_feature[test_index] test_label = all_label[test_index] test_pid = all_pid[test_index] ### ENCASE if method == 'ENCASE_E': selected_cols = list(range(258, 558)) train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_1 = MyXGB() clf_2 = MyXGB() clf_3 = MyXGB() clf_4 = MyXGB() clf_5 = MyXGB() clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5]) clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) if method == 'ENCASE_EC': selected_cols = list(range(0, 558)) train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_1 = MyXGB() clf_2 = MyXGB() clf_3 = MyXGB() clf_4 = MyXGB() clf_5 = MyXGB() clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5]) clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) if method == 'ENCASE_ECD': clf_1 = MyXGB() clf_2 = MyXGB() clf_3 = MyXGB() clf_4 = MyXGB() clf_5 = MyXGB() clf_final = Encase([clf_1, clf_2, clf_3, clf_4, clf_5]) clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'XGBoost_E': selected_cols = list(range(258, 558)) train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyXGB(n_estimators=100, num_round=50) clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'XGBoost_EC': selected_cols = list(range(0, 558)) train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyXGB(n_estimators=100, num_round=50) clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'XGBoost_ECD': clf_final = MyXGB(n_estimators=100, num_round=50) clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'LR_E': selected_cols = list(range(258, 558)) train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyLR() clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'LR_EC': selected_cols = list(range(0, 558)) train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyLR() clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'LR_ECD': clf_final = MyLR() clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'SampleEn': selected_cols = [300, 301, 302, 303] train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyLR() clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'CDF': selected_cols = [304, 305, 306] train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyLR() clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'MAD': selected_cols = [307] train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyLR() clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) elif method == 'Variability': selected_cols = [346, 347, 348, 349, 350] train_data = train_data[:, selected_cols] test_data = test_data[:, selected_cols] clf_final = MyLR() clf_final.fit(train_data, train_label) pred = clf_final.predict(test_data) res = MyEval.F14Exp(pred, test_label) print(res) with open('../../stat/res_exp_for_paper.csv', 'a') as fout: fout.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format( method, i_iter, i_fold, res[0], res[1], res[2], res[3], res[4])) i_fold += 1