def train_xgb(): # import xgboost as xgb # Set our parameters for xgboost params = {} params['objective'] = 'binary:logistic' params['eval_metric'] = 'logloss' params['eta'] = 0.02 params['max_depth'] = 4 feature1 = DataUtil.load_matrix('../feature_train/feature_min_max.txt') feature2 = pd.read_csv('../feature_train/feature_deepnet.csv').values print(feature1.shape) feature = np.concatenate([feature1, feature2], axis=1) print(feature.shape) label = np.load('../data/train_label.npy') x_train, x_valid, y_train, y_valid = train_test_split(feature, label) feature1_test = DataUtil.load_matrix('../feature_test/feature_min_max.txt') feature2_test = pd.read_csv('../feature_test/feature_deepnet.csv').values feature_test = np.concatenate([feature1_test, feature2_test], axis=1) d_train = xgb.DMatrix(x_train, label=y_train) d_valid = xgb.DMatrix(x_valid, label=y_valid) d_test = xgb.DMatrix(feature_test) watchlist = [(d_train, 'train'), (d_valid, 'valid')] bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=50, verbose_eval=10) pd.DataFrame(bst.predict(d_test)).to_csv('../result/result.csv')
def train(): "训练模型" Train_left = np.load('./data/X_train_question1.npy') Train_right = np.load('./data/X_train_question2.npy') Train_label= np.load('./data/train_label.npy') Train_label = Train_label.astype(np.int64) # stack_tr = np.zeros((Train_label.shape[0])) statistics_feature = DataUtil.load_matrix('./feature_train/feature_min_max.txt') sta2 = pd.read_csv('./feature_train/feature_deepnet.csv').values for k,(tr,va) in enumerate(StratifiedKFold(Train_label,random_state=27,n_folds=N_FOLD)): model = esim() print(' stack:{}/{}'.format(k+1,N_FOLD)) X_train_left = Train_left[tr] X_train_right = Train_right[tr] Y_train = Train_label[tr] train_stistics = statistics_feature[tr] train_sta2 = sta2[tr] val_sta2 = sta2[va] val_stistics = statistics_feature[va] X_val_left = Train_left[va] X_val_right = Train_right[va] Y_val = Train_label[va] print ("Train...") checkpoint = ModelCheckpoint('./model_file/CIKM_dec_Attention_classify_{}.hdf5'.format(k), monitor='val_loss', verbose=1, save_best_only=True, mode='min') early = EarlyStopping(monitor='val_loss', mode='min', patience=10) callbacks_list = [checkpoint, early] model.fit([X_train_left,X_train_right,train_stistics,train_sta2], Y_train, batch_size=128, epochs=N_EPOCH, verbose=1, validation_data=([X_val_left,X_val_right,val_stistics,val_sta2], Y_val),callbacks=callbacks_list)
def train(): "训练模型" Train_left = np.load('../data/X_train_question1.npy') Train_right = np.load('../data/X_train_question2.npy') Train_label = np.load('../data/train_label.npy') Train_label = Train_label.astype(np.int64) stack_tr = np.zeros((Train_label.shape[0])) statistics_feature = DataUtil.load_matrix( '../feature_train/feature_min_max.txt') sta2 = pd.read_csv('../feature_train/feature_deepnet.csv').values for k, (tr, va) in enumerate( StratifiedKFold(Train_label, random_state=27, n_folds=N_FOLD)): model = decomposable_attention() print(' stack:{}/{}'.format(k + 1, N_FOLD)) X_train_left = Train_left[tr] X_train_right = Train_right[tr] Y_train = Train_label[tr] train_stistics = statistics_feature[tr] train_sta = sta2[tr] val_sta2 = sta2[va] val_stistics = statistics_feature[va] X_val_left = Train_left[va] X_val_right = Train_right[va] Y_val = Train_label[va] X_train_left1,X_train_left2,X_train_right1,X_train_right2,train_stistics1,train_stistics2,train_sta1,train_sta2,Y_train1,Y_train2 = \ train_test_split(X_train_left,X_train_right,train_stistics,train_sta,Y_train,test_size=0.2,stratify=Y_train) print("Train...") checkpoint = ModelCheckpoint( '../model_file/attenion1_{}.hdf5'.format(k), monitor='val_loss', verbose=1, save_best_only=True, mode='min') early = EarlyStopping(monitor='val_loss', mode='min', patience=5) callbacks_list = [checkpoint, early] model.fit([X_train_left1, X_train_right1, train_stistics1, train_sta1], Y_train1, batch_size=128, epochs=N_EPOCH, verbose=1, validation_data=([ X_train_left2, X_train_right2, train_stistics2, train_sta2 ], Y_train2), callbacks=callbacks_list) model.load_weights('../model_file/attenion1_{}.hdf5'.format(k)) val_pre = model.predict( [X_val_left, X_val_right, val_stistics, val_sta2]).flatten() print(val_pre.shape, val_pre) stack_tr[va] += val_pre print('log_loss', log_loss(Y_val, val_pre)) df_train_result = pd.DataFrame({'Score': stack_tr}) df_train_result.to_csv('../result/attention1_train.txt', header=False, index=False)
def test(): "预测得分" test_left = np.load('../data/test_left.npy') test_right = np.load('../data/test_right.npy') statistics_feature = DataUtil.load_matrix('../feature_test/feature_min_max.txt') sta2 = pd.read_csv('../feature_test/feature_deepnet.csv').values result_np = np.zeros((len(test_right,))) for i in range(5): merged_model.load_weights('../model_file/deepnet1_{}.hdf5'.format(i)) score = merged_model.predict([test_left,test_right, statistics_feature,sta2]) score = np.reshape(score,(len(score),)) result_np +=score result_df = pd.DataFrame({"score":result_np/5}) import datetime result_df.to_csv('../result/deepnet1_test.txt',index=False,header=False)
def test(): "预测得分" version = 'clean_stops_number_punciton' model = esim() test_left = np.load('./data/test_left.npy') test_right = np.load('./data/test_right.npy') statistics_feature = DataUtil.load_matrix('./feature_test/feature_min_max.txt') sta2 = pd.read_csv('./feature_test/feature_deepnet.csv').values result_np = np.zeros((len(test_right,))) for i in range(N_FOLD): model.load_weights('./model_file/CIKM_dec_Attention_classify_{}.hdf5'.format(i)) score = model.predict([test_left,test_right,statistics_feature,sta2]) score = np.reshape(score,(len(score),)) result_np +=score result_df = pd.DataFrame({"score":result_np/N_FOLD}) import datetime unquie_flag = datetime.datetime.now().strftime('%m_%d_%H_%M') result_df.to_csv('./result/submit_'+version+'_{}.txt'.format(unquie_flag),index=False,header=False)
def test(): "预测得分" version = 'esim' # model = decomposable_attention() model = esim() test_left = np.load('../data/test_left.npy') test_right = np.load('../data/test_right.npy') statistics_feature = DataUtil.load_matrix('../feature_test/feature_min_max.txt') sta2 = pd.read_csv('../feature_test/feature_deepnet.csv').values result_np = np.zeros((len(test_right,))) for i in range(N_FOLD): model.load_weights('../model_file/esim_{}.hdf5'.format(i)) score = model.predict([test_left,test_right,statistics_feature,sta2]) score = np.reshape(score,(len(score),)) result_np +=score result_df = pd.DataFrame({"score":result_np/N_FOLD}) import datetime unquie_flag = datetime.datetime.now().strftime('%m_%d_%H_%M') result_df.to_csv('../result/esim1_test.txt'.format(unquie_flag),index=False,header=False)
def save_feature(df,step): # powerful_word_oside_feature = df.apply(extract_powerful_word_oside,axis=1) # DataUtil.save_matrix('../feature_{}/powerful_word_oside_feature.txt'.format(step),powerful_word_oside_feature,'w') PowerfulWordDoubleSideRate_feature = df.apply(extract_PowerfulWordDoubleSideRate,axis=1) DataUtil.save_matrix('../feature_{}/PowerfulWordDoubleSideRate_feature.txt'.format(step),PowerfulWordDoubleSideRate_feature,'w') # PowerfulWordOneSideRate_feautre = df.apply(extract_PowerfulWordOneSideRate,axis=1) # DataUtil.save_matrix('../feature_{}/PowerfulWordOneSideRate_feautre.txt'.format(step),PowerfulWordOneSideRate_feautre,'w') powerful_word_dside_feature = df.apply(extract_powerful_word_dside,axis=1).values DataUtil.save_matrix('../feature_{}/powerful_word_dside_feature.txt'.format(step),powerful_word_dside_feature,'w') ngramDistance_feature = df.apply(extract_ngramDistance,axis=1).values DataUtil.save_matrix('../feature_{}/ngramDistance_feature_feature.txt'.format(step),ngramDistance_feature,'w') # # print(ngramDistance_feature) NgramDiceDistance_feature = df.apply(extract_NgramDiceDistance,axis=1).values DataUtil.save_matrix('../feature_{}/NgramDiceDistance_feature.txt'.format(step),NgramDiceDistance_feature,'w') NgramJaccardCoef_feature = df.apply(extract_NgramJaccardCoef,axis=1).values DataUtil.save_matrix('../feature_{}/NgramJaccardCoef_feature.txt'.format(step),NgramJaccardCoef_feature,'w') Distance_feature = df.apply(extract_edit_Distance,axis=1) DataUtil.save_matrix('../feature_{}/Distance_feature.txt'.format(step),Distance_feature,'w') no_feature = df.apply(extract_no,axis=1) DataUtil.save_matrix('../feature_{}/feature_no.txt'.format(step),no_feature,'w') # '''word-match-feature''' word_match = df.apply(word_match_share,axis=1) DataUtil.save_vector('../feature_{}/word_match.txt'.format(step),word_match,'w') # '''tf-idf-word-share''' tf_idf_word_share_feature = df.apply(tf_idf_word_share,axis=1) DataUtil.save_vector('../feature_{}/tf_idf_word_share_feature.txt'.format(step),tf_idf_word_share_feature,'w') # '''len-feature''' print("start") len_feature = df.apply(len_word_sentence_feature,axis=1).values lendiff_feature = df.apply(lengthdiff,axis=1) lendiffrate_feature = df.apply(LengthDiffRate,axis=1) DataUtil.save_matrix('../feature_{}/len_feature.txt'.format(step),len_feature,'w') DataUtil.save_matrix('../feature_{}/lendiff_feature.txt'.format(step),lendiff_feature,'w') DataUtil.save_matrix('../feature_{}/lendiffrate_feature.txt'.format(step),lendiffrate_feature,'w') tfidf_feature= df.apply(extract_tfidf_feature,axis=1) DataUtil.save_matrix('../feature_{}/tfidf_feature.txt'.format(step),tfidf_feature,'w') # '''dul_num_feature''' # print('start load') dul_num = df.apply(extract_dul_num,axis=1) DataUtil.save_matrix('../feature_{}/dul_num.txt'.format(step),dul_num,'w')
def save_feature(step): '''path''' NgramDiceDistance_feature_path = '../feature_{}/NgramDiceDistance_feature.txt'.format( step) NgramJaccardCoef_feature_path = '../feature_{}/NgramJaccardCoef_feature.txt'.format( step) Distance_feature_path = '../feature_{}/Distance_feature.txt'.format(step) no_feature_path = '../feature_{}/feature_no.txt'.format(step) word_match_path = '../feature_{}/word_match.txt'.format(step) tf_idf_word_share_feature_path = '../feature_{}/tf_idf_word_share_feature.txt'.format( step) len_feature_path = '../feature_{}/len_feature.txt'.format(step) lendiff_feature_path = '../feature_{}/lendiff_feature.txt'.format(step) lendiffrate_feature_path = '../feature_{}/lendiffrate_feature.txt'.format( step) tfidf_feature_path = '../feature_{}/tfidf_feature.txt'.format(step) ngramDistance_feature_path = '../feature_{}/ngramDistance_feature_feature.txt'.format( step) powerful_word_dside_feature_path = '../feature_{}/powerful_word_dside_feature.txt'.format( step) powerful_word_oside_feature_path = '../feature_{}/powerful_word_oside_feature.txt'.format( step) PowerfulWordDoubleSideRate_feature_path = '../feature_{}/PowerfulWordDoubleSideRate_feature.txt'.format( step) PowerfulWordOneSideRate_feautre_path = '../feature_{}/PowerfulWordOneSideRate_feautre.txt'.format( step) dul_num_path = '../feature_{}/dul_num.txt'.format(step) ''' load feature ''' powerful_word_dside_feature = DataUtil.load_matrix( powerful_word_dside_feature_path) PowerfulWordDoubleSideRate_feature = DataUtil.load_matrix( PowerfulWordDoubleSideRate_feature_path) no_feature = DataUtil.load_matrix(no_feature_path) # no_feature_min_max_transfer= preprocessing.MinMaxScaler() # no_feature_train_minmax = no_feature_min_max_transfer.fit_transform(no_feature) word_match_feature = DataUtil.load_matrix(word_match_path) ngramDistance_feature = DataUtil.load_matrix(ngramDistance_feature_path) tf_idf_word_share_feature = DataUtil.load_matrix( tf_idf_word_share_feature_path) dul_num = DataUtil.load_matrix(dul_num_path) dul_num = preprocessing.scale(dul_num) NgramDiceDistance_feature = DataUtil.load_matrix( NgramDiceDistance_feature_path) NgramJaccardCoef_feature = DataUtil.load_matrix( NgramJaccardCoef_feature_path) # Distance_feature = DataUtil.load_matrix(Distance_feature_path) len_feature = DataUtil.load_matrix(len_feature_path) lendiff_feature = DataUtil.load_matrix(lendiff_feature_path) lendiffrate_feature = DataUtil.load_matrix(lendiffrate_feature_path) tfidf_feature = DataUtil.load_matrix(tfidf_feature_path) tfidf_feature = np.nan_to_num(tfidf_feature) if step == 'train': cut_index = 3 else: cut_index = 2 train_distance_feature = pd.read_csv( '../feature_{}/w2vec_features_scale.csv'.format(step), encoding='gbk') train_distance_feature = train_distance_feature.fillna(value=0) train_distance_feature = train_distance_feature.iloc[:, cut_index:] train_distance_feature.to_csv( '../feature_{}/feature_deepnet.csv'.format(step), index=False) comb_feature = pd.read_csv('../feature_{}/comb.csv'.format(step), encoding='gbk') print('comb', comb_feature.shape) print('distance_featrue', train_distance_feature.shape) feature = np.concatenate( (comb_feature, ngramDistance_feature, lendiffrate_feature, lendiff_feature, len_feature, tfidf_feature, powerful_word_dside_feature, NgramJaccardCoef_feature, NgramDiceDistance_feature, dul_num, no_feature, word_match_feature, tf_idf_word_share_feature), axis=1) DataUtil.save_matrix('../feature_{}/feature.txt'.format(step), feature, 'w')
feature2_test = pd.read_csv('../feature_test/feature_deepnet.csv').values feature_test = np.concatenate([feature1_test, feature2_test], axis=1) d_train = xgb.DMatrix(x_train, label=y_train) d_valid = xgb.DMatrix(x_valid, label=y_valid) d_test = xgb.DMatrix(feature_test) watchlist = [(d_train, 'train'), (d_valid, 'valid')] bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=50, verbose_eval=10) pd.DataFrame(bst.predict(d_test)).to_csv('../result/result.csv') if __name__ == '__main__': # save_feature('train') save_feature('test') train_feature = DataUtil.load_matrix('../feature_train/feature.txt') test_feature = DataUtil.load_matrix('../feature_test/feature.txt') print(train_feature.shape) feature = np.concatenate([train_feature, test_feature], axis=0) scale_transfer = preprocessing.StandardScaler() scale_transfer_fit = scale_transfer.fit(feature) train_feature_min_max = scale_transfer_fit.transform(train_feature) test_feature_min_max = scale_transfer_fit.transform(test_feature) DataUtil.save_matrix('../feature_train/feature_min_max.txt', train_feature_min_max, 'w') DataUtil.save_matrix('../feature_test/feature_min_max.txt', test_feature_min_max, 'w')