def stg2_XTR(fobj, train_x, hold_x, sub_test, ntrees=100): ntrees = ntrees e = 40 njobs = 2 r = 0.3 arr = [0, 1, 2, 21, 22, 23, 24, 25, 26] hold_test_x = hold_x[:, arr] hold_test_y = hold_x[:, -1] sub_test = sub_test[:, arr] fobj.write('Trees: %r' % ntrees) print 'Stage 2: Exp Threshold - %r' % e fobj.write('Stage 2: Exp Threshold - %r\n' % e) c_train = train_x[(train_x[:, -1] <= e)] c_arr = [0, 1, 2, 21, 22, 23, 24, 25, 26, 27] c_train = c_train[:, c_arr] c_train_y = c_train[:, -1] c_train_x = c_train[:, :-1] kaggle_file = 'Kaggle_XTR_Ankit_e' + str(e) + '_r' + str(r) + '_t_' + str( ntrees) + str(len(arr)) + '.csv' df_kaggle_file = 'Kaggle_df_XTR_Ankit_e' + str(e) + '_r' + str( r) + '_t_' + str(ntrees) + str(len(arr)) + '.csv' print 'Stage 2: Rate - %r' % r fobj.write('XTR exp: %r rate: %r' % (e, r)) rf_model = etr(n_estimators=ntrees, n_jobs=njobs) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'XTR Train Error: %r\n' % (error) fobj.write('XTR Train-2 Error: %r\n' % (error)) train_y_pred = est.predict(hold_test_x) error = mt.mean_absolute_error(hold_test_y, train_y_pred) print 'XTR 20 percent Hold Error: %r\n' % (error) fobj.write('XTR 20 percent Hold Error: %r\n' % (error)) print 'Test Size:%r' % len(sub_test) test_y_pred = est.predict(sub_test) id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file), all_data, delimiter=',', header='Id,Expected') df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file)) df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file), header=True, index=False)
def stg2_XTR(fobj, train_x, hold_x, sub_test, ntrees=100): ntrees = ntrees e = 40 njobs=2 r = 0.3 arr = [0,1,2,21,22,23,24,25,26] hold_test_x = hold_x[:,arr] hold_test_y = hold_x[:,-1] sub_test = sub_test[:,arr] fobj.write('Trees: %r'%ntrees) print 'Stage 2: Exp Threshold - %r' % e fobj.write('Stage 2: Exp Threshold - %r\n'%e) c_train = train_x[(train_x[:,-1]<=e)] c_arr = [0,1,2,21,22,23,24,25,26,27] c_train = c_train[:,c_arr] c_train_y = c_train[:,-1] c_train_x = c_train[:,:-1] kaggle_file = 'Kaggle_XTR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr)) +'.csv' df_kaggle_file = 'Kaggle_df_XTR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr))+ '.csv' print 'Stage 2: Rate - %r' % r fobj.write('XTR exp: %r rate: %r'%(e,r)) rf_model = etr(n_estimators=ntrees, n_jobs=njobs) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'XTR Train Error: %r\n' % (error) fobj.write('XTR Train-2 Error: %r\n' % (error)) train_y_pred = est.predict(hold_test_x) error = mt.mean_absolute_error(hold_test_y, train_y_pred) print 'XTR 20 percent Hold Error: %r\n' % (error) fobj.write('XTR 20 percent Hold Error: %r\n' % (error)) print 'Test Size:%r' % len(sub_test) test_y_pred = est.predict(sub_test) id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file), all_data, delimiter=',', header='Id,Expected') df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file)) df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file), header=True, index=False)
def runXTR(train_1_x, test_x, hold_out, sub_test, fobj): ntrees = 100 njobs = 2 exp = 40 c_train_1_x = train_1_x[(train_1_x[:, -1] <= exp)] c_train_y = c_train_1_x[:, -1] c_train_x = c_train_1_x[:, :-1] rf_model = etr(n_estimators=ntrees, n_jobs=-1) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'XTR Train-1 Error: %r\n' % (error) fobj.write('XTR Train-1 Error: %r\n' % (error)) valid_y_pred = est.predict(test_x) hold_y = est.predict(hold_out) sub_y = est.predict(sub_test) return est, valid_y_pred, hold_y, sub_y
def runXTR(train_1_x, test_x, hold_out, sub_test, fobj): ntrees = 100 njobs = 2 exp = 40 c_train_1_x = train_1_x[(train_1_x[:,-1]<=exp)] c_train_y = c_train_1_x[:,-1] c_train_x = c_train_1_x[:,:-1] rf_model = etr(n_estimators=ntrees, n_jobs=-1) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'XTR Train-1 Error: %r\n' % (error) fobj.write('XTR Train-1 Error: %r\n' % (error)) valid_y_pred = est.predict(test_x) hold_y = est.predict(hold_out) sub_y = est.predict(sub_test) return est, valid_y_pred, hold_y, sub_y
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[40], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[]): global fobj fobj.write('Total Number of records: %r\n' % (len(comp_mat))) #step 1 split Train and test records train_x, u_test_x, train_y, u_test_y = cv.train_test_split( comp_mat, comp_mat[:, -1], random_state=52, test_size=test_size) #removing the predictor column from the matrix u_test_x = u_test_x[:, :-1] for exp in exp_thresh: fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp)) #step 2 prning to required exppected threshold c_train_x = train_x[(train_x[:, -1] <= exp)] c_train_y = c_train_x[:, -1] c_train_x = c_train_x[:, :-1] #step1 split Train ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split( c_train_x, c_train_y, random_state=32, test_size=test_size) fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y)) fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y)) fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y)) print 'Fitting Model Measurements' trees_array = [100] kf = cv.KFold(len(ttrain_x), n_folds=5) st_time = time.time() l_rate = [0.1, 0.05, 0.02, 0.01] max_depth = [None] min_samples_leaf = [1] for dpth in max_depth: fobj.write('\n\nmax _depth: %r\n' % (dpth)) print 'max _depth: %r' % (dpth) for msl in min_samples_leaf: fobj.write('\nmin_samples_leaf: %r\n' % (msl)) print 'min_samples_leaf: %r' % (msl) for ntrees in trees_array: valid_acc = [] test_acc = [] train_acc = [] est_arr = [] unconst_acc = [] fold = 0 for train_idx, test_idx in kf: fobj.write('\nfold: %r\n' % (fold)) #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False) rf_model = etr(n_estimators=ntrees, max_depth=dpth, min_samples_split=msl) vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features) valid_acc.append(vacc) train_acc.append(tacc) est_arr.append(est) fobj.write('Train Size:%r\n' % (len(train_idx))) fobj.write('Validation Size:%r\n' % (len(test_idx))) fobj.write('Validation Error: %r\n' % (vacc)) fobj.write('Train Error: %r\n' % (tacc)) fobj.write('Constrained Test data size:%r\n' % (len(ttest_x))) test_acc.append( mt.mean_absolute_error(ttest_y, est.predict(ttest_x))) fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1])) unconst_acc.append( mt.mean_absolute_error(u_test_y, est.predict(u_test_x))) fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1])) fold += 1 break et_time = time.time() fobj.write('..Statistics..\n') fobj.write('Expected Threshold Limit: %r\n' % (exp)) fobj.write('Trees:%r\n' % ntrees) fobj.write('Train Average: %r\n' % (np.mean(train_acc))) fobj.write('Validation Average: %r\n' % (np.mean(valid_acc))) fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc))) fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) fobj.write('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60)) #Print to console print('..Statistics..\n') print('Expected Threshold Limit: %r\n' % (exp)) print('Trees:%r\n' % ntrees) print('Train Average: %r\n' % (np.mean(train_acc))) print('Validation Average: %r\n' % (np.mean(valid_acc))) print('Constrained Test Average: %r\n' % (np.mean(test_acc))) print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) print('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60)) print 'Generating Solution Result:' test_x = np.loadtxt( 'C:\Users\saura\Desktop\ML_Project\data\\norm_test_fmat.csv', delimiter=',') print 'Test Size:%r' % len(test_x) test_y_pred = est.predict(test_x) id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt( 'C:\Users\saura\Desktop\ML_Project\ensemble_data\\mytest_solution.csv', all_data, delimiter=',', header='Id,Expected') df = pd.read_csv( 'C:\Users\saura\Desktop\ML_Project\ensemble_data\\mytest_solution.csv' ) df.to_csv( 'C:\Users\saura\Desktop\ML_Project\\ensemble_data\\final_solution.csv', header=True, index=False)
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[], remove_mp=False): global fobj fobj.write('Total Number of records: %r\n' % (len(comp_mat))) #remove marshal palmer result column: if remove_mp: print 'Before Removing label size: ',np.shape(comp_mat) y = comp_mat[:,-1] print 'Before Removing label size: ', np.shape(comp_mat) comp_mat = comp_mat[:,:-2] print 'Before Appending label size: ',np.shape(comp_mat) comp_mat = np.column_stack((comp_mat, y)) print 'After appending label',np.shape(comp_mat) #step 1 split Train and test records train_x, u_test_x, train_y, u_test_y = cv.train_test_split(comp_mat, comp_mat[:,-1], random_state = 52, test_size=test_size) #removing the predictor column from the matrix u_test_x = u_test_x[:,:-1] #for exp in exp_thresh: exp = 40 fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp)) #step 2 prning to required exppected threshold c_train_x = train_x[(train_x[:,-1]<=exp)] c_train_y = c_train_x[:,-1] c_train_x = c_train_x[:,:-1] #step1 split Train ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(c_train_x, c_train_y, random_state = 32, test_size=test_size ) fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y)) fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y)) fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y)) print 'Fitting Model Measurements' trees_array = [100] kf = cv.KFold(len(ttrain_x), n_folds=5) st_time = time.time() l_rate = [0.3] max_depth = [4,6] min_samples_leaf = [3, 5, 9, 17] for rate in l_rate: fobj.write('\n\nLearning Rate: %r\n' % (rate)) print '\n\nLearning Rate: %r\n' % (rate) for ntrees in trees_array: valid_acc = [] test_acc = [] train_acc = [] est_arr = [] unconst_acc = [] fold = 0 for train_idx, test_idx in kf: fobj.write('\nfold: %r\n'%(fold)) print '\nfold: %r\n'%(fold) #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False) rf_model = etr(n_estimators=ntrees, max_depth=dpth, min_samples_split=msl) vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features) valid_acc.append(vacc) train_acc.append(tacc) est_arr.append(est) fobj.write('Train Size:%r\n' % (len(train_idx))) print 'Train Size:%r\n' % (len(train_idx)) fobj.write('Validation Size:%r\n' % (len(test_idx))) fobj.write('Validation Error: %r\n' % (vacc)) print 'Validation Error: %r\n' % (vacc) fobj.write('Train Error: %r\n' % (tacc)) fobj.write('Constrained Test data size:%r\n' % (len(ttest_x))) test_acc.append(mt.mean_absolute_error(ttest_y, est.predict(ttest_x))) fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1])) print 'Constrained Test Error for fold: %r\n' % (test_acc[-1]) y_res = est.predict(u_test_x) unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res)) fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1])) print 'Complete test accuracy:%r\n' % (unconst_acc[-1]) fold+=1 et_time = time.time() fobj.write('..Statistics..\n') fobj.write('Expected Threshold Limit: %r\n' % (exp)) fobj.write('Trees:%r\n' % ntrees) fobj.write('Train Average: %r\n' % (np.mean(train_acc))) fobj.write('Validation Average: %r\n' % (np.mean(valid_acc))) fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc))) fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) fobj.write('Total Time Taken: %r mins\n' % ((et_time-st_time)/60)) #Print to console print('..Statistics..\n') print('Expected Threshold Limit: %r\n' % (exp)) print('Trees:%r\n' % ntrees) print('Train Average: %r\n' % (np.mean(train_acc))) print('Validation Average: %r\n' % (np.mean(valid_acc))) print('Constrained Test Average: %r\n' % (np.mean(test_acc))) print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) print('Total Time Taken: %r mins\n' % ((et_time-st_time)/60)) print 'Generating Solution Result:' test_x = np.loadtxt('ensemble_data\\norm_test_fmat.csv',delimiter=',') print 'Test Size:%r' % len(test_x) test_y_pred = est.predict(test_x) id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv', all_data, delimiter=',', header='Id,Expected') df = pd.read_csv('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv') df.to_csv('C:\Users\saura\Desktop\ML_Project\\ensemble_data\\gbr_final_solution.csv', header=True, index=False)
print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_263, target))) # ExtraTreesRegressor 极端随机森林回归 folds = KFold(n_splits=5, shuffle=True, random_state=13) oof_etr_263 = np.zeros(train_shape) predictions_etr_263 = np.zeros(len(X_test_263)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)): print("fold n°{}".format(fold_ + 1)) tr_x = X_train_263[trn_idx] tr_y = y_train[trn_idx] etr_263 = etr(n_estimators=1000, max_depth=8, min_samples_leaf=12, min_weight_fraction_leaf=0.0, max_features=0.4, verbose=1, n_jobs=-1) etr_263.fit(tr_x, tr_y) oof_etr_263[val_idx] = etr_263.predict(X_train_263[val_idx]) predictions_etr_263 += etr_263.predict(X_test_263) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_etr_263, target))) train_stack2 = np.vstack( [oof_lgb_263, oof_xgb_263, oof_gbr_263, oof_rfr_263, oof_etr_263]).transpose() # transpose()函数的作用就是调换x,y,z的位置,也就是数组的索引值 test_stack2 = np.vstack([
if (Env_var.get('Pca') == 1): pca = PCA(n_components = 300).fit(train_data_X) train_data_X = pca.transform(train_data_X) test_data_X = pca.transform(test_data_X) ###############################--------Model Setup--------############################### ann_regressor = KerasRegressor(build_fn=ann_model, epochs=30, batch_size=10, verbose=1) xgb_regressor = xgb(learning_rate = 0.0825, min_child_weight = 1, max_depth = 7, subsample = 0.8, verbose = 10, random_state = 2017, n_jobs = -1, eval_metric = "rmse") rfr_regressor = rfr(max_features = 0.9, min_samples_leaf = 50) gbr_regressor = gbr(n_estimators = 200, verbose = 5, learning_rate = 0.08, max_depth = 7, max_features = 0.5, min_samples_leaf = 50, subsample = 0.8, random_state = 2017) etr_regressor = etr(n_estimators = 200, verbose = 10, max_depth = 7, min_samples_leaf = 100, max_features = 0.9, min_impurity_split = 100, random_state = 2017) lr_regressor = lr() svr_regressor = svr(verbose = 10) ensemble = Ensemble(n_folds = 5,stacker = lr_regressor,base_models = [ann_regressor, xgb_regressor, rfr_regressor, gbr_regressor, etr_regressor]) ###############################--------Grid Search--------############################### if (Env_var.get('GridSearch') == 1): if (Env_var.get('Model') == 'ann'): dropout_rate = [0.0, 0.001, 0.01]