def stg2_GBR(fobj, train_x, hold_x, sub_test, ntrees=100): ntrees = ntrees e = 30 r = 0.3 arr = [0, 1, 2, 21, 22, 23, 24, 25, 26] hold_test_x = hold_x[:, arr] hold_test_y = hold_x[:, -1] sub_test = sub_test[:, arr] fobj.write('Trees: %r' % ntrees) print 'Stage 2: Exp Threshold - %r' % e fobj.write('Stage 2: Exp Threshold - %r\n' % e) c_train = train_x[(train_x[:, -1] <= e)] c_arr = [0, 1, 2, 21, 22, 23, 24, 25, 26, 27] c_train = c_train[:, c_arr] c_train_y = c_train[:, -1] c_train_x = c_train[:, :-1] kaggle_file = 'Kaggle_GBR_Ankit_e' + str(e) + '_r' + str(r) + '_t_' + str( ntrees) + str(len(arr)) + '.csv' df_kaggle_file = 'Kaggle_df_GBR_Ankit_e' + str(e) + '_r' + str( r) + '_t_' + str(ntrees) + str(len(arr)) + '.csv' print 'Stage 2: Rate - %r' % r fobj.write('GBR exp: %r rate: %r' % (e, r)) rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=r, max_depth=6) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'GBR Train Error: %r\n' % (error) fobj.write('GBR Train-2 Error: %r\n' % (error)) train_y_pred = est.predict(hold_test_x) error = mt.mean_absolute_error(hold_test_y, train_y_pred) print 'GBR 20 percent Hold Error: %r\n' % (error) fobj.write('GBR 20 percent Hold Error: %r\n' % (error)) print 'Test Size:%r' % len(sub_test) test_y_pred = est.predict(sub_test) id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file), all_data, delimiter=',', header='Id,Expected') df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file)) df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file), header=True, index=False)
def model_gradientboosting_regressor(X_train, X_test, y_train, y_test): model_name = f'model_{count}_gradientboosting_regressor' model = gbr() model.fit(X_train, y_train) model.independentcols = independentcols score = model.score(X_test, y_test) print(f'{model_name} accuracy: {score}') joblib.dump(model, f'model/{model_name}.joblib')
def regression(self, metric="root_mean_squared_error", folds=10, alphas=[], graph=False): size = 1.3 * self.report_width // 10 models = {} models["Linear regressor"] = lr() models["Lasso regressor"] = lassor() models["Lasso CV regressor"] = lassocvr() models["Ridge regressor"] = rr(alpha=0, normalize=True) models["Ridge CV regressor"] = rcvr(alphas = alphas) models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform') models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance') models["K nearest neighbors regressor K5"] = knnr(n_neighbors=5) models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10) models["SGD regressor"] = sgdr(max_iter=10000, warm_start=True) models["Decision tree regressor"] = dtr() models["Decision tree regressor D3"] = dtr(max_depth=3) models["Random forest regressor"] = rfr() models["Ada boost regressor"] = abr() models["Gradient boost regressor"] = gbr() models["Support vector regressor"] = svr() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*') #kf = StratifiedKFold(n_splits=folds, shuffle=True) kf = KFold(n_splits=folds) results = [] names = [] for model_name in models: cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, scoring=metric) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Regressor': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Regressor Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def stg2_GBR(fobj, train_x, hold_x, sub_test, ntrees=100): ntrees = ntrees e = 30 r = 0.3 arr = [0,1,2,21,22,23,24,25,26] hold_test_x = hold_x[:,arr] hold_test_y = hold_x[:,-1] sub_test = sub_test[:,arr] fobj.write('Trees: %r'%ntrees) print 'Stage 2: Exp Threshold - %r' % e fobj.write('Stage 2: Exp Threshold - %r\n'%e) c_train = train_x[(train_x[:,-1]<=e)] c_arr = [0,1,2,21,22,23,24,25,26,27] c_train = c_train[:,c_arr] c_train_y = c_train[:,-1] c_train_x = c_train[:,:-1] kaggle_file = 'Kaggle_GBR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr)) +'.csv' df_kaggle_file = 'Kaggle_df_GBR_Ankit_e'+str(e)+'_r'+str(r)+'_t_'+str(ntrees)+str(len(arr))+ '.csv' print 'Stage 2: Rate - %r' % r fobj.write('GBR exp: %r rate: %r'%(e,r)) rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=r, max_depth=6) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'GBR Train Error: %r\n' % (error) fobj.write('GBR Train-2 Error: %r\n' % (error)) train_y_pred = est.predict(hold_test_x) error = mt.mean_absolute_error(hold_test_y, train_y_pred) print 'GBR 20 percent Hold Error: %r\n' % (error) fobj.write('GBR 20 percent Hold Error: %r\n' % (error)) print 'Test Size:%r' % len(sub_test) test_y_pred = est.predict(sub_test) id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt(os.path.join(param.CURRENT_FOLDER, kaggle_file), all_data, delimiter=',', header='Id,Expected') df = pd.read_csv(os.path.join(param.CURRENT_FOLDER, kaggle_file)) df.to_csv(os.path.join(param.CURRENT_FOLDER, df_kaggle_file), header=True, index=False)
def fit(previos_data: pd.DataFrame): labels = previos_data[previos_data.columns[-1]] tabel = previos_data[previos_data.columns[:-1]] X_train, X_test, y_train, y_test = train_test_split(tabel, labels, test_size=0.33, random_state=42) self.predictor = gbr() self.predictor.fit(X_train, y_test) self.f1_score = f1_score(y_test, self.predictor.predict(X_test)) self.is_fitted = True return self
def runGBR(train_1_x, test_x, hold_out, sub_test, fobj): ntrees = 100 rate = 0.3 max_depth = 6 exp = 30 c_train_1_x = train_1_x[(train_1_x[:,-1]<=exp)] c_train_y = c_train_1_x[:,-1] c_train_x = c_train_1_x[:,:-1] rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate, max_depth=max_depth) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'GBR Train Error: %r\n' % (error) fobj.write('GBR Train-1 Error: %r\n' % (error)) valid_y_pred = est.predict(test_x) hold_y = est.predict(hold_out) sub_y = est.predict(sub_test) return est, valid_y_pred, hold_y, sub_y
def __init__(self, n_estimators=100, max_depth=10, learning_rate=0.01, loss='deviance', max_features=None, min_samples_split=2, min_samples_leaf=1, verbose=0, fit_intercept=True, normalize=False, name='GBC'): """ Initialize object with the informed hyper-parameter values. Args: """ # set method's name and paradigm super().__init__(name, fit_intercept, normalize) assert loss in ('deviance', 'exponential') assert n_estimators >= 1 assert max_depth >= 2 assert learning_rate > 0 assert min_samples_split >= 1 assert min_samples_leaf >= 1 self.n_estimators = n_estimators self.max_depth = max_depth self.learning_rate = learning_rate self.min_samples_split = min_samples_split self.loss = loss self.min_samples_leaf = min_samples_leaf self.output_directory = '' self.model = gbr(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features, verbose=verbose, loss=loss)
def __init__(self, n_estimators=100, max_depth=2, learning_rate=0.01, loss='ls', max_features=None, min_samples_split=2, min_samples_leaf=3, alpha=.09, verbose=0, name='GBR-Reg', fit_intercept=True, normalize=False): """ Initialize object with the informed hyper-parameter values. Args: (self, lambda_1=0.1, lambda_2=0, name='MSSL', fit_intercept=True, normalize=False) """ # set method's name and paradigm super().__init__(name, fit_intercept, normalize) self.n_estimators = n_estimators self.max_depth = max_depth self.learning_rate = learning_rate self.min_samples_split = min_samples_split self.loss = loss self.min_samples_leaf = min_samples_leaf self.output_directory = '' self.model = gbr(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features, verbose=verbose, loss=loss)
def runGBR(train_1_x, test_x, hold_out, sub_test, fobj): ntrees = 100 rate = 0.3 max_depth = 6 exp = 30 c_train_1_x = train_1_x[(train_1_x[:, -1] <= exp)] c_train_y = c_train_1_x[:, -1] c_train_x = c_train_1_x[:, :-1] rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate, max_depth=max_depth) est = rf_model.fit(c_train_x, c_train_y) train_y_pred = est.predict(c_train_x) error = mt.mean_absolute_error(c_train_y, train_y_pred) print 'GBR Train Error: %r\n' % (error) fobj.write('GBR Train-1 Error: %r\n' % (error)) valid_y_pred = est.predict(test_x) hold_y = est.predict(hold_out) sub_y = est.predict(sub_test) return est, valid_y_pred, hold_y, sub_y
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[]): global fobj fobj.write('Total Number of records: %r\n' % (len(comp_mat))) #step 1 split Train and test records train_x, u_test_x, train_y, u_test_y = cv.train_test_split(comp_mat, comp_mat[:,-1], random_state = 52, test_size=test_size) #removing the predictor column from the matrix u_test_x = u_test_x[:,:-1] #for exp in exp_thresh: exp = 25 fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp)) #step 2 prning to required exppected threshold c_train_x = train_x[(train_x[:,-1]<=exp)] c_train_y = c_train_x[:,-1] c_train_x = c_train_x[:,:-1] #step1 split Train ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(c_train_x, c_train_y, random_state = 32, test_size=test_size ) fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y)) fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y)) fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y)) print 'Fitting Model Measurements' trees_array = [100] kf = cv.KFold(len(ttrain_x), n_folds=5) st_time = time.time() l_rate = [0.3] max_depth = [4,6] min_samples_leaf = [3, 5, 9, 17] for rate in l_rate: fobj.write('\n\nLearning Rate: %r\n' % (rate)) print '\n\nLearning Rate: %r\n' % (rate) for ntrees in trees_array: valid_acc = [] test_acc = [] train_acc = [] est_arr = [] unconst_acc = [] fold = 0 for train_idx, test_idx in kf: fobj.write('\nfold: %r\n'%(fold)) print '\nfold: %r\n'%(fold) #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False) rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate) vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features) valid_acc.append(vacc) train_acc.append(tacc) est_arr.append(est) fobj.write('Train Size:%r\n' % (len(train_idx))) print 'Train Size:%r\n' % (len(train_idx)) fobj.write('Validation Size:%r\n' % (len(test_idx))) fobj.write('Validation Error: %r\n' % (vacc)) print 'Validation Error: %r\n' % (vacc) fobj.write('Train Error: %r\n' % (tacc)) fobj.write('Constrained Test data size:%r\n' % (len(ttest_x))) test_acc.append(mt.mean_absolute_error(ttest_y, est.predict(ttest_x))) fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1])) print 'Constrained Test Error for fold: %r\n' % (test_acc[-1]) y_res = est.predict(u_test_x) unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res)) fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1])) print 'Complete test accuracy:%r\n' % (unconst_acc[-1]) fold+=1 break et_time = time.time() fobj.write('..Statistics..\n') fobj.write('Expected Threshold Limit: %r\n' % (exp)) fobj.write('Trees:%r\n' % ntrees) fobj.write('Train Average: %r\n' % (np.mean(train_acc))) fobj.write('Validation Average: %r\n' % (np.mean(valid_acc))) fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc))) fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) fobj.write('Total Time Taken: %r mins\n' % ((et_time-st_time)/60)) #Print to console print('..Statistics..\n') print('Expected Threshold Limit: %r\n' % (exp)) print('Trees:%r\n' % ntrees) print('Train Average: %r\n' % (np.mean(train_acc))) print('Validation Average: %r\n' % (np.mean(valid_acc))) print('Constrained Test Average: %r\n' % (np.mean(test_acc))) print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) print('Total Time Taken: %r mins\n' % ((et_time-st_time)/60))
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[]): global fobj fobj.write('Total Number of records: %r\n' % (len(comp_mat))) #step 1 split Train and test records train_x, u_test_x, train_y, u_test_y = cv.train_test_split( comp_mat, comp_mat[:, -1], random_state=52, test_size=test_size) #removing the predictor column from the matrix u_test_x = u_test_x[:, :-1] #for exp in exp_thresh: exp = 25 fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp)) #step 2 prning to required exppected threshold c_train_x = train_x[(train_x[:, -1] <= exp)] c_train_y = c_train_x[:, -1] c_train_x = c_train_x[:, :-1] #step1 split Train ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split( c_train_x, c_train_y, random_state=32, test_size=test_size) fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y)) fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y)) fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y)) print 'Fitting Model Measurements' trees_array = [100] kf = cv.KFold(len(ttrain_x), n_folds=5) st_time = time.time() l_rate = [0.3] max_depth = [4, 6] min_samples_leaf = [3, 5, 9, 17] for rate in l_rate: fobj.write('\n\nLearning Rate: %r\n' % (rate)) print '\n\nLearning Rate: %r\n' % (rate) for ntrees in trees_array: valid_acc = [] test_acc = [] train_acc = [] est_arr = [] unconst_acc = [] fold = 0 for train_idx, test_idx in kf: fobj.write('\nfold: %r\n' % (fold)) print '\nfold: %r\n' % (fold) #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False) rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate) vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features) valid_acc.append(vacc) train_acc.append(tacc) est_arr.append(est) fobj.write('Train Size:%r\n' % (len(train_idx))) print 'Train Size:%r\n' % (len(train_idx)) fobj.write('Validation Size:%r\n' % (len(test_idx))) fobj.write('Validation Error: %r\n' % (vacc)) print 'Validation Error: %r\n' % (vacc) fobj.write('Train Error: %r\n' % (tacc)) fobj.write('Constrained Test data size:%r\n' % (len(ttest_x))) test_acc.append( mt.mean_absolute_error(ttest_y, est.predict(ttest_x))) fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1])) print 'Constrained Test Error for fold: %r\n' % (test_acc[-1]) y_res = est.predict(u_test_x) unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res)) fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1])) print 'Complete test accuracy:%r\n' % (unconst_acc[-1]) fold += 1 break et_time = time.time() fobj.write('..Statistics..\n') fobj.write('Expected Threshold Limit: %r\n' % (exp)) fobj.write('Trees:%r\n' % ntrees) fobj.write('Train Average: %r\n' % (np.mean(train_acc))) fobj.write('Validation Average: %r\n' % (np.mean(valid_acc))) fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc))) fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) fobj.write('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60)) #Print to console print('..Statistics..\n') print('Expected Threshold Limit: %r\n' % (exp)) print('Trees:%r\n' % ntrees) print('Train Average: %r\n' % (np.mean(train_acc))) print('Validation Average: %r\n' % (np.mean(valid_acc))) print('Constrained Test Average: %r\n' % (np.mean(test_acc))) print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) print('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60))
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[]): global fobj fobj.write('Total Number of records: %r\n' % (len(comp_mat))) #step 1 split Train and test records train_x, u_test_x, train_y, u_test_y = cv.train_test_split(comp_mat, comp_mat[:,-1], random_state = 52, test_size=test_size) #removing the predictor column from the matrix u_test_x = u_test_x[:,:-1] #for exp in exp_thresh: exp = 25 fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp)) #step 2 prning to required exppected threshold c_train_x = train_x[(train_x[:,-1]<=exp)] c_train_y = c_train_x[:,-1] c_train_x = c_train_x[:,:-1] #step1 split Train ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split(c_train_x, c_train_y, random_state = 32, test_size=test_size ) fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y)) fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y)) fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y)) print 'Fitting Model Measurements' trees_array = [1000] kf = cv.KFold(len(ttrain_x), n_folds=5) st_time = time.time() l_rate = [0.3] max_depth = [4,6] min_samples_leaf = [3, 5, 9, 17] for rate in l_rate: fobj.write('\n\nLearning Rate: %r\n' % (rate)) print '\n\nLearning Rate: %r\n' % (rate) for ntrees in trees_array: valid_acc = [] test_acc = [] train_acc = [] est_arr = [] unconst_acc = [] fold = 0 for train_idx, test_idx in kf: fobj.write('\nfold: %r\n'%(fold)) print '\nfold: %r\n'%(fold) #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False) rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate) vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features) valid_acc.append(vacc) train_acc.append(tacc) est_arr.append(est) fobj.write('Train Size:%r\n' % (len(train_idx))) print 'Train Size:%r\n' % (len(train_idx)) fobj.write('Validation Size:%r\n' % (len(test_idx))) fobj.write('Validation Error: %r\n' % (vacc)) print 'Validation Error: %r\n' % (vacc) fobj.write('Train Error: %r\n' % (tacc)) fobj.write('Constrained Test data size:%r\n' % (len(ttest_x))) test_acc.append(mt.mean_absolute_error(ttest_y, est.predict(ttest_x))) fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1])) print 'Constrained Test Error for fold: %r\n' % (test_acc[-1]) y_res = est.predict(u_test_x) unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res)) fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1])) print 'Complete test accuracy:%r\n' % (unconst_acc[-1]) fold+=1 et_time = time.time() fobj.write('..Statistics..\n') fobj.write('Expected Threshold Limit: %r\n' % (exp)) fobj.write('Trees:%r\n' % ntrees) fobj.write('Train Average: %r\n' % (np.mean(train_acc))) fobj.write('Validation Average: %r\n' % (np.mean(valid_acc))) fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc))) fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) fobj.write('Total Time Taken: %r mins\n' % ((et_time-st_time)/60)) #Print to console print('..Statistics..\n') print('Expected Threshold Limit: %r\n' % (exp)) print('Trees:%r\n' % ntrees) print('Train Average: %r\n' % (np.mean(train_acc))) print('Validation Average: %r\n' % (np.mean(valid_acc))) print('Constrained Test Average: %r\n' % (np.mean(test_acc))) print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) print('Total Time Taken: %r mins\n' % ((et_time-st_time)/60)) print 'Generating Solution Result:' test_x = np.loadtxt('ensemble_data\\norm_test_fmat.csv',delimiter=',') print 'Test Size:%r' % len(test_x) test_y_pred = est.predict(test_x) id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv', all_data, delimiter=',', header='Id,Expected') df = pd.read_csv('C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv') df.to_csv('C:\Users\saura\Desktop\ML_Project\\ensemble_data\\gbr_final_solution.csv', header=True, index=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) regressors = [ lr(), bay(), rr(alpha=.5, random_state=0), l(alpha=0.1, random_state=0), ll(), knn(), ard(), rfr(random_state=0, n_estimators=100), SVR(gamma='scale', kernel='rbf'), rcv(fit_intercept=False), en(random_state=0), dtr(random_state=0), ada(random_state=0), gbr(random_state=0) ] print('unscaled:', br) for reg in regressors: reg.fit(X_train, y_train) rmse, name = get_error(reg, X_test, y_test) name = reg.__class__.__name__ print(name + '(rmse):', end=' ') print(rmse) print() print('scaled:', br) scaler = StandardScaler() X_train_std = scaler.fit_transform(X_train) X_test_std = scaler.fit_transform(X_test) for reg in regressors: reg.fit(X_train_std, y_train)
X_s = x_scaler.fit_transform(X) y_s = y_scaler.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2) X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) y_train = pd.DataFrame(y_train) y_test = pd.DataFrame(y_test) # Gradient Boosting Regressor start # training the model from sklearn.ensemble import GradientBoostingRegressor as gbr gbr = gbr(loss='ls', learning_rate=0.1, n_estimators=800) gbr.fit(X_train, y_train[0]) y_pred1 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1)) gbr.fit(X_train, y_train[1]) y_pred2 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1)) gbr.fit(X_train, y_train[2]) y_pred3 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1)) gbr.fit(X_train, y_train[3]) y_pred4 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1)) gbr.fit(X_train, y_train[4]) y_pred5 = np.reshape(gbr.predict(X_test), (X_test.shape[0], 1))
def driver(comp_mat, pps_mth='ORIGINAL', exp_thresh=[25], regression_mth=[], test_size=0.2, f_selection=False, n_features=3, reg_methods=[], remove_mp=False): global fobj fobj.write('Total Number of records: %r\n' % (len(comp_mat))) #step 1 split Train and test records train_x, u_test_x, train_y, u_test_y = cv.train_test_split( comp_mat, comp_mat[:, -1], random_state=52, test_size=test_size) #removing the predictor column from the matrix u_test_x = u_test_x[:, :-1] #for exp in exp_thresh: exp = 25 fobj.write('\n\nExpected Threshold Limit: %r\n' % (exp)) #step 2 prning to required exppected threshold c_train_x = train_x[(train_x[:, -1] <= exp)] c_train_y = c_train_x[:, -1] c_train_x = c_train_x[:, :-1] #step1 split Train ttrain_x, ttest_x, ttrain_y, ttest_y = cv.train_test_split( c_train_x, c_train_y, random_state=32, test_size=test_size) fobj.write('Total Number of Constrained test records: %r\n' % len(ttest_y)) fobj.write('Total Number of Unconstrained test records: %r\n' % len(u_test_y)) fobj.write('Total Constrained Training Records: %r\n' % len(ttrain_y)) print 'Fitting Model Measurements' trees_array = [1000] kf = cv.KFold(len(ttrain_x), n_folds=5) st_time = time.time() l_rate = [0.3] max_depth = [4, 6] min_samples_leaf = [3, 5, 9, 17] for rate in l_rate: fobj.write('\n\nLearning Rate: %r\n' % (rate)) print '\n\nLearning Rate: %r\n' % (rate) for ntrees in trees_array: valid_acc = [] test_acc = [] train_acc = [] est_arr = [] unconst_acc = [] fold = 0 for train_idx, test_idx in kf: fobj.write('\nfold: %r\n' % (fold)) print '\nfold: %r\n' % (fold) #rf_model = rfr(n_estimators=ntrees, n_jobs=-1) Random forest regressor #rf_model = etr(n_estimators=ntrees, n_jobs=3, bootstrap=False) rf_model = gbr(n_estimators=ntrees, loss='lad', learning_rate=rate) vacc, tacc, est = rf_regressor(rf_model, ttrain_x[train_idx], ttrain_y[train_idx], ttrain_x[test_idx], ttrain_y[test_idx], f_selection=f_selection, n_features=n_features) valid_acc.append(vacc) train_acc.append(tacc) est_arr.append(est) fobj.write('Train Size:%r\n' % (len(train_idx))) print 'Train Size:%r\n' % (len(train_idx)) fobj.write('Validation Size:%r\n' % (len(test_idx))) fobj.write('Validation Error: %r\n' % (vacc)) print 'Validation Error: %r\n' % (vacc) fobj.write('Train Error: %r\n' % (tacc)) fobj.write('Constrained Test data size:%r\n' % (len(ttest_x))) test_acc.append( mt.mean_absolute_error(ttest_y, est.predict(ttest_x))) fobj.write('Constrained Test Error for fold: %r\n' % (test_acc[-1])) print 'Constrained Test Error for fold: %r\n' % (test_acc[-1]) y_res = est.predict(u_test_x) unconst_acc.append(mt.mean_absolute_error(u_test_y, y_res)) fobj.write('Complete test accuracy:%r\n' % (unconst_acc[-1])) print 'Complete test accuracy:%r\n' % (unconst_acc[-1]) fold += 1 et_time = time.time() fobj.write('..Statistics..\n') fobj.write('Expected Threshold Limit: %r\n' % (exp)) fobj.write('Trees:%r\n' % ntrees) fobj.write('Train Average: %r\n' % (np.mean(train_acc))) fobj.write('Validation Average: %r\n' % (np.mean(valid_acc))) fobj.write('Constrained Test Average: %r\n' % (np.mean(test_acc))) fobj.write('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) fobj.write('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60)) #Print to console print('..Statistics..\n') print('Expected Threshold Limit: %r\n' % (exp)) print('Trees:%r\n' % ntrees) print('Train Average: %r\n' % (np.mean(train_acc))) print('Validation Average: %r\n' % (np.mean(valid_acc))) print('Constrained Test Average: %r\n' % (np.mean(test_acc))) print('Unconstrained Test Avg: %r\n' % (np.mean(unconst_acc))) print('Total Time Taken: %r mins\n' % ((et_time - st_time) / 60)) print 'Generating Solution Result:' test_x = np.loadtxt('ensemble_data\\norm_test_fmat.csv', delimiter=',') print 'Test Size:%r' % len(test_x) test_y_pred = est.predict(test_x) id_col = np.arange(1., len(test_y_pred) + 1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt( 'C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv', all_data, delimiter=',', header='Id,Expected') df = pd.read_csv( 'C:\Users\saura\Desktop\ML_Project\ensemble_data\\gbr_mytest_solution.csv' ) df.to_csv( 'C:\Users\saura\Desktop\ML_Project\\ensemble_data\\gbr_final_solution.csv', header=True, index=False)
print("CV score: {:<8.8f}".format(mean_squared_error(oof_rfr_263, target))) # GradientBoostingRegressor梯度提升决策树 folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018) oof_gbr_263 = np.zeros(train_shape) predictions_gbr_263 = np.zeros(len(X_test_263)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)): print("fold n°{}".format(fold_ + 1)) tr_x = X_train_263[trn_idx] tr_y = y_train[trn_idx] gbr_263 = gbr(n_estimators=400, learning_rate=0.01, subsample=0.65, max_depth=7, min_samples_leaf=20, max_features=0.22, verbose=1) gbr_263.fit(tr_x, tr_y) oof_gbr_263[val_idx] = gbr_263.predict(X_train_263[val_idx]) predictions_gbr_263 += gbr_263.predict(X_test_263) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_263, target))) # ExtraTreesRegressor 极端随机森林回归 folds = KFold(n_splits=5, shuffle=True, random_state=13) oof_etr_263 = np.zeros(train_shape) predictions_etr_263 = np.zeros(len(X_test_263))
#Load Test X matrix CSV test_x = np.loadtxt('C:\Users\saura\Desktop\ML_Project\data\\norm_test_fmat.csv',delimiter=',') print 'Test Size:%r' % len(test_x) test_y_pred = est.predict(test_x) id_col = np.arange(1.,len(test_y_pred)+1, dtype=np.int) all_data = np.column_stack((id_col, test_y_pred)) np.savetxt('C:\Users\saura\Desktop\ML_Project\data\\mytest_solution.csv', all_data, delimiter=',', header='Id,Expected') df = pd.read_csv('C:\Users\saura\Desktop\ML_Project\data\\mytest_solution.csv') df.to_csv('C:\Users\saura\Desktop\ML_Project\\final_solution.csv', header=True, index=False) if __name__=="__main__": narray = np.loadtxt('C:\Users\saura\Desktop\ML_Project\data\\rho_gr_0_85_fmat.csv',delimiter=',') print 'read narray.. size:%r'%(len(narray)) nlabel = np.loadtxt('C:\Users\saura\Desktop\ML_Project\data\\rho_gr_0_85_label.csv',delimiter=',') print 'read label' train_x, test_x, train_y, test_y = cv.train_test_split(narray, nlabel, random_state = 42, test_size=0.2) rf_model = gbr(n_estimators=100, loss='lad') rf_regressor(rf_model, train_x, train_y, test_x, test_y) #200 0.99157639 19.635566 19.24871943 FALSE #gBR results #n_estimators = 100 # read narray.. size:731556 # read label # Train Size:585244 # Train Error 23.2042661418 # Train Size:146312 # valid Error:23.190855007768523 # Test Size:717625
train_df = df[:len(df1)] train_df = train_df.reset_index(drop=True) test_df = df[len(df1)+1:] test_df = test_df.reset_index(drop=True) ########### Weather baseline get_l2 = lambda y, y_hat: np.sum((y - y_hat)**2) / np.sum((y - np.mean(y))**2) def create_sklearn_compatible_x_y(df, columns): X = df[columns].values y = df.y.values return X, y max_estimators = 500 tree_num = np.round(np.linspace(20, max_estimators, 50)) model = gbr(n_estimators=max_estimators, max_depth=2) columns = ['temp', 'atemp', 'hum', 'windspeed'] X_train, y_train = create_sklearn_compatible_x_y(train_df, columns) X_test, y_test = create_sklearn_compatible_x_y(test_df, columns) model.fit(X_train, y_train) def plot_l2_vs_estimator_num(X_train, y_train, X_test, y_test): train_staged_y_hat = model.staged_predict(X_train) test_staged_y_hat = model.staged_predict(X_test) y_hat_iter = itertools.izip(train_staged_y_hat, test_staged_y_hat) get_l2 = lambda y, y_hat: np.sum((y - y_hat)**2) / np.sum((y - np.mean(y))**2) res = {}
def regression(self, metric, folds=10, alphas=[], printt=True, graph=False): size = self.graph_width # significant model setup differences should be list as different models models = {} models["Linear regressor"] = lr() models["Lasso regressor"] = lassor() models["Lasso CV regressor"] = lassocvr() models["Ridge regressor"] = rr(alpha=0, normalize=True) models["Ridge CV regressor"] = rcvr(alphas = alphas) models["Elastic net regressor"] = enr() models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform') models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance') models["K nearest neighbors regressor K5"] = knnr(n_neighbors=5) models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10) models["SGD regressor"] = sgdr(max_iter=10000, warm_start=True) models["Decision tree regressor"] = dtr() models["Decision tree regressor D3"] = dtr(max_depth=3) models["Random forest regressor"] = rfr() models["Ada boost regressor"] = abr() models["Gradient boost regressor"] = gbr() models["Support vector regressor RBF"] = svr() models["Support vector regressor Linear"] = svr('linear') models["Support vector regressor Poly"] = svr(kernel='poly') self.models = models kf = KFold(n_splits=folds, shuffle=True) results = [] names = [] et = [] for model_name in models: start = time.time() cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric) results.append(cv_scores) names.append(model_name) et.append((time.time() - start)) report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et}) report['Score (avg)'] = report.Score.apply(lambda x: np.sqrt(x).mean()) report['Score (std)'] = report.Score.apply(lambda x: np.sqrt(x).std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True) report.drop('Score', axis=1, inplace=True) report.reset_index(inplace=True, drop=True) self.report_performance = report if printt: print('\n') print(self.report_width * '*', '\n*') print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*') print(self.report_width * '*', '') print(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Regressor Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0, bottom=0.25) self.graphs_model.append(fig) plt.show() return None
traindata = traindata.fillna(0) train_xcol = traindata.drop(['W'], axis=1).drop([0], axis=0).reset_index() target = traindata['W'] target = target.drop([len(target) - 1], axis=0) loss = 'lad' learning_rate = 0.05 n_estimators = 500 min_samples_split = 3 max_depth = 10 gbm = gbr(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, min_samples_split=min_samples_split, max_depth=max_depth) model = gbm.fit(train_xcol, target) testdata = jaysdata.drop(['Lg', 'L', 'Year'], axis=1) testdata = testdata.fillna(0) test_xcol = testdata.drop(['W'], axis=1).drop([0], axis=0).reset_index() test_ycol = testdata['W'].drop([len(testdata) - 1]) new_r_square = model.score(test_xcol, test_ycol) hidden_layer_sizes = (100, 10) mlpnn = mlp(hidden_layer_sizes=hidden_layer_sizes)
test_data_X = sc_X.transform(test_data_X) train_data_y = np.log(1+train_data_y) if (Env_var.get('Pca') == 1): pca = PCA(n_components = 300).fit(train_data_X) train_data_X = pca.transform(train_data_X) test_data_X = pca.transform(test_data_X) ###############################--------Model Setup--------############################### ann_regressor = KerasRegressor(build_fn=ann_model, epochs=30, batch_size=10, verbose=1) xgb_regressor = xgb(learning_rate = 0.0825, min_child_weight = 1, max_depth = 7, subsample = 0.8, verbose = 10, random_state = 2017, n_jobs = -1, eval_metric = "rmse") rfr_regressor = rfr(max_features = 0.9, min_samples_leaf = 50) gbr_regressor = gbr(n_estimators = 200, verbose = 5, learning_rate = 0.08, max_depth = 7, max_features = 0.5, min_samples_leaf = 50, subsample = 0.8, random_state = 2017) etr_regressor = etr(n_estimators = 200, verbose = 10, max_depth = 7, min_samples_leaf = 100, max_features = 0.9, min_impurity_split = 100, random_state = 2017) lr_regressor = lr() svr_regressor = svr(verbose = 10) ensemble = Ensemble(n_folds = 5,stacker = lr_regressor,base_models = [ann_regressor, xgb_regressor, rfr_regressor, gbr_regressor, etr_regressor]) ###############################--------Grid Search--------############################### if (Env_var.get('GridSearch') == 1):