def xgb_regressor(self, assign=True, **kwargs): """ 有监督学习回归器,默认使用: GBR(n_estimators=100) 通过**kwargs即关键字参数透传GBR(**kwargs),即: GBR(**kwargs) 注意导入使用: try: from xgboost.sklearn import XGBRegressor as GBR except ImportError: from sklearn.ensemble import GradientBoostingRegressor as GBR :param assign: 是否保存实例后的回归器对象,默认True,self.reg = reg :param kwargs: 有参数情况下初始化: GBR(n_estimators=100) 无参数情况下初始化: GBR(**kwargs) :return: 实例化的GBR对象 """ if kwargs is not None and len(kwargs) > 0: reg = GBR(**kwargs) else: reg = GBR(n_estimators=100) if assign: self.reg = reg return reg
def main(): features = [ "response_excitedness", "response_happiness", "mode", "time_signature", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "valence", "tempo" ] data = link_features_mood(get_responses=True) train_set = [] for song in data: print(data) row = [song[feature] for feature in features] train_set.append(row) train_set = np.array(train_set).astype(float) energy = [elem[1] for elem in train_set] happiness = [elem[2] for elem in train_set] train_data = [elem[5:] for elem in train_set] excited_est = GBR(n_estimators=50, max_depth=3) excited_est.fit(train_data, energy) happy_est = GBR(n_estimators=50, max_depth=3) happy_est.fit(train_data, happiness) dump(excited_est, 'Retrained-Energy.joblib') dump(happy_est, 'Retrained-Happiness.joblib')
def _make_cate_predictions(self, trial: optuna.Trial, i: int) -> np.ndarray: """Make predictions of CATE by a sampled set of hyperparameters.""" # hyparparameters # for control model eta_con = trial.suggest_loguniform('eta_control', 1e-5, 1e-1) min_leaf_con = trial.suggest_int('min_samples_leaf_control', 1, 20) max_depth_con = trial.suggest_int('max_depth_control', 1, 20) subsample_con = trial.suggest_uniform('sub_sample_control', 0.1, 1.0) control_params = { 'n_estimators': 100, 'learning_rate': eta_con, 'min_samples_leaf': min_leaf_con, 'max_depth': max_depth_con, 'subsample': subsample_con, 'random_state': 12345 } # for treated model eta_trt = trial.suggest_loguniform('eta_treat', 1e-5, 1e-1) min_leaf_trt = trial.suggest_int('min_samples_leaf_treat', 1, 20) max_depth_trt = trial.suggest_int('max_depth_treat', 1, 20) subsample_trt = trial.suggest_uniform('sub_sample_treat', 0.1, 1.0) treated_params = { 'n_estimators': 100, 'learning_rate': eta_trt, 'min_samples_leaf': min_leaf_trt, 'max_depth': max_depth_trt, 'subsample': subsample_trt, 'random_state': 12345 } # for overall model eta_ova = trial.suggest_loguniform('eta_overall', 1e-5, 1e-1) min_leaf_ova = trial.suggest_int('min_samples_leaf_overall', 1, 20) max_depth_ova = trial.suggest_int('max_depth_overall', 1, 20) subsample_ova = trial.suggest_uniform('sub_sample_overall', 0.1, 1.0) overall_params = { 'n_estimators': 100, 'learning_rate': eta_ova, 'min_samples_leaf': min_leaf_ova, 'max_depth': max_depth_ova, 'subsample': subsample_ova, 'random_state': 12345 } # define DAL model meta_learner = DAL(controls_model=GBR(**control_params), treated_model=GBR(**treated_params), overall_model=GBR(**overall_params)) meta_learner.fit(X=self.Xtr[i], T=self.Ttr[i], Y=self.Ytr[i]) return meta_learner.effect(X=self.Xval[i])
def main(TRAIN_RATIO): ''' This script is used to train the Gradient Boosting Regressor. The .joblib files (trained data) are used on the server for mood analysis. This file is only used offline. :param TRAIN_RATIO: Ratio used to train/test the data''' data = np.array(np.loadtxt(open("analyzed_tracks_1.csv", "rb"), delimiter=",", skiprows=1, usecols=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))) trainset = [] testset = [] for item in data: if(np.random.uniform(0, 1) <= TRAIN_RATIO): trainset.append(item) else: testset.append(item) energy = [elem[0] for elem in trainset] happiness = [elem[1] for elem in trainset] traindata = [elem[4:] for elem in trainset] testdata = [elem[4:] for elem in testset] testE = [elem[0] for elem in testset] testH = [elem[1] for elem in testset] E_est = GBR(n_estimators=50, max_depth=3) E_est.fit(traindata, energy) H_est = GBR(n_estimators=50, max_depth=3) H_est.fit(traindata, happiness) # Create .joblib files to reuse the trained algorith later. dump(E_est, 'Trained-Energy.joblib') dump(H_est, 'Trained-Happiness.joblib') E_pred = E_est.predict(testdata) H_pred = H_est.predict(testdata) # Determine absolute difference betweeen predictions and actual values. E_sum = 0 H_sum = 0 for i in range(len(testE)): difE = abs(testE[i]-E_pred[i]) difH = abs(testH[i]-H_pred[i]) E_sum += difE H_sum += difH print(E_sum, H_sum)
def fit(self, X, y): if self.T is None: self.load_tags() self.als = MangakiALS(self.nb_components) try: self.als.load(self.als.get_backup_filename()) except: self.als.set_parameters(self.nb_users, self.nb_works) self.als.fit(X, y) self.als.compute_all_errors(X, y, X, y) self.chrono.save('fit ALS model') X_full = self.prepare_features(X, self.als.U, self.als.VT.T) self.chrono.save('build features') self.gbr = GBR(n_estimators=self.nb_estimators) self.gbr.fit(X_full, y) logging.debug('feature_importances=%s', str(self.gbr.feature_importances_)) logging.debug('train_score=%s', str(self.gbr.train_score_)) self.chrono.save('fit GBR model')
def find_best_feature(feature_name, cv_fold, train_data, train_label): # 为了寻找最佳的特征组合,这里是对LGBMClassifier XGBClassifier GBC三个模型的得分进行平均,来代表这个特征所代表的分数 get_ans_face = feature_name new_lgb_model = lgb.LGBMRegressor(n_estimators=300, random_state=1) cv_model = cv(new_lgb_model, train_data[get_ans_face], train_label, cv=cv_fold, scoring='r2') new_lgb_model.fit(train_data[get_ans_face], train_label) m1 = cv_model.mean() new_xgb_model1 = xgb.XGBRegressor(n_estimators=300, random_state=1) cv_model = cv(new_xgb_model1, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='r2') new_xgb_model1.fit(train_data[get_ans_face].values, train_label) m2 = cv_model.mean() new_gbc_model = GBR(n_estimators=310) cv_model = cv(new_gbc_model, train_data[get_ans_face].values, train_label, cv=cv_fold, scoring='r2') new_gbc_model.fit(train_data[get_ans_face].values, train_label) m3 = cv_model.mean() return (m1 + m2 + m3) / 3
def train_model(): data = get_data() X_train, X_test, y_train, y_test = split_data(data) X_train, y_train = remove_county_state(X_train, y_train) X_test, y_test = remove_county_state(X_test, y_test) # data preprocessing (removing mean and scaling to unit variance with StandardScaler) pipeline = make_pipeline(StandardScaler(), GBR()) # set hyperparameters hyperparameters = { 'gradientboostingregressor__n_estimators': [100, 600, 700, 800], 'gradientboostingregressor__max_depth': [3, 4, 5, 10, 20], 'gradientboostingregressor__min_samples_split': [3, 4, 5, 10, 20], 'gradientboostingregressor__learning_rate': [0.01, 0.05, 0.1], 'gradientboostingregressor__loss': ['ls'], } # tune model via pipeline clf = GridSearchCV(pipeline, hyperparameters, cv=3) clf.fit(X_train, y_train) pred = clf.predict(X_test) # print('feature importances:', clf.feature_importances_) print('r2 score:', r2_score(y_test, pred)) print('mse:', mean_squared_error(y_test, pred)) print('*' * 20) print('best params:', clf.best_params_) print('best grid:', clf.best_estimator_) print('^' * 20) eval_model(clf.best_estimator_, X_train, y_train, X_test, y_test) print('#' * 20) print('score', clf.score) return clf
def Gradient(self,Results='',TestSet=False): G = GBR(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') if TestSet==False: GResult = G.fit(self.X,np.ravel(self.y,1)) if Results==True: print(str(GResult.score(self.X,np.ravel(self.y,1))) + '\n' + str(GResult.get_params())) plt.plot(G.fit(self.X,np.ravel(self.y,1)).predict(self.X)) y = np.array(self.y[self.DVCols]) plt.plot(y,'ro') plt.show() else: x_train = self.X[:len(self.X)//2] y_train = np.ravel(self.y,1)[:len(self.y)//2] x_test = self.X[len(self.X)//2:] y_test = np.ravel(self.y,1)[len(self.y)//2:] GResult = G.fit(x_train,y_train) if Results==True: print(str(GResult.score(self.X,np.ravel(self.y,1))) + '\n' + str(GResult.get_params())) GRPredict = GResult.predict(x_test) plt.plot(GRPredict,polyval(polyfit(GRPredict,y_test.reshape(-1),1),GRPredict),'r-',label='predicted') plt.plot(GRPredict,y_test.reshape(-1),'bo') plt.legend() plt.show()
def gradient_boosting_regressor(trainX, y_train): model = GBR(n_estimators=300, learning_rate=0.1, max_depth=8, random_state=777, loss='ls') model.fit(trainX.iloc[:, ~trainX.columns.str.match("y")], y_train) return model
def model(X_train, y_train, X_test=np.array([]), y_test=np.array([]), method="LR"): #X_train inputs of model for training #X_test inputs of model fortesting #y_train -outputs for Xtrain #y_test - outputs fo X_test #method of model design. Default method is linear regression if method == "LR": lr = LR() elif method == "Ridge": lr = Ridge() elif method == "Lasso": lr = Lasso() elif method == "MLPRegressor": lr = MLPRegressor() elif method == "SVR": lr = SVR() elif method == "KNR": lr = KNR() elif method == "RFR": lr = RFR() elif method == "GBR": lr = GBR() else: print("unknown method") return False # lr = MLPRegressor( hidden_layer_sizes=[5], activation ="relu") # lr = MLPRegressor() # lr=SVR() # lr=KNR() # # lr=Ridge(alpha=alpha.x) # lr=Ridge() # lr=Lasso(alpha=0.001) # lr=Lasso() # lr=RFR(n_estimators=5, max_features=2, max_depth=2, random_state=2) # lr=RFR() # lr=GBR() lr = lr.fit(X_train, y_train[:, 0]) y_mod_train = lr.predict(X_train) c_train = CCC(y_train, y_mod_train[:, np.newaxis]) c_test = -1 if len(y_test) > 0: y_mod_test = lr.predict(X_test) c_test = CCC(y_test, y_mod_test[:, np.newaxis]) return (lr, c_train, c_test)
def gbdtcv(n_estimators, min_samples_split, max_depth): val = cross_val_score( GBR(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split), max_depth=int(max_depth), random_state=2 ), X_tr, y_tr, cv=2 ).mean() return val
def bestParas(X_train, y_train): pipeline = make_pipeline(preprocessing.StandardScaler(), GBR()) hyperparameters = { 'gradientboostingregressor__learning_rate': [0.1, 0.2], 'gradientboostingregressor__max_depth': [3, 6, 9], 'gradientboostingregressor__n_estimators': [50, 80], 'gradientboostingregressor__subsample': [0.8, 0.9, 1.0] } gbr = GridSearchCV(pipeline, hyperparameters, cv=10).fit(X_train, y_train) return gbr
def __ensemble_test(type, X_train, X_test, y_train, y_test): if type.lower() == 'gbr': reg = GBR(n_estimators=100, random_state=1) elif type.lower() == 'rfr': reg = RFR(n_estimators=100, random_state=1) elif type.lower() == 'abr': reg = ABR(n_estimators=100, random_state=1) elif type.lower() == 'etr': reg = ETR(n_estimators=100, random_state=1) reg.fit(X_train, y_train) return reg, reg.score(X_test, y_test), reg.feature_importances_
def GBDT(self, n, step): best_params = { 'n_estimators': 1000, 'max_depth': 10, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'huber' } params_high = { 'n_estimators': 1000, 'max_depth': 10, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'huber' } model_low = GBR() print(self.txl.shape, self.tyllog.shape, self.vxl.shape) model_low.fit(self.txl, self.tyllog) self.y_pre_train_log = model_low.predict(self.txl).reshape(-1, 1) self.y_pre_train = [ 10**x for x in model_low.predict(self.txl).reshape(-1, 1) ] self.y_pre_valid_log = model_low.predict(self.vxl).reshape(-1, 1) self.y_pre_valid = [ 10**x for x in model_low.predict(self.vxl).reshape(-1, 1) ] model_high = GBR() model_high.fit(self.txh, self.tyhlog) self.y_pre_train_log = np.r_[ self.y_pre_train_log, model_high.predict(self.txh).reshape(-1, 1)] self.y_pre_train = np.r_[ self.y_pre_train, np.exp(model_high.predict(self.txh).reshape(-1, 1))] self.y_pre_valid_log = np.r_[ self.y_pre_valid_log, model_high.predict(self.txh).reshape(-1, 1)] self.y_pre_valid = np.r_[ self.y_pre_valid, np.exp(model_high.predict(self.vxh).reshape(-1, 1))]
def GradientBoosting_regression(self, n, step): params = { 'n_estimators': n, 'max_depth': step, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'lad' } gbr = GBR(**params) gbr.fit(self.train_X, self.train_y) self.y_pre_train = gbr.predict(self.train_X) self.y_pre_test = gbr.predict(self.test_X)
def train(self, zone, num, hidden_layer_size=(4), n_jobs=1, kernel='rbf', n_components=15, n_estimators=50, loss='linear', learning_rate=1.0, host='127.0.0.1'): f = fd(host) input_set = f.getTrainData(zone) x_train, x_test, y_train, y_test, scaler, pca = self.read_dataset( input_set, n_components) if num == 1: #Linear Regression clf = LinearRegression(n_jobs=n_jobs) clf.fit(x_train, y_train) # storeObj(clf,zone,clf.score(x_test,y_test),'Linear Regression') return clf, clf.score(x_test, y_test), 'Linear Regression', scaler, pca elif num == 2: # SVR sigmoid clf = svm.SVR(kernel=kernel) clf.fit(x_train, y_train) # storeObj(clf, zone, clf.score(x_test, y_test), 'SVR'+','+kernel) return clf, clf.score(x_test, y_test), 'SVR' + kernel, scaler, pca elif num == 3: #Neural Net clf = mlpr(hidden_layer_size=hidden_layer_size) clf.fit(x_train, y_train) str = '' for i in hidden_layer_size: str += '-> {}'.format(i) # storeObj(clf, zone, clf.score(x_test, y_test), 'NeuralNet'+' hidden layer size'+hidden_layer_size) return clf, clf.score( x_test, y_test), 'NeuralNet hidden_size' + str, scaler, pca elif num == 4: #Gradient Boosting Regressor clf = GBR(loss=loss, n_estimators=n_estimators, learning_rate=learning_rate) clf.fit(x_train, y_train) # storeObj(clf, zone, clf.score(x_test, y_test), 'Gradient Boosting Regressor') return clf, clf.score( x_test, y_test), 'Gradient Boosted Regressor', scaler, pca elif num == 5: clf = ABR() clf.fit(x_train, y_train) # storeObj(clf, zone, clf.score(x_test, y_test), 'AdaBoost Regressor') return clf, clf.score(x_test, y_test), 'AdaBoost Regressor', scaler, pca
def ratio_test(): df = pd.DataFrame(columns=['ratio', 'score']) for i in range(1, 100): X_train, X_test, y_train, y_test, predict_X, features = pre.raw_preprocessing( i / 100) reg = GBR(random_state=1) reg.fit(X_train, y_train) df = df.append(pd.DataFrame( [[1 - i / 100, reg.score(X_test, y_test)]], columns=df.columns), ignore_index=True) plt.plot(df['ratio'], df['score'], 'k.-') plt.xlabel('train_set_ratio') plt.ylabel('score') plt.savefig('ratio_score.png') df.to_csv('ratio.png', index=None)
def goal(hyper): modelo = GBR(n_estimators=int(hyper['n_estimators']), learning_rate=hyper['learning_rate'], subsample=hyper['subsample'], alpha=hyper['alpha'], validation_fraction=hyper['validation_fraction']) eval_set = [(X_train, y_train), (X_test, y_test)] modelo.fit(X_train, y_train) y_pred = modelo.predict(X_test) rmse = mse(y_test, y_pred)**0.5 return {'loss': rmse, 'status': STATUS_OK}
def trainMSEtest(data, index_train, index_test): X = data[:, :-1] y = data[:, -1] X_train = X[index_train] X_test = X[index_test] y_train = y[index_train] y_test = y[index_test] gbr = GBR(loss='ls', max_depth=6, n_estimators=80, subsample=0.6, learning_rate=0.08).fit(X_train, y_train) pred_test = gbr.predict(X_test) pred_train = gbr.predict(X_train) print(around(pred_test[:30]), '\n', y_test[:30]) print('Test MSE:', mse(y_test, pred_test)) print('Train MSE:', mse(y_train, pred_train))
def train_cali_metrics(pred_probs, multi_labels, thres = 0.5): cal_features = [] cal_labels = [] for probs, ml in zip(pred_probs, multi_labels): prob_feat = get_prob_feat(probs, thres) #prior_feat = get_prior_feat(probs, thres) / float(priors_total) card_feat = get_card_feat(probs, thres) label_feat = get_label_feat(probs, thres) feature = [prob_feat, card_feat] + label_feat cal_label = check_same(probs, ml, 0.5) cal_features.append(feature) cal_labels.append(cal_label) gb = GBR(loss='ls', learning_rate=0.1, min_samples_leaf=5, n_estimators=100) gb.fit(cal_features, cal_labels) return gb
def TrainModel_GBR(x, y): x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #clf = GBR() clf=GBR(alpha=0.9, criterion='friedman_mse', init=None, \ learning_rate=0.03, loss='huber', max_depth=15,\ max_features='sqrt', max_leaf_nodes=None,\ min_impurity_decrease=0.0, min_impurity_split=None, \ min_samples_leaf=10, min_samples_split=40,\ min_weight_fraction_leaf=0.0, n_estimators=300, \ presort='auto', random_state=10, subsample=0.8, verbose=0, \ warm_start=False) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print("MSE:", metrics.mean_squared_error(y_test, y_pred)) return clf
def test_gbdt(): import numpy as np import pandas as pd from sklearn.datasets import load_boston dataset = load_boston() X, y, features = dataset['data'], dataset['target'], dataset[ 'feature_names'] X = pd.DataFrame(X, columns=features) y = pd.DataFrame(y, columns=['target']) data = pd.concat([X, y], axis=1) features = data.columns[:-1] target = data.columns[-1] from sklearn.model_selection import train_test_split X_train, X_vali, y_train, y_vali = train_test_split(X, y, test_size=0.2, random_state=25) print('X_train shape: ', X_train.shape) print('X_vali shape: ', X_vali.shape) print('y_train shape: ', y_train.shape) print('y_vali shape: ', y_vali.shape) from sklearn.tree import DecisionTreeRegressor as DTR dtr = DTR(max_depth=5) dtr.fit(X_train, y_train.values.reshape(-1)) print('sklearn dtr score: ', dtr.score(X_vali, y_vali)) from sklearn.ensemble import GradientBoostingRegressor as GBR import xgboost as xgb gbr = GBR(max_depth=5) gbr.fit(X_train, y_train) print('sklearn gbr score: ', gbr.score(X_vali, y_vali)) from ml.tree import DecisionTreeRegressor mydtr = DecisionTreeRegressor(max_depth=5) mydtr.fit(X_train, y_train) print('my dtr score: ', mydtr.score(X_vali, y_vali)) from ml.ensemble import GradientBoostingRegressor mygbr = GradientBoostingRegressor() mygbr.fit(X_train, y_train) print('my gbr score: ', mygbr.score(X_vali, y_vali))
def __init__(self, n_estimators = 2000, period = 12, savepath = 'Results', modeltype = None): """ parameters """ self.n_estimators = n_estimators self.timegap = period * 5 self.savepath=savepath self.modeltype= modeltype # model save path self.weightspath = self.savepath + '/{}_GBR_model_{}_estimators.joblib'.format(self.modeltype, self.n_estimators) # containers for predictions self.train_pred = None self.test_pred = None # Design the model self.model = GBR(loss='ls', learning_rate=0.1, validation_fraction=0.1, n_iter_no_change=300, n_estimators=self.n_estimators)
def gbr(data_dir, model_dir, features): X_train, X_test, y_train, y_test, predict_X, features = pre.drop_preprocessing( data_dir, features) os.chdir(model_dir) gbr = GBR(subsample=1, random_state=1) grid = GridSearchCV(estimator=gbr, param_grid={ 'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': range(50, 311, 20) }, cv=5) grid.fit(X_train, y_train) print(grid.best_params_) print(grid.best_estimator_.score(X_test, y_test)) joblib.dump( grid.best_estimator_, 'gbr_%d_%.4f.m' % (len(features), grid.best_estimator_.score(X_test, y_test))) df = pd.DataFrame(columns=['pbe_bandgap', 'ml_bandgap']) df['pbe_bandgap'] = y_test df['ml_bandgap'] = grid.best_estimator_.predict(X_test) print(df) return grid.best_estimator_
def analyzeMetricNumericalShell(metric, training, excludeFeatures): print('\nModeling', metric) X = training[training.columns - excludeFeatures] Y = training[metric] # To reproduce results, fix the random seed seed(1) features = X.columns # print(X.info()) # as an array x = np.asanyarray(X) y = np.asanyarray(Y) # regressor predictions linearP = runCVNumerical(x, y, LinR) gbrP = runCVNumerical(x, y, GBR ) # forestP = fitRandomForestRegressor(x, y) forestP = runCVNumerical(x, y, RFR) treeP = runCVNumerical(x, y, DTR) # RMSLEs linearErr = rmse(y, linearP) gbrErr = rmse(y, gbrP) forestErr = rmse(y, forestP) treeErr = rmse(y, treeP) mappings = [ { 'name' : "Linear Regression", 'algo' : LinR }, { 'name' : "Gradient Boosting Regressor", 'algo' : GBR }, { 'name' : "Random Forest Regressor", 'algo' : RFR }, { 'name' : "Decision Tree Regressor", 'algo' : DTR } ] errors = [ { 'name' : "Linear Regression", 'accuracy' : linearErr }, { 'name' : "Gradient Boosting Regressor", 'accuracy' : gbrErr }, { 'name' : "Random Forest Regressor", 'accuracy' : forestErr }, { 'name' : "Decision Tree Regressor", 'accuracy' : treeErr } ] errors = sorted(errors, key=lambda k: k['accuracy']) for error in errors: print(error['name'], ' scores an RMSE value of ', error['accuracy']) theBest = min(errors, key=lambda x:x['accuracy'])['name'] bestAlgo = next(d for (index, d) in enumerate(mappings) if d["name"] == theBest)['algo'] bestErr = min(errors, key=lambda x:x['accuracy'])['accuracy'] print('\nBest performer:') print(theBest, ' with an RMSE of ', bestErr) if theBest == 'Gradient Boosting Regressor': best = GBR(n_estimators=1000, learning_rate = 0.09, loss = 'ls', random_state = 652100, max_depth=2, subsample=0.8) fit = best.fit(X, Y) gbrPreds = runCVNumerical(x, y, GBR, n_estimators=1000, learning_rate = 0.09, loss = 'ls', random_state = 652100, max_depth=2, subsample=0.8 ) gbrFinalErr = rmse(y, gbrPreds) print('test rmse by GBR = ', gbrFinalErr) elif theBest == 'Random Forest Regressor': best = RFR(n_estimators=800, random_state = 652100, max_features=0.2) fit = best.fit(X, Y) gbrPreds = runCVNumerical(x, y, RFR, n_estimators=800, random_state = 652100, max_features=0.2 ) gbrFinalErr = rmse(y, gbrPreds) print('test rmse by RFR = ', gbrFinalErr) return ({ 'features' : features, 'predictions' : gbrPreds, 'model' : best })
def analyzeMetricNumericalRMSE(metric, training, excludeFeatures): print('\nModeling', metric) X = training[training.columns - excludeFeatures] Y = training[metric] # To reproduce results, fix the random seed seed(1) features = X.columns print(X.info()) # as an array x = np.asanyarray(X) y = np.asanyarray(Y) # regressor predictions linearP = runCVNumerical(x, y, LinR) gbrP = runCVNumerical(x, y, GBR ) # forestP = fitRandomForestRegressor(x, y) forestP = runCVNumerical(x, y, RFR) treeP = runCVNumerical(x, y, DTR) # RMSLEs linearErr = rmse(y, linearP) gbrErr = rmse(y, gbrP) forestErr = rmse(y, forestP) treeErr = rmse(y, treeP) mappings = [ { 'name' : "Linear Regression", 'algo' : LinR }, { 'name' : "Gradient Boosting Regressor", 'algo' : GBR }, { 'name' : "Random Forest Regressor", 'algo' : RFR }, { 'name' : "Decision Tree Regressor", 'algo' : DTR } ] errors = [ { 'name' : "Linear Regression", 'accuracy' : linearErr }, { 'name' : "Gradient Boosting Regressor", 'accuracy' : gbrErr }, { 'name' : "Random Forest Regressor", 'accuracy' : forestErr }, { 'name' : "Decision Tree Regressor", 'accuracy' : treeErr } ] errors = sorted(errors, key=lambda k: k['accuracy']) for error in errors: print(error['name'], ' scores an RMSE value of ', error['accuracy']) theBest = min(errors, key=lambda x:x['accuracy'])['name'] bestAlgo = next(d for (index, d) in enumerate(mappings) if d["name"] == theBest)['algo'] bestErr = min(errors, key=lambda x:x['accuracy'])['accuracy'] print('\nBest performer:') print(theBest, ' with an RMSE of ', bestErr) best = GBR(n_estimators=1000, learning_rate = 0.09, loss = 'ls', random_state = 652100, max_depth=2, subsample=0.8) fit = best.fit(X, Y) gbrPreds = runCVNumerical(x, y, GBR, n_estimators=1000, learning_rate = 0.09, loss = 'ls', random_state = 652100, max_depth=2, subsample=0.8 ) gbrFinalErr = rmse(y, gbrPreds) print('test rmse = ', gbrFinalErr) fi = best.feature_importances_ sum_fi = {} for j in range( len( features ) ): if features[j] in sum_fi: sum_fi[features[j]] = sum_fi[features[j]] + fi[j] else: sum_fi[features[j]] = fi[j] sum_fi_list = [[key, sum_fi[key]] for key in sum_fi] sum_fi_list.sort( key = lambda x : x[1], reverse = True ) return ({ 'features' : features, 'predictions' : gbrPreds, 'model' : best, 'importance' : sum_fi_list })
def analyzeMetricNumerical(metric, training, excludeFeatures, tuning): # metric = 'Code 2' # print('Analyzing', metric, 'with\n', training.columns) X = training[training.columns - excludeFeatures] Y = training[metric] # To reproduce results, fix the random seed seed(1) features = X.columns # print(X.info()) # as an array x = np.asanyarray(X) y = np.asanyarray(Y) # regressor predictions linearP = runCVNumerical(x, y, LinR) gbrP = runCVNumerical(x, y, GBR ) # forestP = fitRandomForestRegressor(x, y) forestP = runCVNumerical(x, y, RFR) treeP = runCVNumerical(x, y, DTR) # RMSLEs linearErr = rmsle(y, linearP) gbrErr = rmsle(y, gbrP) forestErr = rmsle(y, forestP) treeErr = rmsle(y, treeP) mappings = [ { 'name' : "Linear Regression", 'algo' : LinR }, { 'name' : "Gradient Boosting Regressor", 'algo' : GBR }, { 'name' : "Random Forest Regressor", 'algo' : RFR }, { 'name' : "Decision Tree Regressor", 'algo' : DTR } ] errors = [ { 'name' : "Linear Regression", 'accuracy' : linearErr }, { 'name' : "Gradient Boosting Regressor", 'accuracy' : gbrErr }, { 'name' : "Random Forest Regressor", 'accuracy' : forestErr }, { 'name' : "Decision Tree Regressor", 'accuracy' : treeErr } ] print(errors, '\n\n') theBest = min(errors, key=lambda x:x['accuracy'])['name'] bestAlgo = next(d for (index, d) in enumerate(mappings) if d["name"] == theBest)['algo'] bestErr = min(errors, key=lambda x:x['accuracy'])['accuracy'] print('Best performer = ', theBest, bestErr, bestAlgo) # theBest = 'Gradient Boosting Regressor' # bestAlgo = GBR # get the best performing algorithm if theBest is "Decision Tree Regressor": best = bestAlgo() else: if tuning: print('plot_partial_dependence...') best = GBR(n_estimators=300) # , subsample=0.8, max_features=8 # fit_partial = best.fit(training, y) # fig, axs = plot_partial_dependence(fit_partial, training, # features=range(len(training.columns)), # feature_names=training.columns, # n_cols=2) # fig.set_size_inches(13,25) # plt.subplots_adjust(top=1.5) # fig.show() fit = best.fit(X, Y) # fig2, axs2 = plot_partial_dependence(fit, X, # features=range(len(X.columns)), # feature_names=X.columns, # n_cols=2) # fig2.set_size_inches(13,20) # plt.subplots_adjust(top=1.5) # fig2.show() else: print('no tuning...') best = bestAlgo(n_estimators=300) # search = GridSearchCV(best, param_grid, verbose=2) fit = best.fit(X, Y) # print(fit.best_params_) # y_hat = best.predict(X) # best = GBR(n_estimators=200) # fit = best.fit(X, Y) return ({ 'fit' : fit, 'features' : features, 'model' : best})
df1.drop([variable], axis=1, inplace=True) Tips = df1['tip_amount'] #only y df2 = df1.drop(['tip_amount'], axis=1) #all the x X_train, X_test, Y_train, Y_test = train_test_split(df2, Tips, test_size=0.20) # Check shape print(X_train.shape) print(X_test.shape) print(Y_train.shape) print(Y_test.shape) #Train your model gbr = GBR(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1, max_depth=6, verbose=1) est = gbr.fit(X_train, Y_train) #prediction experiment y_test_pred = est.predict(X_test) mean_squared_error(y_test_pred, Y_test) #plot the train loss vs. iteration n = np.arange(100) + 1 plt.plot(n, est.train_score_, 'r-') plt.ylabel('Training Loss') plt.xlabel('Iteration') #Lowest mean quare is 0.1866 #Cross-validation and greed search could be used to tune the parameters in the GradientBoostingRegressor model.
ngb_nll += [-forecast.logpdf(y_test.flatten()).mean()] #print(np.sqrt(mean_squared_error(forecast.loc, y_test))) #for idx, y_p, y_t in zip(test_index, list(forecast.loc), y_test): # print(idx, y_t, y_p, np.abs(y_p - y_t)) if args.verbose or True: print("[%d/%d] BestIter=%d RMSE: Val=%.4f Test=%.4f NLL: Test=%.4f" % (itr+1, args.n_splits, best_itr, np.sqrt(val_rmse[best_itr-1]), np.sqrt(mean_squared_error(forecast.loc, y_test)), ngb_nll[-1])) #logger.tick(forecast, y_test) gbr = GBR(n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac, verbose=args.verbose) gbr.fit(X_train, y_train.flatten()) y_pred = gbr.predict(X_test) forecast = NormalFixedVar(y_pred.reshape((1, -1))) y_gbm += list(y_pred.flatten()) gbm_rmse += [np.sqrt(mean_squared_error(y_pred.flatten(), y_test.flatten()))] if args.verbose or True: print("[%d/%d] GBM RMSE=%.4f" % (itr+1, args.n_splits, np.sqrt(mean_squared_error(y_pred.flatten(), y_test.flatten())))) #gbrlog.tick(forecast, y_test) print('== RMSE GBM=%.4f +/- %.4f, NGB=%.4f +/- %.4f, NLL NGB=%.4f +/ %.4f' % (np.mean(gbm_rmse), np.std(gbm_rmse), np.mean(ngb_rmse), np.std(ngb_rmse),
# -*- coding: UTF-8 -*- from sklearn.ensemble import GradientBoostingRegressor as GBR import pandas as pd trainData = pd.read_csv("K:\python\lesson6_experiment2\NSW_TRAIN.csv") testData = pd.read_csv("K:\python\lesson6_experiment2\NSW_TEST.csv") # X_train = trainData.loc[:, ['WEEK','HOLIDAY','Min_T','Max_T','AVG_T','RAIN']] # 6列特征 X_train = trainData.loc[:, ['WEEK', 'Min_T', 'Max_T', 'AVG_T', 'RAIN']] # 6列特征 y_train = trainData.iloc[:, -2] # 取平均负荷为target X_test = testData.loc[:, ['WEEK', 'Min_T', 'Max_T', 'AVG_T', 'RAIN']] y_test = testData.iloc[:, -2] gbr = GBR() gbr.fit(X_train, y_train) pre = gbr.predict(X_test) score = gbr.score(X_test, y_test) print gbr.feature_importances_ pre_ele = pd.DataFrame(pre, columns=['pre_avg']) real_ele = pd.DataFrame(y_test) after = pd.concat([pre_ele, real_ele], axis=1) after['error'] = abs(after['AVG_ELE'] - after['pre_avg']) / after['pre_avg'] error_count = after[after['error'] < 0.05].shape[0] print error_count