def train(targets, features, model_file, params): model = GradientBoostingRegressor(**params) print "Training hard..." model.fit(features, targets) print "Saving model..." pickle.dump(model, open(model_file, 'wb')) return model
def train(self, x, y, param_names, **kwargs): start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search max_features, learning_rate, max_depth, min_samples_leaf, n_estimators = self._random_search(random_iter=100, x=scaled_x, y=y) # Now train model gb = GradientBoostingRegressor(loss='ls', learning_rate=learning_rate, n_estimators=n_estimators, subsample=1.0, min_samples_split=2, min_samples_leaf=min_samples_leaf, max_depth=max_depth, init=None, random_state=self._rng, max_features=max_features, alpha=0.9, verbose=0) gb.fit(scaled_x, y) self._model = gb duration = time.time() - start self._training_finished = True return duration
def cross_val_cols(self, n_folds = 3): """ Takes in: number of folds Prints out RMSE score and stores the results in self.results """ cv = KFold(n = self.X_train.shape[0], n_folds = n_folds) gbr = GradientBoostingRegressor(**self.params) self.med_error = [] self.rmse_cv = [] self.pct_error=[] self.results = {'pred': [], 'real': []} for train, test in cv: gbr.fit(self.X_train[train], self.y_train[train]) dfFeatures+=[unencode(pd.DataFrame(columns=final_cols[:-1], data=self.X_train[test]))] pred = gbr.predict(self.X_train[test]) medError=median_absolute_error(predExp, testExp) percentError=np.median([np.fabs(p-t)/t for p,t in zip(predExp, testExp)]) error = mean_squared_error(np.power(pred, 10), np.power(self.y_train[test], 10))**0.5 self.inFeatures=(self.X_train[test]) self.results['pred'] += list(predExp) self.results['real'] += list(testExp) self.rmse_cv += [error] self.med_error+=[medError] self.pct_error+=[percentError] print 'Abs Median Error:', np.mean(self.med_error) print 'Abs Percent Error:', np.mean(self.pct_error) print 'Mean RMSE:', np.mean(self.rmse_cv) self.valDf=pd.DataFrame.concat(dfFeatures) self.valDf= self.valDf.reset_index().drop('index', axis = 1) self.valDf['pred']=self.results['pred'] self.valDf['real']=self.results['real'] return self.valDf
def train_model(features, label, params): #Preprocessing #scaled_features = preprocessing.scale(features); scaled_features = features; total_rmse = 0.0; count = 0; kf = KFold(len(scaled_features), n_folds=10); for train_index, validation_index in kf: X_train, X_validation = scaled_features[train_index], scaled_features[validation_index]; Y_train, Y_validation = label[train_index], label[validation_index]; #estimator = SVR(**params) #estimator = RandomForestRegressor(**params) estimator = GradientBoostingRegressor(**params) estimator.fit(X_train, Y_train); current_rmse = calculate_RMSE(estimator, X_validation, Y_validation); total_rmse += current_rmse; count += 1; #Average across all samples avg_current_rmse = total_rmse / float(count); print("Avg Current RMSE " + str(avg_current_rmse)); return (params, avg_current_rmse);
def check_boston(presort, loss, subsample): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. ones = np.ones(len(boston.target)) last_y_pred = None for sample_weight in None, ones, 2 * ones: clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_less(mse, 6.0) if last_y_pred is not None: assert_array_almost_equal(last_y_pred, y_pred) last_y_pred = y_pred
def test_feature_importance_regression(): """Test that Gini importance is calculated correctly. This test follows the example from [1]_ (pg. 373). .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements of statistical learning. New York: Springer series in statistics. """ california = fetch_california_housing() X, y = california.data, california.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1, max_leaf_nodes=6, n_estimators=100, random_state=0) reg.fit(X_train, y_train) sorted_idx = np.argsort(reg.feature_importances_)[::-1] sorted_features = [california.feature_names[s] for s in sorted_idx] # The most important feature is the median income by far. assert sorted_features[0] == 'MedInc' # The three subsequent features are the following. Their relative ordering # might change a bit depending on the randomness of the trees and the # train / test split. assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
def test_gradient_boosting_early_stopping(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if early_stopping works as expected for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13), (gbc, 1e-3, 36), (gbr, 1e-3, 28)): est.set_params(tol=tol) est.fit(X_train, y_train) assert_equal(est.n_estimators_, early_stop_n_estimators) assert est.score(X_test, y_test) > 0.7 # Without early stopping gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) gbc.fit(X, y) gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42) gbr.fit(X, y) assert gbc.n_estimators_ == 100 assert gbr.n_estimators_ == 200
def test_gradient_boosting_validation_fraction(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=100, n_iter_no_change=10, validation_fraction=0.1, learning_rate=0.1, max_depth=3, random_state=42) gbc2 = clone(gbc).set_params(validation_fraction=0.3) gbc3 = clone(gbc).set_params(n_iter_no_change=20) gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10, learning_rate=0.1, max_depth=3, validation_fraction=0.1, random_state=42) gbr2 = clone(gbr).set_params(validation_fraction=0.3) gbr3 = clone(gbr).set_params(n_iter_no_change=20) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if validation_fraction has an effect gbc.fit(X_train, y_train) gbc2.fit(X_train, y_train) assert gbc.n_estimators_ != gbc2.n_estimators_ gbr.fit(X_train, y_train) gbr2.fit(X_train, y_train) assert gbr.n_estimators_ != gbr2.n_estimators_ # Check if n_estimators_ increase monotonically with n_iter_no_change # Set validation gbc3.fit(X_train, y_train) gbr3.fit(X_train, y_train) assert gbr.n_estimators_ < gbr3.n_estimators_ assert gbc.n_estimators_ < gbc3.n_estimators_
def test_plot_partial_dependence(): # Test partial dependence plot function. clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(boston.data, boston.target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)], grid_resolution=grid_resolution, feature_names=boston.feature_names) assert len(axs) == 3 assert all(ax.has_data for ax in axs) # check with str features and array feature names fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')], grid_resolution=grid_resolution, feature_names=boston.feature_names) assert len(axs) == 3 assert all(ax.has_data for ax in axs) # check with list feature_names feature_names = boston.feature_names.tolist() fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')], grid_resolution=grid_resolution, feature_names=feature_names) assert len(axs) == 3 assert all(ax.has_data for ax in axs)
def gbm_fit(params, cv_folds): gbm = GradientBoostingRegressor(**params) gbm.fit(x_train, y_train) # Check accuracy of model # No need for validation data because of cross validation # Training data is split up into cv_folds folds: # Model trained on (cv_folds - 1) of the folds; last fold is saved as validation set cv_scores_mse = cross_validation.cross_val_score(gbm, x_train, y_train, cv=cv_folds, scoring='mean_squared_error') print '\nModel Report' print ('MSE Score: Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g' % (np.mean(cv_scores_mse), np.std(cv_scores_mse), np.min(cv_scores_mse), np.max(cv_scores_mse))) feat_imp = pd.Series(gbm.feature_importances_, features).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() # Check actual performance on test data final_predictions = gbm.predict(x_test) test['health_score_in_week'] = final_predictions test.to_csv(output_file, columns=['user_id', 'date', 'steps', 'total_sleep', 'resting_hr', 'step_week_slope', 'sleep_week_slope', 'hr_week_slope', 'curr_health_score', 'health_score_in_week']) # Save the model to file 'health_prediction.pkl' joblib.dump(gbm, 'health_prediction.pkl', compress=1)
def test_boston(): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. for loss in ("ls", "lad", "huber"): for subsample in (1.0, 0.5): last_y_pred = None for i, sample_weight in enumerate( (None, np.ones(len(boston.target)), 2 * np.ones(len(boston.target)))): clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=1, random_state=1) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert mse < 6.0, "Failed with loss %s and " \ "mse = %.4f" % (loss, mse) if last_y_pred is not None: np.testing.assert_array_almost_equal( last_y_pred, y_pred, err_msg='pred_%d doesnt match last pred_%d for loss %r and subsample %r. ' % (i, i - 1, loss, subsample)) last_y_pred = y_pred
def gbdt_model(trains): trains = np.array(trains) gbdt=GradientBoostingRegressor( loss='ls', learning_rate=0.1, n_estimators=100, subsample=1, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False ) # pdb.set_trace() train_set = trains[:, :-1] label_set = trains[:, -1] gbdt.fit(train_set, label_set) return gbdt
def pipeline(): val = data[data.watch==0] val_a_b = val[['item_id','store_code','a','b']] val_x = val.drop(['label','watch','item_id','store_code','a','b'],axis=1) train = data[data.watch!=0] train_y = train.label a = list(train.a) b = list(train.b) train_weight = [] for i in range(len(a)): train_weight.append(min(a[i],b[i])) train_weight = np.array(train_weight) train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1) train_x.fillna(train_x.median(),inplace=True) val_x.fillna(val_x.median(),inplace=True) model = GradientBoostingRegressor(loss='lad',learning_rate=0.01,n_estimators=400,subsample=0.75,max_depth=6,random_state=1024, max_features=0.75) #train model.fit(train_x,train_y, sample_weight=train_weight) #predict val set val_a_b['pred'] = model.predict(val_x) val_a_b.to_csv('gbrt_3.csv',index=None)
def gradient_boosting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold): #kernel: linear, poly, rbf, sigmoid, precomputed rows = 0 while rows_temp > 0: rows = rows + 1 rows_temp = rows_temp - 1 columns = 0 while columns_temp > 0: columns = columns + 1 columns_temp = columns_temp - 1 features_values = [x for x in features_values_temp] prediction_values = [y for y in prediction_values_temp] rotated = convert_list_to_matrix(features_values, rows, columns) scores = np.array(prediction_values) threshold = float(threshold) estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.) X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train) mean_squared_error(y_test, est.predict(X_test))
def grid_search(): results_list_of_tuples = list() num_folds = 3 best_result = tuple() for item1 in gd_grid['learning_rate']: for item2 in gd_grid['max_depth']: for item3 in gd_grid['min_samples_leaf']: for item4 in gd_grid['n_estimators']: for item5 in gd_grid['random_state']: instance = 'LR {}, max_depth {}, min_samp_leaf {}, n_est {}, rs {}'.format(item1, item2, item3, item4, item5) print instance gbrt = GradientBoostingRegressor(random_state=item5, n_estimators=item4, min_samples_leaf=item3, max_depth=item2, learning_rate=item1 ) kf = KFold(X.shape[0], n_folds=num_folds) mse_list = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] w_train, w_test = weights[train_index], weights[test_index] gbrt.fit(X_train, y_train, w_train) y_pred = gbrt.predict(X_test) mse = mean_squared_error(y_test, y_pred, sample_weight=w_test) mse_list.append(mse) kf_mse = np.mean(np.array(mse_list)) results_list_of_tuples.append((instance, kf_mse)) return results_list_of_tuples
def GBRModel(X_train,X_cv,y_train,y_cv): targets = get_target_array() #print len(train_features) #print train_features[0] #print len(test_features) n_estimators = [50, 100]#, 1500, 5000] max_depth = [3,8] best_GBR = None best_mse = float('inf') best_score = -float('inf') print "################# Performing Gradient Boosting Regression ####################### \n\n\n\n" for estm in n_estimators: for cur_depth in max_depth: #random_forest = RandomForestRegressor(n_estimators=estm) regr_GBR = GradientBoostingRegressor(n_estimators=estm, max_depth= cur_depth) predictor = regr_GBR.fit(X_train,y_train) score = regr_GBR.score(X_cv,y_cv) mse = np.mean((regr_GBR.predict(X_cv) - y_cv) **2) print "Number of estimators used: ",estm print "Tree depth used: ",cur_depth print "Residual sum of squares: %.2f "%mse print "Variance score: %.2f \n"%score if best_score <= score: if best_mse > mse: best_mse = mse best_score = score best_GBR = predictor print "\nBest score: ",best_score print "Best mse: ",best_mse return best_GBR
def gradient_boosting_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return:the predicted values,learning curve, validation curve """ gbr = GradientBoostingRegressor(n_estimators=200, max_depth=7, random_state=7) if get_model: print "Fitting GBR..." gbr.fit(train_x, np.log(train_y+1)) gbr_pred = np.exp(gbr.predict(pred_x))- 1 #dealing with for i in range(len(gbr_pred)): if gbr_pred[i] < 0: gbr_pred[i] = 0 Votes = gbr_pred[:, np.newaxis] Id = np.array(review_id)[:, np.newaxis] submission_gbr = np.concatenate((Id,Votes),axis=1) np.savetxt("submission_gbr.csv", submission_gbr,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') # plot validation and learning curves if v_curve: print "Working on Validation Curves" plot_validation_curve(GradientBoostingRegressor(), "Validation Curve: GBR", train_x, np.log(train_y+1.0), param_name="n_estimators", param_range=[5, 20, 60, 100, 150, 200]) if l_curve: print "Working on Learning Curves" plot_learning_curve(GradientBoostingRegressor(), "Learning Curve: GBR", train_x, np.log(train_y+1.0))
def add_new_weak_learner(self): ''' Summary: Adds a new function, h, to self.weak_learners by solving for Eq. 1 using multiple additive regression trees: [Eq. 1] h = argmin_h (sum_i Q_A(s_i,a_i) + h(s_i, a_i) - (r_i + max_b Q_A(s'_i, b))) ''' if len(self.most_recent_episode) == 0: # If this episode contains no data, don't do anything. return # Build up data sets of features and loss terms data = np.zeros((len(self.most_recent_episode), self.max_state_features + 1)) total_loss = np.zeros(len(self.most_recent_episode)) for i, experience in enumerate(self.most_recent_episode): # Grab the experience. s, a, r, s_prime = experience # Pad in case the state features are too short (as in Atari sometimes). features = self._pad_features_with_zeros(s, a) loss = (r + self.gamma * self.get_max_q_value(s_prime) - self.get_q_value(s, a)) # Add to relevant lists. data[i] = features total_loss[i] = loss # Compute new regressor and add it to the weak learners. estimator = GradientBoostingRegressor(loss='ls', n_estimators=1, max_depth=self.max_depth) estimator.fit(data, total_loss) self.weak_learners.append(estimator)
def get_boosting_regressor(x, y, verbose=False): """Calculate a GradientBoostingRegressor on predictor and target variables Parameters ---------- x : numpy.array Predictor variable y : numpy.array Target variable verbose : bool, optional If True, output status messages Returns ------- classifier : sklearn.ensemble.GradientBoostingRegressor A fitted classifier of the predictor and target variable """ if verbose: sys.stderr.write('Getting boosting regressor\n') clf = GradientBoostingRegressor(n_estimators=50, subsample=0.6, max_features=100, verbose=0, learning_rate=0.1, random_state=0).fit(x, y) clf.feature_importances = pd.Series(clf.feature_importances_, index=x.columns) if verbose: sys.stderr.write('Finished boosting regressor\n') return clf
def anm_fit( (x, y) ): newX = np.array(x).reshape(len(x), 1) clf = GradientBoostingRegressor() clf.fit(newX, y) err = y - clf.predict(newX) ret = [clf.score(newX, y)] + list(pearsonr(x, err)) return ret
def train_and_score(i): global X_train global X_test global Y_train global dist_train global dist_test # GBR performed best but we experimented with other models as well (see the paper) cl = GradientBoostingRegressor(n_estimators=100, loss='ls', learning_rate=0.1) # we add user distance from i-th branch (for which we do prediction) to train set dist_from_target_branch_train = dist_train[:,i].reshape((len(dist_train[:,i]),1)) # dist from i-th branch X_train = np.hstack((X_train, dist_from_target_branch_train)) # we add mean user activity distance from i-th branch (for which we do prediction) to train set ab_dist_train = act_branch_dist_train[:,i].reshape((len(act_branch_dist_train[:,i]),1)) # dist from i-th branch X_train = np.hstack((X_train, ab_dist_train)) # we also experimented with Standard Scaler, without much success # mmscaler_train = StandardScaler() # X_train = mmscaler_train.fit_transform(X_train) cl.fit(X_train,Y_train[:,i]) # same as above for test set dist_from_target_branch_test = dist_test[:,i].reshape((len(dist_test[:,i]),1)) # dist from i-th branch X_test = np.hstack((X_test, dist_from_target_branch_test)) ab_dist_test = act_branch_dist_test[:,i].reshape((len(act_branch_dist_test[:,i]),1)) # dist from i-th branch X_test = np.hstack((X_test, ab_dist_test)) # mmscaler_test = StandardScaler() # X_test = mmscaler_test.fit_transform(X_test) return cl.predict(X_test)
class SupraAxis(): def __init__(self, axisXIn = 1., axisYIn = 0.): self.reg = None self.x = axisXIn self.y = axisYIn def PrepareModel(self, features, offsets): if self.reg is not None: return 0 self.reg = GradientBoostingRegressor() offsets = np.array(offsets) labels = offsets[:,0] * self.x + offsets[:,1] * self.y if not np.all(np.isfinite(labels)): raise Exception("Training labels contains non-finite value(s), either NaN or infinite") self.reg.fit(features, labels) def IsModelReady(self): return self.reg is not None def ClearModel(self): self.reg = None def GetFeatureImportance(self): return self.reg.feature_importances_
def CaGBMModel(X_train, Y_train, X_test, Y_test, cv_iterator): #=========================================================================== # modelCV = GradientBoostingRegressor(subsample = 1, random_state = 42) # param_grid = {'loss':['ls'], # 'learning_rate':[0.1], # 'n_estimators':[100], # 'max_depth':[5, 50, 150], # 'min_samples_split':[2], # 'min_samples_leaf':[5, 15, 30], # 'max_features':["auto"] # } # # search = GridSearchCV(modelCV, param_grid, scoring="mean_squared_error", cv=cv_iterator, n_jobs = -1) # search.fit(X_train, Y_train["P"]) # search.grid_scores_ # model = search.best_estimator_ # mse = search.best_score_ # print (time.strftime("%H:%M:%S")) #=========================================================================== gbm = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, max_depth=50, min_samples_leaf=20, max_features=None, random_state=76) gbm.fit(X_train, Y_train["Ca"]) yhat_gbm = gbm.predict(X_test) test_error = math.sqrt(mean_squared_error(Y_test["Ca"], yhat_gbm)) return gbm, test_error
def build_models(self): self.remove_columns( [ "institute_latitude", "institute_longitude", "institute_state", "institute_country", "var10", "var11", "var12", "var13", "var14", "var15", "instructor_past_performance", "instructor_association_industry_expert", "secondary_area", "var24", ] ) model1 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8) model2 = RandomForestRegressor(n_estimators=50) model3 = ExtraTreesRegressor(n_estimators=50) model1.fit(self.X, self.y) model2.fit(self.X, self.y) model3.fit(self.X, self.y) return [model1, model2, model3]
def fit(filename, treename, inputsname, targetname, workingpoint=0.9, test=False): # Reading inputs and targets ninputs = len(inputsname) branches = copy.deepcopy(inputsname) branches.append(targetname) data = root2array(filename, treename=treename, branches=branches) data = data.view((np.float64, len(data.dtype.names))) # Extract and format inputs and targets from numpy array inputs = data[:, range(ninputs)].astype(np.float32) targets = data[:, [ninputs]].astype(np.float32).ravel() # if test requested, use 60% of events for training and 40% for testing inputs_train = inputs targets_train = targets if test: inputs_train, inputs_test, targets_train, targets_test = cross_validation.train_test_split(inputs, targets, test_size=0.4, random_state=0) # Define and fit quantile regression (quantile = workingpoint) # Default training parameters are used regressor = GradientBoostingRegressor(loss='quantile', alpha=workingpoint) regressor.fit(inputs_train, targets_train) if test: # Compare regression prediction with the true value and count the fraction of time it falls below # This should give the working point value predict_test = regressor.predict(inputs_test) compare = np.less(targets_test, predict_test) print 'Testing regression with inputs', inputsname, 'and working point', workingpoint print ' Test efficiency =', float(list(compare).count(True))/float(len(compare)) # TODO: add 1D efficiency graphs vs input variables return regressor
def fit(self,data_train,target): self.target_train = target self.catcol = data_train.filter(like='var').columns.tolist() #start_gbr_tr = time.clock() self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr.fit(data_train,self.target_train) self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean") self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train) #end_gbr_tr = time.clock() #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr #start_xfr_tr = time.clock() self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr.fit(data_train,self.target_train) self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean") self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train) #end_xfr_tr = time.clock() #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr #start_gbr_cat = time.clock() self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7) self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_gbr_cat = time.clock() #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat #start_xfr_cat = time.clock() self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7) self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train) #end_xfr_cat = time.clock() #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat return self
def modelTheData(data,target): # params = {'n_estimators': 400, 'max_depth': 4, 'min_samples_split': 2, # 'subsample': 0.5,'min_samples_leaf': 2, # 'learning_rate': 0.01, 'loss': 'ls'} #beijing myMachine = GradientBoostingRegressor(alpha=0.9, init=None, learn_rate=None, learning_rate=0.05, loss='ls', max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300, random_state=None, subsample=0.5, verbose=0) #shanghai # myMachine = GradientBoostingRegressor(alpha=0.9, init=None, learn_rate=None, # learning_rate=0.05, loss='ls', max_depth=3, max_features=None, # min_samples_leaf=2, min_samples_split=2, n_estimators=500, # random_state=None, subsample=0.5, verbose=0) # myMachine = GradientBoostingRegressor(**params) myMachine.fit(data,target) return myMachine
def testingGBM(X_train, Y_train, X_test, Y_test): params = {'verbose':2, 'n_estimators':100, 'max_depth':50, 'min_samples_leaf':20, 'learning_rate':0.1, 'loss':'ls', 'max_features':None} test_init = Ridge(alpha = 0.1, normalize = True, fit_intercept=True) gbm2 = GradientBoostingRegressor(**params) gbm2.fit(X_train, Y_train["Ca"]) yhat_gbm = gbm2.predict(X_test) mean_squared_error(Y_test["Ca"], yhat_gbm) math.sqrt(mean_squared_error(Y_test["Ca"], yhat_gbm)) test_score = np.zeros((params['n_estimators'],), dtype=np.float64) for i, y_pred in enumerate(gbm2.staged_decision_function(X_test)): test_score[i]=mean_squared_error(Y_test["Ca"], y_pred) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title('Deviance') plt.plot(np.arange(params['n_estimators']) + 1, gbm2.train_score_, 'b-', label='Training Set Deviance') plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') plt.legend(loc='upper right') plt.xlabel('Boosting Iterations') plt.ylabel('Deviance') plt.show()
def gbdrtrain(x, y, pre_x): x, pre_x = datscater(x, pre_x) clf = GradientBoostingRegressor(n_estimators=740, min_samples_leaf = 0.8, min_samples_split = 40, learning_rate=0.1,max_depth=7, random_state=400, loss='huber').fit(x, y) # clf = GradientBoostingRegressor(n_estimators=200,max_leaf_nodes =20, learning_rate=0.1,max_depth=6, random_state=400, loss='ls').fit(x, y) pred = clf.predict(pre_x) return pred
def impute(df,imp_val,headers): if np.isnan(imp_val): imp_val = -500 log("imputing...",1) model = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') data = np.array(df[headers].get_values()) data[np.isnan(data)] = -500 for col in range(0,len(headers)): #print "Working on column: "+str(col) ##for the current column, remove rows where the current (row,column) value is not equal to zero ##this way we are only training on data with non-zero target values reduced_data = data[np.logical_not(data[:,col] == imp_val)] #remove row if row,col_num value is zero target_set = reduced_data[:,col] training_set = np.delete(reduced_data,col,1) model.fit(training_set,target_set) row_num=0 for row in data: remaining = np.delete(row,col,0) if data[row_num,col] == imp_val: data[row_num,col] = model.predict(remaining) row_num+=1 cntr=0 for h in headers: df[h] = data[:,cntr];cntr+=1 return df
df = df.drop('day', axis=1) df['is_Holiday'] = df['month'].apply( lambda x: 1 if x in ['Apr', 'May', 'Jun', 'Nov'] else 0) df = df.drop('month', axis=1) df = df.drop(['title', 'cast'], axis=1) df = pd.get_dummies(df, prefix='is') #Quantify all is_ columns!! df['vote_average'] = df['vote_average'].fillna(df['vote_average'].mean()) return df X, Y = rgf.drop('revenue', axis=1), rgf['revenue'] X = regression_engineering(X) train_X, test_X, train_Y, test_Y = train_test_split( X, Y, train_size=0.75, test_size=0.25) #randomly separating training and test set reg = GradientBoostingRegressor() reg.fit(train_X, train_Y) #Train regressor model print('Regressor Score: ', reg.score(test_X, test_Y)) #Compare with dummy regressor!! dummy = DummyRegressor() dummy.fit(train_X, train_Y) print('Dummy Regressor Score: ', dummy.score(test_X, test_Y)) sns.set_style('whitegrid') plt.figure(figsize=(12, 14)) sns.barplot(x=reg.feature_importances_, y=X.columns) plt.savefig('regressor.png') #Classification: Predicting Movie Sucess cls = movies_df[movies_df['return'].notnull()]
def test_zero_estimator_reg(): # Test if ZeroEstimator works for regression. est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(boston.data, boston.target) y_pred = est.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_almost_equal(mse, 33.0, decimal=0) est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(boston.data, boston.target) y_pred = est.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_almost_equal(mse, 33.0, decimal=0) est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, boston.data, boston.target)
bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor() gb.fit(X_train, y_train) gb.score(X_train, y_train) gb.score(X_test, y_test) #KNN from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor() knn = KNeighborsRegressor(algorithm='brute') knn.fit(X_train, y_train) knn.score(X_train, y_train) knn.score(X_test, y_test) #votingRegressor from sklearn.ensemble import VotingRegressor reg1 = GradientBoostingRegressor()
from encode import create_df_cate_to_numeric from init import split_train_test import pandas as pd import numpy as np import datetime from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_absolute_error from sklearn.ensemble import GradientBoostingRegressor train_df = pd.read_csv('./data/train.csv') n_train_df = create_df_cate_to_numeric(train_df) n_train_df.fillna(0, inplace=True) X_train, X_test, y_train, y_test = split_train_test(n_train_df, test_size=0.01) y_label = 'SalePrice' df_y = n_train_df[y_label] df_X = n_train_df.drop(y_label, axis=1) grbt = GradientBoostingRegressor(n_estimators=1500, learning_rate=0.5) grbt.fit(X_train, y_train) test_df = pd.read_csv('./data/test.csv') n_test_df = create_df_cate_to_numeric(test_df) n_test_df.fillna(0, inplace=True) y_pred = grbt.predict(n_test_df) result = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': y_pred}) now = datetime.datetime.today().strftime('%Y-%m-%d_%H:%M') file_name = './submissions/result' + now + '.csv' result.to_csv(file_name, index=False)
lambda_2=1e-06, n_iter=30, normalize=False, tol=0.0000001, verbose=True) myGBR = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None, learning_rate=0.01, loss='huber', max_depth=14, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=40, min_weight_fraction_leaf=0.0, n_estimators=300, presort='auto', random_state=10, subsample=0.8, verbose=0, warm_start=False) RF_model = RandomForestRegressor(n_estimators=50, max_depth=25, min_samples_split=20, min_samples_leaf=10, max_features='sqrt',
terms_to_sum = [(np.log(y_pred[i] + 1) - np.log(y[i] + 1))**2.0 for i, pred in enumerate(y_pred)] return (sum(terms_to_sum) * (1.0 / len(y)))**0.5 if not os.path.exists("data.dat"): data = pd.read_csv("D:\userdata\\bellas\\Downloads\\train.csv", header=0) data.set_index(["ID"], inplace=True) data.to_pickle("data.dat") else: data = pd.read_pickle("data.dat") if not os.path.exists("model.dat"): X = data.drop(["target"], axis=1) targets = data.target gb = GradientBoostingRegressor() gb.fit(X, targets) with open("model.dat", "wb") as f: pickle.dump(gb, f) else: with open("model.dat", "rb") as f: gb = pickle.load(f) if not os.path.exists("test_data.dat"): test_data = pd.read_csv("D:\userdata\\bellas\\Downloads\\test.csv", header=0) test_data.set_index(["ID"], inplace=True) test_data.to_pickle("test_data.dat") else: test_data = pd.read_pickle("test_data.dat")
y = np.log1p(df['OBO']).values X = df.iloc[:, 4:].values # transformation #rb = RobustScaler() #X = rb.fit_transform(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) # gb ========================================================================== mod_gb = GradientBoostingRegressor(random_state=1337) mod_gb.fit(X_train, y_train) # Predicting the Test set results y_pred = mod_gb.predict(X_test) mape_gb = np.mean( np.abs((np.expm1(y_test) - np.expm1(y_pred)) / np.expm1(y_test))) * 100 #Print model report: print("\GB nModel Report") print("MAPE : %.2f" % mape_gb) # hyperparameters tuning def my_scorer(y_true, y_pred): mape = np.mean( np.abs((np.expm1(y_true) - np.expm1(y_pred)) / np.expm1(y_true))) * 100
def Regression(train_data, train_solution, test_data, test_solution, method): ## Fix Data Structure ## train_data = train_data.values train_solution = train_solution.values test_data = test_data.values test_solution = test_solution.values ## List of Method Options with Initialization ## if method == 'lin_reg': # linear regression from sklearn.linear_model import LinearRegression reg = LinearRegression() elif method == 'ply_reg': # polynomial regression from sklearn.linear_model import LinearRegression reg = LinearRegression() poly_features = PolynomialFeatures(degree=2) elif method == 'rdg_reg': # ridge regression from sklearn.linear_model import Ridge reg = Ridge() elif method == 'lso_reg': # lasso regression from sklearn.linear_model import Lasso reg = Lasso(alpha=0.00001) elif method == 'ela_net': # elastic net regression from sklearn.linear_model import ElasticNet reg = ElasticNet() elif method == 'svr_lin': # SVM regression from sklearn.svm import LinearSVR reg = LinearSVR(epsilon=0.01, max_iter=10000) elif method == 'svr_2nd': # SVR regression from sklearn.svm import SVR reg = SVR(kernel='poly', degree=2, epsilon=0.01) #C=100 elif method == 'svr_3rd': # SVR regression from sklearn.svm import SVR reg = SVR(kernel='poly', degree=3, epsilon=0.01) #C=100 elif method == 'dcn_tre': # decision tree from sklearn.tree import DecisionTreeRegressor reg = DecisionTreeRegressor() elif method == 'rdm_for': # random forests from sklearn.ensemble import RandomForestRegressor reg = RandomForestRegressor(n_estimators=100, random_state=3) elif method == 'ada_bst': # AdaBoost Regressor from sklearn.ensemble import AdaBoostRegressor reg = AdaBoostRegressor(n_estimators=100, random_state=3) elif method == 'grd_bst': # Gradient Boosting Regressor from sklearn.ensemble import GradientBoostingRegressor reg = GradientBoostingRegressor(random_state=3) elif method == 'gss_prc': # Gaussian Process Regressor from sklearn.gaussian_process import GaussianProcessRegressor reg = GaussianProcessRegressor(random_state=3) elif method == 'knl_rdg': # Kernel Ridge Regression from sklearn.kernel_ridge import KernelRidge reg = KernelRidge() elif method == 'nst_nbr_uni': # K Nearest Neighbors Regressor from sklearn.neighbors import KNeighborsRegressor reg = KNeighborsRegressor(weights='uniform') elif method == 'nst_nbr_dst': # K Nearest Neighbors Regressor from sklearn.neighbors import KNeighborsRegressor reg = KNeighborsRegressor(weights='distance') elif method == 'rad_nbr_uni': # Radius Neighbor Regressor from sklearn.neighbors import RadiusNeighborsRegressor reg = RadiusNeighborsRegressor(weights='uniform') elif method == 'rad_nbr_dst': # Radius Neighbor Regressor from sklearn.neighbors import RadiusNeighborsRegressor reg = RadiusNeighborsRegressor(weights='distance') elif method == 'mlp_reg': from sklearn.neural_network import MLPRegressor reg = MLPRegressor(random_state=3) else: print( 'Error: Regression method not recognized.\nPlease pick a valid method key (example: xxx_xxx).' ) ## Preprocessing and Setup ## from sklearn.preprocessing import StandardScaler scaler = StandardScaler() data = scaler.fit_transform(train_data) scaler = StandardScaler() test_data = scaler.fit_transform(test_data) solution = train_solution.reshape(-1, ) if method == 'ply_reg': data = poly_features.fit_transform(data) reg.fit(data, solution) if len(test_data) < 5: predictions = reg.predict(data) elif len(test_data) > 5: if method == 'ply_reg': test_data = poly_features.transform(test_data) test_solution = test_solution.reshape(-1, ) predictions_test = reg.predict(test_data) solution = test_solution predictions = predictions_test else: print('Error: test_set undetermined.') Matrix_to_save = pd.DataFrame() Matrix_to_save['Solution'] = solution Matrix_to_save['Predictions'] = predictions return Matrix_to_save
df = pd.read_csv('ml_house_data_set.csv') del df['house_number'] del df['unit_number'] del df['street_name'] del df['zip_code'] features_df = pd.get_dummies(df, columns=['garage_type', 'city']) del features_df['sale_price'] X = features_df.as_matrix() y = df['sale_price'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=6, min_samples_leaf=9, max_features=0.1, loss='huber') model.fit(X_train, y_train) # joblib.dump(model, 'trained_house_classifier_model.pkl') mse = mean_absolute_error(y_train, model.predict(X_train)) print 'Training set mean absolute error: %.4f' % mse mse = mean_absolute_error(y_test, model.predict(X_test)) print 'Test set mean absolute error: %.4f' % mse
def test_regression_synthetic(): # Test on synthetic regression datasets used in Leo Breiman, # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). random_state = check_random_state(1) regression_params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls' } # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: clf = GradientBoostingRegressor(presort=presort) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 5.0) # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 1700.0) # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 0.015)
# 4 # ============================================================================== train.index = range(train.shape[0]) test.index = range(test.shape[0]) pred.index = range(pred.shape[0]) train1 = test[(test.record_date >= '20160401') & (test.record_date < '20160501')] train = pd.concat([train, train1]) test = test[(test.record_date >= '20160501') & (test.record_date < '20160701')] featurelist = [i for i in train.columns if i not in df.columns] gbdt = GradientBoostingRegressor(random_state=seed) # gbdt = RandomForestRegressor(random_state = seed) # gbdt = ExtraTreesRegressor(n_estimators=10,random_state = seed) # gbdt = lgb.LGBMRegressor(max_depth = 2,learning_rate=0.05,n_estimators=3000,reg_alpha=10) # gbdt = xgb.XGBRegressor(max_depth = 2,learning_rate=0.1,n_estimators=2000,reg_alpha=5,gamma = 10) # gbdt = xgb.XGBRegressor(max_depth = 7,learning_rate=0.1,n_estimators=200,reg_alpha=5,gamma = 10) gbdt = gbdt.fit(train[featurelist], train.power_consumption) test['power_consumptionbk'] = test['power_consumption'] test.power_consumption = gbdt.predict(test[featurelist]) test.power_consumption = test.power_consumption.astype(int) pred.power_consumption = gbdt.predict(pred[featurelist]) from matplotlib import pyplot as plt plt.figure()
############################################################################################################################################# # parameters : xgb regression ############################################################################################################### ############################################################################################################################################# randomforest = RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=20, random_state=2017, max_features="auto", verbose=1) adaboost = AdaBoostRegressor(n_estimators=500, random_state=2017, learning_rate=0.01) gbdt = GradientBoostingRegressor(n_estimators=500, learning_rate=0.04, subsample=0.8, random_state=2017, max_depth=5, verbose=1) extratree = ExtraTreesRegressor(n_estimators=600, max_depth=8, max_features="auto", n_jobs=20, random_state=2017, verbose=1) lr_reg = LinearRegression(n_jobs=20) kNN = KNeighborsRegressor(n_neighbors=10, n_jobs=20, random_state=2017, verbose=1) ############################################################################################################################################# # parameters : regression ###################################################################################################################
def run(fold): df = pd.read_csv(config.AIRBNB_TRAIN_FOLDS_FILE).drop("id", axis=1) attribs = [f for f in df.columns if f not in ["price", "kfold"]] cat_attribs = ["zipcode", "neighbourhood_cleansed", "room_type", "bed_type", "cancellation_policy", "security_deposit"] num_attribs = [f for f in attribs if f not in cat_attribs] df_train, df_valid = df[df.kfold != fold], df[df.kfold == fold] X_train, y_train = df_train[attribs].copy(), df_train["price"].copy() X_valid, y_valid = df_valid[attribs].copy(), df_valid["price"].copy() num_transformer = Pipeline([ ("num_cleaner", NumAttributesCleaner(num_attribs))]) cat_transformer = Pipeline([ ("cat_clener", CatAttributesCleaner(cat_attribs)), ("ohe", OneHotEncoder())]) transformer = ColumnTransformer([ ("num", num_transformer, num_attribs), ("cat", cat_transformer, cat_attribs)]) X_train = transformer.fit_transform(X_train) X_valid = transformer.transform(X_valid) #selector = SelectKBest(f_regression, 20) #rf = RandomForestRegressor(max_depth=2, random_state=42) #selector = SelectFromModel(estimator=rf, max_features=40) #X_train = selector.fit_transform(X_train, y_train) #X_valid = selector.transform(X_valid) model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=7, loss='ls', random_state=42) model.fit(X_train, y_train) train_preds = model.predict(X_train) valid_preds = model.predict(X_valid) train_scores = {"r2_score": metrics.r2_score(y_train, train_preds), "rmse_score": np.sqrt(metrics.mean_squared_error(y_train, train_preds))} valid_scores = {"r2_score": metrics.r2_score(y_valid, valid_preds), "rmse_score": np.sqrt(metrics.mean_squared_error(y_valid, valid_preds))} print(f"Fold={fold} (train): ", f"r2_score = {train_scores['r2_score'].round(2)}", "--- ", f"rmse_score = {train_scores['rmse_score'].round(2)}") print(f"Fold={fold} (valid): ", f"r2_score = {valid_scores['r2_score'].round(2)}", "--- ", f"rmse_score = {valid_scores['rmse_score'].round(2)}") print("") joblib.dump(model, os.path.join(config.MODEL_OUTPUT, f"gb_{fold}.bin"))
print('10-fold Cross Validation: ') print("Tuned Ridge Parameter: {}".format(gs.best_params_)) print("Tuned Ridge R2: {}".format(r2)) print("Tuned Ridge MSE: {}".format(mse)) plt.plot(y_pred) plt.plot(y_test_3) plt.title('Ridge Regression Result on Test Set: PCT 9MO FWD') plt.legend(['Predict', 'Real']) plt.show() #Ensembling by using GradientBoostingRegressor import time N = {'n_estimators': [50, 100, 200, 300, 400, 500, 600]} from sklearn.ensemble import GradientBoostingRegressor model_4 = GradientBoostingRegressor(max_depth=4, random_state=16) gb = GridSearchCV(model_4, N, cv=10) gb.fit(X_train_3, y_train_3) y_pred = gb.predict(X_test_3) r2 = gb.score(X_test_3, y_test_3) mse = mean_squared_error(y_pred, y_test_3) print('Ensemble by GradientBoostingRegressor: ') print("GradientBoostingRegressor Parameter: {}".format(gb.best_params_)) print("GradientBoostingRegressor R2: {}".format(r2)) print("GradientBoostingRegressor MSE: {}".format(mse)) plt.plot(y_pred) plt.plot(y_test_3) plt.title('GradientBoostingRegressoe Result on Test Set: PCT 9MO FWD') plt.legend(['Predict', 'Real'])
from sklearn.ensemble import GradientBoostingRegressor with open('the_value_from_spark.txt', 'r') as f: the_value = int(f.read()) dt = { 'predictor': {key + 1: key + 1 for key in range(the_value * 2)}, 'res': {key + 1: 0 for key in range(the_value * 2)} } dt['res'][the_value] = the_value df = pd.DataFrame(dt) X_train = df.loc[:, ['predictor']] y_train = df.loc[:, ['res']] model = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.1, loss='ls').fit(X_train, y_train) X_calc = pd.DataFrame({'predictor': {0: the_value, 1: the_value - 10}}) Y_res = model.predict(X_calc) res_df = pd.DataFrame(Y_res, columns=['res']).round() res = int(res_df.iloc[0] - res_df.iloc[1]) with open('the_value_from_ml.txt', 'w') as f: f.write(str(res))
from math import sqrt from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error from Python.basic.data_preparation import get_data_regression if __name__ == '__main__': x_train, x_test, y_train, y_test = get_data_regression() params = { 'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls' } regressor = GradientBoostingRegressor(**params) regressor.fit(x_train, y_train) y_pred = regressor.predict(x_test) error_standard_deviation = sqrt(mean_squared_error(y_test, y_pred)) print(error_standard_deviation)
regTreeModel=tree.DecisionTreeRegressor\ (max_features=par[0],max_depth=par[1],min_samples_split=par[2],min_samples_leaf=par[3], min_weight_fraction_leaf=par[4],max_leaf_nodes=18) fitModel = linear_model.LinearRegression() Yp,Yptrain,regTreeModel,fitModelList,predind=SSRS.RegressionTree\ (X_train,X_test,Y_train,Y_test,regTreeModel,fitModel,Field,doFitSelection=0) rmse, rmse_band = SSRS.RMSEcal(Yp, Y_test) rmse_train, rmse_band_train = SSRS.RMSEcal(Yptrain, Y_train) print(rmse) print(rmse_train) # predict correlation regModel=tree.DecisionTreeRegressor\ (max_features=0.3,max_depth=20,min_samples_split=3,min_samples_leaf=5, min_weight_fraction_leaf=0.5) regModel = GradientBoostingRegressor() Yp,rmse,rmse_train,rmse_band,rmse_band_train=SSRS.Regression\ (X_train,X_test,Y_train,Y_test,multiband=0,regModel=regModel,doplot=0) print(rmse) print(rmse_train) regModel.fit(X_train, Y_train) savedir = r"/Volumes/wrgroup/Kuai/USGSCorr/figure_tree/" savedir = r"Y:\Kuai\USGSCorr\figure_tree\\" with open(savedir + "tree.dot", 'w') as f: f = tree.export_graphviz(regModel, out_file=f, feature_names=Field, label='none', node_ids=True) os.system("dot -Tpng tree.dot -o tree.png")
X = dataset[:,0:50] # ni_n[47], na_n[1], V, T y = dataset[:,50:51] # RD_mol[47], RD_at[1] print(dataset.shape) print(X.shape) print(y.shape) # Instantiate the machine learning models SupportVectorMachine = SVR() KernelRidge = KernelRidge() MultiLayerPerceptron = MLPRegressor() KNeighbors = KNeighborsRegressor() ExtraTree = ExtraTreesRegressor() DecisionTree = DecisionTreeRegressor() RandomForest = RandomForestRegressor() GradientBoosting = GradientBoostingRegressor() HistGradientBoosting = HistGradientBoostingRegressor() # Instantiate the machine learning models with pipelines from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler SupportVectorMachine_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', SupportVectorMachine)]) KernelRidge_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', KernelRidge)]) MultiLayerPerceptron_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', MultiLayerPerceptron)]) KNeighbors_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', KNeighbors)]) ExtraTree_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', ExtraTree)]) DecisionTree_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', DecisionTree)]) RandomForest_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', RandomForest)]) GradientBoosting_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', GradientBoosting)]) HistGradientBoosting_pipe = Pipeline([('standardize', StandardScaler()), ('regressor', HistGradientBoosting)])
def fit_gbr(self): self.model = GradientBoostingRegressor(n_estimators=self.n_estimators, max_depth=self.max_depth) self.model.fit(self.x, self.y)
def test_multi_target_regression_one_target(): # Test multi target regression raises X, y = datasets.make_regression(n_targets=1) rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) assert_raises(ValueError, rgr.fit, X, y)
dataset['relayNo'] = dataset['relayNo'].astype('category') x = dataset.iloc[:, 4:6].values y = dataset.iloc[:, 7:].values print(dataset.dtypes) # Split the dataset into the training set and test set # We're splitting the data in 1/3, so out of 30 rows, 20 rows will go into the training set, # and 10 rows will go into the testing set. xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=1 / 3, random_state=7) linearRegressor = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=-1) linearRegressor.fit(xTrain, yTrain) yPrediction = linearRegressor.predict(xTest) #print(mean_absolute_error(yTrain,yPrediction)) print(linearRegressor.predict([[2, 87], [18, 87], [5, 90], [4, 80]])) plot.scatter(xTrain, yTrain, color='red') plot.plot(xTrain, linearRegressor.predict(xTrain), color='blue') plot.title('Tracking beacons') plot.xlabel('RSSI and Relays') plot.ylabel('Predicted Location') plot.show()
print('Calculating In-Bag RMSE') print(MSE(label, model.predict(train[col]))**0.5) print('Calculating Out-Bag RMSE') print(np.mean(RMSE)) return Final,Final_pred ## Prepare output of level 1. ## Prepare data train_,test_ = get_additional_features(train,test,magic=True) train_ = train_.sample(frac=1,random_state=420) col = list(test.columns) ## Input 1: GBDT gb1 = GradientBoostingRegressor(n_estimators=1000,max_features=0.95,learning_rate=0.005,max_depth=4) gb1_train,gb1_test = get_sklearn_stack_data(gb1,train_,col,train_['y'],test_) ## Input2: Lasso las1 = Lasso(alpha=5,random_state=42) las1_train,las1_test = get_sklearn_stack_data(las1,train_,col,train_['y'],test_) ## Input 3: LGB params = { 'objective': 'regression', 'metric': 'rmse', 'boosting': 'gbdt', 'learning_rate': 0.0045 , #small learn rate, large number of iterations 'verbose': 0, 'num_iterations': 500, 'bagging_fraction': 0.95,
#**********************2.全样本3/12/24/36个月滑动窗口函数运行**********************************# path = r'..\DataBase\factor'#96项因子所在路径 factorname = [x[1:-4] for x in os.listdir(path)] riskfree, timeseries, factor, timeseries2, index = datatransfrom(path)[0], datatransfrom(path)[1], datatransfrom(path)[2], datatransfrom2(path)[0], datatransfrom2(path)[1] for i in range(4): i= 0 output(window[i],LinearRegression(),'OLS'+str(window[i]),riskfree[i], timeseries) FC(window[i], riskfree[i], timeseries, 96,'FC') output(window[i], PLSRegression(PLS_params[i]), 'PLS' + str(window[i]), riskfree[i], timeseries) output(window[i],Lasso(alpha=lasso_params[i]),'Lasso'+ str(window[i]), riskfree[i], timeseries) output(window[i],Ridge(alpha=ridge_params[i]),'Ridge'+str(window[i]),riskfree[i], timeseries) output(window[i],ElasticNet(alpha= elasticnet_params['alpha'] [i],l1_ratio= elasticnet_params['l1_ratio'][i]),'ElasticNet'+str(window[i]),riskfree[i], timeseries) output(window[i],SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i] ),'SVR'+str(window[i]),riskfree[i], timeseries) output(window[i], GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]), 'GBDT' + str(window[i]),riskfree[i], timeseries) output(window[i], XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]), 'XGBOOST' + str(window[i]), riskfree[i], timeseries) output(window[i], ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]), 'ENANN' + str(window[i]), riskfree[i], timeseries) output(window[i], DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]), 'DFN' + str(window[i]), riskfree[i], timeseries) output2(window[i], rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]), 'LSTM'+ str(window[i]) ,riskfree[i], timeseries2) output2(window[i], rm.lstmmodule(96, RNN_params['hidden_number'][i], RNN_params['depth'][i], 100, 3571, lr=RNN_params['learning_rate'][i], ntype='RNN'), 'RNN'+ str(window[i]), riskfree[i], timeseries2) modellist = [DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]), ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]), XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]), GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]), PLSRegression(PLS_params[i]), Ridge(alpha=ridge_params[i]), SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i])]# PLS一定要放在倒数第三个 nmolist = [rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]), rm.lstmmodule(96, RNN_params['hidden_number'][i], RNN_params['depth'][i], 100, 3571, lr=RNN_params['learning_rate'][i], ntype='RNN')]# 循环神经网络模型 modelname = ['DFN', 'En-ann', 'xgboost', 'GBDT', 'lasso', 'Elasticnet', 'pls', 'Ridge', 'svm', 'LSTM', 'RNN']
# 模型 # LASSO Regression : lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) # Elastic Net Regression ENet = make_pipeline( RobustScaler(), ElasticNet( alpha=0.0005, l1_ratio=.9, random_state=3)) # Kernel Ridge Regression KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) # Gradient Boosting Regression GBoost = GradientBoostingRegressor( n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) # XGboost model_xgb = xgb.XGBRegressor( colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213,
num_training_samples], weights_train[: args . num_training_samples], y_train[: args . num_training_samples] print('Training data shape: {}\nTesting data shape:{}'.format( str(X_train.shape), str(X_test.shape))) print('Training model ...') param_grid = json.loads(args.param_config.read()) trees_grid = GridSearchCV(GradientBoostingRegressor(), param_grid = param_grid, n_jobs = args.cores, scoring = 'neg_mean_squared_error', cv = 10)\ .fit(X_train, y_train, sample_weight = weights_train) model = trees_grid.best_estimator_ print('Best MSE achieved: {}'.format(str(-1 * trees_grid.best_score_))) prediction = model.predict(X_test) score = mean_squared_error(y_test, prediction, sample_weight=weights_test) print('Validation set MSE: {}'.format(str(score))) print('Saving model ...') dump(model, args.out_model)
def _select_estimator(estimator, n_jobs, n_estimators, random_state=None): '''Select estimator and parameters from argument name.''' # Regressors if estimator == 'RandomForestRegressor': param_dist = {**parameters['ensemble'], **parameters['bootstrap']} estimator = RandomForestRegressor( n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'ExtraTreesRegressor': param_dist = {**parameters['ensemble'], **parameters['bootstrap']} estimator = ExtraTreesRegressor( n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'GradientBoostingRegressor': param_dist = parameters['ensemble'] estimator = GradientBoostingRegressor( n_estimators=n_estimators, random_state=random_state) elif estimator == 'SVR': param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]} estimator = SVR(kernel='rbf', gamma='scale') elif estimator == 'LinearSVR': param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]} estimator = SVR(kernel='linear') elif estimator == 'Ridge': param_dist = parameters['linear'] estimator = Ridge(solver='auto', random_state=random_state) elif estimator == 'Lasso': param_dist = parameters['linear'] estimator = Lasso(random_state=random_state) elif estimator == 'ElasticNet': param_dist = parameters['linear'] estimator = ElasticNet(random_state=random_state) elif estimator == 'KNeighborsRegressor': param_dist = parameters['kneighbors'] estimator = KNeighborsRegressor(algorithm='auto') # Classifiers elif estimator == 'RandomForestClassifier': param_dist = {**parameters['ensemble'], **parameters['bootstrap'], **parameters['criterion']} estimator = RandomForestClassifier( n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'ExtraTreesClassifier': param_dist = {**parameters['ensemble'], **parameters['bootstrap'], **parameters['criterion']} estimator = ExtraTreesClassifier( n_jobs=n_jobs, n_estimators=n_estimators, random_state=random_state) elif estimator == 'GradientBoostingClassifier': param_dist = parameters['ensemble'] estimator = GradientBoostingClassifier( n_estimators=n_estimators, random_state=random_state) elif estimator == 'LinearSVC': param_dist = parameters['linear_svm'] estimator = LinearSVC(random_state=random_state) elif estimator == 'SVC': param_dist = parameters['svm'] estimator = SVC(kernel='rbf', random_state=random_state, gamma='scale') elif estimator == 'KNeighborsClassifier': param_dist = parameters['kneighbors'] estimator = KNeighborsClassifier(algorithm='auto') return param_dist, estimator
def gradient_booster(param_grid, n_jobs): estimator = GradientBoostingRegressor() classifier = GridSearchCV(estimator=estimator, cv=5, param_grid=param_grid, n_jobs=n_jobs) classifier.fit(X_train, y_train) print(classifier.best_estimator_)
def gradient_boosting_regressor(self, train_x, train_y): from sklearn.ensemble import GradientBoostingRegressor model = GradientBoostingRegressor(n_estimators=100) model.fit(train_x, train_y) return model
estimator = GradientBoostingRegressor() classifier = GridSearchCV(estimator=estimator, cv=5, param_grid=param_grid, n_jobs=n_jobs) classifier.fit(X_train, y_train) print(classifier.best_estimator_) gradient_booster(p1, job1) # Train GBR with optimized parameters clf = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None, learning_rate=0.05, loss='ls', max_depth=4, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=3, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='auto', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False) clf.fit(X_train, y_train) # Predicting the results for our test data set predicted_values = clf.predict(X_test) print(f"Printing MAE error(avg abs residual): {metrics.mean_absolute_error(y_test, predicted_values)}") print(f"Printing MSE error: {metrics.mean_squared_error(y_test, predicted_values)}") print(f"Printing RMSE error: {np.sqrt(metrics.mean_squared_error(y_test, predicted_values))}") print(f"R2 Score: {metrics.r2_score(y_test, predicted_values)}")
# RMSE math.sqrt(mean_squared_error(target_test, predicted_tree_boost)) """ Gradient Boosting Regression ---------------------------------------------------- """ # Tune Hyperparameters of DecisionTreeClassifier parameters = { 'max_depth': [2, 3, 4, 5, 10], 'learning_rate': [0.001, 0.05, 0.1, 0.3, 0.8, 1, 1.5, 2, 4, 5], 'n_estimators': range(2, 50, 5), 'min_samples_leaf': [1, 2, 3, 5], 'max_leaf_nodes': [5, 7, 10, 15] } grid_search_gradientboost = GridSearchCV(GradientBoostingRegressor(), parameters, n_jobs=4) grid_search_gradientboost.fit(regressors_train_pca, target_train) print(grid_search_gradientboost.best_score_, grid_search_gradientboost.best_params_) # Train Best Model regr_gradientboost = GradientBoostingRegressor(n_estimators=85, max_depth=5, min_samples_split=2, max_leaf_nodes=14, min_samples_leaf=4, learning_rate=0.15, loss='ls')