def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df_test = pd.read_csv(data_test, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingRegressor(n_estimators=50) clf.fit(X, y) y_predict = clf.predict(X_guest) result = { "mean_squared_error": mean_squared_error(y_guest, y_predict), "mean_absolute_error": mean_absolute_error(y_guest, y_predict) } print(result) return {}, result
def main(config="../../config.yaml", param="./gbdt_config_reg.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingRegressor(random_state=0, n_estimators=50) clf.fit(X, y) y_predict = clf.predict(X) result = {"mean_absolute_error": mean_absolute_error(y, y_predict)} print(result) return {}, result
def GDBT_ST(trainFileName,testFilename): trainData = ld.LoadData_DATA_ST(trainFileName) testData = ld.LoadData_DATA_ST(testFilename) store = ['1','2','3','4','5'] res = [] for i in store: train_X = [];train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [];items = [] context = testData[i] for array in context: items.append((array[0],array[1])) array = [float(x) for x in array[2:] ] test_X.append((array[2:])) clf = GradientBoostingRegressor(loss='lad', n_estimators=50, learning_rate=0.1, max_depth=3).\ fit(train_X,train_y) pred_y = clf.predict(test_X) for i in range(len(pred_y)): res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)]) return res
def test_gradient_boosting_estimator_with_smooth_quantile_loss(): np.random.seed(0) m = 15000 n = 10 p = .8 X = np.random.normal(size=(m,n)) beta = np.random.normal(size=n) mu = np.dot(X, beta) y = np.random.lognormal(mu) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333) loss_function = SmoothQuantileLossFunction(1, p, .0001) q_loss = QuantileLossFunction(1, p) model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), loss_function, n_estimators=150, stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True) assert_raises(NotFittedError, lambda : model.predict(X_train)) model.fit(X_train, y_train) prediction = model.predict(X_test) model2 = GradientBoostingRegressor(loss='quantile', alpha=p) model2.fit(X_train, y_train) prediction2 = model2.predict(X_test) assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2)) assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2)) q = np.mean(y_test <= prediction) assert_less(np.abs(q-p), .05) assert_greater(model.score_, 0.) assert_approx_equal(model.score(X_train, y_train), model.score_)
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingRegressor(random_state=0, n_estimators=50, learning_rate=0.1) clf.fit(X, y) y_predict = clf.predict(X) result = { "mean_absolute_error": mean_absolute_error(y, y_predict), } print(result) return {}, result
class GradientBoostingRegressorImpl(): def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): self._hyperparams = { 'loss': loss, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'subsample': subsample, 'criterion': criterion, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_depth': max_depth, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'init': init, 'random_state': random_state, 'max_features': max_features, 'alpha': alpha, 'verbose': verbose, 'max_leaf_nodes': max_leaf_nodes, 'warm_start': warm_start, 'presort': presort, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'tol': tol } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def GradientBoosted(X_train, X_test, y_train, y_test): mod = GradientBoostingRegressor() mod.fit(X_train, y_train) print "Done training" gb_labels = mod.predict(X_test) print "Done testing" gb_score = mod.score(X_test, y_test) return gb_score, gb_labels
def rfr_the_loss(hub, molid): X, y = regress_the_loss_from_coocurrences_Xy(hub, molid) # rfr = RandomForestRegressor(n_estimators=800, n_jobs=8, oob_score=True, random_state=0) rfr = GradientBoostingRegressor(n_estimators=100) rfr.fit(X, y) # print rfr.oob_score_ # print rfr.oob_improvement_ influential = np.argsort(-rfr.feature_importances_)[:20] print '\t%s' % '\n\t'.join(X.columns[influential])
def GDBT_ALL(trainFileName, testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i], 'all', '%.4f' % max(pred_y[i], 0)]) return res
def GDBT_ALL(trainFileName,testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i],'all','%.4f'%max(pred_y[i],0)]) return res
def GDBT_ALL_train(trainFileName,testFileName): train_X, train_y, _ = ld.loadData_all(trainFileName) test_X, test_y,items = ld.loadData_all(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(test_X) res = [] for i in range(len(test_X)): res.append([items[i],'all','%.2f'%max(pred_y[i],0),'%.2f'%max(test_y[i],0)]) return res
def model_build(train_set): X = train_set.iloc[:, 6:11] Y = train_set['label'] #print(X.head(5)) #print(Y.head(5)) model = GradientBoostingRegressor() #model = GradientBoostingClassifier() model.fit(X, Y) print(model.feature_importances_) #print(model) return model
def test_argument_names(): boston = load_boston() X = DataFrame(boston['data'], columns=boston['feature_names']) y = boston['target'] model = GradientBoostingRegressor(verbose=True).fit(X, y) code = sklearn2code(model, ['predict'], numpy_flat, argument_names=X.columns) boston_housing_module = exec_module('boston_housing_module', code) assert_array_almost_equal(model.predict(X), boston_housing_module.predict(**X))
def NonlinReg(coeff, regressor='GBR', features=4, interval=0, length=1): ''' NonlinReg: Non-linear Regression Model coeff: Input sequence disposed by WT (Wavelet Transformation Function) regressor: Non-linear regressor, 'GBR' default features: Days used to predict, 4 default interval: Prediction lagging, 0 default length: 1 default ''' X, Y = [], [] for i in range(len(coeff[0])): if i + features + interval < len(coeff[0]): X.append(coeff[0][i:i + features]) Y.append(coeff[0][i + features + interval]) X = np.array(X) Y = np.array(Y) if regressor == 'GBR': gbr = GBR(learning_rate=0.1, n_estimators=80, max_depth=2).fit(X, Y) X_ = copy.deepcopy(X) Y_ = copy.deepcopy(Y) for i in range(length): X_ = np.concatenate( (X_, np.array([ np.concatenate( (X_[-1][-features + 1:], Y_[[-interval - 1]])) ]))) Y_ = np.concatenate((Y_, gbr.predict(X_[-1]))) if regressor == 'SVR': svr = svm.SVR(kernel='rbf', C=100, gamma=3).fit(X, Y) X_ = copy.deepcopy(X) Y_ = copy.deepcopy(Y) for i in range(length): X_ = np.concatenate( (X_, np.array([ np.concatenate( (X_[-1][-features + 1:], Y_[[-interval - 1]])) ]))) Y_ = np.concatenate((Y_, svr.predict(X_[-1]))) return Y_
def __init__(self, data, label, task, model_name='lgb', eval_metric=None, importance_threshold=0.0): ''' :param data: DataFrame :param label: label name :param task: 任务类型, [regression, classification] :param model: ['gbdt', 'xgb', 'lgb'] :param importance_threshold, 除去小于阈值的特征 ''' self.data = data self.label = label self.task = task self.model_name = model_name self._importance_threshold = importance_threshold self.model = None # 根据任务和label的值,设置验证准则 self.eval_metric = None if model_name == 'lgb': if self.task == 'classification': self.model = lgb.LGBMClassifier(**lgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'logloss' elif self.task == 'regression': self.model = lgb.LGBMRegressor(**lgb_params) self.eval_metric = 'l2' else: raise ValueError('Task must be either "classification" or "regression"') elif model_name == 'xgb': if self.task == 'classification': self.model = xgb.XGBClassifier(**xgb_params) if self.data[self.label].unique().shape[0] == 2: self.eval_metric = 'logloss' else: self.eval_metric = 'mlogloss' elif self.task == 'regression': self.model = xgb.XGBRegressor(**xgb_params) self.eval_metric = 'rmse' else: raise ValueError('Task must be either "classification" or "regression"') else: # gbdt if self.task == 'classification': self.model = GradientBoostingClassifier(**gbdt_params) elif self.task == 'regression': self.model = GradientBoostingRegressor(**gbdt_params) else: raise ValueError('Task must be either "classification" or "regression"') if not eval_metric: self.eval_metric = eval_metric
def GDBT_ALL_train(trainFileName, testFileName): train_X, train_y, _ = ld.loadData_all(trainFileName) test_X, test_y, items = ld.loadData_all(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(test_X) res = [] for i in range(len(test_X)): res.append([ items[i], 'all', '%.2f' % max(pred_y[i], 0), '%.2f' % max(test_y[i], 0) ]) return res
def main(train, test, filepath): if not filepath: click.echo("need filepath") return X, Y = get_data(filepath) if not train or not test: click.echo("need train or test size") return TRAIN_SIZE = 96 * int(train) TEST_SIZE = 96 * int(test) X_train = X[:TRAIN_SIZE] Y_train = Y[:TRAIN_SIZE] X_test = X[TRAIN_SIZE:] Y_test = Y[TRAIN_SIZE:] #clf = SVR(kernel='rbf', C=1e3, gamma=0.00001) clf = GradientBoostingRegressor(n_estimators=100, max_depth=1) #clf = DecisionTreeRegressor(max_depth=25) #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25) #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7) #clf.fit(X_train,Y_train) #y_pred = clf.predict(X_test) #plt.plot(X_test, y_pred, linestyle='-', color='red') predict_list = [] for i in range(TEST_SIZE): X = [[x] for x in range(i, TRAIN_SIZE + i)] clf.fit(X, Y[i:TRAIN_SIZE + i]) y_pred = clf.predict(np.array([TRAIN_SIZE + 1 + i]).reshape(1, -1)) predict_list.append(y_pred) #print("mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)) #print("sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))) origin_data = Y_test #print("origin data:%s"%origin_data) plt.plot([x for x in range(TRAIN_SIZE + 1, TRAIN_SIZE + TEST_SIZE + 1)], predict_list, linestyle='-', color='red', label='prediction model') plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') plt.legend(loc=1, prop={'size': 12}) plt.show()
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def run(self): loss = self.lossComboBox.currentText() if loss == 'Least Squares': loss = 'ls' if loss == 'Least Absolute Deviation': loss = 'lad' if loss == 'Huber': loss = 'huber' if loss == 'Quantile': loss = 'quantile' params = { 'loss': loss, 'learning_rate': self.learningDoubleSpinBox.value(), 'n_estimators': self.numEstSpinBox.value(), 'subsample': self.subsampleDoubleSpinBox.value(), 'criterion': 'friedman_mse', 'min_samples_split': self.min_n_splitSpinBox.value(), 'min_samples_leaf': self.min_n_leafSpinBox.value(), 'min_weight_fraction_leaf': self.min_fractionDoubleSpinBox.value(), 'max_depth': self.max_depthSpinBox.value(), 'min_impurity_decrease': self.min_imp_decDoubleSpinBox.value(), 'random_state': 1, 'alpha': self.alphaDoubleSpinBox.value() } return params, self.getChangedValues(params, GradientBoostingRegressor())
def train_model(data): half_len = len(data) # train X = [] y = [] for [c, cb, delta] in data[:half_len]: X.append([c, cb]) y.append(delta) svr_rbf_general = svm.SVR(kernel='rbf') svr_linear_general = svm.SVR(kernel='linear') svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1) svr_lin = svm.SVR(kernel='linear', C=1e3) svr_poly = svm.SVR(kernel='poly', C=1e3, degree=2) model_br = BayesianRidge() model_lr = LinearRegression() model_etc = ElasticNet() model_svr = SVR() model_gbr = GradientBoostingRegressor() # clf = svr_linear_general clf = svr_linear_general clf.fit(X, y) return clf
def tune_gbr(self): parameters = {'kernel':['rbf','linear'], 'C':[88,89,90,91,92], 'gamma':[0.34,0.36,0.37]} clf = GridSearchCV(GradientBoostingRegressor(),parameters,verbose=2) clf.fit(self.X_train,self.y_train) print (clf.best_params_) print (clf.best_score_)
def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor as GBR # Special fix for gradient boosting! if isinstance(X, np.ndarray): X = np.ascontiguousarray(X, dtype=X.dtype) if refit: self.estimator = None if self.estimator is None: self.learning_rate = float(self.learning_rate) self.n_estimators = int(self.n_estimators) self.subsample = float(self.subsample) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.max_features = float(self.max_features) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) self.verbose = int(self.verbose) self.estimator = GBR( loss=self.loss, learning_rate=self.learning_rate, n_estimators=n_iter, subsample=self.subsample, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_depth=self.max_depth, criterion=self.criterion, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=self.random_state, verbose=self.verbose, warm_start=True, ) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) # Apparently this if is necessary if self.estimator.n_estimators >= self.n_estimators: self.fully_fit_ = True return self
def model_build(train_set, weight=None): """ 模型建立,根据训练集,构建GBDT模型 :param train_set: 训练集 :param weight: 训练集label权重列表 :return: 训练完成的model """ X = train_set.iloc[:, 6:11] Y = train_set['label'] #print(X.head(5)) #print(Y.head(5)) model = GradientBoostingRegressor() #model = GradientBoostingClassifier() if not weight: model.fit(X, Y) print(model.feature_importances_) #print(model) return model
def getModels(): models = {} models['dt'] = DecisionTreeRegressor(max_depth=50) models['rf1'] = RandomForestRegressor() models['rf2'] = RandomForestRegressor(n_estimators=128, max_depth=15) models['gbr'] = GradientBoostingRegressor(n_estimators=128, max_depth=5, learning_rate=1.0) # models['abr'] = AdaBoostRegressor(n_estimators=128) return models
def model_build(train_set, weight=None): """ 模型建立,根据训练集,构建GBDT模型 :param train_set: 训练集 :param weight: 训练集label权重列表 :return: 训练完成的model """ X = train_set.iloc[:, 1:] print(len(X)) Y = train_set['label'] print(len(Y)) #print(X.head(5)) #print(Y.head(5)) model = GradientBoostingRegressor() #model = GradientBoostingClassifier() #model = logistic_regression_path(X, Y) model.fit(X, Y) print(model.feature_importances_) #print(model) return model
def create_models(): models = { 'BayesianRidge': BayesianRidge(), # 'LinearRegression': LinearRegression(), 'ElasticNet': ElasticNet(), 'SVR(rbf)': SVR(kernel='rbf'), 'SVR(linear)': SVR(kernel='linear'), 'Lasso': Lasso(), 'GBR': GradientBoostingRegressor(n_estimators=300, max_depth=3), } return models
def train_model(): global train_x, train_y, test_x gbr = GradientBoostingRegressor() cv_score = cross_val_score(gbr, train_x, train_y).mean() print(cv_score) nn = MLPRegressor() cv_score = cross_val_score(nn, train_x, train_y).mean() print(cv_score) rft = RandomForestRegressor() cv_score = cross_val_score(rft, train_x, train_y).mean() print(cv_score)
def train_model(self): # Tried other model such MLP neural network regressor and random forest trees, but GBR performed best global train_x, train_y, test_x cvscore = [] range = [4, 5, 6, 7, 8] for i in range: print(i) gbr = GradientBoostingRegressor(max_leaf_nodes=i) cv_score = cross_val_score( gbr, train_x, train_y, scoring='neg_mean_squared_error').mean() cvscore.append(cv_score) print(cvscore)
def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): self._hyperparams = { 'loss': loss, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'subsample': subsample, 'criterion': criterion, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_depth': max_depth, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'init': init, 'random_state': random_state, 'max_features': max_features, 'alpha': alpha, 'verbose': verbose, 'max_leaf_nodes': max_leaf_nodes, 'warm_start': warm_start, 'presort': presort, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'tol': tol } self._wrapped_model = SKLModel(**self._hyperparams)
def multi_output_regression(train, test, grid, outputs): # Multi-Layer Perceptron Regressor input_train, input_test, output_train, actual = pd.training_testing_data( train, test, grid, outputs) print('You are training on %d samples' % (len(input_train))) print('You are testing on %d samples' % (len(input_test))) multi_output_mlp = MultiOutputRegressor( MLPRegressor(solver='adam', learning_rate='adaptive', max_iter=500, early_stopping=True)) multi_output_mlp.fit(input_train, output_train) prediction_mlp = multi_output_mlp.predict(input_test) print('Multi-Layer Perceptron') print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_mlp))) print('MSE: %.5f' % (mean_squared_error(actual, prediction_mlp))) print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_mlp)))) # Gradient Boosting Regressor input_train, input_test, output_train, actual = pd.training_testing_data( train, test, grid, outputs) print('You are training on %d samples' % (len(input_train))) print('You are testing on %d samples' % (len(input_test))) multi_output_gbr = MultiOutputRegressor( GradientBoostingRegressor(loss='huber')) multi_output_gbr.fit(input_train, output_train) prediction_gbr = multi_output_gbr.predict(input_test) print('Gradient Boosting Regressor') print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_gbr))) print('MSE: %.5f' % (mean_squared_error(actual, prediction_gbr))) print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_gbr)))) # Random Forest Regressor input_train, input_test, output_train, actual = pd.training_testing_data( train, test, grid, outputs) print('You are training on %d samples' % (len(input_train))) print('You are testing on %d samples' % (len(input_test))) multi_output_rfr = MultiOutputRegressor(RandomForestRegressor()) multi_output_rfr.fit(input_train, output_train) prediction_rfr = multi_output_rfr.predict(input_test) print('Random Forest Regressor') print(r'$R^{2}$: %.5f' % (r2_score(actual, prediction_rfr))) print('MSE: %.5f' % (mean_squared_error(actual, prediction_rfr))) print('RMSE: %.5f' % (np.sqrt(mean_squared_error(actual, prediction_rfr)))) return actual, prediction_gbr, prediction_mlp, prediction_rfr
def predict_using_local_model(self): gbr = GradientBoostingRegressor() gbr.fit(self.train_x, self.train_y) print('Accuracy of gbr, on the training set: ' + str(gbr.score(train_x, train_y))) start_time = time.time() predictions = gbr.predict(self.test_x) predict_time = time.time() - start_time print('Prediction time for gbr is ' + str(predict_time) + '\n') predictions = predictions.astype('uint8') return predictions
def parameter_choose(train_set): """ 模型最佳参数选择,根据对应的训练集选择最佳模型参数 :param train_set: 训练集 :return: 无 """ X = train_set.iloc[:, 6:11] Y = train_set['label'] param_test = {'n_estimators': range(10, 81, 10)} gsearch = GridSearchCV( estimator=GradientBoostingRegressor(learning_rate=1), param_grid=param_test, iid=True, cv=5) gsearch.fit(X, Y) print(gsearch.cv_results_) print(gsearch.best_params_, gsearch.best_score_)
def prediction(): global train_x, train_y, test_x gbr = GradientBoostingRegressor() gbr.fit(train_x, train_y) print('Accuracy of gbr, on the training set: ' + str(gbr.score(train_x, train_y))) start_time = time.time() predictions = gbr.predict(test_x) predict_time = time.time() - start_time print('Prediction time for gbr is ' + str(predict_time) + '\n') predictions = predictions.astype('uint8') print(predictions) return predictions
def compare_algorithms(datasetName, data, target): X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1) params = { 'n_estimators': [10, 20, 30, 40], 'loss': ['ls', 'huber'], 'min_samples_leaf': [6], 'max_depth': [3, 4, 5, 6] } print("\n\nTraining GBRT on %s..." % datasetName) clf = GridSearchCV(GradientBoostingRegressor(), params, cv=5, n_jobs=-1) clf.fit(X_train, y_train) print("Best params original: %s" % clf.best_params_) print("Avg train time original: %s seconds" % clf.cv_results_["mean_fit_time"][clf.best_index_]) bestOriginal = clf.best_estimator_ myclf = GridSearchCV(MyGradientBoostingRegressor(), params, cv=5, n_jobs=-1) myclf.fit(X_train, y_train) print("Best params mine: %s" % myclf.best_params_) print("Avg train time mine: %s seconds" % myclf.cv_results_["mean_fit_time"][myclf.best_index_]) bestMine = myclf.best_estimator_ originalPredictions = bestOriginal.predict(X_test) myPredicttions = bestMine.predict(X_test) print("The dataset: %s with %s train instances" % (datasetName, data.shape[0])) print("Original GradientBoostingRegressor R2: %s\tMSE: %s\tMAE: %s" % (r2_score(y_test, originalPredictions), mean_squared_error(y_test, originalPredictions), mean_absolute_error(y_test, originalPredictions))) print("My GradientBoostingRegressor R2: %s\tMSE: %s\tMAE: %s" % (r2_score(y_test, myPredicttions), mean_squared_error(y_test, myPredicttions), mean_absolute_error(y_test, myPredicttions)))
def trainmodels(): global n_folds,model_br,model_dic,model_etc,model_gbr,model_lr,model_names,\ model_svr,cv_score_list,pre_y_list n_folds = 6 # 设置交叉检验的次数 model_br = BayesianRidge() # 建立贝叶斯岭回归模型对象 model_lr = LinearRegression() # 建立普通线性回归模型对象 model_etc = ElasticNet() # 建立弹性网络回归模型对象 model_svr = SVR() # 建立支持向量机回归模型对象 model_gbr = GradientBoostingRegressor() # 建立梯度增强回归模型对象 model_names = [ 'BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR' ] # 不同模型的名称列表 model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同回归模型对象的集合 cv_score_list = [] # 交叉检验结果列表 pre_y_list = [] # 各个回归模型预测的y值列表 for model in model_dic: # 读出每个回归模型对象 scores = cross_val_score(model, X, y, cv=n_folds) # 将每个回归模型导入交叉检验模型中做训练检验 cv_score_list.append(scores) # 将交叉检验结果存入结果列表 pre_y_list.append(model.fit(X, y).predict(X)) # 将回归训练中得到的预测y存入列表
return sum((tree.predict(X) for tree in self.trees)) def fit(self, X, y): for m in range(self.n_boosting_steps): residuals = y - self.predict(X) new_tree = Node(X, residuals) new_tree.fit(max_tree_size=self.max_tree_size) self.trees.append(new_tree) if __name__ == '__main__': from sklearn.cross_validation import train_test_split from sklearn.metrics.metrics import mean_squared_error from sklearn.datasets import load_boston boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.33) from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor sk_gbrt = GradientBoostingRegressor(n_estimators=20) sk_gbrt.fit(X_train, y_train) print "sklearn test MSE", mean_squared_error(y_test, sk_gbrt.predict(X_test)) mart = MART(10, 15) mart.fit(X_train, y_train) print "mart test MSE", mean_squared_error(y_test, mart.predict(X_test))
# train_X = [];train_y = [] # context = trainData[i] # for array in context: # array = [float(x) for x in array[2:] ] # train_X.append((array[2:-1])) # train_y.append(array[-1]) # test_X = [];test_y = [];items = [] # context = testData[i] # for array in context: # items.append((array[0],array[1])) # array = [float(x) for x in array[2:] ] # test_X.append((array[2:-1])) # test_y.append(array[-1]) n_etemators = 1000 clf1 = GradientBoostingRegressor(loss='lad', n_estimators=n_etemators, learning_rate=0.01, max_depth=3,verbose=0).\ fit(train_X, train_y) test_score1 = np.zeros((n_etemators,), dtype=np.float64) for i, pred_y in enumerate(clf1.staged_predict(test_X)): print(i,clf1.feature_importances_) test_score1[i] = clf1.loss_(test_y, pred_y) # clf2 = GradientBoostingRegressor(loss='lad', n_estimators=n_etemators, learning_rate=0.1, max_depth=2,verbose=0).\ # fit(train_X, train_y) # test_score2 = np.zeros((n_etemators,), dtype=np.float64) # for i, pred_y in enumerate(clf2.staged_predict(test_X)): # test_score2[i] = clf2.loss_(test_y, pred_y) # # clf3 = GradientBoostingRegressor(loss='lad', n_estimators=n_etemators, learning_rate=0.1, max_depth=2,verbose=0,subsample=0.5).\ # fit(train_X, train_y) # test_score3 = np.zeros((n_etemators,), dtype=np.float64) # for i, pred_y in enumerate(clf3.staged_predict(test_X)):
for year in [2007, 2009, 2011, 2013]: X_train,X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split( train_for_loo, 'WnvPresent', year) X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False) X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False) y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False) y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False) if predict_num_mosquitos: reg = GradientBoostingRegressor(n_estimators=40) reg.fit(X_train.drop(['NumMosquitos'], axis=1), y_train_numMosquitos.astype(float)) predicted_mosquitos = reg.predict(X_test) X_test['NumMosquitos'] = predicted_mosquitos print("Accuracy is", metrics.r2_score(y_test_numMosquitos, predicted_mosquitos)) clf.fit(X_train.drop(['NumMosquitos'], axis=1), y_train) y_pred = clf.predict_proba(X_test)[:, 1] # print(y_pred) # y_pred = clf.predict_proba(X_test) # For xgbwrapper best score: 57.2 # y_pred = clf.predict_proba(X_test) # y_pred = clf.predict(X_test)
from sklearn.ensemble import GradientBoostingClassifier from BinReader import BinReader import numpy as np from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor (data,label,items) = BinReader.readData(ur'F:\AliRecommendHomeworkData\1212新版\train1217.expand.norm.bin') X_train = np.array(data) label = [item[0] for item in label] y_train = np.array(label) est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,max_depth=3, random_state=0, loss='ls',verbose=1).fit(X_train, y_train) print 'testing...' reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin') reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x,userid,itemid,label) = reader.readline() x[0] = 1 y = est.predict([x])[0] result[i] = (userid,itemid,y) if i % 10000 == 0: print '%d/%d' % (i,reader.LineCount) result.sort(key=lambda x:x[2],reverse=True) result = result[:7000] print ur'正在输出...' with open('result.csv','w') as f: for item in result: