def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] data_test = param["data_test"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df_test = pd.read_csv(data_test, index_col=idx) df = pd.concat([df_guest, df_host], axis=0) y = df[label_name] X = df.drop(label_name, axis=1) X = df.drop(label_name, axis=1) X_guest = df_guest.drop(label_name, axis=1) y_guest = df_guest[label_name] clf = GradientBoostingRegressor(n_estimators=50) clf.fit(X, y) y_predict = clf.predict(X_guest) result = { "mean_squared_error": mean_squared_error(y_guest, y_predict), "mean_absolute_error": mean_absolute_error(y_guest, y_predict) } print(result) return {}, result
def test_gradient_boosting_estimator_with_smooth_quantile_loss(): np.random.seed(0) m = 15000 n = 10 p = .8 X = np.random.normal(size=(m,n)) beta = np.random.normal(size=n) mu = np.dot(X, beta) y = np.random.lognormal(mu) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333) loss_function = SmoothQuantileLossFunction(1, p, .0001) q_loss = QuantileLossFunction(1, p) model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), loss_function, n_estimators=150, stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True) assert_raises(NotFittedError, lambda : model.predict(X_train)) model.fit(X_train, y_train) prediction = model.predict(X_test) model2 = GradientBoostingRegressor(loss='quantile', alpha=p) model2.fit(X_train, y_train) prediction2 = model2.predict(X_test) assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2)) assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2)) q = np.mean(y_test <= prediction) assert_less(np.abs(q-p), .05) assert_greater(model.score_, 0.) assert_approx_equal(model.score(X_train, y_train), model.score_)
def main(param=""): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] # prepare data df_guest = pd.read_csv(data_guest, index_col=idx) df_host = pd.read_csv(data_host, index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingRegressor(random_state=0, n_estimators=50, learning_rate=0.1) clf.fit(X, y) y_predict = clf.predict(X) result = { "mean_absolute_error": mean_absolute_error(y, y_predict), } print(result) return {}, result
def main(config="../../config.yaml", param="./gbdt_config_reg.yaml"): # obtain config if isinstance(param, str): param = JobConfig.load_from_file(param) data_guest = param["data_guest"] data_host = param["data_host"] idx = param["idx"] label_name = param["label_name"] print('config is {}'.format(config)) if isinstance(config, str): config = JobConfig.load_from_file(config) data_base_dir = config["data_base_dir"] print('data base dir is', data_base_dir) else: data_base_dir = config.data_base_dir # prepare data df_guest = pd.read_csv(os.path.join(data_base_dir, data_guest), index_col=idx) df_host = pd.read_csv(os.path.join(data_base_dir, data_host), index_col=idx) df = df_guest.join(df_host, rsuffix='host') y = df[label_name] X = df.drop(label_name, axis=1) clf = GradientBoostingRegressor(random_state=0, n_estimators=50) clf.fit(X, y) y_predict = clf.predict(X) result = {"mean_absolute_error": mean_absolute_error(y, y_predict)} print(result) return {}, result
def GDBT_ST(trainFileName,testFilename): trainData = ld.LoadData_DATA_ST(trainFileName) testData = ld.LoadData_DATA_ST(testFilename) store = ['1','2','3','4','5'] res = [] for i in store: train_X = [];train_y = [] context = trainData[i] for array in context: array = [float(x) for x in array[2:]] train_X.append((array[2:-1])) train_y.append(array[-1]) test_X = [];items = [] context = testData[i] for array in context: items.append((array[0],array[1])) array = [float(x) for x in array[2:] ] test_X.append((array[2:])) clf = GradientBoostingRegressor(loss='lad', n_estimators=50, learning_rate=0.1, max_depth=3).\ fit(train_X,train_y) pred_y = clf.predict(test_X) for i in range(len(pred_y)): res.append([items[i][0],items[i][1],'%.4f'%max(pred_y[i],0)]) return res
class GradientBoostingRegressorImpl(): def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001): self._hyperparams = { 'loss': loss, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'subsample': subsample, 'criterion': criterion, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_depth': max_depth, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'init': init, 'random_state': random_state, 'max_features': max_features, 'alpha': alpha, 'verbose': verbose, 'max_leaf_nodes': max_leaf_nodes, 'warm_start': warm_start, 'presort': presort, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'tol': tol } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
def GradientBoosted(X_train, X_test, y_train, y_test): mod = GradientBoostingRegressor() mod.fit(X_train, y_train) print "Done training" gb_labels = mod.predict(X_test) print "Done testing" gb_score = mod.score(X_test, y_test) return gb_score, gb_labels
def GDBT_ALL_train(trainFileName,testFileName): train_X, train_y, _ = ld.loadData_all(trainFileName) test_X, test_y,items = ld.loadData_all(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(test_X) res = [] for i in range(len(test_X)): res.append([items[i],'all','%.2f'%max(pred_y[i],0),'%.2f'%max(test_y[i],0)]) return res
def GDBT_ALL(trainFileName, testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i], 'all', '%.4f' % max(pred_y[i], 0)]) return res
def gradient_boosting(train, test, label): gb = GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=3, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber') gb.fit(train, label.as_matrix().ravel()) # prediction on training data y_predicton = gb.predict(train) y_test = label print("Gradient Boosting score on training set: ", rmse(y_test, y_predicton)) y_prediction = gb.predict(test) y_prediction = np.exp(y_prediction) return y_prediction
def GDBT_ALL(trainFileName,testFileName): train_X, train_y, _ = ld.LoadData_DATA_LABEL_ITEM(trainFileName) Eval_X, items = ld.LoadData_DATA_ITEM(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(Eval_X) res = [] for i in range(len(Eval_X)): res.append([items[i],'all','%.4f'%max(pred_y[i],0)]) return res
def test_argument_names(): boston = load_boston() X = DataFrame(boston['data'], columns=boston['feature_names']) y = boston['target'] model = GradientBoostingRegressor(verbose=True).fit(X, y) code = sklearn2code(model, ['predict'], numpy_flat, argument_names=X.columns) boston_housing_module = exec_module('boston_housing_module', code) assert_array_almost_equal(model.predict(X), boston_housing_module.predict(**X))
def predict_using_local_model(self): gbr = GradientBoostingRegressor() gbr.fit(self.train_x, self.train_y) print('Accuracy of gbr, on the training set: ' + str(gbr.score(train_x, train_y))) start_time = time.time() predictions = gbr.predict(self.test_x) predict_time = time.time() - start_time print('Prediction time for gbr is ' + str(predict_time) + '\n') predictions = predictions.astype('uint8') return predictions
def prediction(): global train_x, train_y, test_x gbr = GradientBoostingRegressor() gbr.fit(train_x, train_y) print('Accuracy of gbr, on the training set: ' + str(gbr.score(train_x, train_y))) start_time = time.time() predictions = gbr.predict(test_x) predict_time = time.time() - start_time print('Prediction time for gbr is ' + str(predict_time) + '\n') predictions = predictions.astype('uint8') print(predictions) return predictions
def NonlinReg(coeff, regressor='GBR', features=4, interval=0, length=1): ''' NonlinReg: Non-linear Regression Model coeff: Input sequence disposed by WT (Wavelet Transformation Function) regressor: Non-linear regressor, 'GBR' default features: Days used to predict, 4 default interval: Prediction lagging, 0 default length: 1 default ''' X, Y = [], [] for i in range(len(coeff[0])): if i + features + interval < len(coeff[0]): X.append(coeff[0][i:i + features]) Y.append(coeff[0][i + features + interval]) X = np.array(X) Y = np.array(Y) if regressor == 'GBR': gbr = GBR(learning_rate=0.1, n_estimators=80, max_depth=2).fit(X, Y) X_ = copy.deepcopy(X) Y_ = copy.deepcopy(Y) for i in range(length): X_ = np.concatenate( (X_, np.array([ np.concatenate( (X_[-1][-features + 1:], Y_[[-interval - 1]])) ]))) Y_ = np.concatenate((Y_, gbr.predict(X_[-1]))) if regressor == 'SVR': svr = svm.SVR(kernel='rbf', C=100, gamma=3).fit(X, Y) X_ = copy.deepcopy(X) Y_ = copy.deepcopy(Y) for i in range(length): X_ = np.concatenate( (X_, np.array([ np.concatenate( (X_[-1][-features + 1:], Y_[[-interval - 1]])) ]))) Y_ = np.concatenate((Y_, svr.predict(X_[-1]))) return Y_
def GDBT_ALL_train(trainFileName, testFileName): train_X, train_y, _ = ld.loadData_all(trainFileName) test_X, test_y, items = ld.loadData_all(testFileName) clf = GradientBoostingRegressor(loss='lad', n_estimators=40, learning_rate=0.1, max_depth=3).\ fit(train_X, train_y) pred_y = clf.predict(test_X) res = [] for i in range(len(test_X)): res.append([ items[i], 'all', '%.2f' % max(pred_y[i], 0), '%.2f' % max(test_y[i], 0) ]) return res
def main(train, test, filepath): if not filepath: click.echo("need filepath") return X, Y = get_data(filepath) if not train or not test: click.echo("need train or test size") return TRAIN_SIZE = 96 * int(train) TEST_SIZE = 96 * int(test) X_train = X[:TRAIN_SIZE] Y_train = Y[:TRAIN_SIZE] X_test = X[TRAIN_SIZE:] Y_test = Y[TRAIN_SIZE:] #clf = SVR(kernel='rbf', C=1e3, gamma=0.00001) clf = GradientBoostingRegressor(n_estimators=100, max_depth=1) #clf = DecisionTreeRegressor(max_depth=25) #clf = ExtraTreesRegressor(n_estimators=2000,max_depth=14) #clf = xgb.XGBRegressor(n_estimators=2000,max_depth=25) #clf = RandomForestRegressor(n_estimators=1000,max_depth=26,n_jobs=7) #clf.fit(X_train,Y_train) #y_pred = clf.predict(X_test) #plt.plot(X_test, y_pred, linestyle='-', color='red') predict_list = [] for i in range(TEST_SIZE): X = [[x] for x in range(i, TRAIN_SIZE + i)] clf.fit(X, Y[i:TRAIN_SIZE + i]) y_pred = clf.predict(np.array([TRAIN_SIZE + 1 + i]).reshape(1, -1)) predict_list.append(y_pred) #print("mean_squared_error:%s"%mean_squared_error(Y_test, predict_list)) #print("sqrt of mean_squared_error:%s"%np.sqrt(mean_squared_error(Y_test, predict_list))) origin_data = Y_test #print("origin data:%s"%origin_data) plt.plot([x for x in range(TRAIN_SIZE + 1, TRAIN_SIZE + TEST_SIZE + 1)], predict_list, linestyle='-', color='red', label='prediction model') plt.plot(X_test, Y_test, linestyle='-', color='blue', label='actual model') plt.legend(loc=1, prop={'size': 12}) plt.show()
label = [item[0] for item in label] y_train = np.array(label) est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, max_depth=3, random_state=0, loss='ls', verbose=1).fit(X_train, y_train) print 'testing...' reader = BinReader( ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin') reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x, userid, itemid, label) = reader.readline() x[0] = 1 y = est.predict([x])[0] result[i] = (userid, itemid, y) if i % 10000 == 0: print '%d/%d' % (i, reader.LineCount) result.sort(key=lambda x: x[2], reverse=True) result = result[:7000] print ur'正在输出...' with open('result.csv', 'w') as f: for item in result: f.write('%d,%d\n' % (item[0], item[1])) print ur'阈值:', result[-1][2] print ur'样本总数:', reader.LineCount
X_train,X_test, y_train, y_test, y_train_numMosquitos, y_test_numMosquitos = year_train_test_split( train_for_loo, 'WnvPresent', year) X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False) X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False) y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False) y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False) if predict_num_mosquitos: reg = GradientBoostingRegressor(n_estimators=40) reg.fit(X_train.drop(['NumMosquitos'], axis=1), y_train_numMosquitos.astype(float)) predicted_mosquitos = reg.predict(X_test) X_test['NumMosquitos'] = predicted_mosquitos print("Accuracy is", metrics.r2_score(y_test_numMosquitos, predicted_mosquitos)) clf.fit(X_train.drop(['NumMosquitos'], axis=1), y_train) y_pred = clf.predict_proba(X_test)[:, 1] # print(y_pred) # y_pred = clf.predict_proba(X_test) # For xgbwrapper best score: 57.2 # y_pred = clf.predict_proba(X_test) # y_pred = clf.predict(X_test) non_carriers_mask = (X_test.Species == species_encoder.transform('CULEX SALINARIUS')) |\
'n_estimators': [10, 50, 100, 200, 300], 'max_depth': [2, 3, 4, 5], 'max_features': ['auto', 'sqrt', 'log2'] } gbr_cv = GridSearchCV(gbr, parameters) gbr_cv.fit(svd_train_df.iloc[:, :-1], svd_train_df.iloc[:, -1]) #best parameters gbr_cv.best_params_ gbr_opt = GradientBoostingRegressor(max_depth=2, max_features='log2', n_estimators=10) gbr_opt.fit(svd_train_df.iloc[:, :-1], svd_train_df.iloc[:, -1]) gbr_opt_tfidf_2gram_pred = gbr_opt.predict(svd_test_df.iloc[:, :-1]) gbr_opt_tfidf_2gram_mse = mean_squared_error(svd_test_df.iloc[:, -1], gbr_opt_tfidf_2gram_pred) gbr_opt_tfidf_2gram_mae = mean_absolute_error(svd_test_df.iloc[:, -1], gbr_opt_tfidf_2gram_pred) print('mean squared error is {}'.format(gbr_opt_tfidf_2gram_mse)) print('mean absolute error is {}'.format(gbr_opt_tfidf_2gram_mae)) ### Look at R^2: inference_data = pd.read_csv('bio_txt.csv') true_df = inference_data[[ 'index', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor (data,label,items) = BinReader.readData(ur'F:\AliRecommendHomeworkData\1212新版\train1217.expand.norm.bin') X_train = np.array(data) label = [item[0] for item in label] y_train = np.array(label) est = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,max_depth=3, random_state=0, loss='ls',verbose=1).fit(X_train, y_train) print 'testing...' reader = BinReader(ur'F:\AliRecommendHomeworkData\1212新版\test18.expand.norm.bin') reader.open() result = [0] * reader.LineCount for i in xrange(reader.LineCount): (x,userid,itemid,label) = reader.readline() x[0] = 1 y = est.predict([x])[0] result[i] = (userid,itemid,y) if i % 10000 == 0: print '%d/%d' % (i,reader.LineCount) result.sort(key=lambda x:x[2],reverse=True) result = result[:7000] print ur'正在输出...' with open('result.csv','w') as f: for item in result: f.write('%d,%d\n' % (item[0],item[1])) print ur'阈值:',result[-1][2] print ur'样本总数:',reader.LineCount
X_train.to_csv("data_per_year/" + str(year) + "X_train.csv", index=False) X_test.to_csv("data_per_year/" + str(year) + "X_test.csv", index=False) y_train.to_csv("data_per_year/" + str(year) + "y_train.csv", index=False) y_test.to_csv("data_per_year/" + str(year) + "y_test.csv", index=False) print(X_test.columns) if predict_num_mosquitos: reg = GradientBoostingRegressor(n_estimators=40) reg.fit(X_train.drop(['NumMosquitos'], axis=1), y_train_numMosquitos.astype(float)) predicted_mosquitos = reg.predict(X_test) X_test['NumMosquitos'] = predicted_mosquitos print("Accuracy is", metrics.r2_score(y_test_numMosquitos, predicted_mosquitos)) print(len(X_train)) print(len(y_train)) clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:, 1] # print(y_pred) # y_pred = clf.predict_proba(X_test) # For xgbwrapper best score: 57.2 # y_pred = clf.predict_proba(X_test) # y_pred = clf.predict(X_test)
# preprocess x_pred X_pred_ordinal_transf = preprocess_X(X_pred_ordinal) X_pred = np.concatenate((X_pred_numeric, X_pred_ordinal_transf.T), axis=1) return [X_data, y_data, X_pred, test_Ids] X_data, y_data, X_pred, test_Ids = get_data() # Model parameters lr = 0.15 # learning rate n_est = 200 # number of boosting stages # log scaling the target y_data = np.log(y_data) # training a Gradient boosting regressor model = GBR(learning_rate=lr, n_estimators=n_est, random_state=0) model.fit(X_data, y_data) # evaluate model scored = cross_val_score(model, X_data, y=y_data, cv=5, scoring='neg_mean_squared_error', n_jobs=2) prices = np.round(np.exp(model.predict(X_pred)), 2) Gen_output_file(test_Ids, prices) print('scored {0}'.format(np.mean(scored)))
print('mae \t mean_absolute_error') print('mse \t mean_squared_error') print('r2 \t coefficient of determination') print(70 * '-') exit() # 模型效果可视化 plt.figure() x = np.arange(training_data_input.shape[0]) plt.plot(x, training_data_output, color='r', label='origin y') color_list = ['k.', 'b.', 'go', 'yv', 'c*', 'm^'] # 颜色列表 for i, pre_y in enumerate(pre_y_list): # 读出通过回归模型预测得到的索引及结果 plt.plot(x, pre_y_list[i], color_list[i], label=model_names[i]) # 画出每条预测结果线 plt.title('regression result comparison') # 标题 plt.legend(loc='upper right') plt.xlabel('test data number') plt.ylabel('real and predicted values') # plt.savefig("regression compare.jpg", dpi=500) plt.show() # 模型应用 print('regression prediction:') print('predict data \t real data') new_pre_y = model_gbr.predict(test_data_input) # 使用GBR进行预测 model_gbr_score = model_gbr.score(test_data_input, test_data_output) print("The score of model_gbr is : %f" % model_gbr_score) for i in range(len(test_data_input)): print(' %.2f \t %0.2f' % (new_pre_y[i], test_data_output[i])) # 打印输出每个数据点的预测信息 # if __name__ == "__main__": # svm_baseline()
class GradientBoostingRegressor: def __init__(self, loss, learning_rate, n_estimators, subsample, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, criterion, max_features, max_leaf_nodes, min_impurity_decrease, random_state=None, verbose=0, **kwargs): self.loss = loss self.learning_rate = learning_rate self.n_estimators = n_estimators self.subsample = subsample self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_depth = max_depth self.criterion = criterion self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.random_state = random_state self.verbose = verbose self.estimator = None self.fully_fit_ = False self.time_limit = None def fit(self, X, y, sample_weight=None): from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor as GBR # Special fix for gradient boosting! if isinstance(X, np.ndarray): X = np.ascontiguousarray(X, dtype=X.dtype) self.learning_rate = float(self.learning_rate) self.n_estimators = int(self.n_estimators) self.subsample = float(self.subsample) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.max_features = float(self.max_features) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) self.verbose = int(self.verbose) self.estimator = GBR( loss=self.loss, learning_rate=self.learning_rate, n_estimators=self.n_estimators, subsample=self.subsample, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_depth=self.max_depth, criterion=self.criterion, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=self.random_state, verbose=self.verbose, warm_start=True, ) self.estimator.fit(X, y, sample_weight=sample_weight) return self def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) @staticmethod def get_cs(): cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ['ls', 'lad'], default_value='ls') learning_rate = UniformFloatHyperparameter(name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True) n_estimators = UniformIntegerHyperparameter("n_estimators", 50, 500, default_value=200) max_depth = UniformIntegerHyperparameter(name="max_depth", lower=1, upper=10, default_value=3) criterion = CategoricalHyperparameter('criterion', ['friedman_mse', 'mse', 'mae'], default_value='friedman_mse') min_samples_split = UniformIntegerHyperparameter( name="min_samples_split", lower=2, upper=20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( name="min_samples_leaf", lower=1, upper=20, default_value=1) min_weight_fraction_leaf = UnParametrizedHyperparameter( "min_weight_fraction_leaf", 0.) subsample = UniformFloatHyperparameter(name="subsample", lower=0.1, upper=1.0, default_value=1.0) max_features = UniformFloatHyperparameter("max_features", 0.1, 1.0, default_value=1) max_leaf_nodes = UnParametrizedHyperparameter(name="max_leaf_nodes", value="None") min_impurity_decrease = UnParametrizedHyperparameter( name='min_impurity_decrease', value=0.0) cs.add_hyperparameters([ loss, learning_rate, n_estimators, max_depth, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, subsample, max_features, max_leaf_nodes, min_impurity_decrease ]) return cs
svrr.fit(tf_train_2gram, Y_train) svrr_base_tfidf_2gram_pred = svrr.predict(tf_test_2gram) svrr_tfidf_2gram_base_mse = mean_squared_error(Y_test, svrr_base_tfidf_2gram_pred) svrr_tfidf_2gram_base_mae = mean_absolute_error(Y_test, svrr_base_tfidf_2gram_pred) print('svrr mean squared error is {}'.format(svrr_tfidf_2gram_base_mse)) print('svrr mean absolute error is {}'.format(svrr_tfidf_2gram_base_mae)) #gradient boosting accept sparse matrix as input in "fit" method, but currently does not accept sparse matrix for "predict" method #gradient boosting regressor: gbr = GradientBoostingRegressor() gbr.fit(tf_train_2gram, Y_train) gbr_base_tfidf_2gram_pred = gbr.predict(tf_test_2gram.todense()) gbr_tfidf_2gram_base_mse = mean_squared_error(Y_test, gbr_base_tfidf_2gram_pred) gbr_tfidf_2gram_base_mae = mean_absolute_error(Y_test, gbr_base_tfidf_2gram_pred) print('gbr mean squared error is {}'.format(gbr_tfidf_2gram_base_mse)) print('gbr mean absolute error is {}'.format(gbr_tfidf_2gram_base_mae)) '''from the experiment, we see that gradient boosting tree has the best performance, however, overall, the text feature doesn't predicte sentencing length very well according to mean absolute error and mean squared error.''' #plt.scatter(Y_test,gbr_base_tfidf_2gram_pred) #plt.show() #savefig('GBR_predicted_VS_true.png')
def gbr(x_train, x_test, y_train): #Training Classifier reg = GradientBoostingRegressor(random_state=1) reg.fit(x_train, y_train) #Testing Classifier print(plt.plot(reg.predict(x_test)))
line_train_y.append(inv_y[i + windows_size - 1]) line_train_x = numpy.array(line_train_x).reshape( line_test_size - windows_size + 1, 2 * windows_size) # line_train_x = scaler.fit_transform(line_train_x) model.fit(line_train_x, line_train_y) line_test_x = [] for i in range(arima.test_size - line_test_size - windows_size + 1): for j in range(windows_size): line_test_x.append(arima.predictions[line_test_size + i + j][0]) for j in range(windows_size): line_test_x.append(inv_yhat[line_test_size + i + j]) line_test_x = numpy.array(line_test_x).reshape( arima.test_size - line_test_size - windows_size + 1, 2 * windows_size) # line_test_x = scaler.fit_transform(line_test_x) dyn_combined = model.predict(line_test_x) mse = mean_squared_error(inv_y[test_start:], dyn_combined) rmse = math.sqrt(mean_squared_error(inv_y[test_start:], dyn_combined)) mae = mean_absolute_error(inv_y[test_start:], dyn_combined) mape = arima.mean_a_p_e(inv_y[test_start:], dyn_combined) print('dyn combined Test MAE:%.3f MSE: %.3f RMSE:%.3f MAPE:%.3f' % (mae, mse, rmse, mape)) plt.figure() plt.plot(inv_y[test_start:], '-', label="real flow") # plt.plot(arima.predictions, 'x--', color='y', label="ARIMA") # plt.plot(inv_yhat, 'x--', color='red', label="LSTM") plt.plot(dyn_combined, '--', color='red', label="combined") plt.legend(loc='upper right') plt.xlabel("period(15-minute intervals)")
# gbr from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt model = GradientBoostingRegressor(n_estimators=10000) X_train, X_test, y_train, y_test = train_test_split(state_data,action_data,test_size=0.33) print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) # GBR只能预测一个target,所以设置target的index target_index = 0 model.fit(X_train,y_train[:,target_index]) test_pre = model.predict(X_test) plt.plot(y_test[:,target_index],test_pre,"ro") plt.show() # 打印出各个feature的importance print(model.feature_importances_) ''' importance分析结果: 对于油门和刹车这个target,importance为 速度标量 0.45935925 速度x 0.01006275 速度y 0.00771682 加速度标量 0.3719029 加速度x 0.05408772 加速度y 0.00396671 车道线c0 0.01144348 车道线c1 0.00659045
6.58 ], [ 0.7842, 0., 8.14, 0., 0.538, 5.99, 81.7, 4.2579, 4., 307., 21., 386.75, 14.67 ], [ 0.80271, 0., 8.14, 0., 0.538, 5.456, 36.6, 3.7965, 4., 307., 21., 288.99, 11.69 ], [ 0.7258, 0., 8.14, 0., 0.538, 5.727, 69.5, 3.7965, 4., 307., 21., 390.95, 11.28 ]] # 要预测的新数据集 for i, new_point in enumerate(new_point_set): # 循环读出每个要预测的数据点 new_pre_y = model_gbr.predict(new_point) # 使用GBR进行预测 print('predict for new point %d is: %.2f' % (i + 1, new_pre_y) ) # 打印输出每个数据点的预测信息 #################################################################### # 4.3 分类分析 # 导入库 import numpy as np # 导入numpy库 from sklearn.model_selection import train_test_split # 数据分区库 from sklearn import tree # 导入决策树库 from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, \ roc_curve # 导入指标库 import prettytable # 导入表格库 import pydotplus # 导入dot插件库 import matplotlib.pyplot as plt # 导入图形展示库
return sum((tree.predict(X) for tree in self.trees)) def fit(self, X, y): for m in range(self.n_boosting_steps): residuals = y - self.predict(X) new_tree = Node(X, residuals) new_tree.fit(max_tree_size=self.max_tree_size) self.trees.append(new_tree) if __name__ == '__main__': from sklearn.cross_validation import train_test_split from sklearn.metrics.metrics import mean_squared_error from sklearn.datasets import load_boston boston = load_boston() X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.33) from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor sk_gbrt = GradientBoostingRegressor(n_estimators=20) sk_gbrt.fit(X_train, y_train) print "sklearn test MSE", mean_squared_error(y_test, sk_gbrt.predict(X_test)) mart = MART(10, 15) mart.fit(X_train, y_train) print "mart test MSE", mean_squared_error(y_test, mart.predict(X_test))
class GradientBoostingRegressor(IterativeComponentWithSampleWeight, BaseRegressionModel): def __init__(self, loss, learning_rate, n_estimators, subsample, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, criterion, max_features, max_leaf_nodes, min_impurity_decrease, random_state=None, verbose=0): self.loss = loss self.learning_rate = learning_rate self.n_estimators = n_estimators self.subsample = subsample self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_depth = max_depth self.criterion = criterion self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.random_state = random_state self.verbose = verbose self.estimator = None self.fully_fit_ = False self.start_time = time.time() self.time_limit = None def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor as GBR # Special fix for gradient boosting! if isinstance(X, np.ndarray): X = np.ascontiguousarray(X, dtype=X.dtype) if refit: self.estimator = None if self.estimator is None: self.learning_rate = float(self.learning_rate) self.n_estimators = int(self.n_estimators) self.subsample = float(self.subsample) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float( self.min_weight_fraction_leaf) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.max_features = float(self.max_features) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) self.verbose = int(self.verbose) self.estimator = GBR( loss=self.loss, learning_rate=self.learning_rate, n_estimators=n_iter, subsample=self.subsample, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_depth=self.max_depth, criterion=self.criterion, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=self.random_state, verbose=self.verbose, warm_start=True, ) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) # Apparently this if is necessary if self.estimator.n_estimators >= self.n_estimators: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'GB', 'name': 'Gradient Boosting Regressor', 'handles_regression': True, 'handles_classification': False, 'handles_multiclass': False, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ['ls', 'lad'], default_value='ls') learning_rate = UniformFloatHyperparameter(name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True) n_estimators = UniformIntegerHyperparameter("n_estimators", 50, 500, default_value=200) max_depth = UniformIntegerHyperparameter(name="max_depth", lower=1, upper=10, default_value=3) criterion = CategoricalHyperparameter( 'criterion', ['friedman_mse', 'mse', 'mae'], default_value='friedman_mse') min_samples_split = UniformIntegerHyperparameter( name="min_samples_split", lower=2, upper=20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( name="min_samples_leaf", lower=1, upper=20, default_value=1) min_weight_fraction_leaf = UnParametrizedHyperparameter( "min_weight_fraction_leaf", 0.) subsample = UniformFloatHyperparameter(name="subsample", lower=0.1, upper=1.0, default_value=1.0) max_features = UniformFloatHyperparameter("max_features", 0.1, 1.0, default_value=1) max_leaf_nodes = UnParametrizedHyperparameter( name="max_leaf_nodes", value="None") min_impurity_decrease = UnParametrizedHyperparameter( name='min_impurity_decrease', value=0.0) cs.add_hyperparameters([ loss, learning_rate, n_estimators, max_depth, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, subsample, max_features, max_leaf_nodes, min_impurity_decrease ]) return cs elif optimizer == 'tpe': from hyperopt import hp space = { 'loss': hp.choice('gb_loss', ["ls", "lad"]), 'learning_rate': hp.loguniform('gb_learning_rate', np.log(0.01), np.log(1)), # 'n_estimators': hp.randint('gb_n_estimators', 451) + 50, 'n_estimators': hp.choice('gb_n_estimators', [100]), 'max_depth': hp.randint('gb_max_depth', 8) + 1, 'criterion': hp.choice('gb_criterion', ['friedman_mse', 'mse', 'mae']), 'min_samples_split': hp.randint('gb_min_samples_split', 19) + 2, 'min_samples_leaf': hp.randint('gb_min_samples_leaf', 20) + 1, 'min_weight_fraction_leaf': hp.choice('gb_min_weight_fraction_leaf', [0]), 'subsample': hp.uniform('gb_subsample', 0.1, 1), 'max_features': hp.uniform('gb_max_features', 0.1, 1), 'max_leaf_nodes': hp.choice('gb_max_leaf_nodes', [None]), 'min_impurity_decrease': hp.choice('gb_min_impurity_decrease', [0]) } init_trial = { 'loss': "ls", 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 3, 'criterion': "friedman_mse", 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0, 'subsample': 1, 'max_features': 1, 'max_leaf_nodes': None, 'min_impurity_decrease': 0 } return space
ax.set_title('Heatmap of Methylation Values') ax.set_xlabel("Samples sorted ascending by age") ax.set_ylabel("19 selected cgp positions") plt.savefig('heatmap19.jpg') print(data19_ages) # Trying GradientBoostingRegressor Learning from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor #X_train, X_test, y_train, y_test = train_test_split(data_f, annotations["Age_Group"], test_size=0.10) X_train, X_test, y_train, y_test = train_test_split(data19, data19_ages, test_size=0.10) gb = GradientBoostingRegressor(loss='lad', learning_rate=0.03, n_estimators=300, max_features='log2', subsample = 0.6, min_samples_split=2, max_depth=4, verbose = 1, warm_start = True) gb.fit(X_train, y_train) y = gb.predict(X_test) print(y.shape) print(y_test.shape) df = pd.DataFrame({"Real":y_test, "Pred":y.reshape(73)}) df = df.sort_values(by=['Real']) plt.scatter(x = range(73), y = df.Pred, c = 'b') plt.scatter(x = range(73), y = df.Real, c = 'r') plt.show() plt.scatter(x = df.Real, y = df.Pred, c = 'b') X_train, X_test, y_train, y_test = train_test_split(data19, data19_ages, test_size=0.10) model = tf.keras.models.Sequential([ tf.keras.layers.Dense(512, activation='relu', input_shape=(19,)),
6.58 ], [ 0.7842, 0., 8.14, 0., 0.538, 5.99, 81.7, 4.2579, 4., 307., 21., 386.75, 14.67 ], [ 0.80271, 0., 8.14, 0., 0.538, 5.456, 36.6, 3.7965, 4., 307., 21., 288.99, 11.69 ], [ 0.7258, 0., 8.14, 0., 0.538, 5.727, 69.5, 3.7965, 4., 307., 21., 390.95, 11.28 ]] # 要预测的新数据集 for i, new_point in enumerate(new_point_set): # 循环读出每个要预测的数据点 new_pre_y = model_gbr.predict(np.array(new_point).reshape(1, -1)) # 使用GBR进行预测 print('predict for new point %d is: %.2f' % (i + 1, new_pre_y) ) # 打印输出每个数据点的预测信息 #################################################################### # 4.3 分类分析 # 导入库 import numpy as np # 导入numpy库 from sklearn.model_selection import train_test_split # 数据分区库 from sklearn import tree # 导入决策树库 from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, \ roc_curve # 导入指标库 import prettytable # 导入表格库 import pydotplus # 导入dot插件库 import matplotlib.pyplot as plt # 导入图形展示库
ss_y.inverse_transform(training_data_output), color='r', label='origin y') color_list = ['k+', 'b.', 'go', 'cv', 'y*', 'm^'] # 颜色列表 for i, pre_y in enumerate(pre_y_list): # 读出通过回归模型预测得到的索引及结果 plt.plot(x, ss_y.inverse_transform(pre_y_list[i].reshape(-1, 1)), color_list[i], label=model_names[i]) # 画出每条预测结果线 plt.title('Comparison of results by six regression model') # 标题 plt.legend(loc='upper right') plt.xlabel('Test data number') plt.ylabel('Filter weight gain(g/h)') plt.savefig("picture/all regression compare.jpg", dpi=500) plt.show() # 模型应用 print('regression prediction:') new_pre_y = ss_y.inverse_transform( model_gbr.predict(test_data_input).reshape(-1, 1)) # 使用GBR进行预测 print('predict data \t real data') test_y_output = ss_y.inverse_transform(test_data_output) for i in range(len(test_data_input)): print(' %.2f \t %0.2f' % (new_pre_y[i], test_y_output[i])) # 打印输出每个数据点的预测信息 mse = mean_squared_error(test_y_output, new_pre_y) print("The mse of model_gbr is : %f" % mse) # if __name__ == "__main__": # svm_baseline()
mae_svm = mean_absolute_error(Y_test, y_pred_svm) #RF Algorithm from sklearn.ensemble import RandomForestRegressor regressor_rf = RandomForestRegressor(n_estimators=20, random_state=0) regressor_rf.fit(X_train, Y_train) y_pred_rf = regressor_rf.predict(X_test) rms_rf = sqrt(mean_squared_error(Y_test, y_pred_rf)) rsqrd_rf = r2_score(Y_test, y_pred_rf) mae_rf = mean_absolute_error(Y_test, y_pred_rf) #GB Algorithm from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor regressor_gb = GradientBoostingRegressor(learning_rate=0.5, n_estimators=400, loss='ls') regressor_gb.fit(X_train, Y_train) y_pred_gb = regressor_gb.predict(X_test) rms_gb = sqrt(mean_squared_error(Y_test, y_pred_gb)) rsqrd_gb = r2_score(Y_test, y_pred_gb) mae_gb = mean_absolute_error(Y_test, y_pred_gb) #Multiple Linear Regression from sklearn.linear_model import LinearRegression regressor_lr = LinearRegression() regressor_lr.fit(X_train, Y_train) y_pred_lr = regressor_lr.predict(X_test) rms_lr = sqrt(mean_squared_error(Y_test, y_pred_lr)) rsqrd_lr = r2_score(Y_test, y_pred_lr) mae_lr = mean_absolute_error(Y_test, y_pred_lr)
'killsNorm', 'damageDealtNorm', 'boostsPerWalkDistance', 'healsPerWalkDistance', 'healsAndBoostsPerWalkDistance', 'killsPerWalkDistance' ]] return data train = pd.read_csv("train_V2.csv") train = train.sample(frac=0.3) train = preprocess(train) X = train[[ 'totalDistance', 'killsCategories', 'playersJoined', 'killsNorm', 'damageDealtNorm', 'boostsPerWalkDistance', 'healsPerWalkDistance', 'healsAndBoostsPerWalkDistance', 'killsPerWalkDistance' ]] y = train["winPlacePerc"] X_train, X_test, y_train, y_test = train_test_split(X, y) regressor = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=1, loss='ls') regressor.fit(X_train, y_train) test_y_predicted = regressor.predict(X_test) valiation(y_test, test_y_predicted)