def GBM_grid_search(X_train, y_train, X_val, y_val): parameters = { 'max_depth': 40, 'min_samples_leaf': 1, 'learning_rate': 0.01 } param_grid = { 'loss': [ #'poisson', 'least_squares', 'least_absolute_deviation' ] } GradientBoostingRegressorObject = HistGradientBoostingRegressor( random_state=1, **parameters) best_score = float('-inf') for g in ParameterGrid(param_grid): GradientBoostingRegressorObject.set_params(**g) GradientBoostingRegressorObject.fit(X_train, y_train) # save if best if GradientBoostingRegressorObject.score(X_val, y_val) > best_score: best_score = GradientBoostingRegressorObject.score(X_val, y_val) best_grid = g return (best_grid['loss'])
def test_missing_values_resilience(problem, missing_proportion, expected_min_score_classification, expected_min_score_regression): # Make sure the estimators can deal with missing values and still yield # decent predictions rng = np.random.RandomState(0) n_samples = 1000 n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features, random_state=rng) gb = HistGradientBoostingRegressor() expected_min_score = expected_min_score_regression else: X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, n_repeated=0, random_state=rng) gb = HistGradientBoostingClassifier() expected_min_score = expected_min_score_classification mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool) X[mask] = np.nan gb.fit(X, y) assert gb.score(X, y) > expected_min_score
def test_least_absolute_deviation(): # For coverage only. X, y = make_regression(n_samples=500, random_state=0) gbdt = HistGradientBoostingRegressor(loss='least_absolute_deviation', random_state=0) gbdt.fit(X, y) assert gbdt.score(X, y) > .9
sys.path.append(project_path) ### with open('car_price_feat.txt') as f: feat_list = list(filter(lambda x: x[0] != '#', f.read().split('\n'))) ### data_train = pd.read_csv(f'{project_path}/data/car_price_train.201908.csv') data_test = pd.read_csv(f'{project_path}/data/car_price_test.201908.csv') ### series_name = '宝马5系' d_train = data_train[data_train.model_series == series_name] d_test = data_test[data_test.model_series == series_name] ### label_encode_map, f_map = DataProcess.gencode( pd.concat([data_train, data_test]), feat_list) en_train, en_test = DataProcess.encode_process( d_train[feat_list], feat_list, label_encode_map), DataProcess.encode_process(d_test[feat_list], feat_list, label_encode_map) #### est = HistGradientBoostingRegressor(max_iter=200, learning_rate=0.3, max_depth=6, min_samples_leaf=20, max_leaf_nodes=40) est.fit(en_train, d_train.price) pred = est.predict(en_test) evaluate(d_test, pred) ### R2 print(est.score(en_test, d_test.price))
def test_missing_values_minmax_imputation(): # Compare the buit-in missing value handling of Histogram GBC with an # a-priori missing value imputation strategy that should yield the same # results in terms of decision function. # # Each feature (containing NaNs) is replaced by 2 features: # - one where the nans are replaced by min(feature) - 1 # - one where the nans are replaced by max(feature) + 1 # A split where nans go to the left has an equivalent split in the # first (min) feature, and a split where nans go to the right has an # equivalent split in the second (max) feature. # # Assuming the data is such that there is never a tie to select the best # feature to split on during training, the learned decision trees should be # strictly equivalent (learn a sequence of splits that encode the same # decision function). # # The MinMaxImputer transformer is meant to be a toy implementation of the # "Missing In Attributes" (MIA) missing value handling for decision trees # https://www.sciencedirect.com/science/article/abs/pii/S0167865508000305 # The implementation of MIA as an imputation transformer was suggested by # "Remark 3" in :arxiv:'<1902.06931>` class MinMaxImputer(TransformerMixin, BaseEstimator): def fit(self, X, y=None): mm = MinMaxScaler().fit(X) self.data_min_ = mm.data_min_ self.data_max_ = mm.data_max_ return self def transform(self, X): X_min, X_max = X.copy(), X.copy() for feature_idx in range(X.shape[1]): nan_mask = np.isnan(X[:, feature_idx]) X_min[nan_mask, feature_idx] = self.data_min_[feature_idx] - 1 X_max[nan_mask, feature_idx] = self.data_max_[feature_idx] + 1 return np.concatenate([X_min, X_max], axis=1) def make_missing_value_data(n_samples=int(1e4), seed=0): rng = np.random.RandomState(seed) X, y = make_regression(n_samples=n_samples, n_features=4, random_state=rng) # Pre-bin the data to ensure a deterministic handling by the 2 # strategies and also make it easier to insert np.nan in a structured # way: X = KBinsDiscretizer(n_bins=42, encode="ordinal").fit_transform(X) # First feature has missing values completely at random: rnd_mask = rng.rand(X.shape[0]) > 0.9 X[rnd_mask, 0] = np.nan # Second and third features have missing values for extreme values # (censoring missingness): low_mask = X[:, 1] == 0 X[low_mask, 1] = np.nan high_mask = X[:, 2] == X[:, 2].max() X[high_mask, 2] = np.nan # Make the last feature nan pattern very informative: y_max = np.percentile(y, 70) y_max_mask = y >= y_max y[y_max_mask] = y_max X[y_max_mask, 3] = np.nan # Check that there is at least one missing value in each feature: for feature_idx in range(X.shape[1]): assert any(np.isnan(X[:, feature_idx])) # Let's use a test set to check that the learned decision function is # the same as evaluated on unseen data. Otherwise it could just be the # case that we find two independent ways to overfit the training set. return train_test_split(X, y, random_state=rng) # n_samples need to be large enough to minimize the likelihood of having # several candidate splits with the same gain value in a given tree. X_train, X_test, y_train, y_test = make_missing_value_data( n_samples=int(1e4), seed=0) # Use a small number of leaf nodes and iterations so as to keep # under-fitting models to minimize the likelihood of ties when training the # model. gbm1 = HistGradientBoostingRegressor(max_iter=100, max_leaf_nodes=5, random_state=0) gbm1.fit(X_train, y_train) gbm2 = make_pipeline(MinMaxImputer(), clone(gbm1)) gbm2.fit(X_train, y_train) # Check that the model reach the same score: assert gbm1.score(X_train, y_train) == pytest.approx(gbm2.score(X_train, y_train)) assert gbm1.score(X_test, y_test) == pytest.approx(gbm2.score(X_test, y_test)) # Check the individual prediction match as a finer grained # decision function check. assert_allclose(gbm1.predict(X_train), gbm2.predict(X_train)) assert_allclose(gbm1.predict(X_test), gbm2.predict(X_test))
def test_absolute_error(): # For coverage only. X, y = make_regression(n_samples=500, random_state=0) gbdt = HistGradientBoostingRegressor(loss="absolute_error", random_state=0) gbdt.fit(X, y) assert gbdt.score(X, y) > 0.9
def EvaluateInput(index): global Correct, Wrong if (index < Forecaster.window_size): return "Please Choose an index with enough Windows Values" CorrectNextCP = Forecaster.dataset[index][2] CurrentCP = Forecaster.dataset[index - 1][2] CorrectAction = Forecaster.ActualOutput(CurrentCP, CorrectNextCP) InputToRegressor = Forecaster.dataset[index - 10:index] FlattenedInput = np.reshape( InputToRegressor, (1, InputToRegressor.shape[0] * InputToRegressor.shape[1])) Result = est.predict(FlattenedInput) PredictedAction = Forecaster.ActualOutput(CurrentCP, Result) if (PredictedAction == CorrectAction): print("Correct") Correct += 1 else: print("Wrong") Wrong += 1 print(est.score(Forecaster.FlattenedXInput, Forecaster.YInput)) EvaluateInput(10) print(Correct / (Correct + Wrong))
#!/usr/bin/env python import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import PoissonRegressor from sklearn.experimental import enable_hist_gradient_boosting # noqa from sklearn.ensemble import HistGradientBoostingRegressor n_samples, n_features = 1000, 20 rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) # positive integer target correlated with X[:, 5] with many zeros: y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) glm = PoissonRegressor() gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01) glm.fit(X_train, y_train) gbdt.fit(X_train, y_train) print(glm.score(X_test, y_test)) print(gbdt.score(X_test, y_test))
'for the California housing dataset, with MLPRegressor') fig.tight_layout(rect=[0, 0.03, 1, 0.95]) ############################################################################## # Partial Dependence computation for Gradient Boosting # ---------------------------------------------------- # # Let's now fit a GradientBoostingRegressor and compute the partial dependence # plots either or one or two variables at a time. print("Training GradientBoostingRegressor...") tic = time() est = HistGradientBoostingRegressor() est.fit(X_train, y_train) print("done in {:.3f}s".format(time() - tic)) print("Test R2 score: {:.2f}".format(est.score(X_test, y_test))) ############################################################################## # Here, we used the default hyperparameters for the gradient boosting model # without any preprocessing as tree-based models are naturally robust to # monotonic transformations of numerical features. # # Note that on this tabular dataset, Gradient Boosting Machines are both # significantly faster to train and more accurate than neural networks. It is # also significantly cheaper to tune their hyperparameters (the default tend to # work well while this is not often the case for neural networks). # # Finally, as we will see next, computing partial dependence plots tree-based # models is also orders of magnitude faster making it cheap to compute partial # dependence plots for pairs of interacting features:
# # Below we will give an example of a large dataset and we can compare # computation time with the earlier experiment in the previous section. # %% from time import time from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor histogram_gradient_boosting = HistGradientBoostingRegressor(max_iter=200, random_state=0) start_time = time() histogram_gradient_boosting.fit(X_train, y_train) fit_time_histogram_gradient_boosting = time() - start_time start_time = time() score_histogram_gradient_boosting = histogram_gradient_boosting.score( X_test, y_test) score_time_histogram_gradient_boosting = time() - start_time print("Historgram gradient boosting decision tree") print(f"R2 score: {score_histogram_gradient_boosting:.3f}") print(f"Fit time: {fit_time_histogram_gradient_boosting:.2f} s") print(f"Score time: {score_time_histogram_gradient_boosting:.5f} s\n") # %% [markdown] # The histogram gradient-boosting is the best algorithm in terms of score. # It will also scale when the number of samples increases, while the normal # gradient-boosting will not.
# Алгоритм №5: градиентный бустинг reg5 = GradientBoostingRegressor(n_estimators = 500, max_features = 'auto', random_state = 42) reg5.fit(X_train, y_train) train_score_5 = reg5.score(X_train, y_train) test_score_5 = reg5.score(X_test, y_test) print('Score on train: ' + str(train_score_5)) # Score on train: 0.843 print('Score on test: ' + str(test_score_5)) # Score on test: 0.606 # Алгоритм №6: скоростной градиентный бустинг reg6 = HistGradientBoostingRegressor(max_iter = 500, random_state = 42) reg6.fit(X_train, y_train) train_score_6 = reg6.score(X_train, y_train) test_score_6 = reg6.score(X_test, y_test) print('Score on train: ' + str(train_score_6)) # Score on train: 0.993 print('Score on test: ' + str(test_score_6)) # Score on test: 0.674 # Пройдёмся по разным глубинам дерева решений градиентного бустинга all_time = datetime.now() hgb_scores = {} for i in range(2, 31): timer = datetime.now() print(i) reg6 = HistGradientBoostingRegressor(max_depth = i, max_iter = 500, random_state = 42) reg6.fit(X_train, y_train)
def modeling_compare(X, y): import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import RidgeCV from sklearn.model_selection import RepeatedKFold from sklearn.linear_model import ElasticNet from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import PoissonRegressor from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.linear_model import Lasso from sklearn.linear_model import SGDRegressor from sklearn.neural_network import MLPClassifier from sklearn.ensemble import VotingRegressor models_lab = [ 'Linear Regression', 'Ridge', 'Ridge with tuning hyperparameters', 'Elastic Net', 'Random Forest', 'Poisson Regression', 'Gradient Boosting regression', 'Lasso', 'Stochastic Gradient Descent', 'Neural Network', 'Voting Regression' ] reg1 = LinearRegression().fit(X, y) reg2 = Ridge().fit(X, y) reg3 = Ridge(alpha=0.2).fit(X, y) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) grid = dict() grid['alpha'] = arange(0, 1, 0.01) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) reg3 = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error').fit(X, y) reg4 = ElasticNet().fit(X, y) reg5 = RandomForestRegressor().fit(X, y) reg6 = PoissonRegressor().fit(X, y) reg7 = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01).fit(X, y) reg8 = Lasso().fit(X, y) reg9 = SGDRegressor(loss='squared_loss', penalty='l2').fit(X, y) reg10 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(17, 10), random_state=1).fit(X, y) # VotingRegressor without NN ereg = VotingRegressor(estimators=[('lr', reg1), ('rd', reg2), ( 'rs', reg3), ('en', reg4), ('rf', reg5), ('pr', reg6), ('gb', reg7), ('ls', reg8), ('gd', reg9)]).fit(X, y) models_obj = [ reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10, ereg ] score = [ reg1.score(X, y), reg2.score(X, y), reg3.score(X, y), reg4.score(X, y), reg5.score(X, y), reg6.score(X, y), reg7.score(X, y), reg8.score(X, y), reg9.score(X, y), reg10.score(X, y), ereg.score(X, y) ] score_df = pd.DataFrame() score_df['models_lab'] = models_lab score_df['models_obj'] = models_obj score_df['score'] = score return (score_df)
from sklearn.inspection import plot_partial_dependence from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.datasets._california_housing import fetch_california_housing df = fetch_california_housing(as_frame=True) print(df) x = df['data'] y = df['target'] est = HistGradientBoostingRegressor().fit(x, y) est.score(x, y) features = ['HouseAge', 'HouseAge', ['MedInc', 'HouseAge']] plot_partial_dependence(est, x, features=features) """We can clearly see an interaction between the two features: for an Median income > 4.5, the House price is Dependent on HouseAge, for MedIncome < 4.5 NO STRONG depedence HousePricing and HouseAge. Makes sense ! Many Rich people Create more brand new Houses so they affect the House prices. if we lived in Sao Paolo the very low income will not affect the Relationship(Prices, Population) because everyone is poor. """ from matplotlib import pyplot as plt plt.gca() plt.show() """Disadvantages of PDP: - The realistic maximum number of features in a partial dependence function is two. - The assumption of independence is the biggest issue with PD plots. It is assumed that the feature(s) for which the partial dependence is computed are not correlated with other features. One solution to this problem is Accumulated Local Effect plots or short ALE plots that work with the conditional instead of the marginal distribution. -By plotting the individual conditional expectation curves instead of the aggregated line,