def test_scm1d(): df = pd.read_csv('scm1d.csv') target = df.loc[:, df.columns.str.contains('L')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=1658.0 / 8145.0, random_state=42, shuffle=True) gb = GradientBoostingRegressor( l2_regularization=0.07054193143238725, min_samples_leaf=23, learning_rate=0.12336530854190006, max_iter=1999, n_iter_no_change=None, ) # scaler = StandardScaler() # scaler.fit(X_train) # train_scaled = scaler.transform(X_train) # test_scaled = scaler.transform(X_test) # scalery = StandardScaler() # scalery.fit(y_train) # y_Train = scalery.transform(y_train) # y_Test = scalery.transform(y_test) gb.fit(X_train, y_train) y_preds = gb.predict_multi(X_test) r2 = r2_score(y_test, y_preds, multioutput='uniform_average') print(r2)
def test_atp1d(): df = pd.read_csv('atp1d.csv') target = df.loc[:, df.columns.str.startswith('LBL')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.5, random_state=42, shuffle=True) gb = GradientBoostingRegressor(l2_regularization=0.003391634274257872, min_samples_leaf=10, learning_rate=0.1088115324113492, max_iter=199, n_iter_no_change=20) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict_multi(test_scaled) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_edm(): df = pd.read_csv('edm.csv') target = df.loc[:, ['DFlow', 'DGap']] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.5, random_state=42, shuffle=True) gb = GradientBoostingRegressor( l2_regularization=0.880826520747869, min_samples_leaf=12, learning_rate=0.22445307581959334, max_iter=279, n_iter_no_change=23, ) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict_multi(test_scaled) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_wq(): df = pd.read_csv('water-quality.csv') target = df.loc[:, df.columns.str.startswith('x')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.5, random_state=42, shuffle=True) gb = GradientBoostingRegressor(l2_regularization=0.07509314619453317, min_samples_leaf=15, learning_rate=0.01948991297099692, max_iter=300, n_iter_no_change=17) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict_multi(test_scaled) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_scm20d(): df = pd.read_csv('scm20d.csv') target = df.loc[:, df.columns.str.contains('L')] df.drop(target.columns, axis=1, inplace=True) df, target = df.to_numpy(), target.to_numpy() c = 1503.0 / 7463.0 X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=c, random_state=42, shuffle=True) gb = GradientBoostingRegressor( l2_regularization=0.8640187696889217, min_samples_leaf=19, learning_rate=0.1164232801613771, max_iter=1998, n_iter_no_change=None, ) scaler = StandardScaler() scaler.fit(X_train) train_scaled = scaler.transform(X_train) test_scaled = scaler.transform(X_test) scalery = StandardScaler() scalery.fit(y_train) y_Train = scalery.transform(y_train) y_Test = scalery.transform(y_test) gb.fit(train_scaled, y_Train) y_preds = gb.predict_multi(test_scaled) r2 = r2_score(y_Test, y_preds, multioutput='uniform_average') print(r2)
def test_early_stopping_regression(scoring, validation_split, tol): max_iter = 500 n_iter_no_change = 5 X, y = make_regression(random_state=0) gb = GradientBoostingRegressor( verbose=1, # just for coverage scoring=scoring, tol=tol, validation_split=validation_split, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0) gb.fit(X, y) if scoring is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def test_same_predictions_regression(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Make sure pygbm has the same predictions as LGBM for easy targets. # # In particular when the size of the trees are bound and the number of # samples is large enough, the structure of the prediction trees found by # LightGBM and PyGBM should be exactly identical. # # Notes: # - Several candidate splits may have equal gains when the number of # samples in a node is low (and because of float errors). Therefore the # predictions on the test set might differ if the structure of the tree # is not exactly the same. To avoid this issue we only compare the # predictions on the test set when the number of samples is large enough # and max_leaf_nodes is low enough. # - To ignore discrepancies caused by small differences the binning # strategy, data is pre-binned if n_samples > 255. rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_regression(n_samples=n_samples, n_features=5, n_informative=5, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_pygbm = GradientBoostingRegressor(max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_lightgbm_estimator(est_pygbm) est_lightgbm.fit(X_train, y_train) est_pygbm.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lgbm = est_lightgbm.predict(X_train) pred_pygbm = est_pygbm.predict(X_train) # less than 1% of the predictions are different up to the 3rd decimal assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-3) < .011 if max_leaf_nodes < 10 and n_samples >= 1000: pred_lgbm = est_lightgbm.predict(X_test) pred_pygbm = est_pygbm.predict(X_test) # less than 1% of the predictions are different up to the 4th decimal assert np.mean(abs(pred_lgbm - pred_pygbm) > 1e-4) < .01
def test_pre_binned_data(): # Make sure that: # - training on numerical data and predicting on numerical data is the # same as training on binned data and predicting on binned data # - training on numerical data and predicting on numerical data is the # same as training on numerical data and predicting on binned data # - training on binned data and predicting on numerical data is not # possible. X, y = make_regression(random_state=0) gbdt = GradientBoostingRegressor(scoring=None, random_state=0) mapper = BinMapper(random_state=0) X_binned = mapper.fit_transform(X) fit_num_pred_num = gbdt.fit(X, y).predict(X) fit_binned_pred_binned = gbdt.fit(X_binned, y).predict(X_binned) fit_num_pred_binned = gbdt.fit(X, y).predict(X_binned) assert_allclose(fit_num_pred_num, fit_binned_pred_binned) assert_allclose(fit_num_pred_num, fit_num_pred_binned) assert_raises_regex(ValueError, 'This estimator was fitted with pre-binned data ', gbdt.fit(X_binned, y).predict, X)
"""This example illustrates the use of scikit-learn's GridSearchCV. The grid search is used to determine the best learning rate.""" import numpy as np from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from pygbm import GradientBoostingRegressor rng = np.random.RandomState(0) n_samples = int(1e6) X, y = make_regression(n_samples, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) clf = GradientBoostingRegressor(max_iter=10, scoring=None, verbose=1, random_state=rng) param_grid = {'learning_rate': [1, .1, .01, .001]} cv = KFold(n_splits=3, random_state=rng) gs = GridSearchCV(clf, param_grid=param_grid, cv=cv) gs.fit(X_train, y_train) print(f'Best param: {gs.best_params_}') print(f'R2 coefficient: {gs.score(X_test, y_test)}')
"""This example illustrates the use of scikit-learn's GridSearchCV. The grid search is used to determine the best learning rate.""" import numpy as np from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from pygbm import GradientBoostingRegressor rng = np.random.RandomState(0) n_samples = int(1e6) X, y = make_regression(n_samples, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) clf = GradientBoostingRegressor(max_iter=10, n_iter_no_change=None, verbose=1, random_state=rng) param_grid = {'learning_rate': [1, .1, .01, .001]} cv = KFold(n_splits=3, random_state=rng) gs = GridSearchCV(clf, param_grid=param_grid, cv=cv) gs.fit(X_train, y_train) print(f'Best param: {gs.best_params_}') print(f'R2 coefficient: {gs.score(X_test, y_test)}')