def retrieve_training_dataset(X, predicted_indicator): """ Transforms the raw dataset into a dataset that is suitable for training the model. Parameters ---------- X: Covariable-cleaned database Returns ------- X_train Train data y_train Response train data data_train Train data readable for the package gpbooster, contains the information about X_train and y_train groups_train Group indices """ logging.info('Retrieving training dataset') y = X[[predicted_indicator]] X_train, y_train, groups_train = prepare_training_dataset(X, y) data_train = gpb.Dataset(X_train, y_train) return X_train, y_train, data_train, groups_train
def test_gpboost(): try: import gpboost except: print("Skipping test_gpboost!") return import shap # train gpboost model X, y = shap.datasets.boston() data_train = gpboost.Dataset(X, y, categorical_feature=[8]) model = gpboost.train(params={'objective': 'regression_l2', 'learning_rate': 0.1, 'verbose': 0}, train_set=data_train, num_boost_round=10) # explain the model's predictions using SHAP values ex = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent") shap_values = ex.shap_values(X) predicted = model.predict(X, raw_score=True) assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-4, \ "SHAP values don't sum to model output!"
group[int(i * n / m):int((i + 1) * n / m)] = i b1 = np.sqrt(0.5) * np.random.normal(size=m) # simulate random effects eps = b1[group] eps = eps - np.mean(eps) # simulate fixed effects X = np.random.rand(n, 2) f = f1d(X[:, 0]) # simulate response variable probs = stats.norm.cdf(f + eps) y = np.random.uniform(size=n) < probs y = y.astype(np.float64) # --------------------Parameter tuning using cross-validation: deterministic and random grid search---------------- # Create random effects model and Dataset gp_model = gpb.GPModel(group_data=group, likelihood="bernoulli_probit") data_train = gpb.Dataset(X, y) # Other parameters not contained in the grid of tuning parameters params = {'objective': 'binary', 'verbose': 0, 'num_leaves': 2**10} # Small grid and deterministic grid search param_grid_small = { 'learning_rate': [0.1, 0.01], 'min_data_in_leaf': [20, 100], 'max_depth': [5, 10], 'max_bin': [255, 1000] } opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid_small, params=params, num_try_random=None, nfold=4, gp_model=gp_model, use_gp_model_for_validation=True,
b = np.sqrt(sigma2_1) * np.random.normal(size=m_test) # simulate random effects Zb = b[group] # Put everything together xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term y = F + Zb + xi # observed data # split train and test data y_train = y[0:ntrain] y_test = y[ntrain:n] X_train = X.iloc[0:ntrain,] X_test = X.iloc[ntrain:n,] # --------------------Learning and prediction---------------- # Define and train GPModel gp_model = gpb.GPModel(group_data=group_train) # create dataset for gpb.train function data_train = gpb.Dataset(X_train, y_train) # specify tree-boosting parameters as a dict params = { 'objective': 'regression_l2', 'learning_rate': 0.1, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } # train model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32) gp_model.summary() # estimated covariance parameters # Covariance parameters in the following order: # ['Error_term', 'Group_1'] # [0.9183072 1.013057 ] # Make predictions pred = bst.predict(data=X_test, group_data_pred=group_test) y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect np.sqrt(np.mean((y_test - y_pred) ** 2)) # root mean square error (RMSE) on test data. Approx. = 1.25
# incidence matrix relating grouped random effects to samples Z1 = np.zeros((n, m)) for i in range(m): Z1[np.where(group == i), i] = 1 sigma2_1 = 1 ** 2 # random effect variance sigma2 = 0.1 ** 2 # error variance b1 = np.sqrt(sigma2_1) * np.random.normal(size=m) # simulate random effects eps = Z1.dot(b1) xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term y = F + eps + xi # observed data # define GPModel gp_model = gpb.GPModel(group_data=group) gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"}) # create dataset for gpb.train data_train = gpb.Dataset(X, y) # specify your configurations as a dict params = { 'objective': 'regression_l2', 'learning_rate': 0.05, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } print('Starting training...') # train bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=16)
X = np.random.rand(n, 2) # mean function plus noise y = f1d(X[:, 0]) + np.random.normal(scale=0.1, size=n) return ([X, y]) # Simulate data n = 1000 data = sim_data(2 * n) Xtrain = data[0][0:n, :] ytrain = data[1][0:n] Xtest = data[0][n:(2 * n), :] ytest = data[1][n:(2 * n)] # create dataset for gpb.train data_train = gpb.Dataset(Xtrain, ytrain) data_eval = gpb.Dataset(Xtest, ytest, reference=data_train) # specify your configurations as a dict params = { 'objective': 'regression_l2', 'metric': {'l2', 'l1'}, 'learning_rate': 0.1, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } print('Starting training...') # train evals_result = {} # record eval results for plotting
plt.show() def sim_data(n): """Function that simulates data. Two covariates of which only one has an effect""" X = np.random.rand(n, 2) # mean function plus noise y = f1d(X[:, 0]) + np.random.normal(scale=0.1, size=n) return ([X, y]) # Simulate data n = 1000 data = sim_data(2 * n) # create dataset for gpb.train data_train = gpb.Dataset(data[0][0:n, :], data[1][0:n]) # specify your configurations as a dict params = { 'objective': 'regression_l2', 'metric': {'l2', 'l1'}, 'learning_rate': 0.1, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } print('Starting cross-validation...') # do cross-validation cvbst = gpb.cv(params=params, train_set=data_train,
import numpy as np import matplotlib.pyplot as plt plt.style.use('ggplot') # Load data data = grunfeld.load_pandas().data # Visualize response variable plt.hist(data['invest'], bins=50) plt.title("Histogram of response variable") """ Boosting with two crossed firm and year grouped random effects """ # Define random effects model (assuming firm and year random effects) gp_model = gpb.GPModel(group_data=data[['firm', 'year']]) # Create dataset for gpb.train data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) # Specify boosting parameters as dict # Note: no attempt has been done to optimaly choose tuning parameters params = { 'objective': 'regression_l2', 'learning_rate': 1, 'max_depth': 6, 'min_data_in_leaf': 1, 'verbose': 0 } # Train GPBoost model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=1800) # Estimated random effects model (variances of random effects)
elif likelihood == "bernoulli_logit": probs = 1 / (1 + np.exp(-(f + eps))) y = np.random.uniform(size=n) < probs y = y.astype(np.float64) elif likelihood == "poisson": mu = np.exp(f + eps) y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu) elif likelihood == "gamma": mu = np.exp(f + eps) y = mu * stats.gamma.ppf(np.random.uniform(size=n), a=1) fig1, ax1 = plt.subplots() ax1.hist(y, bins=50) # visualize response variable #--------------------Training---------------- # create dataset for gpb.train data_train = gpb.Dataset(X, y) # Train model gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) # Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.: # gp_model.set_optim_params(params={"trace": True}) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=num_boost_round) gp_model.summary() # Trained random effects model (true variance = 0.5) # Showing training loss gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model,
# %% test data(generate like train) x_test = np.random.rand(n_test * n_test, 2) F_x_test = f1d(x_test[:, 0]) xi_test = np.sqrt(sigma2) * np.random.normal(size=n_test * n_test) y_test = F_x_test + b_test + xi_test # %% chk plt.scatter(x_train[:, 1], y) plt.scatter(x_train[:, 0], y) # %% plt.scatter(x_test[:, 1], y_test) plt.scatter(x_test[:, 0], y_test) # %% training gp_model = gpb.GPModel(gp_coords=coords_train, cov_function="exponential") data_train = gpb.Dataset(x_train, y) params = { 'objective': 'rmse', 'learning_rate': 0.01, 'max_depth': 3, 'min_data_in_leaf': 10, 'num_leaves': 2**10, 'verbose': -1 } bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=247) print('estimated covariance parameters') gp_model.summary()
y = y.astype(np.float64) elif likelihood == "bernoulli_logit": probs = 1 / (1 + np.exp(-(f + eps))) y = np.random.uniform(size=n) < probs y = y.astype(np.float64) elif likelihood == "poisson": mu = np.exp(f + eps) y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu) elif likelihood == "gamma": mu = np.exp(f + eps) y = stats.gamma.ppf(np.random.uniform(size=n), loc=mu, a=1) fig1, ax1 = plt.subplots() ax1.hist(y, bins=50) # visualize response variable # create dataset for gpb.train data_train = gpb.Dataset(X, y) # Train model gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) # Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.: # gp_model.set_optim_params(params={"trace": True}) bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=num_boost_round) gp_model.summary() # Trained random effects model # Make predictions nplot = 200 # number of predictions X_test_plot = np.column_stack((np.linspace(0, 1, nplot), np.zeros(nplot))) group_data_pred = -np.ones(nplot) # Predict response variable
lp = lp * 5 + 0.2 lp_test = lp_test * 5 + 0.2 y = np.random.normal(loc=lp, scale=1) y_test = np.random.normal(loc=lp_test, scale=1) # apply censoring yu = 8 yl = 5 y[y >= yu] = yu y[y <= yl] = yl # censoring fractions print(np.sum(y == yu) / n) print(np.sum(y == yl) / n) # train model and make predictions params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu} dtrain = gpb.Dataset(X, y) bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100) y_pred = bst.predict(X_test) # mean square error (approx. 1.1 for n=10'000) print("Test error of Grabit: " + str(((y_pred - y_test)**2).mean())) # compare to standard least squares gradient boosting (approx. 1.8 for n=10'000) params = {'objective': 'regression_l2', 'verbose': 0} bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100) y_pred_ls = bst.predict(X_test) print("Test error of standard least squares gradient boosting: " + str(((y_pred_ls - y_test)**2).mean())) # measure time import time params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu} dtrain = gpb.Dataset(X, y)