def get_booster_model(data_train, groups_train): """Gets model and define its parameters. For finding the optimal number of iterations, cross-validation is applied. Parameters ---------- data_train: Train data readable for the package gpbooster, should contain the information about X_train and y_train groups_train: Group indices Returns ------- gp_model Instance of the Gradient Tree boosting model with random effects params Parameters with which the model should be trained opt_num_boost_rounds Optimal number of boosting rounds for the training, found with cross- validation """ logging.info('Getting booster model') gp_model = gpb.GPModel(group_data=groups_train) gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"}) params = { 'objective': 'regression_l2', 'learning_rate': 0.05, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } logging.info('Calculating optimal number of boost rounds \ via cross-validation') cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, use_gp_model_for_validation=True, num_boost_round=300, early_stopping_rounds=5, nfold=3, verbose_eval=False, show_stdv=False, seed=1) opt_num_boost_rounds = np.argmin(cvbst['l2-mean']) return gp_model, params, opt_num_boost_rounds
for i in range(m): group[int(i * n / m):int((i + 1) * n / m)] = i # incidence matrix relating grouped random effects to samples Z1 = np.zeros((n, m)) for i in range(m): Z1[np.where(group == i), i] = 1 sigma2_1 = 1**2 # random effect variance sigma2 = 0.5**2 # error variance np.random.seed(1) b1 = np.sqrt(sigma2_1) * np.random.normal(size=m) # simulate random effects eps = Z1.dot(b1) xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term y = eps + xi # observed data # Define and fit model gp_model = gpb.GPModel(group_data=group) gp_model.fit(y=y, std_dev=True) gp_model.summary() # Make predictions group_test = np.arange(m) pred = gp_model.predict(group_data_pred=group_test) # Compare true and predicted random effects plt.scatter(b1, pred['mu']) plt.title("Comparison of true and predicted random effects") plt.xlabel("truth") plt.ylabel("predicted") plt.show() # Also predict covariance matrix pred = gp_model.predict(group_data_pred=np.array([1, 1, 2, 2, -1, -1]), predict_cov_mat=True)
for i in range(m): group[int(i * n / m):int((i + 1) * n / m)] = i b1 = np.sqrt(0.5) * np.random.normal(size=m) # simulate random effects eps = b1[group] eps = eps - np.mean(eps) # simulate fixed effects X = np.random.rand(n, 2) f = f1d(X[:, 0]) # simulate response variable probs = stats.norm.cdf(f + eps) y = np.random.uniform(size=n) < probs y = y.astype(np.float64) # --------------------Parameter tuning using cross-validation: deterministic and random grid search---------------- # Create random effects model and Dataset gp_model = gpb.GPModel(group_data=group, likelihood="bernoulli_probit") data_train = gpb.Dataset(X, y) # Other parameters not contained in the grid of tuning parameters params = {'objective': 'binary', 'verbose': 0, 'num_leaves': 2**10} # Small grid and deterministic grid search param_grid_small = { 'learning_rate': [0.1, 0.01], 'min_data_in_leaf': [20, 100], 'max_depth': [5, 10], 'max_bin': [255, 1000] } opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid_small, params=params, num_try_random=None, nfold=4, gp_model=gp_model,
group_test[int(i * ntrain / m_test):int((i + 1) * ntrain / m_test)] = i group = np.concatenate((group_train,group_test)) b = np.sqrt(sigma2_1) * np.random.normal(size=m_test) # simulate random effects Zb = b[group] # Put everything together xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term y = F + Zb + xi # observed data # split train and test data y_train = y[0:ntrain] y_test = y[ntrain:n] X_train = X.iloc[0:ntrain,] X_test = X.iloc[ntrain:n,] # --------------------Learning and prediction---------------- # Define and train GPModel gp_model = gpb.GPModel(group_data=group_train) # create dataset for gpb.train function data_train = gpb.Dataset(X_train, y_train) # specify tree-boosting parameters as a dict params = { 'objective': 'regression_l2', 'learning_rate': 0.1, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } # train model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32) gp_model.summary() # estimated covariance parameters # Covariance parameters in the following order: # ['Error_term', 'Group_1'] # [0.9183072 1.013057 ] # Make predictions pred = bst.predict(data=X_test, group_data_pred=group_test) y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect
b_random_slope = 0.75 * np.random.normal(size=m) y_crossed_random_slope = b[group] + b_crossed[ group_crossed] + x * b_random_slope[group] + xi # Simulate data for two nested random effects m_nested = 200 # number of categories / levels for the second nested grouping variable group_nested = np.arange( n) # grouping variable for nested lower level random effects for i in range(m_nested): group_nested[int(i * n / m_nested):int((i + 1) * n / m_nested)] = i b_nested = 1. * np.random.normal( size=m_nested) # nested lower level random effects y_nested = b[group] + b_nested[group_nested] + xi # observed data # --------------------Grouped random effects model: single-level random effect---------------- # --------------------Training---------------- gp_model = gpb.GPModel(group_data=group, likelihood="gaussian") gp_model.fit(y=y, X=X, params={"std_dev": True}) gp_model.summary() # Use other optimization specifications (gradient descent with Nesterov acceleration) # and monitor convergence of optimization ("trace": True) gp_model = gpb.GPModel(group_data=group, likelihood="gaussian") gp_model.fit(y=y, X=X, params={ "optimizer_cov": "gradient_descent", "lr_cov": 0.1, "std_dev": True, "use_nesterov_acc": True, "maxit": 100, "trace": True })
group = np.arange(n) # grouping variable for i in range(m): group[int(i * n / m):int((i + 1) * n / m)] = i # incidence matrix relating grouped random effects to samples Z1 = np.zeros((n, m)) for i in range(m): Z1[np.where(group == i), i] = 1 sigma2_1 = 1 ** 2 # random effect variance sigma2 = 0.1 ** 2 # error variance b1 = np.sqrt(sigma2_1) * np.random.normal(size=m) # simulate random effects eps = Z1.dot(b1) xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term y = F + eps + xi # observed data # define GPModel gp_model = gpb.GPModel(group_data=group) gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"}) # create dataset for gpb.train data_train = gpb.Dataset(X, y) # specify your configurations as a dict params = { 'objective': 'regression_l2', 'learning_rate': 0.05, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } print('Starting training...') # train bst = gpb.train(params=params,
y = np.random.uniform(size=n) < probs y = y.astype(np.float64) elif likelihood == "bernoulli_logit": probs = 1 / (1 + np.exp(-(f + eps))) y = np.random.uniform(size=n) < probs y = y.astype(np.float64) elif likelihood == "poisson": mu = np.exp(f + eps) y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu) elif likelihood == "gamma": mu = np.exp(f + eps) y = stats.gamma.ppf(np.random.uniform(size=n), loc=mu, a=1) plt.hist(y, bins=50) # visualize response variable # Train model gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) gp_model.fit( y=y, X=X) # use option params={"trace": True} for monitoring convergence gp_model.summary() # Make predictions group_test = np.arange(m) X_test = np.column_stack((np.ones(m), np.zeros(m))) # Predict latent variable pred_lin = gp_model.predict(X_pred=X_test, group_data_pred=group_test, predict_var=True, predict_response=False) pred_lin['mu'][0:5] # Predicted latent mean pred_lin['var'][0:5] # Predicted latent variance # Predict response variable
for i in range(m): group[int(i * n / m):int((i + 1) * n / m)] = i b1 = np.random.normal(size=m) # simulate random effects eps = b1[group] # simulate fixed effects def f1d(x): """Non-linear function for simulation""" return (1.7 * (1 / (1 + np.exp(-(x - 0.5) * 20)) + 0.75 * x)) X = np.random.rand(n, 2) f = f1d(X[:, 0]) xi = np.sqrt(0.01) * np.random.normal(size=n) # simulate error term y = f + eps + xi # observed data print('Starting training...') # define GPModel gp_model = gpb.GPModel(group_data=group, likelihood="gaussian") # train bst = gpb.GPBoostRegressor(max_depth=6, learning_rate=0.05, min_data_in_leaf=5, n_estimators=15) bst.fit(X, y, gp_model=gp_model) print("Estimated random effects model") gp_model.summary() print('Starting predicting...') # predict group_test = np.arange(m) Xtest = np.zeros((m, 2)) Xtest[:, 0] = np.linspace(0, 1, m) pred = bst.predict(X=Xtest, group_data_pred=group_test)
group = np.arange(n) # grouping variable for i in range(m): group[int(i * n / m):int((i + 1) * n / m)] = i # incidence matrix relating grouped random effects to samples Z1 = np.zeros((n, m)) for i in range(m): Z1[np.where(group == i), i] = 1 sigma2_1 = 1**2 # random effect variance sigma2 = 0.1**2 # error variance b1 = np.sqrt(sigma2_1) * np.random.normal(size=m) # simulate random effects eps = Z1.dot(b1) xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term y = F + eps + xi # observed data # define GPModel gp_model = gpb.GPModel(group_data=group) gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"}) # create dataset for gpb.train data_train = gpb.Dataset(X, y) # specify your configurations as a dict params = { 'objective': 'regression_l2', 'learning_rate': 0.05, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } print('Starting cross-validation...') # do cross-validation cvbst = gpb.cv(params=params,
import gpboost as gpb from statsmodels.datasets import grunfeld import numpy as np import matplotlib.pyplot as plt plt.style.use('ggplot') # Load data data = grunfeld.load_pandas().data # Visualize response variable plt.hist(data['invest'], bins=50) plt.title("Histogram of response variable") """ Boosting with two crossed firm and year grouped random effects """ # Define random effects model (assuming firm and year random effects) gp_model = gpb.GPModel(group_data=data[['firm', 'year']]) # Create dataset for gpb.train data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest']) # Specify boosting parameters as dict # Note: no attempt has been done to optimaly choose tuning parameters params = { 'objective': 'regression_l2', 'learning_rate': 1, 'max_depth': 6, 'min_data_in_leaf': 1, 'verbose': 0 } # Train GPBoost model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model,
# %% test data(generate like train) x_test = np.random.rand(n_test * n_test, 2) F_x_test = f1d(x_test[:, 0]) xi_test = np.sqrt(sigma2) * np.random.normal(size=n_test * n_test) y_test = F_x_test + b_test + xi_test # %% chk plt.scatter(x_train[:, 1], y) plt.scatter(x_train[:, 0], y) # %% plt.scatter(x_test[:, 1], y_test) plt.scatter(x_test[:, 0], y_test) # %% training gp_model = gpb.GPModel(gp_coords=coords_train, cov_function="exponential") data_train = gpb.Dataset(x_train, y) params = { 'objective': 'rmse', 'learning_rate': 0.01, 'max_depth': 3, 'min_data_in_leaf': 10, 'num_leaves': 2**10, 'verbose': -1 } bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=247) print('estimated covariance parameters')
y = np.random.uniform(size=n) < probs y = y.astype(np.float64) elif likelihood == "bernoulli_logit": probs = 1 / (1 + np.exp(-(f + eps))) y = np.random.uniform(size=n) < probs y = y.astype(np.float64) elif likelihood == "poisson": mu = np.exp(f + eps) y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu) elif likelihood == "gamma": mu = np.exp(f + eps) y = mu * stats.gamma.ppf(np.random.uniform(size=n), a=1) plt.hist(y, bins=50) # visualize response variable # --------------------Train model---------------- gp_model = gpb.GPModel(group_data=group, likelihood=likelihood) gp_model.fit(y=y, X=X) # use params={"trace": True} for monitoring convergence gp_model.summary() # --------------------Make predictions---------------- group_test = np.arange(m) X_test = np.column_stack((np.ones(m), np.zeros(m))) # Predict latent variable pred = gp_model.predict(X_pred=X_test, group_data_pred=group_test, predict_var=True, predict_response=False) print(pred['mu'][0:5]) # Predicted latent mean print(pred['var'][0:5]) # Predicted latent variance # Predict response variable pred_resp = gp_model.predict(X_pred=X_test, group_data_pred=group_test, predict_var=True, predict_response=True) print(pred_resp['mu'][0:5]) # Predicted response variable (label)
np.random.seed(1) # Simulate grouped random effects group = np.arange(n) # grouping variable for i in range(m): group[int(i * n / m):int((i + 1) * n / m)] = i b1 = np.random.normal(size=m) # simulate random effects eps = b1[group] # Simulate fixed effects X = np.random.rand(n, 2) f = f1d(X[:, 0]) xi = np.sqrt(0.01) * np.random.normal(size=n) # simulate error term y = f + eps + xi # observed data #--------------------Training---------------- # Define GPModel gp_model = gpb.GPModel(group_data=group) # The default optimizer for covariance parameters (hyperparameters) is Fisher scoring. # This can be changed as follows: # gp_model.set_optim_params(params={"optimizer_cov": "gradient_descent", "lr_cov": 0.05, # "use_nesterov_acc": True, "acc_rate_cov": 0.5}) # Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.: # gp_model.set_optim_params(params={"trace": True}) # Create dataset for gpb.train data_train = gpb.Dataset(X, y) # Specify boosting parameters as dict params = { 'objective': 'regression_l2', 'learning_rate': 0.05, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0