示例#1
0
def retrieve_training_dataset(X, predicted_indicator):
    """
    Transforms the raw dataset into a dataset that is suitable
    for training the model.
    Parameters
    ----------
    X: Covariable-cleaned database

    Returns
    -------
    X_train
        Train data
    y_train
        Response train data
    data_train
        Train data readable for the package gpbooster, contains the information
        about X_train and y_train
    groups_train
        Group indices
        """
    logging.info('Retrieving training dataset')
    y = X[[predicted_indicator]]
    X_train, y_train, groups_train = prepare_training_dataset(X, y)
    data_train = gpb.Dataset(X_train, y_train)
    return X_train, y_train, data_train, groups_train
示例#2
0
def test_gpboost():
    try:
        import gpboost
    except:
        print("Skipping test_gpboost!")
        return
    import shap

    # train gpboost model
    X, y = shap.datasets.boston()
    data_train = gpboost.Dataset(X, y, categorical_feature=[8])
    model = gpboost.train(params={'objective': 'regression_l2', 'learning_rate': 0.1, 'verbose': 0},
                          train_set=data_train, num_boost_round=10)

    # explain the model's predictions using SHAP values
    ex = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
    shap_values = ex.shap_values(X)

    predicted = model.predict(X, raw_score=True)

    assert np.abs(shap_values.sum(1) + ex.expected_value - predicted).max() < 1e-4, \
        "SHAP values don't sum to model output!"
示例#3
0
    group[int(i * n / m):int((i + 1) * n / m)] = i
b1 = np.sqrt(0.5) * np.random.normal(size=m)  # simulate random effects
eps = b1[group]
eps = eps - np.mean(eps)
# simulate fixed effects
X = np.random.rand(n, 2)
f = f1d(X[:, 0])
# simulate response variable
probs = stats.norm.cdf(f + eps)
y = np.random.uniform(size=n) < probs
y = y.astype(np.float64)

# --------------------Parameter tuning using cross-validation: deterministic and random grid search----------------
# Create random effects model and Dataset
gp_model = gpb.GPModel(group_data=group, likelihood="bernoulli_probit")
data_train = gpb.Dataset(X, y)
# Other parameters not contained in the grid of tuning parameters
params = {'objective': 'binary', 'verbose': 0, 'num_leaves': 2**10}
# Small grid and deterministic grid search
param_grid_small = {
    'learning_rate': [0.1, 0.01],
    'min_data_in_leaf': [20, 100],
    'max_depth': [5, 10],
    'max_bin': [255, 1000]
}
opt_params = gpb.grid_search_tune_parameters(param_grid=param_grid_small,
                                             params=params,
                                             num_try_random=None,
                                             nfold=4,
                                             gp_model=gp_model,
                                             use_gp_model_for_validation=True,
b = np.sqrt(sigma2_1) * np.random.normal(size=m_test)  # simulate random effects
Zb = b[group]
# Put everything together
xi = np.sqrt(sigma2) * np.random.normal(size=n)  # simulate error term
y = F + Zb + xi  # observed data
# split train and test data
y_train = y[0:ntrain]
y_test = y[ntrain:n]
X_train = X.iloc[0:ntrain,]
X_test = X.iloc[ntrain:n,]

# --------------------Learning and prediction----------------
# Define and train GPModel
gp_model = gpb.GPModel(group_data=group_train)
# create dataset for gpb.train function
data_train = gpb.Dataset(X_train, y_train)
# specify tree-boosting parameters as a dict
params = { 'objective': 'regression_l2', 'learning_rate': 0.1,
    'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 }
# train model
bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32)
gp_model.summary() # estimated covariance parameters
# Covariance parameters in the following order:
# ['Error_term', 'Group_1']
# [0.9183072 1.013057 ]

# Make predictions
pred = bst.predict(data=X_test, group_data_pred=group_test)
y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect
np.sqrt(np.mean((y_test - y_pred) ** 2)) # root mean square error (RMSE) on test data. Approx. = 1.25
示例#5
0
# incidence matrix relating grouped random effects to samples
Z1 = np.zeros((n, m))
for i in range(m):
    Z1[np.where(group == i), i] = 1
sigma2_1 = 1 ** 2  # random effect variance
sigma2 = 0.1 ** 2  # error variance
b1 = np.sqrt(sigma2_1) * np.random.normal(size=m)  # simulate random effects
eps = Z1.dot(b1)
xi = np.sqrt(sigma2) * np.random.normal(size=n)  # simulate error term
y = F + eps + xi  # observed data

# define GPModel
gp_model = gpb.GPModel(group_data=group)
gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"})
# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# specify your configurations as a dict
params = {
    'objective': 'regression_l2',
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0
}

print('Starting training...')
# train
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=16)
示例#6
0
    X = np.random.rand(n, 2)
    # mean function plus noise
    y = f1d(X[:, 0]) + np.random.normal(scale=0.1, size=n)
    return ([X, y])


# Simulate data
n = 1000
data = sim_data(2 * n)
Xtrain = data[0][0:n, :]
ytrain = data[1][0:n]
Xtest = data[0][n:(2 * n), :]
ytest = data[1][n:(2 * n)]

# create dataset for gpb.train
data_train = gpb.Dataset(Xtrain, ytrain)
data_eval = gpb.Dataset(Xtest, ytest, reference=data_train)

# specify your configurations as a dict
params = {
    'objective': 'regression_l2',
    'metric': {'l2', 'l1'},
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0
}

print('Starting training...')
# train
evals_result = {}  # record eval results for plotting
示例#7
0
plt.show()


def sim_data(n):
    """Function that simulates data. Two covariates of which only one has an effect"""
    X = np.random.rand(n, 2)
    # mean function plus noise
    y = f1d(X[:, 0]) + np.random.normal(scale=0.1, size=n)
    return ([X, y])


# Simulate data
n = 1000
data = sim_data(2 * n)
# create dataset for gpb.train
data_train = gpb.Dataset(data[0][0:n, :], data[1][0:n])

# specify your configurations as a dict
params = {
    'objective': 'regression_l2',
    'metric': {'l2', 'l1'},
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_data_in_leaf': 5,
    'verbose': 0
}

print('Starting cross-validation...')
# do cross-validation
cvbst = gpb.cv(params=params,
               train_set=data_train,
示例#8
0
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# Load data
data = grunfeld.load_pandas().data
# Visualize response variable
plt.hist(data['invest'], bins=50)
plt.title("Histogram of response variable")
"""
Boosting with two crossed firm and year grouped random effects
"""
# Define random effects model (assuming firm and year random effects)
gp_model = gpb.GPModel(group_data=data[['firm', 'year']])
# Create dataset for gpb.train
data_train = gpb.Dataset(data=data[['value', 'capital']], label=data['invest'])
# Specify boosting parameters as dict
# Note: no attempt has been done to optimaly choose tuning parameters
params = {
    'objective': 'regression_l2',
    'learning_rate': 1,
    'max_depth': 6,
    'min_data_in_leaf': 1,
    'verbose': 0
}
# Train GPBoost model
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=1800)
# Estimated random effects model (variances of random effects)
elif likelihood == "bernoulli_logit":
    probs = 1 / (1 + np.exp(-(f + eps)))
    y = np.random.uniform(size=n) < probs
    y = y.astype(np.float64)
elif likelihood == "poisson":
    mu = np.exp(f + eps)
    y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu)
elif likelihood == "gamma":
    mu = np.exp(f + eps)
    y = mu * stats.gamma.ppf(np.random.uniform(size=n), a=1)
fig1, ax1 = plt.subplots()
ax1.hist(y, bins=50)  # visualize response variable

#--------------------Training----------------
# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# Train model
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
# Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.:
# gp_model.set_optim_params(params={"trace": True})
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=num_boost_round)
gp_model.summary()  # Trained random effects model (true variance = 0.5)

# Showing training loss
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
示例#10
0
# %% test data(generate like train)
x_test = np.random.rand(n_test * n_test, 2)
F_x_test = f1d(x_test[:, 0])
xi_test = np.sqrt(sigma2) * np.random.normal(size=n_test * n_test)
y_test = F_x_test + b_test + xi_test

# %% chk
plt.scatter(x_train[:, 1], y)
plt.scatter(x_train[:, 0], y)

# %%
plt.scatter(x_test[:, 1], y_test)
plt.scatter(x_test[:, 0], y_test)
# %% training
gp_model = gpb.GPModel(gp_coords=coords_train, cov_function="exponential")
data_train = gpb.Dataset(x_train, y)
params = {
    'objective': 'rmse',
    'learning_rate': 0.01,
    'max_depth': 3,
    'min_data_in_leaf': 10,
    'num_leaves': 2**10,
    'verbose': -1
}

bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=247)
print('estimated covariance parameters')
gp_model.summary()
示例#11
0
    y = y.astype(np.float64)
elif likelihood == "bernoulli_logit":
    probs = 1 / (1 + np.exp(-(f + eps)))
    y = np.random.uniform(size=n) < probs
    y = y.astype(np.float64)
elif likelihood == "poisson":
    mu = np.exp(f + eps)
    y = stats.poisson.ppf(np.random.uniform(size=n), mu=mu)
elif likelihood == "gamma":
    mu = np.exp(f + eps)
    y = stats.gamma.ppf(np.random.uniform(size=n), loc=mu, a=1)
fig1, ax1 = plt.subplots()
ax1.hist(y, bins=50)  # visualize response variable

# create dataset for gpb.train
data_train = gpb.Dataset(X, y)
# Train model
gp_model = gpb.GPModel(group_data=group, likelihood=likelihood)
# Use the option "trace": true to monitor convergence of hyperparameter estimation of the gp_model. E.g.:
# gp_model.set_optim_params(params={"trace": True})
bst = gpb.train(params=params,
                train_set=data_train,
                gp_model=gp_model,
                num_boost_round=num_boost_round)
gp_model.summary()  # Trained random effects model

# Make predictions
nplot = 200  # number of predictions
X_test_plot = np.column_stack((np.linspace(0, 1, nplot), np.zeros(nplot)))
group_data_pred = -np.ones(nplot)
# Predict response variable
示例#12
0
lp = lp * 5 + 0.2
lp_test = lp_test * 5 + 0.2
y = np.random.normal(loc=lp, scale=1)
y_test = np.random.normal(loc=lp_test, scale=1)
# apply censoring
yu = 8
yl = 5
y[y >= yu] = yu
y[y <= yl] = yl
# censoring fractions
print(np.sum(y == yu) / n)
print(np.sum(y == yl) / n)

# train model and make predictions
params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu}
dtrain = gpb.Dataset(X, y)
bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100)
y_pred = bst.predict(X_test)
# mean square error (approx. 1.1 for n=10'000)
print("Test error of Grabit: " + str(((y_pred - y_test)**2).mean()))
# compare to standard least squares gradient boosting (approx. 1.8 for n=10'000)
params = {'objective': 'regression_l2', 'verbose': 0}
bst = gpb.train(params=params, train_set=dtrain, num_boost_round=100)
y_pred_ls = bst.predict(X_test)
print("Test error of standard least squares gradient boosting: " +
      str(((y_pred_ls - y_test)**2).mean()))

# measure time
import time
params = {'objective': 'tobit', 'verbose': 0, 'yl': yl, 'yu': yu}
dtrain = gpb.Dataset(X, y)