def get_booster_model(data_train, groups_train): """Gets model and define its parameters. For finding the optimal number of iterations, cross-validation is applied. Parameters ---------- data_train: Train data readable for the package gpbooster, should contain the information about X_train and y_train groups_train: Group indices Returns ------- gp_model Instance of the Gradient Tree boosting model with random effects params Parameters with which the model should be trained opt_num_boost_rounds Optimal number of boosting rounds for the training, found with cross- validation """ logging.info('Getting booster model') gp_model = gpb.GPModel(group_data=groups_train) gp_model.set_optim_params(params={"optimizer_cov": "fisher_scoring"}) params = { 'objective': 'regression_l2', 'learning_rate': 0.05, 'max_depth': 6, 'min_data_in_leaf': 5, 'verbose': 0 } logging.info('Calculating optimal number of boost rounds \ via cross-validation') cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, use_gp_model_for_validation=True, num_boost_round=300, early_stopping_rounds=5, nfold=3, verbose_eval=False, show_stdv=False, seed=1) opt_num_boost_rounds = np.argmin(cvbst['l2-mean']) return gp_model, params, opt_num_boost_rounds
# train model bst = gpb.train(params=params, train_set=data_train, gp_model=gp_model, num_boost_round=32) gp_model.summary() # estimated covariance parameters # Covariance parameters in the following order: # ['Error_term', 'Group_1'] # [0.9183072 1.013057 ] # Make predictions pred = bst.predict(data=X_test, group_data_pred=group_test) y_pred = pred['fixed_effect'] + pred['random_effect_mean'] # sum predictions of fixed effect and random effect np.sqrt(np.mean((y_test - y_pred) ** 2)) # root mean square error (RMSE) on test data. Approx. = 1.25 # Parameter tuning using cross-validation (only number of boosting iterations) gp_model = gpb.GPModel(group_data=group_train) cvbst = gpb.cv(params=params, train_set=data_train, gp_model=gp_model, use_gp_model_for_validation=False, num_boost_round=100, early_stopping_rounds=5, nfold=4, verbose_eval=True, show_stdv=False, seed=1) best_iter = np.argmin(cvbst['l2-mean']) print("Best number of iterations: " + str(best_iter)) # Best number of iterations: 32 # --------------------Model interpretation---------------- # Plotting feature importances gpb.plot_importance(bst) # Partial dependence plots from pdpbox import pdp # Single variable plots (takes a few seconds to compute) pdp_dist = pdp.pdp_isolate(model=bst, dataset=X_train, model_features=X_train.columns, feature='variable_2', num_grid_points=50) pdp.pdp_plot(pdp_dist, 'variable_2', plot_lines=True)