def train_gradient_boosted_trees(training_predictor_table, training_target_table, validation_predictor_table, validation_target_table): """Trains gradient-boosted trees. :param training_predictor_table: See doc for `utils.read_feature_file`. :param training_target_table: Same. :param validation_predictor_table: Same. :param validation_target_table: Same. """ num_predictors = len(list(training_predictor_table)) # max_predictors_per_split = int(numpy.round( # numpy.sqrt(num_predictors) # )) gbt_model_object = utils.setup_classification_forest( max_predictors_per_split=num_predictors, num_trees=100, min_examples_at_split=500, min_examples_at_leaf=200) _ = utils.train_classification_gbt( model_object=gbt_model_object, training_predictor_table=training_predictor_table, training_target_table=training_target_table) training_probabilities = gbt_model_object.predict_proba( training_predictor_table.as_matrix())[:, 1] training_event_frequency = numpy.mean( training_target_table[utils.BINARIZED_TARGET_NAME].values) utils.eval_binary_classifn( observed_labels=training_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=training_probabilities, training_event_frequency=training_event_frequency, dataset_name='training') validation_probabilities = gbt_model_object.predict_proba( validation_predictor_table.as_matrix())[:, 1] utils.eval_binary_classifn( observed_labels=validation_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=validation_probabilities, training_event_frequency=training_event_frequency, dataset_name='validation')
def tree_experiment_testing(min_per_split_values, min_per_leaf_values, validation_bss_matrix, training_predictor_table, training_target_table, testing_predictor_table, testing_target_table): """Selects and tests tree for experiment with min examples per split/leaf. :param min_per_split_values: See doc for `tree_experiment_validation`. :param min_per_leaf_values: Same. :param validation_bss_matrix: Same. :param training_predictor_table: See doc for `utils.read_feature_file`. :param training_target_table: Same. :param testing_predictor_table: Same. :param testing_target_table: Same. """ best_linear_index = numpy.nanargmax(numpy.ravel(validation_bss_matrix)) best_split_index, best_leaf_index = numpy.unravel_index( best_linear_index, validation_bss_matrix.shape) best_min_examples_per_split = min_per_split_values[best_split_index] best_min_examples_per_leaf = min_per_leaf_values[best_leaf_index] best_validation_bss = numpy.nanmax(validation_bss_matrix) message_string = ( 'Best validation BSS = {0:.3f} ... corresponding min examples per split' ' node = {1:d} ... min examples per leaf node = {2:d}').format( best_validation_bss, best_min_examples_per_split, best_min_examples_per_leaf) print(message_string) final_model_object = utils.setup_classification_tree( min_examples_at_split=best_min_examples_per_split, min_examples_at_leaf=best_min_examples_per_leaf) _ = utils.train_classification_tree( model_object=final_model_object, training_predictor_table=training_predictor_table, training_target_table=training_target_table) testing_predictions = final_model_object.predict_proba( testing_predictor_table.as_matrix())[:, 1] training_event_frequency = numpy.mean( training_target_table[utils.BINARIZED_TARGET_NAME].values) _ = utils.eval_binary_classifn( observed_labels=testing_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=testing_predictions, training_event_frequency=training_event_frequency, create_plots=True, verbose=True, dataset_name='testing')
def train_tree_default(training_predictor_table, training_target_table, validation_predictor_table, validation_target_table): """Trains decision tree with default params. :param training_predictor_table: See doc for `utils.read_feature_file`. :param training_target_table: Same. :param validation_predictor_table: Same. :param validation_target_table: Same. """ default_tree_model_object = utils.setup_classification_tree( min_examples_at_split=30, min_examples_at_leaf=30) _ = utils.train_classification_tree( model_object=default_tree_model_object, training_predictor_table=training_predictor_table, training_target_table=training_target_table) training_probabilities = default_tree_model_object.predict_proba( training_predictor_table.as_matrix())[:, 1] training_event_frequency = numpy.mean( training_target_table[utils.BINARIZED_TARGET_NAME].values) utils.eval_binary_classifn( observed_labels=training_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=training_probabilities, training_event_frequency=training_event_frequency, dataset_name='training') validation_probabilities = default_tree_model_object.predict_proba( validation_predictor_table.as_matrix())[:, 1] utils.eval_binary_classifn( observed_labels=validation_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=validation_probabilities, training_event_frequency=training_event_frequency, dataset_name='validation')
def train_logistic_model(training_predictor_table, training_target_table, validation_predictor_table, validation_target_table): """Trains plain logistic-regression model. :param training_predictor_table: See doc for `utils.read_feature_file`. :param training_target_table: Same. :param validation_predictor_table: Same. :param validation_target_table: Same. """ plain_log_model_object = utils.setup_logistic_regression(lambda1=0., lambda2=0.) _ = utils.train_logistic_regression( model_object=plain_log_model_object, training_predictor_table=training_predictor_table, training_target_table=training_target_table) training_probabilities = plain_log_model_object.predict_proba( training_predictor_table.as_matrix())[:, 1] training_event_frequency = numpy.mean( training_target_table[utils.BINARIZED_TARGET_NAME].values) utils.eval_binary_classifn( observed_labels=training_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=training_probabilities, training_event_frequency=training_event_frequency, dataset_name='training') validation_probabilities = plain_log_model_object.predict_proba( validation_predictor_table.as_matrix())[:, 1] utils.eval_binary_classifn( observed_labels=validation_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=validation_probabilities, training_event_frequency=training_event_frequency, dataset_name='validation')
def tree_experiment_training(training_predictor_table, training_target_table, validation_predictor_table, validation_target_table): """Trains decision trees for experiment with min examples per split/leaf. :param training_predictor_table: See doc for `utils.read_feature_file`. :param training_target_table: Same. :param validation_predictor_table: Same. :param validation_target_table: Same. """ min_per_split_values = numpy.array( [2, 5, 10, 20, 30, 40, 50, 100, 200, 500], dtype=int) min_per_leaf_values = numpy.array( [1, 5, 10, 20, 30, 40, 50, 100, 200, 500], dtype=int) num_min_per_split_values = len(min_per_split_values) num_min_per_leaf_values = len(min_per_leaf_values) validation_auc_matrix = numpy.full( (num_min_per_split_values, num_min_per_leaf_values), numpy.nan) validation_max_csi_matrix = validation_auc_matrix + 0. validation_bs_matrix = validation_auc_matrix + 0. validation_bss_matrix = validation_auc_matrix + 0. training_event_frequency = numpy.mean( training_target_table[utils.BINARIZED_TARGET_NAME].values) for i in range(num_min_per_split_values): for j in range(num_min_per_leaf_values): if min_per_leaf_values[j] >= min_per_split_values[i]: continue this_message_string = ( 'Training model with minima of {0:d} examples per split node, ' '{1:d} per leaf node...').format(min_per_split_values[i], min_per_leaf_values[j]) print(this_message_string) this_model_object = utils.setup_classification_tree( min_examples_at_split=min_per_split_values[i], min_examples_at_leaf=min_per_leaf_values[j]) _ = utils.train_classification_tree( model_object=this_model_object, training_predictor_table=training_predictor_table, training_target_table=training_target_table) these_validation_predictions = this_model_object.predict_proba( validation_predictor_table.as_matrix())[:, 1] this_evaluation_dict = utils.eval_binary_classifn( observed_labels=validation_target_table[ utils.BINARIZED_TARGET_NAME].values, forecast_probabilities=these_validation_predictions, training_event_frequency=training_event_frequency, create_plots=False, verbose=False) validation_auc_matrix[i, j] = this_evaluation_dict[utils.AUC_KEY] validation_max_csi_matrix[i, j] = this_evaluation_dict[ utils.MAX_CSI_KEY] validation_bs_matrix[i, j] = this_evaluation_dict[ utils.BRIER_SCORE_KEY] validation_bss_matrix[i, j] = this_evaluation_dict[ utils.BRIER_SKILL_SCORE_KEY]