def test_confusion_matrix_with_multioutput_samples(): # Given input_solution = [["cat", "ant", "cat"]] input_prediction = [["ant", "ant", "cat"]] # When-Then with pytest.raises(ValueError): observed_output = confusion_matrix(input_solution, input_prediction)
def test_confusion_matrix_with_invalid_weights(): # Given input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"] input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"] labels = [[1, 2], 0.1, [0.1], 3, 1] # When-Then with pytest.raises(ValueError): observed_output = confusion_matrix(input_solution, input_prediction, labels=labels)
def test_confusion_matrix_with_multiDimensional_labels(): # Given input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"] input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"] labels = [["ant", "bird"], "cat"] # When-Then with pytest.raises(ValueError): observed_output = confusion_matrix(input_solution, input_prediction, labels=labels)
def test_confusion_matrix_with_valid_inputs_without_labels_and_weights(): # Given input_solution = [2, 0, 2, 2, 0, 1] input_prediction = [0, 0, 2, 2, 0, 2] expected_output = np.array([[2, 0, 0], [0, 0, 1], [1, 0, 2]]) # When observed_output = confusion_matrix(input_solution, input_prediction) # Then assert (np.array_equal(expected_output, observed_output))
def test_confusion_matrix_with_valid_inputs_with_lesser_number_of_labels_and_without_weights(): # Given input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"] input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"] labels = ["bird", "cat"] expected_output = np.array([[0, 1], [0, 2]]) # When observed_output = confusion_matrix(input_solution, input_prediction, labels=labels) # Then assert(np.array_equal(expected_output, observed_output))
def test_confusion_matrix_with_empty_inputs(): # Given input_solution = [] input_prediction = [] labels = ["bird", "cat"] expected_output = np.array([[0, 0], [0, 0]]) # When observed_output = confusion_matrix(input_solution, input_prediction, labels = labels) # Then assert(np.array_equal(expected_output, observed_output))
def test_confusion_matrix_with_valid_inputs_with_labels_and_with_weights(): # Given input_solution = ["cat", "ant", "cat", "cat", "ant", "bird"] input_prediction = ["ant", "ant", "cat", "cat", "ant", "cat"] labels = ["ant", "bird", "cat"] weights = [0.1, 0.3, 1.0, 0.8, 0.2, 2.0] expected_output = np.array([[0.5, 0.0, 0.0], [0.0, 0.0, 2.0], [0.1, 0.0, 1.8]]) # When observed_output = confusion_matrix(input_solution, input_prediction, labels=labels, weights=weights) # Then assert(np.array_equal(expected_output, observed_output))
def evaluate_predictions(self, y_true, y_pred, sample_weight=None, silent=False, auxiliary_metrics=True, detailed_report=False): """ Evaluate predictions. Does not support sample weights since this method reports a variety of metrics. Args: silent (bool): Should we print which metric is being used as well as performance. auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric? detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True). Returns single performance-value if auxiliary_metrics=False. Otherwise returns dict where keys = metrics, values = performance along each metric. """ is_proba = False assert isinstance(y_true, (np.ndarray, pd.Series)) assert isinstance(y_pred, (np.ndarray, pd.Series, pd.DataFrame)) self._validate_class_labels(y_true) if isinstance(y_pred, np.ndarray): if self.problem_type == QUANTILE: y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels) elif len(y_pred.shape) > 1: y_pred = pd.DataFrame(data=y_pred, columns=self.class_labels) if isinstance(y_pred, pd.DataFrame): is_proba = True elif not self.eval_metric.needs_pred: raise AssertionError(f'`evaluate_predictions` requires y_pred_proba input ' f'when evaluating "{self.eval_metric.name}"... Please generate valid input via `predictor.predict_proba(data)`.\n' f'This may have occurred if you passed in predict input instead of predict_proba input, ' f'or if you specified `as_multiclass=False` to `predictor.predict_proba(data, as_multiclass=False)`, ' f'which is not supported by `evaluate_predictions`.') if is_proba: y_pred_proba = y_pred y_pred = get_pred_from_proba_df(y_pred_proba, problem_type=self.problem_type) if self.problem_type == BINARY: # roc_auc crashes if this isn't done y_pred_proba = y_pred_proba[self.positive_class] else: y_pred_proba = None y_pred = pd.Series(y_pred) if y_pred_proba is not None: y_pred_proba_internal = self.label_cleaner.transform_proba(y_pred_proba, as_pandas=True) else: y_pred_proba_internal = None y_true_internal = self.label_cleaner.transform(y_true) # Get labels in numeric order y_true_internal = y_true_internal.fillna(-1) y_pred_internal = self.label_cleaner.transform(y_pred) # Get labels in numeric order # Compute auxiliary metrics: auxiliary_metrics_lst = [self.eval_metric] performance_dict = {} if auxiliary_metrics: if self.problem_type == REGRESSION: # Adding regression metrics auxiliary_metrics_lst += [ 'root_mean_squared_error', 'mean_squared_error', 'mean_absolute_error', 'r2', 'pearsonr', 'median_absolute_error', ] if self.problem_type in [BINARY, MULTICLASS]: # Adding classification metrics auxiliary_metrics_lst += [ 'accuracy', 'balanced_accuracy', # 'log_loss', # Don't include as it probably adds more confusion to novice users (can be infinite) 'mcc', ] if self.problem_type == BINARY: # binary-specific metrics auxiliary_metrics_lst += [ 'roc_auc', 'f1', 'precision', 'recall', ] scoring_args = dict( y=y_true, y_internal=y_true_internal, weight_evaluation=False, ) if sample_weight is not None: scoring_args['sample_weight'] = sample_weight scoring_args['weight_evaluation'] = True for aux_metric in auxiliary_metrics_lst: if isinstance(aux_metric, str): aux_metric = get_metric(metric=aux_metric, problem_type=self.problem_type, metric_type='aux_metric') if not aux_metric.needs_pred and y_pred_proba_internal is None: logger.log(15, f'Skipping {aux_metric.name} because no prediction probabilities are available to score.') continue if aux_metric.name not in performance_dict: if y_pred_proba_internal is not None: score = self._score_with_pred_proba( y_pred_proba_internal=y_pred_proba_internal, metric=aux_metric, **scoring_args ) else: score = self._score_with_pred( y_pred_internal=y_pred_internal, metric=aux_metric, **scoring_args ) performance_dict[aux_metric.name] = score if self.eval_metric.name in performance_dict: score_eval = performance_dict[self.eval_metric.name] score_eval_flipped = self.eval_metric.convert_score_to_sklearn_val(score_eval) # flip negative once again back to positive (so higher is no longer necessarily better) if score_eval_flipped != score_eval: flipped = True else: flipped = False if not silent: logger.log(20, f"Evaluation: {self.eval_metric.name} on test data: {score_eval}") if flipped: logger.log(20, f"\tNote: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.") if not silent: logger.log(20, "Evaluations on test data:") logger.log(20, json.dumps(performance_dict, indent=4)) if detailed_report and (self.problem_type != REGRESSION): # Construct confusion matrix try: performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe') except ValueError: pass # One final set of metrics to report cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True) metric_name = 'classification_report' if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) performance_dict[metric_name] = cl_metric(y_true, y_pred) except ValueError: pass if not silent and metric_name in performance_dict: logger.log(20, "Detailed (per-class) classification report:") logger.log(20, json.dumps(performance_dict[metric_name], indent=4)) return performance_dict
def evaluate_predictions(self, y_true, y_pred, silent=False, auxiliary_metrics=False, detailed_report=True, high_always_good=False): """ Evaluate predictions. Does not support sample weights since this method reports a variety of metrics. Args: silent (bool): Should we print which metric is being used as well as performance. auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric? detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True). high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated) Returns single performance-value if auxiliary_metrics=False. Otherwise returns dict where keys = metrics, values = performance along each metric. """ is_proba = False assert isinstance(y_true, (np.ndarray, pd.Series)) assert isinstance(y_pred, (np.ndarray, pd.Series, pd.DataFrame)) self._validate_class_labels(y_true) if isinstance(y_pred, np.ndarray): if self.problem_type == QUANTILE: y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels) elif len(y_pred.shape) > 1: y_pred = pd.DataFrame(data=y_pred, columns=self.class_labels) if self.problem_type == BINARY: if isinstance(y_pred, pd.DataFrame): # roc_auc crashes if this isn't done y_pred = y_pred[self.positive_class] is_proba = True elif not self.eval_metric.needs_pred: raise AssertionError(f'`evaluate_predictions` requires y_pred_proba input for binary classification ' f'when evaluating "{self.eval_metric.name}"... Please generate valid input via `predictor.predict_proba(data)`.\n' f'This may have occurred if you passed in predict input instead of predict_proba input, ' f'or if you specified `as_multiclass=False` to `predictor.predict_proba(data, as_multiclass=False)`, ' f'which is not supported by `evaluate_predictions`.') elif self.problem_type == MULTICLASS: if isinstance(y_pred, pd.DataFrame): is_proba = True if is_proba and self.eval_metric.needs_pred: if self.problem_type == BINARY: y_pred = get_pred_from_proba(y_pred_proba=y_pred, problem_type=self.problem_type) y_pred = self.label_cleaner.inverse_transform(y_pred) else: y_pred = get_pred_from_proba_df(y_pred_proba=y_pred, problem_type=self.problem_type) if not self.eval_metric.needs_pred: y_true = self.label_cleaner.transform(y_true) # Get labels in numeric order performance = self.eval_metric(y_true, y_pred) elif self.problem_type == BINARY: # Use 1 and 0, otherwise f1 can crash due to unknown pos_label. y_true_internal = self.label_cleaner.transform(y_true) y_pred_internal = self.label_cleaner.transform(y_pred) performance = self.eval_metric(y_true_internal, y_pred_internal) else: performance = self.eval_metric(y_true, y_pred) metric = self.eval_metric.name if not high_always_good: performance = self.eval_metric.convert_score_to_sklearn_val(performance) # flip negative once again back to positive (so higher is no longer necessarily better) if not silent: logger.log(20, f"Evaluation: {metric} on test data: {performance}") if not auxiliary_metrics: return performance # Otherwise compute auxiliary metrics: auxiliary_metrics = [] if self.problem_type == REGRESSION: # Adding regression metrics pearson_corr = lambda x, y: corrcoef(x, y)[0][1] pearson_corr.__name__ = 'pearson_correlation' auxiliary_metrics += [ mean_absolute_error, explained_variance_score, r2_score, pearson_corr, mean_squared_error, median_absolute_error, # max_error ] else: # Adding classification metrics auxiliary_metrics += [accuracy_score, balanced_accuracy_score, matthews_corrcoef] if self.problem_type == BINARY: # binary-specific metrics # def auc_score(y_true, y_pred): # TODO: this requires y_pred to be probability-scores # fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label) # return auc(fpr, tpr) f1micro_score = lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro') f1micro_score.__name__ = f1_score.__name__ auxiliary_metrics += [f1micro_score] # TODO: add auc? # elif self.problem_type == MULTICLASS: # multiclass metrics # auxiliary_metrics += [] # TODO: No multi-class specific metrics for now. Include top-5, top-10 accuracy here. performance_dict = OrderedDict({metric: performance}) for metric_function in auxiliary_metrics: if isinstance(metric_function, tuple): metric_function, metric_kwargs = metric_function else: metric_kwargs = None metric_name = metric_function.__name__ if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) if metric_kwargs: performance_dict[metric_name] = metric_function(y_true, y_pred, **metric_kwargs) else: performance_dict[metric_name] = metric_function(y_true, y_pred) except ValueError: pass if not silent: logger.log(20, "Evaluations on test data:") logger.log(20, json.dumps(performance_dict, indent=4)) if detailed_report and (self.problem_type != REGRESSION): # Construct confusion matrix try: performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe') except ValueError: pass # One final set of metrics to report cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True) metric_name = 'classification_report' if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) performance_dict[metric_name] = cl_metric(y_true, y_pred) except ValueError: pass if not silent and metric_name in performance_dict: logger.log(20, "Detailed (per-class) classification report:") logger.log(20, json.dumps(performance_dict[metric_name], indent=4)) return performance_dict
def evaluate(self, y_true, y_pred, silent=False, auxiliary_metrics=False, detailed_report=True, high_always_good=False): """ Evaluate predictions. Args: silent (bool): Should we print which metric is being used as well as performance. auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric? detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True). high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated) Returns single performance-value if auxiliary_metrics=False. Otherwise returns dict where keys = metrics, values = performance along each metric. """ assert isinstance(y_true, (np.ndarray, pd.Series)) assert isinstance( y_pred, (np.ndarray, pd.Series)) # TODO: Enable DataFrame for y_pred_proba self._validate_class_labels(y_true) if not self.eval_metric.needs_pred: y_true = self.label_cleaner.transform( y_true) # Get labels in numeric order performance = self.eval_metric(y_true, y_pred) metric = self.eval_metric.name if not high_always_good: performance = performance * self.eval_metric._sign # flip negative once again back to positive (so higher is no longer necessarily better) if not silent: logger.log(20, f"Evaluation: {metric} on test data: {performance}") if not auxiliary_metrics: return performance # Otherwise compute auxiliary metrics: auxiliary_metrics = [] if self.problem_type == REGRESSION: # Adding regression metrics pearson_corr = lambda x, y: corrcoef(x, y)[0][1] pearson_corr.__name__ = 'pearson_correlation' auxiliary_metrics += [ mean_absolute_error, explained_variance_score, r2_score, pearson_corr, mean_squared_error, median_absolute_error, # max_error ] else: # Adding classification metrics auxiliary_metrics += [ accuracy_score, balanced_accuracy_score, matthews_corrcoef ] if self.problem_type == BINARY: # binary-specific metrics # def auc_score(y_true, y_pred): # TODO: this requires y_pred to be probability-scores # fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label) # return auc(fpr, tpr) f1micro_score = lambda y_true, y_pred: f1_score( y_true, y_pred, average='micro') f1micro_score.__name__ = f1_score.__name__ auxiliary_metrics += [f1micro_score] # TODO: add auc? # elif self.problem_type == MULTICLASS: # multiclass metrics # auxiliary_metrics += [] # TODO: No multi-class specific metrics for now. Include top-5, top-10 accuracy here. performance_dict = OrderedDict({metric: performance}) for metric_function in auxiliary_metrics: if isinstance(metric_function, tuple): metric_function, metric_kwargs = metric_function else: metric_kwargs = None metric_name = metric_function.__name__ if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) if metric_kwargs: performance_dict[metric_name] = metric_function( y_true, y_pred, **metric_kwargs) else: performance_dict[metric_name] = metric_function( y_true, y_pred) except ValueError: pass if not silent: logger.log(20, "Evaluations on test data:") logger.log(20, json.dumps(performance_dict, indent=4)) if detailed_report and (self.problem_type != REGRESSION): # Construct confusion matrix try: performance_dict['confusion_matrix'] = confusion_matrix( y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe') except ValueError: pass # One final set of metrics to report cl_metric = lambda y_true, y_pred: classification_report( y_true, y_pred, output_dict=True) metric_name = 'classification_report' if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) performance_dict[metric_name] = cl_metric(y_true, y_pred) except ValueError: pass if not silent and metric_name in performance_dict: logger.log(20, "Detailed (per-class) classification report:") logger.log( 20, json.dumps(performance_dict[metric_name], indent=4)) return performance_dict