def _score_with_pred_proba(self, y, y_internal, y_pred_proba_internal, metric, sample_weight=None, weight_evaluation=None): metric = get_metric(metric, self.problem_type, 'leaderboard_metric') if weight_evaluation is None: weight_evaluation = self.weight_evaluation if metric.needs_pred: if self.problem_type == BINARY: # Use 1 and 0, otherwise f1 can crash due to unknown pos_label. y_pred = get_pred_from_proba(y_pred_proba_internal, problem_type=self.problem_type) y_tmp = y_internal else: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=True) y_tmp = y elif metric.needs_quantile: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=True) y_tmp = y else: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=False) y_tmp = y_internal return compute_weighted_metric(y_tmp, y_pred, metric, weights=sample_weight, weight_evaluation=weight_evaluation, quantile_levels=self.quantile_levels)
def inverse_transform_proba(self, y, as_pandas=False, as_pred=False): y_index = None if isinstance(y, DataFrame): y_index = y.index y = y.to_numpy() if self.invalid_class_count > 0: y_transformed = np.zeros( [len(y), len(self.ordered_class_labels)], dtype=np.float32) y_transformed[:, self.label_index_to_keep] = y else: y_transformed = y if as_pred: y_transformed = get_pred_from_proba( y_pred_proba=y_transformed, problem_type=self.problem_type_transform) y_transformed = self._convert_to_valid_series(y_transformed) y_transformed = y_transformed.map( self.cat_mappings_dependent_var_uncleaned) if y_index is not None: y_transformed.index = y_index if as_pandas and not as_pred: y_transformed = DataFrame(data=y_transformed, index=y_index, columns=self.ordered_class_labels, dtype=np.float32) return y_transformed
def predict(self, X: DataFrame, model=None, as_pandas=True): if as_pandas: X_index = copy.deepcopy(X.index) else: X_index = None y_pred_proba = self.predict_proba(X=X, model=model, as_pandas=False, as_multiclass=False, inverse_transform=False) problem_type = self.label_cleaner.problem_type_transform or self.problem_type y_pred = get_pred_from_proba(y_pred_proba=y_pred_proba, problem_type=problem_type) if problem_type != QUANTILE: y_pred = self.label_cleaner.inverse_transform(pd.Series(y_pred)) if as_pandas: y_pred.index = X_index y_pred.name = self.label else: y_pred = y_pred.values else: if as_pandas: y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels, index=X_index) return y_pred
def predict(self, X: DataFrame, model=None, as_pandas=False): if as_pandas: X_index = copy.deepcopy(X.index) else: X_index = None y_pred_proba = self.predict_proba(X=X, model=model, inverse_transform=False) problem_type = self.label_cleaner.problem_type_transform or self.problem_type y_pred = get_pred_from_proba(y_pred_proba=y_pred_proba, problem_type=problem_type) y_pred = self.label_cleaner.inverse_transform(pd.Series(y_pred)) if as_pandas: y_pred.index = X_index y_pred.name = self.label else: y_pred = y_pred.values return y_pred
def inverse_transform_proba(self, y, as_pandas=False, as_pred=False): if not as_pred: return y y_index = None if isinstance(y, Series): y_index = y.index y = y.to_numpy() if as_pred: y = get_pred_from_proba(y_pred_proba=y, problem_type=self.problem_type_transform) y = self._convert_to_valid_series(y) y = y.map(self.cat_mappings_dependent_var) y = y.to_numpy() if as_pandas: y = Series(data=y, index=y_index) return y
def inverse_transform_proba(self, y, as_pandas=False, as_pred=False): if isinstance(y, DataFrame): y = copy.deepcopy(y) y.columns = copy.deepcopy(self.ordered_class_labels) if as_pred: y = get_pred_from_proba_df(y, problem_type=self.problem_type_transform) if not as_pandas: y = y.to_numpy() elif as_pred: y_index = None if isinstance(y, Series): y_index = y.index y = y.to_numpy() y = get_pred_from_proba(y_pred_proba=y, problem_type=self.problem_type_transform) y = self._convert_to_valid_series(y) y = y.map(self.cat_mappings_dependent_var) y = y.to_numpy() if as_pandas: y = Series(data=y, index=y_index) return y
def evaluate_predictions(self, y_true, y_pred, silent=False, auxiliary_metrics=False, detailed_report=True, high_always_good=False): """ Evaluate predictions. Does not support sample weights since this method reports a variety of metrics. Args: silent (bool): Should we print which metric is being used as well as performance. auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric? detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True). high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated) Returns single performance-value if auxiliary_metrics=False. Otherwise returns dict where keys = metrics, values = performance along each metric. """ is_proba = False assert isinstance(y_true, (np.ndarray, pd.Series)) assert isinstance(y_pred, (np.ndarray, pd.Series, pd.DataFrame)) self._validate_class_labels(y_true) if isinstance(y_pred, np.ndarray): if self.problem_type == QUANTILE: y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels) elif len(y_pred.shape) > 1: y_pred = pd.DataFrame(data=y_pred, columns=self.class_labels) if self.problem_type == BINARY: if isinstance(y_pred, pd.DataFrame): # roc_auc crashes if this isn't done y_pred = y_pred[self.positive_class] is_proba = True elif not self.eval_metric.needs_pred: raise AssertionError(f'`evaluate_predictions` requires y_pred_proba input for binary classification ' f'when evaluating "{self.eval_metric.name}"... Please generate valid input via `predictor.predict_proba(data)`.\n' f'This may have occurred if you passed in predict input instead of predict_proba input, ' f'or if you specified `as_multiclass=False` to `predictor.predict_proba(data, as_multiclass=False)`, ' f'which is not supported by `evaluate_predictions`.') elif self.problem_type == MULTICLASS: if isinstance(y_pred, pd.DataFrame): is_proba = True if is_proba and self.eval_metric.needs_pred: if self.problem_type == BINARY: y_pred = get_pred_from_proba(y_pred_proba=y_pred, problem_type=self.problem_type) y_pred = self.label_cleaner.inverse_transform(y_pred) else: y_pred = get_pred_from_proba_df(y_pred_proba=y_pred, problem_type=self.problem_type) if not self.eval_metric.needs_pred: y_true = self.label_cleaner.transform(y_true) # Get labels in numeric order performance = self.eval_metric(y_true, y_pred) elif self.problem_type == BINARY: # Use 1 and 0, otherwise f1 can crash due to unknown pos_label. y_true_internal = self.label_cleaner.transform(y_true) y_pred_internal = self.label_cleaner.transform(y_pred) performance = self.eval_metric(y_true_internal, y_pred_internal) else: performance = self.eval_metric(y_true, y_pred) metric = self.eval_metric.name if not high_always_good: performance = self.eval_metric.convert_score_to_sklearn_val(performance) # flip negative once again back to positive (so higher is no longer necessarily better) if not silent: logger.log(20, f"Evaluation: {metric} on test data: {performance}") if not auxiliary_metrics: return performance # Otherwise compute auxiliary metrics: auxiliary_metrics = [] if self.problem_type == REGRESSION: # Adding regression metrics pearson_corr = lambda x, y: corrcoef(x, y)[0][1] pearson_corr.__name__ = 'pearson_correlation' auxiliary_metrics += [ mean_absolute_error, explained_variance_score, r2_score, pearson_corr, mean_squared_error, median_absolute_error, # max_error ] else: # Adding classification metrics auxiliary_metrics += [accuracy_score, balanced_accuracy_score, matthews_corrcoef] if self.problem_type == BINARY: # binary-specific metrics # def auc_score(y_true, y_pred): # TODO: this requires y_pred to be probability-scores # fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label) # return auc(fpr, tpr) f1micro_score = lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro') f1micro_score.__name__ = f1_score.__name__ auxiliary_metrics += [f1micro_score] # TODO: add auc? # elif self.problem_type == MULTICLASS: # multiclass metrics # auxiliary_metrics += [] # TODO: No multi-class specific metrics for now. Include top-5, top-10 accuracy here. performance_dict = OrderedDict({metric: performance}) for metric_function in auxiliary_metrics: if isinstance(metric_function, tuple): metric_function, metric_kwargs = metric_function else: metric_kwargs = None metric_name = metric_function.__name__ if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) if metric_kwargs: performance_dict[metric_name] = metric_function(y_true, y_pred, **metric_kwargs) else: performance_dict[metric_name] = metric_function(y_true, y_pred) except ValueError: pass if not silent: logger.log(20, "Evaluations on test data:") logger.log(20, json.dumps(performance_dict, indent=4)) if detailed_report and (self.problem_type != REGRESSION): # Construct confusion matrix try: performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe') except ValueError: pass # One final set of metrics to report cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True) metric_name = 'classification_report' if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) performance_dict[metric_name] = cl_metric(y_true, y_pred) except ValueError: pass if not silent and metric_name in performance_dict: logger.log(20, "Detailed (per-class) classification report:") logger.log(20, json.dumps(performance_dict[metric_name], indent=4)) return performance_dict
def score_debug(self, X: DataFrame, y=None, extra_info=False, compute_oracle=False, silent=False): leaderboard_df = self.leaderboard(extra_info=extra_info, silent=silent) if y is None: X, y = self.extract_label(X) self._validate_class_labels(y) w = None if self.weight_evaluation: X, w = extract_column(X, self.sample_weight) X = self.transform_features(X) y_internal = self.label_cleaner.transform(y) y_internal = y_internal.fillna(-1) trainer = self.load_trainer() scores = {} all_trained_models = trainer.get_model_names() all_trained_models_can_infer = trainer.get_model_names(can_infer=True) all_trained_models_original = all_trained_models.copy() model_pred_proba_dict, pred_time_test_marginal = trainer.get_model_pred_proba_dict(X=X, models=all_trained_models_can_infer, fit=False, record_pred_time=True) if compute_oracle: pred_probas = list(model_pred_proba_dict.values()) ensemble_selection = EnsembleSelection(ensemble_size=100, problem_type=trainer.problem_type, metric=self.eval_metric, quantile_levels=self.quantile_levels) ensemble_selection.fit(predictions=pred_probas, labels=y_internal, identifiers=None, sample_weight=w) # TODO: Only fit non-nan oracle_weights = ensemble_selection.weights_ oracle_pred_time_start = time.time() oracle_pred_proba_norm = [pred * weight for pred, weight in zip(pred_probas, oracle_weights)] oracle_pred_proba_ensemble = np.sum(oracle_pred_proba_norm, axis=0) oracle_pred_time = time.time() - oracle_pred_time_start model_pred_proba_dict['OracleEnsemble'] = oracle_pred_proba_ensemble pred_time_test_marginal['OracleEnsemble'] = oracle_pred_time all_trained_models.append('OracleEnsemble') for model_name, y_pred_proba_internal in model_pred_proba_dict.items(): if self.eval_metric.needs_pred: if self.problem_type == BINARY: # Use 1 and 0, otherwise f1 can crash due to unknown pos_label. y_pred = get_pred_from_proba(y_pred_proba_internal, problem_type=self.problem_type) y_tmp = y_internal else: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=True) y_tmp = y elif self.eval_metric.needs_quantile: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=True) y_tmp = y else: y_pred = self.label_cleaner.inverse_transform_proba(y_pred_proba_internal, as_pred=False) y_tmp = y_internal scores[model_name] = compute_weighted_metric(y_tmp, y_pred, self.eval_metric, w, weight_evaluation=self.weight_evaluation, quantile_levels=self.quantile_levels) pred_time_test = {} # TODO: Add support for calculating pred_time_test_full for oracle_ensemble, need to copy graph from trainer and add oracle_ensemble to it with proper edges. for model in model_pred_proba_dict.keys(): if model in all_trained_models_original: base_model_set = trainer.get_minimum_model_set(model) if len(base_model_set) == 1: pred_time_test[model] = pred_time_test_marginal[base_model_set[0]] else: pred_time_test_full_num = 0 for base_model in base_model_set: pred_time_test_full_num += pred_time_test_marginal[base_model] pred_time_test[model] = pred_time_test_full_num else: pred_time_test[model] = None scored_models = list(scores.keys()) for model in all_trained_models: if model not in scored_models: scores[model] = None pred_time_test[model] = None pred_time_test_marginal[model] = None logger.debug('Model scores:') logger.debug(str(scores)) model_names_final = list(scores.keys()) df = pd.DataFrame( data={ 'model': model_names_final, 'score_test': list(scores.values()), 'pred_time_test': [pred_time_test[model] for model in model_names_final], 'pred_time_test_marginal': [pred_time_test_marginal[model] for model in model_names_final], } ) df_merged = pd.merge(df, leaderboard_df, on='model', how='left') df_merged = df_merged.sort_values(by=['score_test', 'pred_time_test', 'score_val', 'pred_time_val', 'model'], ascending=[False, True, False, True, False]).reset_index(drop=True) df_columns_lst = df_merged.columns.tolist() explicit_order = [ 'model', 'score_test', 'score_val', 'pred_time_test', 'pred_time_val', 'fit_time', 'pred_time_test_marginal', 'pred_time_val_marginal', 'fit_time_marginal', 'stack_level', 'can_infer', ] df_columns_other = [column for column in df_columns_lst if column not in explicit_order] df_columns_new = explicit_order + df_columns_other df_merged = df_merged[df_columns_new] return df_merged