def _make_single_prediction_shap_table(pipeline, input_features, top_k=3, training_data=None, include_shap_values=False, output_format="text"): """Creates table summarizing the top_k positive and top_k negative contributing features to the prediction of a single datapoint. Arguments: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. top_k (int): How many of the highest/lowest features to include in the table. training_data (pd.DataFrame): Training data the pipeline was fit on. This is required for non-tree estimators because we need a sample of training data for the KernelSHAP algorithm. include_shap_values (bool): Whether the SHAP values should be included in an extra column in the output. Default is False. Returns: str: Table """ pipeline_features = pipeline.compute_estimator_features(input_features) shap_values = _compute_shap_values(pipeline, pipeline_features, training_data) normalized_shap_values = _normalize_shap_values(shap_values) class_names = None if hasattr(pipeline, "classes_"): class_names = pipeline.classes_ table_makers = {ProblemTypes.REGRESSION: _RegressionSHAPTable(), ProblemTypes.BINARY: _BinarySHAPTable(class_names), ProblemTypes.MULTICLASS: _MultiClassSHAPTable(class_names)} table_maker_class = table_makers[pipeline.problem_type] table_maker = table_maker_class.make_text if output_format == "text" else table_maker_class.make_dict return table_maker(shap_values, normalized_shap_values, pipeline_features, top_k, include_shap_values)
def _make_single_prediction_shap_table(pipeline, input_features, y, index_to_explain, top_k=3, include_shap_values=False, output_format="text"): """Creates table summarizing the top_k_features positive and top_k_features negative contributing features to the prediction of a single datapoint. Arguments: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. top_k (int): How many of the highest/lowest features to include in the table. training_data (pd.DataFrame): Training data the pipeline was fit on. This is required for non-tree estimators because we need a sample of training data for the KernelSHAP algorithm. include_shap_values (bool): Whether the SHAP values should be included in an extra column in the output. Default is False. output_format (str): The desired format of the output. Can be "text", "dict", or "dataframe". Returns: str: Table Raises: ValueError: if requested index results in a NaN in the computed features. """ pipeline_features = pipeline.compute_estimator_features(input_features, y).to_dataframe() pipeline_features_row = pipeline_features.iloc[[index_to_explain]] if pipeline_features_row.isna().any(axis=None): raise ValueError( f"Requested index ({index_to_explain}) produces NaN in features.") shap_values = _compute_shap_values( pipeline, pipeline_features_row, training_data=pipeline_features.dropna(axis=0)) normalized_shap_values = _normalize_shap_values(shap_values) class_names = None if hasattr(pipeline, "classes_"): class_names = pipeline.classes_ table_makers = { ProblemTypes.REGRESSION: _RegressionSHAPTable(), ProblemTypes.BINARY: _BinarySHAPTable(class_names), ProblemTypes.MULTICLASS: _MultiClassSHAPTable(class_names), ProblemTypes.TIME_SERIES_REGRESSION: _RegressionSHAPTable(), ProblemTypes.TIME_SERIES_BINARY: _BinarySHAPTable(class_names), ProblemTypes.TIME_SERIES_MULTICLASS: _MultiClassSHAPTable(class_names) } table_maker_class = table_makers[pipeline.problem_type] table_maker = { "text": table_maker_class.make_text, "dict": table_maker_class.make_dict, "dataframe": table_maker_class.make_dataframe }[output_format] return table_maker(shap_values, normalized_shap_values, pipeline_features_row, top_k, include_shap_values)
def test_normalize_values(values, answer): normalized = _normalize_shap_values(values) if isinstance(normalized, dict): check_equal_dicts(normalized, answer) else: assert len(normalized) == len(answer) for values, correct in zip(normalized, answer): check_equal_dicts(values, correct)
def test_normalize_values(values, answer): def check_equal_dicts(normalized, answer): assert set(normalized.keys()) == set(answer) for key in normalized: np.testing.assert_almost_equal(normalized[key], answer[key], decimal=4) normalized = _normalize_shap_values(values) if isinstance(normalized, dict): check_equal_dicts(normalized, answer) else: assert len(normalized) == len(answer) for values, correct in zip(normalized, answer): check_equal_dicts(values, correct)
def test_normalize_values_exceptions(): with pytest.raises(ValueError, match="^Unsupported data type for _normalize_shap_values"): _normalize_shap_values(1)