def validate(self, X, y=None): """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], ... 'regression_not_unique_enough': [float(1) for x in range(100)] ... }) >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8) >>> assert uniqueness_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\ "data_check_name": "UniquenessDataCheck",\ "level": "warning",\ "code": "NOT_UNIQUE_ENOUGH",\ "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\ "actions": []} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(UniquenessDataCheck.uniqueness_score) if is_regression(self.problem_type): not_unique_enough_cols = list(res.index[res < self.threshold]) results["warnings"].extend([ DataCheckWarning( message=warning_not_unique_enough.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in not_unique_enough_cols ]) elif is_multiclass(self.problem_type): too_unique_cols = list(res.index[res > self.threshold]) results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in too_unique_cols ]) return results
def test_invalid_target_data_action_for_data_with_null(problem_type): y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) impute_strategy = "mean" if is_regression(problem_type) else "most_frequent" expected = { "warnings": [], "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()] } if is_binary(problem_type): expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [0]}).to_dict()) elif is_multiclass(problem_type): expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details={"num_classes": 1}).to_dict()) expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={"class_to_value_ratio": 0.1}).to_dict()) messages = invalid_targets_check.validate(X, y) assert messages == expected
def test_split_data(problem_type, data_type, X_y_binary, X_y_multi, X_y_regression, make_data_type): if is_binary(problem_type): X, y = X_y_binary if is_multiclass(problem_type): X, y = X_y_multi if is_regression(problem_type): X, y = X_y_regression problem_configuration = None if is_time_series(problem_type): problem_configuration = {'gap': 1, 'max_delay': 7} X = make_data_type(data_type, X) y = make_data_type(data_type, y) test_pct = 0.25 X_train, X_test, y_train, y_test = split_data( X, y, test_size=test_pct, problem_type=problem_type, problem_configuration=problem_configuration) test_size = len(X) * test_pct train_size = len(X) - test_size assert len(X_train) == train_size assert len(X_test) == test_size assert len(y_train) == train_size assert len(y_test) == test_size assert isinstance(X_train, ww.DataTable) assert isinstance(X_test, ww.DataTable) assert isinstance(y_train, ww.DataColumn) assert isinstance(y_test, ww.DataColumn)
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, ts_data, helper_functions): for estimator_class in _all_estimators_used_in_search(): if estimator_class.__name__ == 'ARIMARegressor': continue supported_problem_types = [ handle_problem_types(pt) for pt in estimator_class.supported_problem_types ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1( estimator_class) if is_binary(problem_type): X, y = X_y_binary elif is_multiclass(problem_type): X, y = X_y_multi elif is_regression(problem_type): X, y = X_y_regression X = get_random_state(clf.random_seed).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) assert clf.input_feature_names is None clf.fit(X, y) assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all() predictions = clf.predict(X).to_series() assert len(predictions) == len(y) assert not np.isnan(predictions).all() assert (clf.input_feature_names == col_names)
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0): """Splits data into train and test sets. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples] problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the date_index, gap, and max_delay variables. test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). random_seed (int): Seed for the random number generator. Defaults to 0. Returns: ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets """ X = infer_feature_types(X) y = infer_feature_types(y) data_splitter = None if is_time_series(problem_type): data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed) elif is_regression(problem_type): data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) elif is_classification(problem_type): data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) train, test = next(data_splitter.split(X.to_dataframe(), y.to_series())) X_train = X.iloc[train] X_test = X.iloc[test] y_train = y.iloc[train] y_test = y.iloc[test] return X_train, X_test, y_train, y_test
def test_type_checks(problem_type): assert is_regression(problem_type) == (problem_type in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION ]) assert is_binary(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY ]) assert is_multiclass(problem_type) == (problem_type in [ ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_classification(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_time_series(problem_type) == (problem_type in [ ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION ])
def explain_predictions_best_worst(pipeline, input_features, y_true, num_to_explain=5, top_k_features=3, include_shap_values=False, metric=None, output_format="text"): """Creates a report summarizing the top contributing features for the best and worst points in the dataset as measured by error to true labels. XGBoost models and CatBoost multiclass classifiers are not currently supported. Arguments: pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP. input_features (ww.DataTable, pd.DataFrame): Input data to evaluate the pipeline on. y_true (ww.DataColumn, pd.Series): True labels for the input data. num_to_explain (int): How many of the best, worst, random data points to explain. top_k_features (int): How many of the highest/lowest contributing feature to include in the table for each data point. include_shap_values (bool): Whether SHAP values should be included in the table. Default is False. metric (callable): The metric used to identify the best and worst points in the dataset. Function must accept the true labels and predicted value or probabilities as the only arguments and lower values must be better. By default, this will be the absolute error for regression problems and cross entropy loss for classification problems. output_format (str): Either "text" or "dict". Default is "text". Returns: str, dict, or pd.DataFrame - A report explaining the top contributing features for the best/worst predictions in the input_features. For each of the best/worst rows of input_features, the predicted values, true labels, metric value, feature names, prediction contribution, and SHAP Value (optional) will be listed. Raises: ValueError: if input_features does not have more than twice the requested features to explain. ValueError: if y_true and input_features have mismatched lengths. ValueError: if an output_format outside of "text", "dict" or "dataframe is provided. """ input_features = infer_feature_types(input_features) input_features = _convert_woodwork_types_wrapper( input_features.to_dataframe()) y_true = infer_feature_types(y_true) y_true = _convert_woodwork_types_wrapper(y_true.to_series()) if not (input_features.shape[0] >= num_to_explain * 2): raise ValueError( f"Input features must be a dataframe with more than {num_to_explain * 2} rows! " "Convert to a dataframe and select a smaller value for num_to_explain if you do not have " "enough data.") if y_true.shape[0] != input_features.shape[0]: raise ValueError( "Parameters y_true and input_features must have the same number of data points. Received: " f"true labels: {y_true.shape[0]} and {input_features.shape[0]}") if output_format not in {"text", "dict", "dataframe"}: raise ValueError( f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}" ) if not metric: metric = DEFAULT_METRICS[pipeline.problem_type] try: if is_regression(pipeline.problem_type): if is_time_series(pipeline.problem_type): y_pred = pipeline.predict(input_features, y=y_true).to_series() else: y_pred = pipeline.predict(input_features).to_series() y_pred_values = None y_true_no_nan, y_pred_no_nan = drop_rows_with_nans(y_true, y_pred) errors = metric(y_true_no_nan, y_pred_no_nan) else: if is_time_series(pipeline.problem_type): y_pred = pipeline.predict_proba(input_features, y=y_true).to_dataframe() y_pred_values = pipeline.predict(input_features, y=y_true).to_series() else: y_pred = pipeline.predict_proba(input_features).to_dataframe() y_pred_values = pipeline.predict(input_features).to_series() y_true_no_nan, y_pred_no_nan, y_pred_values_no_nan = drop_rows_with_nans( y_true, y_pred, y_pred_values) errors = metric(pipeline._encode_targets(y_true_no_nan), y_pred_no_nan) except Exception as e: tb = traceback.format_tb(sys.exc_info()[2]) raise PipelineScoreError(exceptions={metric.__name__: (e, tb)}, scored_successfully={}) errors = pd.Series(errors, index=y_pred_no_nan.index) sorted_scores = errors.sort_values() best_indices = sorted_scores.index[:num_to_explain] worst_indices = sorted_scores.index[-num_to_explain:] index_list = best_indices.tolist() + worst_indices.tolist() pipeline_features = pipeline.compute_estimator_features( input_features, y_true).to_dataframe() data = _ReportData(pipeline, pipeline_features, input_features, y_true, y_pred, y_pred_values, errors, index_list, metric) report_creator = _report_creator_factory( data, report_type="explain_predictions_best_worst", output_format=output_format, top_k_features=top_k_features, include_shap_values=include_shap_values, num_to_explain=num_to_explain) return report_creator(data)
def validate(self, X, y): """Checks if the target data contains missing or invalid values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored. y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values. Returns: dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary') >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\ "data_check_name": "InvalidTargetDataCheck",\ "level": "error",\ "code": "TARGET_HAS_NULL",\ "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\ "warnings": [],\ "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} """ results = {"warnings": [], "errors": [], "actions": []} if y is None: results["errors"].append( DataCheckError( message="Target is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}).to_dict()) return results y = infer_feature_types(y) is_supported_type = y.logical_type in numeric_and_boolean_ww + [ ww.logical_types.Categorical ] if not is_supported_type: results["errors"].append( DataCheckError( message= "Target is unsupported {} type. Valid Woodwork logical types include: {}" .format( y.logical_type, ", ".join([ ltype.type_string for ltype in numeric_and_boolean_ww ])), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={ "unsupported_type": y.logical_type.type_string }).to_dict()) y_df = _convert_woodwork_types_wrapper(y.to_series()) null_rows = y_df.isnull() if null_rows.all(): results["errors"].append( DataCheckError(message="Target is either empty or fully null.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()) return results elif null_rows.any(): num_null_rows = null_rows.sum() pct_null_rows = null_rows.mean() * 100 results["errors"].append( DataCheckError( message="{} row(s) ({}%) of target values are null".format( num_null_rows, pct_null_rows), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows }).to_dict()) impute_strategy = "mean" if is_regression( self.problem_type) else "most_frequent" results["actions"].append( DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": impute_strategy }).to_dict()) value_counts = y_df.value_counts() unique_values = value_counts.index.tolist() if is_binary(self.problem_type) and len(value_counts) != 2: if self.n_unique is None: details = {"target_values": unique_values} else: details = { "target_values": unique_values[:min(self.n_unique, len(unique_values))] } results["errors"].append( DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details=details).to_dict()) if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags: results["errors"].append( DataCheckError( message= "Target data type should be numeric for regression type problems.", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={}).to_dict()) if is_multiclass(self.problem_type): if value_counts.min() <= 1: least_populated = value_counts[value_counts <= 1] details = { "least_populated_class_labels": least_populated.index.tolist() } results["errors"].append( DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details=details).to_dict()) if len(unique_values) <= 2: details = {"num_classes": len(unique_values)} results["errors"].append( DataCheckError( message= "Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details=details).to_dict()) num_class_to_num_value_ratio = len(unique_values) / len(y) if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold: details = { "class_to_value_ratio": num_class_to_num_value_ratio } results["warnings"].append( DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details=details).to_dict()) any_neg = not (y_df > 0).all() if y.logical_type in [ ww.logical_types.Integer, ww.logical_types.Double ] else None if any_neg and self.objective.positive_only: details = { "Count of offending values": sum(val <= 0 for val in y_df.values.flatten()) } results["errors"].append( DataCheckError( message= f"Target has non-positive values which is not supported for {self.objective.name}", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_INCOMPATIBLE_OBJECTIVE, details=details).to_dict()) if X is not None: X = infer_feature_types(X) X_index = list(X.to_dataframe().index) y_index = list(y_df.index) X_length = len(X_index) y_length = len(y_index) if X_length != y_length: results["warnings"].append( DataCheckWarning( message= "Input target and features have different lengths", data_check_name=self.name, message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, details={ "features_length": X_length, "target_length": y_length }).to_dict()) if X_index != y_index: if set(X_index) == set(y_index): results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices order", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES_ORDER, details={}).to_dict()) else: index_diff_not_in_X = list(set(y_index) - set(X_index))[:10] index_diff_not_in_y = list(set(X_index) - set(y_index))[:10] results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES, details={ "indices_not_in_features": index_diff_not_in_X, "indices_not_in_target": index_diff_not_in_y }).to_dict()) return results
def _best_worst_predicted_values_section(data, regression, classification): """Get and initialize the predicted values section maker given the data.""" predicted_values_class = regression if is_regression( data.pipeline.problem_type) else classification return predicted_values_class(data.metric.__name__, data.y_pred_values)
def _compute_shap_values(pipeline, features, training_data=None): """Computes SHAP values for each feature. Arguments: pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP. features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. training_data (pd.DataFrame): Training data the pipeline was fit on. For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm. Returns: dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values. For classification problems, returns a list of dictionaries. One for each class. """ estimator = pipeline.estimator if estimator.model_family == ModelFamily.BASELINE: raise ValueError( "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed." ) feature_names = features.columns # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise. # Sklearn components do this under-the-hood so we're not changing the data the model was trained on. # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric. if estimator.model_family != ModelFamily.CATBOOST: features = check_array(features.values) if estimator.model_family.is_tree_estimator(): # Use tree_path_dependent to avoid linear runtime with dataset size with warnings.catch_warnings(record=True) as ws: explainer = shap.TreeExplainer( estimator._component_obj, feature_perturbation="tree_path_dependent") if ws: logger.debug( f"_compute_shap_values TreeExplainer: {ws[0].message}") shap_values = explainer.shap_values(features, check_additivity=False) # shap only outputs values for positive class for Catboost/Xgboost binary estimators. # this modifies the output to match the output format of other binary estimators. # Ok to fill values of negative class with zeros since the negative class will get dropped # in the UI anyways. if estimator.model_family in { ModelFamily.CATBOOST, ModelFamily.XGBOOST } and is_binary(pipeline.problem_type): shap_values = [np.zeros(shap_values.shape), shap_values] else: if training_data is None: raise ValueError( "You must pass in a value for parameter 'training_data' when the pipeline " "does not have a tree-based estimator. " f"Current estimator model family is {estimator.model_family}.") # More than 100 datapoints can negatively impact runtime according to SHAP # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114 sampled_training_data_features = shap.sample(training_data, 100) sampled_training_data_features = check_array( sampled_training_data_features) if is_regression(pipeline.problem_type): link_function = "identity" decision_function = estimator._component_obj.predict else: link_function = "logit" decision_function = estimator._component_obj.predict_proba with warnings.catch_warnings(record=True) as ws: explainer = shap.KernelExplainer(decision_function, sampled_training_data_features, link_function) shap_values = explainer.shap_values(features) if ws: logger.debug( f"_compute_shap_values KernelExplainer: {ws[0].message}") # classification problem if isinstance(shap_values, list): mappings = [] for class_shap_values in shap_values: mappings.append( _create_dictionary(class_shap_values, feature_names)) return mappings # regression problem elif isinstance(shap_values, np.ndarray): return _create_dictionary(shap_values, feature_names) else: raise ValueError( f"Unknown shap_values datatype {str(type(shap_values))}!")
def test_explain_predictions_best_worst_and_explain_predictions( mock_make_table, mock_default_metrics, problem_type, output_format, answer, explain_predictions_answer, custom_index): if output_format == "text": mock_make_table.return_value = "table goes here" elif output_format == "dataframe": shap_table = pd.DataFrame({ "feature_names": [0], "feature_values": [0], "qualitative_explanation": [0], "quantitative_explanation": [0], }) # Use side effect so that we always get a new copy of the dataframe mock_make_table.side_effect = lambda *args, **kwargs: shap_table.copy() else: mock_make_table.return_value = { "explanations": ["explanation_dictionary_goes_here"] } pipeline = MagicMock() pipeline.parameters = "Parameters go here" input_features = pd.DataFrame({"a": [3, 4]}, index=custom_index) pipeline.problem_type = problem_type pipeline.name = "Test Pipeline Name" pipeline.compute_estimator_features.return_value = ww.DataTable( input_features) def _add_custom_index(answer, index_best, index_worst, output_format): if output_format == "text": answer = answer.format(index_0=index_best, index_1=index_worst) elif output_format == "dataframe": col_name = "prefix" if "prefix" in answer.columns else "rank" n_repeats = answer[col_name].value_counts().tolist()[0] answer['index_id'] = [index_best] * n_repeats + [index_worst ] * n_repeats else: answer["explanations"][0]["predicted_values"][ "index_id"] = index_best answer["explanations"][1]["predicted_values"][ "index_id"] = index_worst return answer if is_regression(problem_type): abs_error_mock = MagicMock(__name__="abs_error") abs_error_mock.return_value = pd.Series([4., 1.], dtype="float64") mock_default_metrics.__getitem__.return_value = abs_error_mock pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1])) y_true = pd.Series([3, 2], index=custom_index) answer = _add_custom_index(answer, index_best=custom_index[1], index_worst=custom_index[0], output_format=output_format) elif is_binary(problem_type): pipeline.classes_.return_value = ["benign", "malignant"] cross_entropy_mock = MagicMock(__name__="cross_entropy") mock_default_metrics.__getitem__.return_value = cross_entropy_mock cross_entropy_mock.return_value = pd.Series([0.2, 0.78]) pipeline.predict_proba.return_value = ww.DataTable( pd.DataFrame({ "benign": [0.05, 0.1], "malignant": [0.95, 0.9] })) pipeline.predict.return_value = ww.DataColumn( pd.Series(["malignant"] * 2)) y_true = pd.Series(["malignant", "benign"], index=custom_index) answer = _add_custom_index(answer, index_best=custom_index[0], index_worst=custom_index[1], output_format=output_format) else: # Multiclass text output is formatted slightly different so need to account for that if output_format == "text": mock_make_table.return_value = multiclass_table pipeline.classes_.return_value = ["setosa", "versicolor", "virginica"] cross_entropy_mock = MagicMock(__name__="cross_entropy") mock_default_metrics.__getitem__.return_value = cross_entropy_mock cross_entropy_mock.return_value = pd.Series([0.15, 0.34]) pipeline.predict_proba.return_value = ww.DataTable( pd.DataFrame({ "setosa": [0.8, 0.2], "versicolor": [0.1, 0.75], "virginica": [0.1, 0.05] })) pipeline.predict.return_value = ww.DataColumn( pd.Series(["setosa", "versicolor"])) y_true = pd.Series(["setosa", "versicolor"], index=custom_index) answer = _add_custom_index(answer, index_best=custom_index[0], index_worst=custom_index[1], output_format=output_format) report = explain_predictions(pipeline, input_features, y=y_true, indices_to_explain=[0, 1], output_format=output_format) if output_format == "text": compare_two_tables(report.splitlines(), explain_predictions_answer.splitlines()) elif output_format == "dataframe": assert report.columns.tolist( ) == explain_predictions_answer.columns.tolist() pd.testing.assert_frame_equal( report, explain_predictions_answer[report.columns]) else: assert report == explain_predictions_answer best_worst_report = explain_predictions_best_worst( pipeline, input_features, y_true=y_true, num_to_explain=1, output_format=output_format) if output_format == "text": compare_two_tables(best_worst_report.splitlines(), answer.splitlines()) elif output_format == "dataframe": # Check dataframes equal without caring about column order assert sorted(best_worst_report.columns.tolist()) == sorted( answer.columns.tolist()) pd.testing.assert_frame_equal(best_worst_report, answer[best_worst_report.columns]) else: assert best_worst_report == answer