def test_invalid_target_data_action_for_data_with_null(problem_type): y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) impute_strategy = "mean" if is_regression(problem_type) else "most_frequent" expected = { "warnings": [], "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()] } if is_binary(problem_type): expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [0]}).to_dict()) elif is_multiclass(problem_type): expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details={"num_classes": 1}).to_dict()) expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={"class_to_value_ratio": 0.1}).to_dict()) messages = invalid_targets_check.validate(X, y) assert messages == expected
def _find_best_pipeline(self): """Finds the best pipeline in the rankings If self._best_pipeline already exists, check to make sure it is different from the current best pipeline before training and thresholding""" if len(self.rankings) == 0: return best_pipeline = self.rankings.iloc[0] if not (self._best_pipeline and self._best_pipeline == self.get_pipeline(best_pipeline['id'])): self._best_pipeline = self.get_pipeline(best_pipeline['id']) if self._train_best_pipeline: X_threshold_tuning = None y_threshold_tuning = None X_train, y_train = self.X_train, self.y_train if is_binary(self.problem_type) and self.objective.is_defined_for_problem_type(self.problem_type) \ and self.optimize_thresholds and self.objective.can_optimize_threshold: X_train, X_threshold_tuning, y_train, y_threshold_tuning = split_data( X_train, y_train, self.problem_type, test_size=0.2, random_seed=self.random_seed) self._best_pipeline.fit(X_train, y_train) tune_binary_threshold(self._best_pipeline, self.objective, self.problem_type, X_threshold_tuning, y_threshold_tuning)
def test_split_data(problem_type, data_type, X_y_binary, X_y_multi, X_y_regression, make_data_type): if is_binary(problem_type): X, y = X_y_binary if is_multiclass(problem_type): X, y = X_y_multi if is_regression(problem_type): X, y = X_y_regression problem_configuration = None if is_time_series(problem_type): problem_configuration = {'gap': 1, 'max_delay': 7} X = make_data_type(data_type, X) y = make_data_type(data_type, y) test_pct = 0.25 X_train, X_test, y_train, y_test = split_data( X, y, test_size=test_pct, problem_type=problem_type, problem_configuration=problem_configuration) test_size = len(X) * test_pct train_size = len(X) - test_size assert len(X_train) == train_size assert len(X_test) == test_size assert len(y_train) == train_size assert len(y_test) == test_size assert isinstance(X_train, ww.DataTable) assert isinstance(X_test, ww.DataTable) assert isinstance(y_train, ww.DataColumn) assert isinstance(y_test, ww.DataColumn)
def test_explain_predictions_best_worst_time_series(output_format, pipeline_class, estimator, ts_data): X, y = ts_data if is_binary(pipeline_class.problem_type): y = y % 2 class TSPipeline(pipeline_class): component_graph = ["Delayed Feature Transformer", estimator] name = "time series pipeline" tspipeline = TSPipeline({"pipeline": {"gap": 1, "max_delay": 2}}) tspipeline.fit(X, y) exp = explain_predictions_best_worst(pipeline=tspipeline, input_features=X, y_true=y, output_format=output_format) if output_format == "dict": # Check that the computed features to be explained aren't NaN. for exp_idx in range(len(exp["explanations"])): assert not np.isnan( np.array(exp["explanations"][exp_idx]["explanations"][0] ["feature_values"])).any()
def test_explain_predictions_stacked_ensemble( problem_type, dummy_stacked_ensemble_binary_estimator, dummy_stacked_ensemble_multiclass_estimator, dummy_stacked_ensemble_regressor_estimator, X_y_binary, X_y_multi, X_y_regression): if is_binary(problem_type): X, y = X_y_binary pipeline = dummy_stacked_ensemble_binary_estimator elif is_multiclass(problem_type): X, y = X_y_multi pipeline = dummy_stacked_ensemble_multiclass_estimator else: X, y = X_y_regression pipeline = dummy_stacked_ensemble_regressor_estimator with pytest.raises( ValueError, match="Cannot explain predictions for a stacked ensemble pipeline" ): explain_predictions(pipeline, X, y, indices_to_explain=[0]) with pytest.raises( ValueError, match="Cannot explain predictions for a stacked ensemble pipeline" ): explain_predictions_best_worst(pipeline, X, y)
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, ts_data, helper_functions): for estimator_class in _all_estimators_used_in_search(): if estimator_class.__name__ == 'ARIMARegressor': continue supported_problem_types = [ handle_problem_types(pt) for pt in estimator_class.supported_problem_types ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1( estimator_class) if is_binary(problem_type): X, y = X_y_binary elif is_multiclass(problem_type): X, y = X_y_multi elif is_regression(problem_type): X, y = X_y_regression X = get_random_state(clf.random_seed).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) assert clf.input_feature_names is None clf.fit(X, y) assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all() predictions = clf.predict(X).to_series() assert len(predictions) == len(y) assert not np.isnan(predictions).all() assert (clf.input_feature_names == col_names)
def can_tune_threshold_with_objective(self, objective): """Determine whether the threshold of a binary classification pipeline can be tuned. Arguments: pipeline (PipelineBase): Binary classification pipeline. objective (ObjectiveBase): Primary AutoMLSearch objective. Returns: bool: True if the pipeline threshold can be tuned. """ return objective.is_defined_for_problem_type(self.problem_type) and \ objective.can_optimize_threshold and is_binary(self.problem_type)
def tune_binary_threshold(pipeline, objective, problem_type, X_threshold_tuning, y_threshold_tuning): """Tunes the threshold of a binary pipeline to the X and y thresholding data Arguments: pipeline (Pipeline): Pipeline instance to threshold X_threshold_tuning (ww.DataTable): Features to tune pipeline to y_threshold_tuning (ww.DataColumn): Target data to tune pipeline to """ if is_binary(problem_type) and objective.is_defined_for_problem_type( problem_type) and objective.can_optimize_threshold: pipeline.threshold = 0.5 if X_threshold_tuning: y_predict_proba = pipeline.predict_proba(X_threshold_tuning) y_predict_proba = y_predict_proba.iloc[:, 1] pipeline.threshold = objective.optimize_threshold( y_predict_proba, y_threshold_tuning, X=X_threshold_tuning)
def test_type_checks(problem_type): assert is_regression(problem_type) == (problem_type in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION ]) assert is_binary(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY ]) assert is_multiclass(problem_type) == (problem_type in [ ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_classification(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_time_series(problem_type) == (problem_type in [ ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION ])
def tune_binary_threshold(pipeline, objective, problem_type, X_threshold_tuning, y_threshold_tuning): """Tunes the threshold of a binary pipeline to the X and y thresholding data Arguments: pipeline (Pipeline): Pipeline instance to threshold. objective (ObjectiveBase): The objective we want to tune with. If not tuneable and best_pipeline is True, will use F1. problem_type (ProblemType): The problem type of the pipeline. X_threshold_tuning (ww.DataTable): Features to tune pipeline to. y_threshold_tuning (ww.DataColumn): Target data to tune pipeline to. """ if is_binary(problem_type) and objective.is_defined_for_problem_type( problem_type) and objective.can_optimize_threshold: pipeline.threshold = 0.5 if X_threshold_tuning: y_predict_proba = pipeline.predict_proba(X_threshold_tuning) y_predict_proba = y_predict_proba.iloc[:, 1] pipeline.optimize_threshold(X_threshold_tuning, y_threshold_tuning, y_predict_proba, objective)
def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores Arguments: pipeline (PipelineBase): The pipeline to score automl (AutoMLSearch): The AutoML search, used to access config and for the error callback full_X_train (ww.DataTable): Training features full_y_train (ww.DataColumn): Training target Returns: dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. """ start = time.time() cv_data = [] logger.info("\tStarting cross validation") X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe()) y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series()) y_pd_encoded = y_pd # Encode target for classification problems so that we can support float targets. This is okay because we only use split to get the indices to split on if is_classification(automl.problem_type): y_mapping = { original_target: encoded_target for (encoded_target, original_target) in enumerate(y_pd.value_counts().index) } y_pd_encoded = y_pd.map(y_mapping) for i, (train, valid) in enumerate( automl.data_splitter.split(X_pd, y_pd_encoded)): if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0: # Stacked ensembles do CV internally, so we do not run CV here for performance reasons. logger.debug( f"Skipping fold {i} because CV for stacked ensembles is not supported." ) break logger.debug(f"\t\tTraining and scoring on fold {i}") X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[ valid] y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[ valid] if is_binary(automl.problem_type) or is_multiclass( automl.problem_type): diff_train = set( np.setdiff1d(full_y_train.to_series(), y_train.to_series())) diff_valid = set( np.setdiff1d(full_y_train.to_series(), y_valid.to_series())) diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else "" diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else "" if diff_string: raise Exception(diff_string) objectives_to_score = [automl.objective ] + automl.additional_objectives cv_pipeline = None try: logger.debug(f"\t\t\tFold {i}: starting training") cv_pipeline = EngineBase.train_pipeline( pipeline, X_train, y_train, automl.optimize_thresholds, automl.objective) logger.debug(f"\t\t\tFold {i}: finished training") if automl.optimize_thresholds and pipeline.can_tune_threshold_with_objective( automl.objective ) and automl.objective.can_optimize_threshold: logger.debug( f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})" ) logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline") scores = cv_pipeline.score(X_valid, y_valid, objectives=objectives_to_score) logger.debug( f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}" ) score = scores[automl.objective.name] except Exception as e: if automl.error_callback is not None: automl.error_callback(exception=e, traceback=traceback.format_tb( sys.exc_info()[2]), automl=automl, fold_num=i, pipeline=pipeline) if isinstance(e, PipelineScoreError): nan_scores = { objective: np.nan for objective in e.exceptions } scores = {**nan_scores, **e.scored_successfully} scores = OrderedDict({ o.name: scores[o.name] for o in [automl.objective] + automl.additional_objectives }) score = scores[automl.objective.name] else: score = np.nan scores = OrderedDict( zip([n.name for n in automl.additional_objectives], [np.nan] * len(automl.additional_objectives))) ordered_scores = OrderedDict() ordered_scores.update({automl.objective.name: score}) ordered_scores.update(scores) ordered_scores.update({"# Training": y_train.shape[0]}) ordered_scores.update({"# Validation": y_valid.shape[0]}) evaluation_entry = { "all_objective_scores": ordered_scores, "score": score, 'binary_classification_threshold': None } if is_binary( automl.problem_type ) and cv_pipeline is not None and cv_pipeline.threshold is not None: evaluation_entry[ 'binary_classification_threshold'] = cv_pipeline.threshold cv_data.append(evaluation_entry) training_time = time.time() - start cv_scores = pd.Series([fold['score'] for fold in cv_data]) cv_score_mean = cv_scores.mean() logger.info( f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}" ) return { 'cv_data': cv_data, 'training_time': training_time, 'cv_scores': cv_scores, 'cv_score_mean': cv_score_mean }
def validate(self, X, y): """Checks if the target data contains missing or invalid values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored. y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values. Returns: dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary') >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\ "data_check_name": "InvalidTargetDataCheck",\ "level": "error",\ "code": "TARGET_HAS_NULL",\ "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\ "warnings": [],\ "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} """ results = {"warnings": [], "errors": [], "actions": []} if y is None: results["errors"].append( DataCheckError( message="Target is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}).to_dict()) return results y = infer_feature_types(y) is_supported_type = y.logical_type in numeric_and_boolean_ww + [ ww.logical_types.Categorical ] if not is_supported_type: results["errors"].append( DataCheckError( message= "Target is unsupported {} type. Valid Woodwork logical types include: {}" .format( y.logical_type, ", ".join([ ltype.type_string for ltype in numeric_and_boolean_ww ])), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={ "unsupported_type": y.logical_type.type_string }).to_dict()) y_df = _convert_woodwork_types_wrapper(y.to_series()) null_rows = y_df.isnull() if null_rows.all(): results["errors"].append( DataCheckError(message="Target is either empty or fully null.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()) return results elif null_rows.any(): num_null_rows = null_rows.sum() pct_null_rows = null_rows.mean() * 100 results["errors"].append( DataCheckError( message="{} row(s) ({}%) of target values are null".format( num_null_rows, pct_null_rows), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows }).to_dict()) impute_strategy = "mean" if is_regression( self.problem_type) else "most_frequent" results["actions"].append( DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": impute_strategy }).to_dict()) value_counts = y_df.value_counts() unique_values = value_counts.index.tolist() if is_binary(self.problem_type) and len(value_counts) != 2: if self.n_unique is None: details = {"target_values": unique_values} else: details = { "target_values": unique_values[:min(self.n_unique, len(unique_values))] } results["errors"].append( DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details=details).to_dict()) if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags: results["errors"].append( DataCheckError( message= "Target data type should be numeric for regression type problems.", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={}).to_dict()) if is_multiclass(self.problem_type): if value_counts.min() <= 1: least_populated = value_counts[value_counts <= 1] details = { "least_populated_class_labels": least_populated.index.tolist() } results["errors"].append( DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details=details).to_dict()) if len(unique_values) <= 2: details = {"num_classes": len(unique_values)} results["errors"].append( DataCheckError( message= "Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details=details).to_dict()) num_class_to_num_value_ratio = len(unique_values) / len(y) if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold: details = { "class_to_value_ratio": num_class_to_num_value_ratio } results["warnings"].append( DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details=details).to_dict()) any_neg = not (y_df > 0).all() if y.logical_type in [ ww.logical_types.Integer, ww.logical_types.Double ] else None if any_neg and self.objective.positive_only: details = { "Count of offending values": sum(val <= 0 for val in y_df.values.flatten()) } results["errors"].append( DataCheckError( message= f"Target has non-positive values which is not supported for {self.objective.name}", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_INCOMPATIBLE_OBJECTIVE, details=details).to_dict()) if X is not None: X = infer_feature_types(X) X_index = list(X.to_dataframe().index) y_index = list(y_df.index) X_length = len(X_index) y_length = len(y_index) if X_length != y_length: results["warnings"].append( DataCheckWarning( message= "Input target and features have different lengths", data_check_name=self.name, message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, details={ "features_length": X_length, "target_length": y_length }).to_dict()) if X_index != y_index: if set(X_index) == set(y_index): results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices order", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES_ORDER, details={}).to_dict()) else: index_diff_not_in_X = list(set(y_index) - set(X_index))[:10] index_diff_not_in_y = list(set(X_index) - set(y_index))[:10] results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES, details={ "indices_not_in_features": index_diff_not_in_X, "indices_not_in_target": index_diff_not_in_y }).to_dict()) return results
def _compute_shap_values(pipeline, features, training_data=None): """Computes SHAP values for each feature. Arguments: pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP. features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. training_data (pd.DataFrame): Training data the pipeline was fit on. For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm. Returns: dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values. For classification problems, returns a list of dictionaries. One for each class. """ estimator = pipeline.estimator if estimator.model_family == ModelFamily.BASELINE: raise ValueError( "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed." ) feature_names = features.columns # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise. # Sklearn components do this under-the-hood so we're not changing the data the model was trained on. # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric. if estimator.model_family != ModelFamily.CATBOOST: features = check_array(features.values) if estimator.model_family.is_tree_estimator(): # Use tree_path_dependent to avoid linear runtime with dataset size with warnings.catch_warnings(record=True) as ws: explainer = shap.TreeExplainer( estimator._component_obj, feature_perturbation="tree_path_dependent") if ws: logger.debug( f"_compute_shap_values TreeExplainer: {ws[0].message}") shap_values = explainer.shap_values(features, check_additivity=False) # shap only outputs values for positive class for Catboost/Xgboost binary estimators. # this modifies the output to match the output format of other binary estimators. # Ok to fill values of negative class with zeros since the negative class will get dropped # in the UI anyways. if estimator.model_family in { ModelFamily.CATBOOST, ModelFamily.XGBOOST } and is_binary(pipeline.problem_type): shap_values = [np.zeros(shap_values.shape), shap_values] else: if training_data is None: raise ValueError( "You must pass in a value for parameter 'training_data' when the pipeline " "does not have a tree-based estimator. " f"Current estimator model family is {estimator.model_family}.") # More than 100 datapoints can negatively impact runtime according to SHAP # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114 sampled_training_data_features = shap.sample(training_data, 100) sampled_training_data_features = check_array( sampled_training_data_features) if is_regression(pipeline.problem_type): link_function = "identity" decision_function = estimator._component_obj.predict else: link_function = "logit" decision_function = estimator._component_obj.predict_proba with warnings.catch_warnings(record=True) as ws: explainer = shap.KernelExplainer(decision_function, sampled_training_data_features, link_function) shap_values = explainer.shap_values(features) if ws: logger.debug( f"_compute_shap_values KernelExplainer: {ws[0].message}") # classification problem if isinstance(shap_values, list): mappings = [] for class_shap_values in shap_values: mappings.append( _create_dictionary(class_shap_values, feature_names)) return mappings # regression problem elif isinstance(shap_values, np.ndarray): return _create_dictionary(shap_values, feature_names) else: raise ValueError( f"Unknown shap_values datatype {str(type(shap_values))}!")
def test_explain_predictions_best_worst_and_explain_predictions( mock_make_table, mock_default_metrics, problem_type, output_format, answer, explain_predictions_answer, custom_index): if output_format == "text": mock_make_table.return_value = "table goes here" elif output_format == "dataframe": shap_table = pd.DataFrame({ "feature_names": [0], "feature_values": [0], "qualitative_explanation": [0], "quantitative_explanation": [0], }) # Use side effect so that we always get a new copy of the dataframe mock_make_table.side_effect = lambda *args, **kwargs: shap_table.copy() else: mock_make_table.return_value = { "explanations": ["explanation_dictionary_goes_here"] } pipeline = MagicMock() pipeline.parameters = "Parameters go here" input_features = pd.DataFrame({"a": [3, 4]}, index=custom_index) pipeline.problem_type = problem_type pipeline.name = "Test Pipeline Name" pipeline.compute_estimator_features.return_value = ww.DataTable( input_features) def _add_custom_index(answer, index_best, index_worst, output_format): if output_format == "text": answer = answer.format(index_0=index_best, index_1=index_worst) elif output_format == "dataframe": col_name = "prefix" if "prefix" in answer.columns else "rank" n_repeats = answer[col_name].value_counts().tolist()[0] answer['index_id'] = [index_best] * n_repeats + [index_worst ] * n_repeats else: answer["explanations"][0]["predicted_values"][ "index_id"] = index_best answer["explanations"][1]["predicted_values"][ "index_id"] = index_worst return answer if is_regression(problem_type): abs_error_mock = MagicMock(__name__="abs_error") abs_error_mock.return_value = pd.Series([4., 1.], dtype="float64") mock_default_metrics.__getitem__.return_value = abs_error_mock pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1])) y_true = pd.Series([3, 2], index=custom_index) answer = _add_custom_index(answer, index_best=custom_index[1], index_worst=custom_index[0], output_format=output_format) elif is_binary(problem_type): pipeline.classes_.return_value = ["benign", "malignant"] cross_entropy_mock = MagicMock(__name__="cross_entropy") mock_default_metrics.__getitem__.return_value = cross_entropy_mock cross_entropy_mock.return_value = pd.Series([0.2, 0.78]) pipeline.predict_proba.return_value = ww.DataTable( pd.DataFrame({ "benign": [0.05, 0.1], "malignant": [0.95, 0.9] })) pipeline.predict.return_value = ww.DataColumn( pd.Series(["malignant"] * 2)) y_true = pd.Series(["malignant", "benign"], index=custom_index) answer = _add_custom_index(answer, index_best=custom_index[0], index_worst=custom_index[1], output_format=output_format) else: # Multiclass text output is formatted slightly different so need to account for that if output_format == "text": mock_make_table.return_value = multiclass_table pipeline.classes_.return_value = ["setosa", "versicolor", "virginica"] cross_entropy_mock = MagicMock(__name__="cross_entropy") mock_default_metrics.__getitem__.return_value = cross_entropy_mock cross_entropy_mock.return_value = pd.Series([0.15, 0.34]) pipeline.predict_proba.return_value = ww.DataTable( pd.DataFrame({ "setosa": [0.8, 0.2], "versicolor": [0.1, 0.75], "virginica": [0.1, 0.05] })) pipeline.predict.return_value = ww.DataColumn( pd.Series(["setosa", "versicolor"])) y_true = pd.Series(["setosa", "versicolor"], index=custom_index) answer = _add_custom_index(answer, index_best=custom_index[0], index_worst=custom_index[1], output_format=output_format) report = explain_predictions(pipeline, input_features, y=y_true, indices_to_explain=[0, 1], output_format=output_format) if output_format == "text": compare_two_tables(report.splitlines(), explain_predictions_answer.splitlines()) elif output_format == "dataframe": assert report.columns.tolist( ) == explain_predictions_answer.columns.tolist() pd.testing.assert_frame_equal( report, explain_predictions_answer[report.columns]) else: assert report == explain_predictions_answer best_worst_report = explain_predictions_best_worst( pipeline, input_features, y_true=y_true, num_to_explain=1, output_format=output_format) if output_format == "text": compare_two_tables(best_worst_report.splitlines(), answer.splitlines()) elif output_format == "dataframe": # Check dataframes equal without caring about column order assert sorted(best_worst_report.columns.tolist()) == sorted( answer.columns.tolist()) pd.testing.assert_frame_equal(best_worst_report, answer[best_worst_report.columns]) else: assert best_worst_report == answer