def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, helper_functions): for estimator_class in _all_estimators_used_in_search(): supported_problem_types = [ handle_problem_types(pt) for pt in estimator_class.supported_problem_types ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1( estimator_class) if problem_type == ProblemTypes.BINARY: X, y = X_y_binary elif problem_type == ProblemTypes.MULTICLASS: X, y = X_y_multi elif problem_type == ProblemTypes.REGRESSION: X, y = X_y_regression X = clf.random_state.random((X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) clf.fit(X, y) assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all() predictions = clf.predict(X) assert len(predictions) == len(y) assert not np.isnan(predictions).all()
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, ts_data, helper_functions): for estimator_class in _all_estimators_used_in_search(): if estimator_class.__name__ == 'ARIMARegressor': continue supported_problem_types = [ handle_problem_types(pt) for pt in estimator_class.supported_problem_types ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1( estimator_class) if is_binary(problem_type): X, y = X_y_binary elif is_multiclass(problem_type): X, y = X_y_multi elif is_regression(problem_type): X, y = X_y_regression X = get_random_state(clf.random_seed).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) assert clf.input_feature_names is None clf.fit(X, y) assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all() predictions = clf.predict(X).to_series() assert len(predictions) == len(y) assert not np.isnan(predictions).all() assert (clf.input_feature_names == col_names)
def test_binary_classification_estimators_predict_proba_col_order(helper_functions): X = pd.DataFrame({'input': np.concatenate([np.array([-1] * 100), np.array([1] * 100)])}) data = np.concatenate([np.zeros(100), np.ones(100)]) y = pd.Series(data) for estimator_class in _all_estimators_used_in_search(): supported_problem_types = [handle_problem_types(pt) for pt in estimator_class.supported_problem_types] if ProblemTypes.BINARY in supported_problem_types: estimator = helper_functions.safe_init_component_with_njobs_1(estimator_class) estimator.fit(X, y) predicted_proba = estimator.predict_proba(X).to_dataframe() expected = np.concatenate([(1 - data).reshape(-1, 1), data.reshape(-1, 1)], axis=1) np.testing.assert_allclose(expected, np.round(predicted_proba).values)
def test_estimator_predict_output_type(X_y_binary, helper_functions): X_np, y_np = X_y_binary assert isinstance(X_np, np.ndarray) assert isinstance(y_np, np.ndarray) y_list = list(y_np) X_df_no_col_names = pd.DataFrame(X_np) range_index = pd.RangeIndex(start=0, stop=X_np.shape[1], step=1) X_df_with_col_names = pd.DataFrame( X_np, columns=['x' + str(i) for i in range(X_np.shape[1])]) y_series_no_name = pd.Series(y_np) y_series_with_name = pd.Series(y_np, name='target') datatype_combos = [(X_np, y_np, range_index, np.unique(y_np)), (X_np, y_list, range_index, np.unique(y_np)), (X_df_no_col_names, y_series_no_name, range_index, y_series_no_name.unique()), (X_df_with_col_names, y_series_with_name, X_df_with_col_names.columns, y_series_with_name.unique())] for component_class in _all_estimators_used_in_search(): for X, y, X_cols_expected, y_cols_expected in datatype_combos: print( 'Checking output of predict for estimator "{}" on X type {} cols {}, y type {} name {}' .format(component_class.name, type(X), X.columns if isinstance(X, pd.DataFrame) else None, type(y), y.name if isinstance(y, pd.Series) else None)) component = helper_functions.safe_init_component_with_njobs_1( component_class) component.fit(X, y=y) predict_output = component.predict(X) assert isinstance(predict_output, ww.DataColumn) assert len(predict_output) == len(y) assert predict_output.name is None if not ((ProblemTypes.BINARY in component_class.supported_problem_types) or (ProblemTypes.MULTICLASS in component_class.supported_problem_types)): continue print( 'Checking output of predict_proba for estimator "{}" on X type {} cols {}, y type {} name {}' .format(component_class.name, type(X), X.columns if isinstance(X, pd.DataFrame) else None, type(y), y.name if isinstance(y, pd.Series) else None)) predict_proba_output = component.predict_proba(X) assert isinstance(predict_proba_output, ww.DataTable) assert predict_proba_output.shape == (len(y), len(np.unique(y))) assert (list( predict_proba_output.columns) == y_cols_expected).all()
_create_dictionary([1, 2, 3], ["a", "b", "c"]) N_CLASSES_BINARY = 2 N_CLASSES_MULTICLASS = 3 N_FEATURES = 20 def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain): """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline.""" points_to_explain = training_data[:n_points_to_explain] pipeline.fit(training_data, y) return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data) interpretable_estimators = [e for e in _all_estimators_used_in_search() if e.model_family not in {ModelFamily.XGBOOST, ModelFamily.BASELINE}] all_problems = [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS] all_n_points_to_explain = [1, 5] @pytest.mark.parametrize("estimator,problem_type,n_points_to_explain", product(interpretable_estimators, all_problems, all_n_points_to_explain)) def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression, helper_functions): if problem_type not in estimator.supported_problem_types: pytest.skip("Skipping because estimator and pipeline are not compatible.") if problem_type == ProblemTypes.MULTICLASS and estimator.model_family == ModelFamily.CATBOOST: pytest.skip("Skipping Catboost for multiclass problems.")
N_CLASSES_BINARY = 2 N_CLASSES_MULTICLASS = 3 N_FEATURES = 20 def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain): """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline.""" points_to_explain = training_data[:n_points_to_explain] pipeline.fit(training_data, y) return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data) interpretable_estimators = [ e for e in _all_estimators_used_in_search() if e.model_family not in {ModelFamily.XGBOOST, ModelFamily.BASELINE} ] all_problems = [ ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS ] all_n_points_to_explain = [1, 5] @pytest.mark.parametrize("estimator,problem_type,n_points_to_explain", product(interpretable_estimators, all_problems, all_n_points_to_explain)) def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression, helper_functions): if problem_type not in estimator.supported_problem_types:
_create_dictionary([1, 2, 3], ["a", "b", "c"]) N_CLASSES_BINARY = 2 N_CLASSES_MULTICLASS = 3 N_FEATURES = 20 def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain): """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline.""" points_to_explain = training_data[:n_points_to_explain] pipeline.fit(training_data, y) return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data) interpretable_estimators = [e for e in _all_estimators_used_in_search() if e.model_family != ModelFamily.BASELINE] all_problems = [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS] all_n_points_to_explain = [1, 5] @pytest.mark.parametrize("estimator,problem_type,n_points_to_explain", product(interpretable_estimators, all_problems, all_n_points_to_explain)) def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression, helper_functions): if problem_type not in estimator.supported_problem_types: pytest.skip("Skipping because estimator and pipeline are not compatible.") if problem_type == ProblemTypes.BINARY: training_data, y = X_y_binary is_binary = True