예제 #1
0
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi,
                                                   X_y_regression,
                                                   helper_functions):
    for estimator_class in _all_estimators_used_in_search():
        supported_problem_types = [
            handle_problem_types(pt)
            for pt in estimator_class.supported_problem_types
        ]
        for problem_type in supported_problem_types:
            clf = helper_functions.safe_init_component_with_njobs_1(
                estimator_class)
            if problem_type == ProblemTypes.BINARY:
                X, y = X_y_binary
            elif problem_type == ProblemTypes.MULTICLASS:
                X, y = X_y_multi
            elif problem_type == ProblemTypes.REGRESSION:
                X, y = X_y_regression

            X = clf.random_state.random((X.shape[0], len(string.printable)))
            col_names = [
                'column_{}'.format(ascii_char)
                for ascii_char in string.printable
            ]
            X = pd.DataFrame(X, columns=col_names)
            clf.fit(X, y)
            assert len(clf.feature_importance) == len(X.columns)
            assert not np.isnan(clf.feature_importance).all().all()
            predictions = clf.predict(X)
            assert len(predictions) == len(y)
            assert not np.isnan(predictions).all()
예제 #2
0
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi,
                                                   X_y_regression, ts_data,
                                                   helper_functions):
    for estimator_class in _all_estimators_used_in_search():
        if estimator_class.__name__ == 'ARIMARegressor':
            continue
        supported_problem_types = [
            handle_problem_types(pt)
            for pt in estimator_class.supported_problem_types
        ]
        for problem_type in supported_problem_types:
            clf = helper_functions.safe_init_component_with_njobs_1(
                estimator_class)
            if is_binary(problem_type):
                X, y = X_y_binary
            elif is_multiclass(problem_type):
                X, y = X_y_multi
            elif is_regression(problem_type):
                X, y = X_y_regression

            X = get_random_state(clf.random_seed).random(
                (X.shape[0], len(string.printable)))
            col_names = [
                'column_{}'.format(ascii_char)
                for ascii_char in string.printable
            ]
            X = pd.DataFrame(X, columns=col_names)
            assert clf.input_feature_names is None
            clf.fit(X, y)
            assert len(clf.feature_importance) == len(X.columns)
            assert not np.isnan(clf.feature_importance).all().all()
            predictions = clf.predict(X).to_series()
            assert len(predictions) == len(y)
            assert not np.isnan(predictions).all()
            assert (clf.input_feature_names == col_names)
예제 #3
0
def test_binary_classification_estimators_predict_proba_col_order(helper_functions):
    X = pd.DataFrame({'input': np.concatenate([np.array([-1] * 100), np.array([1] * 100)])})
    data = np.concatenate([np.zeros(100), np.ones(100)])
    y = pd.Series(data)
    for estimator_class in _all_estimators_used_in_search():
        supported_problem_types = [handle_problem_types(pt) for pt in estimator_class.supported_problem_types]
        if ProblemTypes.BINARY in supported_problem_types:
            estimator = helper_functions.safe_init_component_with_njobs_1(estimator_class)
            estimator.fit(X, y)
            predicted_proba = estimator.predict_proba(X).to_dataframe()
            expected = np.concatenate([(1 - data).reshape(-1, 1), data.reshape(-1, 1)], axis=1)
            np.testing.assert_allclose(expected, np.round(predicted_proba).values)
예제 #4
0
def test_estimator_predict_output_type(X_y_binary, helper_functions):
    X_np, y_np = X_y_binary
    assert isinstance(X_np, np.ndarray)
    assert isinstance(y_np, np.ndarray)
    y_list = list(y_np)
    X_df_no_col_names = pd.DataFrame(X_np)
    range_index = pd.RangeIndex(start=0, stop=X_np.shape[1], step=1)
    X_df_with_col_names = pd.DataFrame(
        X_np, columns=['x' + str(i) for i in range(X_np.shape[1])])
    y_series_no_name = pd.Series(y_np)
    y_series_with_name = pd.Series(y_np, name='target')
    datatype_combos = [(X_np, y_np, range_index, np.unique(y_np)),
                       (X_np, y_list, range_index, np.unique(y_np)),
                       (X_df_no_col_names, y_series_no_name, range_index,
                        y_series_no_name.unique()),
                       (X_df_with_col_names,
                        y_series_with_name, X_df_with_col_names.columns,
                        y_series_with_name.unique())]

    for component_class in _all_estimators_used_in_search():
        for X, y, X_cols_expected, y_cols_expected in datatype_combos:
            print(
                'Checking output of predict for estimator "{}" on X type {} cols {}, y type {} name {}'
                .format(component_class.name, type(X),
                        X.columns if isinstance(X, pd.DataFrame) else None,
                        type(y), y.name if isinstance(y, pd.Series) else None))
            component = helper_functions.safe_init_component_with_njobs_1(
                component_class)
            component.fit(X, y=y)
            predict_output = component.predict(X)
            assert isinstance(predict_output, ww.DataColumn)
            assert len(predict_output) == len(y)
            assert predict_output.name is None

            if not ((ProblemTypes.BINARY
                     in component_class.supported_problem_types) or
                    (ProblemTypes.MULTICLASS
                     in component_class.supported_problem_types)):
                continue
            print(
                'Checking output of predict_proba for estimator "{}" on X type {} cols {}, y type {} name {}'
                .format(component_class.name, type(X),
                        X.columns if isinstance(X, pd.DataFrame) else None,
                        type(y), y.name if isinstance(y, pd.Series) else None))
            predict_proba_output = component.predict_proba(X)
            assert isinstance(predict_proba_output, ww.DataTable)
            assert predict_proba_output.shape == (len(y), len(np.unique(y)))
            assert (list(
                predict_proba_output.columns) == y_cols_expected).all()
예제 #5
0
        _create_dictionary([1, 2, 3], ["a", "b", "c"])


N_CLASSES_BINARY = 2
N_CLASSES_MULTICLASS = 3
N_FEATURES = 20


def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain):
    """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline."""
    points_to_explain = training_data[:n_points_to_explain]
    pipeline.fit(training_data, y)
    return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data)


interpretable_estimators = [e for e in _all_estimators_used_in_search() if e.model_family not in {ModelFamily.XGBOOST, ModelFamily.BASELINE}]
all_problems = [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
all_n_points_to_explain = [1, 5]


@pytest.mark.parametrize("estimator,problem_type,n_points_to_explain",
                         product(interpretable_estimators, all_problems, all_n_points_to_explain))
def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression,
              helper_functions):

    if problem_type not in estimator.supported_problem_types:
        pytest.skip("Skipping because estimator and pipeline are not compatible.")

    if problem_type == ProblemTypes.MULTICLASS and estimator.model_family == ModelFamily.CATBOOST:
        pytest.skip("Skipping Catboost for multiclass problems.")
예제 #6
0
N_CLASSES_BINARY = 2
N_CLASSES_MULTICLASS = 3
N_FEATURES = 20


def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain):
    """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline."""
    points_to_explain = training_data[:n_points_to_explain]
    pipeline.fit(training_data, y)
    return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain),
                                training_data)


interpretable_estimators = [
    e for e in _all_estimators_used_in_search()
    if e.model_family not in {ModelFamily.XGBOOST, ModelFamily.BASELINE}
]
all_problems = [
    ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS
]
all_n_points_to_explain = [1, 5]


@pytest.mark.parametrize("estimator,problem_type,n_points_to_explain",
                         product(interpretable_estimators, all_problems,
                                 all_n_points_to_explain))
def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary,
              X_y_multi, X_y_regression, helper_functions):

    if problem_type not in estimator.supported_problem_types:
예제 #7
0
        _create_dictionary([1, 2, 3], ["a", "b", "c"])


N_CLASSES_BINARY = 2
N_CLASSES_MULTICLASS = 3
N_FEATURES = 20


def calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain):
    """Helper function to compute the SHAP values for n_points_to_explain for a given pipeline."""
    points_to_explain = training_data[:n_points_to_explain]
    pipeline.fit(training_data, y)
    return _compute_shap_values(pipeline, pd.DataFrame(points_to_explain), training_data)


interpretable_estimators = [e for e in _all_estimators_used_in_search() if e.model_family != ModelFamily.BASELINE]
all_problems = [ProblemTypes.REGRESSION, ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
all_n_points_to_explain = [1, 5]


@pytest.mark.parametrize("estimator,problem_type,n_points_to_explain",
                         product(interpretable_estimators, all_problems, all_n_points_to_explain))
def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression,
              helper_functions):

    if problem_type not in estimator.supported_problem_types:
        pytest.skip("Skipping because estimator and pipeline are not compatible.")

    if problem_type == ProblemTypes.BINARY:
        training_data, y = X_y_binary
        is_binary = True