Пример #1
0
    def test_get_global_raw_explanations_regression(self, boston,
                                                    tabular_explainer):
        model = create_sklearn_random_forest_regressor(
            boston[DatasetConstants.X_TRAIN], boston[DatasetConstants.Y_TRAIN])

        exp = tabular_explainer(model,
                                boston[DatasetConstants.X_TRAIN],
                                features=boston[DatasetConstants.FEATURES])

        global_explanation = exp.explain_global(
            boston[DatasetConstants.X_TEST])
        assert not global_explanation.is_raw
        assert not global_explanation.is_engineered
        num_engineered_feats = len(boston[DatasetConstants.FEATURES])
        feature_map = np.eye(num_engineered_feats - 1, num_engineered_feats)

        global_raw_explanation = global_explanation.get_raw_explanation(
            [feature_map])
        assert not global_explanation.is_raw
        assert global_explanation.is_engineered

        assert np.array(global_raw_explanation.local_importance_values
                        ).shape[-1] == feature_map.shape[0]

        assert global_raw_explanation.is_raw
        assert not global_raw_explanation.is_engineered
        assert np.array(global_raw_explanation.global_importance_values
                        ).shape[-1] == feature_map.shape[0]
Пример #2
0
    def test_cohort_filter_regression_error(self):
        X_train, X_test, y_train, y_test, feature_names = \
            create_diabetes_data()
        X_train = pd.DataFrame(X_train, columns=feature_names)
        X_test = pd.DataFrame(X_test, columns=feature_names)

        # filter on regression error, which can be done from the
        # RAI dashboard
        filters = [{
            'arg': [40],
            'column': REGRESSION_ERROR,
            'method': 'less and equal'
        }]

        model = create_sklearn_random_forest_regressor(X_train, y_train)
        pred_y = model.predict(X_test)

        validation_data = create_validation_data(X_test, y_test, pred_y)
        validation_filter = abs(validation_data[PRED_Y] -
                                validation_data[TRUE_Y]) <= 40.0
        validation_data = validation_data.loc[validation_filter]
        validation_data = validation_data.drop(columns=PRED_Y)

        model_task = ModelTask.REGRESSION
        categorical_features = []
        run_error_analyzer(validation_data,
                           model,
                           X_test,
                           y_test,
                           feature_names,
                           categorical_features,
                           model_task,
                           filters=filters)
Пример #3
0
    def test_get_local_raw_explanations_regression(self, boston,
                                                   tabular_explainer):
        model = create_sklearn_random_forest_regressor(
            boston[DatasetConstants.X_TRAIN], boston[DatasetConstants.Y_TRAIN])

        exp = tabular_explainer(model,
                                boston[DatasetConstants.X_TRAIN],
                                features=boston[DatasetConstants.FEATURES])

        num_engineered_feats = len(boston[DatasetConstants.FEATURES])
        feature_map = np.eye(num_engineered_feats - 1, num_engineered_feats)

        local_explanation = exp.explain_local(
            boston[DatasetConstants.X_TEST][0])

        local_raw_explanation = local_explanation.get_raw_explanation(
            [feature_map])

        assert len(local_raw_explanation.local_importance_values
                   ) == feature_map.shape[0]

        local_rank = local_raw_explanation.get_local_importance_rank()
        assert len(local_rank) == feature_map.shape[0]

        ranked_names = local_raw_explanation.get_ranked_local_names()
        assert len(ranked_names) == feature_map.shape[0]

        ranked_values = local_raw_explanation.get_ranked_local_values()
        assert len(ranked_values) == feature_map.shape[0]
Пример #4
0
    def test_get_global_raw_explanations_regression_eval_data(
            self, boston, tabular_explainer):
        model = create_sklearn_random_forest_regressor(
            boston[DatasetConstants.X_TRAIN], boston[DatasetConstants.Y_TRAIN])

        exp = tabular_explainer(model,
                                boston[DatasetConstants.X_TRAIN],
                                features=boston[DatasetConstants.FEATURES])

        global_explanation = exp.explain_global(
            boston[DatasetConstants.X_TEST])
        assert not global_explanation.is_raw
        assert not global_explanation.is_engineered
        num_engineered_feats = len(boston[DatasetConstants.FEATURES])
        feature_map = np.eye(num_engineered_feats - 1, num_engineered_feats)

        raw_eval_data = np.ones_like(boston[DatasetConstants.X_TRAIN])
        global_raw_explanation = global_explanation.get_raw_explanation(
            [feature_map], eval_data=raw_eval_data)

        assert np.array_equal(raw_eval_data, global_raw_explanation.eval_data)

        self.validate_global_explanation_regression(global_explanation,
                                                    global_raw_explanation,
                                                    feature_map,
                                                    has_raw_eval_data=True)
Пример #5
0
    def test_get_raw_explanation_no_datasets_mixin(self, boston,
                                                   mimic_explainer):
        model = create_sklearn_random_forest_regressor(
            boston[DatasetConstants.X_TRAIN], boston[DatasetConstants.Y_TRAIN])

        explainer = mimic_explainer(model, boston[DatasetConstants.X_TRAIN],
                                    LGBMExplainableModel)
        global_explanation = explainer.explain_global(
            boston[DatasetConstants.X_TEST])
        assert global_explanation.method == LIGHTGBM_METHOD

        kwargs = {ExplainParams.METHOD: global_explanation.method}
        kwargs[ExplainParams.FEATURES] = global_explanation.features
        kwargs[ExplainParams.MODEL_TASK] = ExplainType.REGRESSION
        kwargs[
            ExplainParams.
            LOCAL_IMPORTANCE_VALUES] = global_explanation._local_importance_values
        kwargs[ExplainParams.EXPECTED_VALUES] = 0
        kwargs[ExplainParams.CLASSIFICATION] = False
        kwargs[ExplainParams.IS_ENG] = True
        synthetic_explanation = _create_local_explanation(**kwargs)

        num_engineered_feats = boston[DatasetConstants.X_TRAIN].shape[1]
        feature_map = np.eye(5, num_engineered_feats)
        feature_names = [str(i) for i in range(feature_map.shape[0])]
        raw_names = feature_names[:feature_map.shape[0]]
        assert not _DatasetsMixin._does_quack(synthetic_explanation)
        global_raw_explanation = synthetic_explanation.get_raw_explanation(
            [feature_map], raw_feature_names=raw_names)
        self.validate_local_explanation_regression(synthetic_explanation,
                                                   global_raw_explanation,
                                                   feature_map,
                                                   has_eng_eval_data=False,
                                                   has_raw_eval_data=False,
                                                   has_dataset_data=False)
Пример #6
0
    def test_explain_model_random_forest_regression(self, boston, tabular_explainer):
        # Fit a random forest regression model
        model = create_sklearn_random_forest_regressor(boston[DatasetConstants.X_TRAIN],
                                                       boston[DatasetConstants.Y_TRAIN])

        # Create tabular explainer
        exp = tabular_explainer(model, boston[DatasetConstants.X_TRAIN], features=boston[DatasetConstants.FEATURES])
        test_logger.info('Running explain global for test_explain_model_random_forest_regression')
        explanation = exp.explain_global(boston[DatasetConstants.X_TEST])
        self.verify_boston_overall_features_rf(explanation.get_ranked_global_names(),
                                               explanation.get_ranked_global_values())
Пример #7
0
 def test_explain_model_npz_tree(self, tabular_explainer):
     # run explain global on a real sparse dataset from the field
     x_train, x_test, y_train, _ = self.create_msx_data(0.1)
     x_train = x_train[DATA_SLICE]
     x_test = x_test[DATA_SLICE]
     y_train = y_train[DATA_SLICE]
     # Fit a random forest regression model
     model = create_sklearn_random_forest_regressor(x_train, y_train.toarray().flatten())
     # Create tabular explainer
     exp = tabular_explainer(model, x_train)
     test_logger.info('Running explain global for test_explain_model_npz_tree')
     exp.explain_global(x_test)
 def test_large_data_surrogate_error_tree(self):
     # validate tree trains quickly for large data
     X_train, y_train, X_test, y_test, _ = \
         create_binary_classification_dataset(100)
     feature_names = list(X_train.columns)
     model = create_sklearn_random_forest_regressor(X_train, y_train)
     X_test, y_test = replicate_dataset(X_test, y_test)
     assert X_test.shape[0] > 1000000
     t0 = time.time()
     categorical_features = []
     model_analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
                                    categorical_features)
     max_depth = 3
     num_leaves = 31
     min_child_samples = 20
     categories_reindexed = []
     cat_ind_reindexed = []
     diff = model_analyzer.get_diff()
     surrogate = create_surrogate_model(model_analyzer, X_test, diff,
                                        max_depth, num_leaves,
                                        min_child_samples,
                                        cat_ind_reindexed)
     t1 = time.time()
     execution_time = t1 - t0
     print(
         "creating surrogate model took {} seconds".format(execution_time))
     # assert we don't take too long to train the tree on 1 million rows
     # note we train on >1 million rows in ~1 second
     assert execution_time < 20
     model_json = surrogate._Booster.dump_model()
     tree_structure = model_json["tree_info"][0]['tree_structure']
     max_split_index = get_max_split_index(tree_structure) + 1
     assert max_split_index == 3
     cache_subtree_features(tree_structure, feature_names)
     pred_y = model_analyzer.model.predict(X_test)
     traversed_X_test = X_test.copy()
     traversed_X_test[DIFF] = diff
     traversed_X_test[TRUE_Y] = y_test
     traversed_X_test[PRED_Y] = pred_y
     t2 = time.time()
     tree = traverse(traversed_X_test,
                     tree_structure,
                     max_split_index,
                     (categories_reindexed, cat_ind_reindexed), [],
                     feature_names,
                     metric=model_analyzer.metric,
                     classes=model_analyzer.classes)
     t3 = time.time()
     execution_time = t3 - t2
     print("traversing tree took {} seconds".format(execution_time))
     assert tree is not None
Пример #9
0
    def test_explain_model_sparse_tree(self, tabular_explainer):
        X, y = retrieve_dataset('a1a.svmlight')
        x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.002, random_state=7)
        # Fit a random forest regression model
        model = create_sklearn_random_forest_regressor(x_train, y_train)
        _, cols = x_train.shape
        shape = 1, cols
        background = csr_matrix(shape, dtype=x_train.dtype)

        # Create tabular explainer
        exp = tabular_explainer(model, background)
        test_logger.info('Running explain global for test_explain_model_sparse_tree')
        policy = SamplingPolicy(allow_eval_sampling=True)
        exp.explain_global(x_test, sampling_policy=policy)
Пример #10
0
    def test_explain_model_local_tree_regression(self, boston, tabular_explainer):
        # Fit a random forest regression model
        model = create_sklearn_random_forest_regressor(boston[DatasetConstants.X_TRAIN],
                                                       boston[DatasetConstants.Y_TRAIN])

        # Create tabular explainer
        exp = tabular_explainer(model, boston[DatasetConstants.X_TRAIN], features=boston[DatasetConstants.FEATURES])
        test_logger.info('Running explain local for test_explain_model_local_tree_regression')
        explanation = exp.explain_local(boston[DatasetConstants.X_TEST])
        assert explanation.local_importance_values is not None
        assert len(explanation.local_importance_values) == len(boston[DatasetConstants.X_TEST])
        assert explanation.num_examples == len(boston[DatasetConstants.X_TEST])
        assert len(explanation.local_importance_values[0]) == len(boston[DatasetConstants.FEATURES])
        assert explanation.num_features == len(boston[DatasetConstants.FEATURES])
        self.verify_top_rows_local_features_with_and_without_top_k(explanation,
                                                                   self.boston_local_features_first_five_rf)
Пример #11
0
    def test_explain_single_local_instance_regression(self, boston, tabular_explainer):
        # Fit an SVM model
        model = create_sklearn_random_forest_regressor(boston[DatasetConstants.X_TRAIN],
                                                       boston[DatasetConstants.Y_TRAIN])

        exp = tabular_explainer(model, boston[DatasetConstants.X_TRAIN], features=boston[DatasetConstants.FEATURES])

        local_explanation = exp.explain_local(boston[DatasetConstants.X_TEST][0])

        assert len(local_explanation.local_importance_values) == len(boston[DatasetConstants.FEATURES])
        assert local_explanation.num_features == len(boston[DatasetConstants.FEATURES])

        local_rank = local_explanation.get_local_importance_rank()
        assert len(local_rank) == len(boston[DatasetConstants.FEATURES])

        ranked_names = local_explanation.get_ranked_local_names()
        assert len(ranked_names) == len(boston[DatasetConstants.FEATURES])

        ranked_values = local_explanation.get_ranked_local_values()
        assert len(ranked_values) == len(boston[DatasetConstants.FEATURES])
 def test_large_data_importances(self):
     # mutual information can be very costly for large number of rows
     # hence, assert we downsample to compute importances for large data
     X_train, y_train, X_test, y_test, _ = \
         create_binary_classification_dataset(100)
     feature_names = list(X_train.columns)
     model = create_sklearn_random_forest_regressor(X_train, y_train)
     X_test, y_test = replicate_dataset(X_test, y_test)
     assert X_test.shape[0] > 1000000
     t0 = time.time()
     categorical_features = []
     model_analyzer = ModelAnalyzer(model, X_test, y_test,
                                    feature_names,
                                    categorical_features)
     model_analyzer.compute_importances()
     t1 = time.time()
     execution_time = t1 - t0
     print(execution_time)
     # assert we don't take too long and downsample the dataset
     # note execution time is in seconds
     assert execution_time < 20
    def test_validate_against_shap(self):
        # Validate our explainer against shap library directly
        X, y = shap.datasets.adult()
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.02,
                                                            random_state=7)
        # Fit several classifiers
        tree_classifiers = [
            create_sklearn_random_forest_classifier(x_train, y_train)
        ]
        non_tree_classifiers = [
            create_sklearn_logistic_regressor(x_train, y_train)
        ]
        tree_regressors = [
            create_sklearn_random_forest_regressor(x_train, y_train)
        ]
        non_tree_regressors = [
            create_sklearn_linear_regressor(x_train, y_train)
        ]
        # For each model, validate we get the same results as calling shap directly
        test_logger.info(
            "Running tree classifiers in test_validate_against_shap")
        for model in tree_classifiers:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree classifiers in test_validate_against_shap")
        for model in non_tree_classifiers:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict_proba, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running tree regressors in test_validate_against_shap")
        for model in tree_regressors:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree regressors in test_validate_against_shap")
        for model in non_tree_regressors:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)
Пример #14
0
    def test_validate_against_shap(self):
        # Validate our explainer against shap library directly
        X, y = shap.datasets.adult()
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.02,
                                                            random_state=7)
        # Fit several classifiers
        tree_classifiers = [
            create_sklearn_random_forest_classifier(x_train, y_train)
        ]
        non_tree_classifiers = [
            create_sklearn_logistic_regressor(x_train, y_train)
        ]
        tree_regressors = [
            create_sklearn_random_forest_regressor(x_train, y_train)
        ]
        non_tree_regressors = [
            create_sklearn_linear_regressor(x_train, y_train)
        ]
        # For each model, validate we get the same results as calling shap directly
        test_logger.info(
            "Running tree classifiers in test_validate_against_shap")
        for model in tree_classifiers:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree classifiers in test_validate_against_shap")
        for model in non_tree_classifiers:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict_proba, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_classification(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running tree regressors in test_validate_against_shap")
        for model in tree_regressors:
            # Run shap directly for comparison
            exp = shap.TreeExplainer(model)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        test_logger.info(
            "Running non tree regressors in test_validate_against_shap")
        for model in non_tree_regressors:
            # Run shap directly for comparison
            clustered = shap.kmeans(x_train, 10)
            exp = shap.KernelExplainer(model.predict, clustered)
            explanation = exp.shap_values(x_test)
            shap_overall_imp = get_shap_imp_regression(explanation)
            overall_imp = tabular_explainer_imp(model, x_train, x_test)
            validate_correlation(overall_imp, shap_overall_imp, 0.95)

        if not rapids_installed:
            pytest.skip("cuML not installed; will skip testing GPU Explainer")
        else:
            test_logger.info(
                "Running GPU non tree classifiers in test_validate_against_shap"
            )
            x_train, x_test, y_train, y_validation, _, _ = create_cancer_data()
            gpu_non_tree_classifiers = [
                create_cuml_svm_classifier(x_train.astype(np.float32),
                                           y_train.astype(np.float32))
            ]
            for model in gpu_non_tree_classifiers:
                exp = KernelExplainer(model=model.predict_proba,
                                      data=x_train.astype(np.float32))
                explanation = exp.shap_values(x_test.astype(np.float32))
                shap_overall_imp = get_shap_imp_classification(explanation)
                overall_imp = tabular_explainer_imp(model,
                                                    x_train.astype(np.float32),
                                                    x_test.astype(np.float32),
                                                    use_gpu=True)
                validate_correlation(overall_imp, shap_overall_imp, 0.95)