def test_explain_raw_feats_regression(self, boston, tabular_explainer): # verify that no errors get thrown when calling get_raw_feat_importances x_train = boston[DatasetConstants.X_TRAIN][DATA_SLICE] x_test = boston[DatasetConstants.X_TEST][DATA_SLICE] y_train = boston[DatasetConstants.Y_TRAIN][DATA_SLICE] model = create_sklearn_linear_regressor(x_train, y_train) explainer = tabular_explainer(model, x_train) global_explanation = explainer.explain_global(x_test) local_explanation = explainer.explain_local(x_test) # 0th raw feature maps to 1 and 3 generated features, 1st raw feature maps to 0th and 2nd gen. features raw_feat_indices = [[1, 3], [0, 2]] num_generated_cols = x_train.shape[1] feature_map = _get_feature_map_from_indices_list( raw_feat_indices, num_raw_cols=2, num_generated_cols=num_generated_cols) global_raw_importances = global_explanation.get_raw_feature_importances( [feature_map]) assert len(global_raw_importances) == len(raw_feat_indices), 'length of global importances ' \ 'does not match number of features' local_raw_importances = local_explanation.get_raw_feature_importances( [feature_map]) assert len(local_raw_importances) == x_test.shape[0], 'length of local importances does not match number ' \ 'of samples'
def test_get_local_raw_explanations_sparse_regression( self, mimic_explainer): X, y = retrieve_dataset('a1a.svmlight') x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=7) # Fit a linear regression model model = create_sklearn_linear_regressor(x_train, y_train) explainer = mimic_explainer( model, x_train, LinearExplainableModel, explainable_model_args={'sparse_data': True}) global_explanation = explainer.explain_global(x_test) assert global_explanation.method == LINEAR_METHOD num_engineered_feats = x_train.shape[1] feature_map = np.eye(5, num_engineered_feats) global_raw_explanation = global_explanation.get_raw_explanation( [feature_map]) self.validate_global_raw_explanation_regression( global_explanation, global_raw_explanation, feature_map)
def test_save_and_load_sparse_explanation(self, mimic_explainer): x_train, x_test, y_train, y_test = create_msx_data(0.05) # Fit a linear regression model model = create_sklearn_linear_regressor(x_train, y_train.toarray().flatten()) explainable_model = LGBMExplainableModel explainer = mimic_explainer(model, x_train, explainable_model, augment_data=False) explanation = explainer.explain_global(x_test) verify_serialization(explanation)
def test_explain_model_linear_regression(self, boston, tabular_explainer): # Fit a linear regression model model = create_sklearn_linear_regressor(boston[DatasetConstants.X_TRAIN], boston[DatasetConstants.Y_TRAIN], pipeline=True) # Create tabular explainer exp = tabular_explainer(model, boston[DatasetConstants.X_TRAIN], features=boston[DatasetConstants.FEATURES]) test_logger.info('Running explain global for test_explain_model_regression') explanation = exp.explain_global(boston[DatasetConstants.X_TEST]) self.verify_boston_overall_features_lr(explanation.get_ranked_global_names(), explanation.get_ranked_global_values())
def test_explain_model_local_kernel_regression(self, boston, tabular_explainer): # Fit a linear regression model model = create_sklearn_linear_regressor(boston[DatasetConstants.X_TRAIN], boston[DatasetConstants.Y_TRAIN]) # Create tabular explainer exp = tabular_explainer(model, boston[DatasetConstants.X_TRAIN], features=boston[DatasetConstants.FEATURES]) test_logger.info('Running explain local for test_explain_model_regression') explanation = exp.explain_local(boston[DatasetConstants.X_TEST]) assert explanation.local_importance_values is not None assert len(explanation.local_importance_values) == len(boston[DatasetConstants.X_TEST]) assert explanation.num_examples == len(boston[DatasetConstants.X_TEST]) assert len(explanation.local_importance_values[0]) == len(boston[DatasetConstants.FEATURES]) assert explanation.num_features == len(boston[DatasetConstants.FEATURES]) self.verify_top_rows_local_features_with_and_without_top_k(explanation, self.boston_local_features_first_five_lr)
def test_validate_against_shap(self): # Validate our explainer against shap library directly X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=7) # Fit several classifiers tree_classifiers = [ create_sklearn_random_forest_classifier(x_train, y_train) ] non_tree_classifiers = [ create_sklearn_logistic_regressor(x_train, y_train) ] tree_regressors = [ create_sklearn_random_forest_regressor(x_train, y_train) ] non_tree_regressors = [ create_sklearn_linear_regressor(x_train, y_train) ] # For each model, validate we get the same results as calling shap directly test_logger.info( "Running tree classifiers in test_validate_against_shap") for model in tree_classifiers: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree classifiers in test_validate_against_shap") for model in non_tree_classifiers: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict_proba, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running tree regressors in test_validate_against_shap") for model in tree_regressors: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree regressors in test_validate_against_shap") for model in non_tree_regressors: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95)
def test_explain_model_serialization_regression(self, mimic_explainer): x_train, x_test, y_train, _, feature_names = create_energy_data() # Fit a linear model model = create_sklearn_linear_regressor(x_train, y_train) self._validate_model_serialization(model, x_train, x_test, mimic_explainer)
def test_validate_against_shap(self): # Validate our explainer against shap library directly X, y = shap.datasets.adult() x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=7) # Fit several classifiers tree_classifiers = [ create_sklearn_random_forest_classifier(x_train, y_train) ] non_tree_classifiers = [ create_sklearn_logistic_regressor(x_train, y_train) ] tree_regressors = [ create_sklearn_random_forest_regressor(x_train, y_train) ] non_tree_regressors = [ create_sklearn_linear_regressor(x_train, y_train) ] # For each model, validate we get the same results as calling shap directly test_logger.info( "Running tree classifiers in test_validate_against_shap") for model in tree_classifiers: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree classifiers in test_validate_against_shap") for model in non_tree_classifiers: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict_proba, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running tree regressors in test_validate_against_shap") for model in tree_regressors: # Run shap directly for comparison exp = shap.TreeExplainer(model) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) test_logger.info( "Running non tree regressors in test_validate_against_shap") for model in non_tree_regressors: # Run shap directly for comparison clustered = shap.kmeans(x_train, 10) exp = shap.KernelExplainer(model.predict, clustered) explanation = exp.shap_values(x_test) shap_overall_imp = get_shap_imp_regression(explanation) overall_imp = tabular_explainer_imp(model, x_train, x_test) validate_correlation(overall_imp, shap_overall_imp, 0.95) if not rapids_installed: pytest.skip("cuML not installed; will skip testing GPU Explainer") else: test_logger.info( "Running GPU non tree classifiers in test_validate_against_shap" ) x_train, x_test, y_train, y_validation, _, _ = create_cancer_data() gpu_non_tree_classifiers = [ create_cuml_svm_classifier(x_train.astype(np.float32), y_train.astype(np.float32)) ] for model in gpu_non_tree_classifiers: exp = KernelExplainer(model=model.predict_proba, data=x_train.astype(np.float32)) explanation = exp.shap_values(x_test.astype(np.float32)) shap_overall_imp = get_shap_imp_classification(explanation) overall_imp = tabular_explainer_imp(model, x_train.astype(np.float32), x_test.astype(np.float32), use_gpu=True) validate_correlation(overall_imp, shap_overall_imp, 0.95)