def explain(self, images: np.ndarray, texts: np.ndarray, mode: str): """ Main API to calculate shap values Args: images: np.ndarray of shape (N, D1, D2, C); N = number of samples D1, D2, C = three channel image texts: np.ndarray of shape (N,) Returns: shap values calculated """ # input validations if mode not in self._supported_modes: raise ValueError(f"This mode {mode} is not supported!") if images.shape[0] != texts.shape[0]: raise ValueError( f"Shape mismatch, inputs' first dimensions should be equal!") if mode == "text_only": self._fixed_images = images if not isinstance(images[0], Image.Image): self._fixed_images = utils.arr_to_img(images) # tokenizer and masker tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True) text_masker = shap.maskers.Text(tokenizer) # NOTE: if using text heatmap you need output_names arg to let it know its text output!!! explainer = shap.Explainer(self._f_text, text_masker, algorithm=self.algorithm) shap_values = explainer(texts, max_evals=self.max_evals, batch_size=self.batch_size) elif mode == "image_only": self._fixed_texts = texts image_masker = shap.maskers.Image("inpaint_telea", images[0].shape) image_explainer = shap.Explainer(self._f_image, image_masker, algorithm=self.algorithm) shap_values = image_explainer(images, max_evals=self.max_evals, batch_size=self.batch_size) return shap_values
def display_hover_data(n_clicks, value): # explain the model on two sample inputs explainer = shap.Explainer(model) shap_values = explainer([value]) # visualize the first prediction's explanation for the POSITIVE output class return text_plot(shap_values[0, :, "POSITIVE"])._repr_html_() # app.clientside_callback( # """ # function(figure, scale) { # if(figure === undefined) { # return {'data': [], 'layout': {}}; # } # const fig = Object.assign({}, figure, { # 'layout': { # ...figure.layout, # 'yaxis': { # ...figure.layout.yaxis, type: scale # } # } # }); # return fig; # } # """, # Output('clientside-graph-px', 'figure'), # Input('clientside-figure-store-px', 'data'), # Input('clientside-graph-scale-px', 'value') # )
def test_pyfunc_serve_and_score(): X, y = shap.datasets.boston() reg = sklearn.ensemble.RandomForestRegressor(n_estimators=10).fit(X, y) model = shap.Explainer( reg.predict, masker=X, algorithm="permutation", # `link` defaults to `shap.links.identity` which is decorated by `numba.jit` and causes # the following error when loading the explainer for serving: # ``` # Exception: The passed link function needs to be callable and have a callable # .inverse property! # ``` # As a workaround, use an identify function that's NOT decorated by `numba.jit`. link=create_identity_function(), ) artifact_path = "model" with mlflow.start_run(): mlflow.shap.log_explainer(model, artifact_path) model_uri = mlflow.get_artifact_uri(artifact_path) resp = pyfunc_serve_and_score_model( model_uri, data=pd.DataFrame(X[:3]), content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED, ) scores = pd.read_json(resp.content, orient="records").values np.testing.assert_allclose(scores, model(X[:3]).values, rtol=100, atol=100)
def test_sklearn_log_explainer(): """ Tests mlflow.shap log_explainer with mlflow serialization of the underlying model """ with mlflow.start_run() as run: run_id = run.info.run_id X, y = shap.datasets.boston() model = sklearn.ensemble.RandomForestRegressor(n_estimators=100) model.fit(X, y) explainer_original = shap.Explainer(model.predict, X, algorithm="permutation") shap_values_original = explainer_original(X[:5]) mlflow.shap.log_explainer(explainer_original, "test_explainer") explainer_uri = "runs:/" + run_id + "/test_explainer" explainer_loaded = mlflow.shap.load_explainer(explainer_uri) shap_values_new = explainer_loaded(X[:5]) explainer_path = _download_artifact_from_uri(artifact_uri=explainer_uri) flavor_conf = _get_flavor_configuration( model_path=explainer_path, flavor_name=mlflow.shap.FLAVOR_NAME ) underlying_model_flavor = flavor_conf["underlying_model_flavor"] assert underlying_model_flavor == mlflow.sklearn.FLAVOR_NAME np.testing.assert_array_equal(shap_values_original.base_values, shap_values_new.base_values) np.testing.assert_allclose( shap_values_original.values, shap_values_new.values, rtol=100, atol=100 )
def test_sklearn_log_explainer_pyfunc(): """ Tests mlflow.shap log_explainer with mlflow serialization of the underlying model using pyfunc flavor """ with mlflow.start_run() as run: run_id = run.info.run_id X, y = shap.datasets.boston() model = sklearn.ensemble.RandomForestRegressor(n_estimators=100) model.fit(X, y) explainer_original = shap.Explainer(model.predict, X, algorithm="permutation") shap_values_original = explainer_original(X[:2]) mlflow.shap.log_explainer(explainer_original, "test_explainer") explainer_pyfunc = mlflow.pyfunc.load_model("runs:/" + run_id + "/test_explainer") shap_values_new = explainer_pyfunc.predict(X[:2]) np.testing.assert_allclose(shap_values_original.values, shap_values_new, rtol=100, atol=100)
def test_serialization_exact(): xgboost = pytest.importorskip('xgboost') # get a dataset on income prediction X,y = shap.datasets.adult() # train an XGBoost model (but any other model type would also work) model = xgboost.XGBClassifier() model.fit(X, y) explainer_original = shap.Explainer(model.predict_proba, X, algorithm='exact') shap_values_original = explainer_original(X[:1]) temp_serialization_file = tempfile.TemporaryFile() # Serialization explainer_original.save(temp_serialization_file) temp_serialization_file.seek(0) # Deserialization explainer_new = shap.Explainer.load(temp_serialization_file) temp_serialization_file.close() shap_values_new = explainer_new(X[:1]) for i in range(len(explainer_original.masker.feature_names)): assert explainer_original.masker.feature_names[i] == explainer_new.masker.feature_names[i] assert np.array_equal(shap_values_original.base_values,shap_values_new.base_values) assert type(explainer_original) == type(explainer_new) assert type(explainer_original.masker) == type(explainer_new.masker)
def get_shap_explainations(model, data): ''' Plot SHAP's output explanations. "SHAP (SHapley Additive exPlanations) is a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions." https://github.com/slundberg/shap :param model: the model to explain :param data: data to explain :return:- ''' # explain the model's predictions using SHAP explainer = shap.Explainer(model) shap_values = explainer(data) # visualize the first prediction's explanation shap.plots.waterfall(shap_values[0]) # visualize the first prediction's explanation with a force plot shap.plots.force(shap_values[0]) # visualize all the training set predictions shap.plots.force(shap_values) # create a dependence scatter plot to show the effect of a single feature across the whole dataset # shap.plots.scatter(shap_values[:, "RM"], color=shap_values) # summarize the effects of all the features shap.plots.beeswarm(shap_values) shap.plots.bar(shap_values) shap.summary_plot(shap_values, data)
def test_serialization_exact_numpy_custom_model_save(): xgboost = pytest.importorskip('xgboost') pickle = pytest.importorskip('pickle') # get a dataset on income prediction X,y = shap.datasets.adult() X = X.values # train an XGBoost model (but any other model type would also work) model = xgboost.XGBClassifier() model.fit(X, y) explainer_original = shap.Explainer(model.predict_proba, X, algorithm='exact') shap_values_original = explainer_original(X[:1]) temp_serialization_file = tempfile.TemporaryFile() # Serialization explainer_original.model.save = lambda out_file, model: pickle.dump(model, out_file) explainer_original.save(temp_serialization_file) temp_serialization_file.seek(0) # Deserialization model_loader = lambda in_file: pickle.load(in_file) explainer_new = shap.Explainer.load(temp_serialization_file, model_loader = model_loader) temp_serialization_file.close() shap_values_new = explainer_new(X[:1]) assert np.array_equal(shap_values_original.base_values,shap_values_new.base_values) assert type(explainer_original) == type(explainer_new) assert type(explainer_original.masker) == type(explainer_new.masker)
def get_reg_shap_explainer_global_and_local(model: object, X_train): """return the shap explainer object and shap values for global and local plot Args: model (object): a traine pycaret model X_train (pd.DataFrame): the X training data """ sample_values = None if model.__class__.__name__ == "CatBoostRegressor": explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X_train) elif model.__class__.__name__ == "RANSACRegressor" \ or model.__class__.__name__ == "KernelRidge" \ or model.__class__.__name__ == "SVR" \ or model.__class__.__name__ == "MLPRegressor" \ or model.__class__.__name__ == "KNeighborsRegressor" \ or model.__class__.__name__ == "AdaBoostRegressor": explainer, shap_values, sample_values = get_shap_kernel(model, X_train) else: explainer = shap.Explainer(model, X_train) shap_values = explainer(X_train) return explainer, shap_values, sample_values
def test_sklearn_log_explainer_self_serialization(): """ Tests mlflow.shap log_explainer with SHAP internal serialization of the underlying model """ with mlflow.start_run() as run: run_id = run.info.run_id X, y = shap.datasets.boston() model = sklearn.ensemble.RandomForestRegressor(n_estimators=100) model.fit(X, y) explainer_original = shap.Explainer(model.predict, X, algorithm="permutation") shap_values_original = explainer_original(X[:5]) mlflow.shap.log_explainer( explainer_original, "test_explainer", serialize_model_using_mlflow=False ) explainer_loaded = mlflow.shap.load_explainer("runs:/" + run_id + "/test_explainer") shap_values_new = explainer_loaded(X[:5]) np.testing.assert_array_equal(shap_values_original.base_values, shap_values_new.base_values) np.testing.assert_allclose( shap_values_original.values, shap_values_new.values, rtol=100, atol=100 )
def _get_shap_values( self, estimator, X, shap_kwargs, ): """ FOR INTERNAL PURPOSES ONLY. """ masker = shap_kwargs.get('masker') algorithm = shap_kwargs.get('algorithm', 'auto') if self.estimator_output == "probability": model = estimator.predict_proba else: model = estimator.predict explainer = shap.Explainer( model=model, masker=masker, algorithm=algorithm, ) shap_results = explainer(X) if self.estimator_output == "probability": shap_results = shap_results[..., 1] contributions = shap_results.values bias = shap_results.base_values return contributions, bias
def test_serialization_permutation(): import shap import xgboost import pickle import numpy as np # get a dataset on income prediction X,y = shap.datasets.adult() # train an XGBoost model (but any other model type would also work) model = xgboost.XGBClassifier() model.fit(X, y) explainer_original = shap.Explainer(model.predict_proba, X, algorithm='permutation') shap_values_original = explainer_original(X[:1]) # Serialization out_file = open(r'test_serialization_permutation_dataframe_scratch_file.bin', "wb") explainer_original.save(out_file) out_file.close() # Deserialization in_file = open(r'test_serialization_permutation_dataframe_scratch_file.bin', "rb") explainer_new = shap.Explainer.load(in_file) in_file.close() shap_values_new = explainer_new(X[:1]) for i in range(len(explainer_original.masker.feature_names)): assert explainer_original.masker.feature_names[i] == explainer_new.masker.feature_names[i] assert np.array_equal(shap_values_original.base_values,shap_values_new.base_values) assert type(explainer_original) == type(explainer_new) assert type(explainer_original.masker) == type(explainer_new.masker)
def get_feature_importance(self, input_data) -> Dict[str, float]: """Computes feature importance for each feature based on an input data. Most of models are supported by SHAP (https://github.com/slundberg/shap). For unsupported models, please override this method by a workable solution. """ explainer = shap.Explainer(self.model) shap_values = explainer(input_data) def _get_shap_values_one_sample(shap_values, index: int): # For LightGBM and XGBoost, shap_values[index].values is a 2d array # representing logit of two classes. They are basically negative of each # other, we only need one. # Related issue https://github.com/slundberg/shap/issues/526. if len(shap_values[index].values.shape ) == 2: # binary classification return shap_values[index].values[:, 0] assert len(shap_values[index].values.shape) == 1, len( shap_values[index].values) return shap_values[index].values feature_importances = np.mean( [ np.abs(_get_shap_values_one_sample(shap_values, i)) for i in range(len(shap_values)) ], axis=0, ).tolist() feature_names = input_data.columns.tolist() feature_importance_dict = dict(zip(feature_names, feature_importances)) return feature_importance_dict
def __init__(self, X, y, model, n_samples=1000): store_attr() model.fit(X, y) self.samples = samples = X.iloc[:n_samples] self.explainer = shap.Explainer(model, samples) self.shap_values = self.explainer(samples)
def shap_calc(model, X, approximate=False, return_explainer=False, verbose=0, sample_size = 100, **shap_kwargs): """ Helper function to calculate the shapley values for a given model. Args: model (binary model): Trained model. X (pd.DataFrame or np.ndarray): features set. approximate (boolean): if True uses shap approximations - less accurate, but very fast. return_explainer (boolean): if True, returns a a tuple (shap_values, explainer). verbose (int, optional): Controls verbosity of the output: - 0 - nether prints nor warnings are shown - 1 - 50 - only most important warnings - 51 - 100 - shows other warnings and prints - above 100 - presents all prints and all warnings (including SHAP warnings). **shap_kwargs: kwargs of the shap.Explainer Returns: (np.ndarray or tuple(np.ndarray, shap.Explainer)): shapley_values for the model, optionally also returns the explainer. """ # Suppress warnings regarding XGboost and Lightgbm models. with warnings.catch_warnings(): if verbose <= 100: warnings.simplefilter("ignore") # Create the background data,required for non tree based models. # A single datapoint can passed as mask (https://github.com/slundberg/shap/issues/955#issuecomment-569837201) if X.shape[1]< sample_size : sample_size = int(np.ceil(X.shape[1]*0.2)) else : pass mask = shap.utils.sample(X,sample_size) explainer = shap.Explainer(model,masker=mask,**shap_kwargs) # Calculate Shap values. shap_values = explainer.shap_values(X) if isinstance(shap_values, list) and len(shap_values)==2: warnings.warn('Shap values are related to the output probabilities of class 1 for this model, instead of ' 'log odds.') shap_values = shap_values[1] if return_explainer: return shap_values, explainer return shap_values
def explainer(): df_new = dataLoader() gradY_df = [0] count = 0 i = 0 while (1): if count == 250: break temp = df_new['y'][i] t2 = df_new['y'][i + 1] if df_new['y'][i + 1] > temp: gradY_df.append(1) else: gradY_df.append(0) i = i + 1 count += 1 gradY_df.append(0) df_new.insert(6, 'gradY_df', gradY_df) shap_df = df_new[['x1', 'x2', 'x3', 'x4', 'x5', 'gradY_df']] n_train_time = int(len(shap_df) * 0.90) train = shap_df[:n_train_time] test = shap_df[n_train_time:] train_x, train_y = train[['x1', 'x2', 'x3', 'x4', 'x5']], train[['gradY_df']] test_x, test_y = test[['x1', 'x2', 'x3', 'x4', 'x5']], test[['gradY_df']] yrr = shap_df[['gradY_df']].values.reshape(252, ) # train an XGBoost model X = shap_df[['x1', 'x2', 'x3', 'x4', 'x5']] y = yrr model = xgboost.XGBRegressor().fit(X, y) # explain the model's predictions using SHAP # (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.) explainer = shap.Explainer(model) shap_values = explainer(X) # visualize the first prediction's explanation shap.plots.waterfall(shap_values[0], show=False) plt.savefig("static/img/img_number1.png") plt.close() shap.summary_plot(shap_values, X, show=False) plt.savefig("static/img/img_number2.png") plt.close() shap.summary_plot(shap_values, X, plot_type="bar", show=False) plt.savefig('static/img/img_number3.png') plt.close() shap.plots.bar(shap_values, show=False) plt.savefig('static/img/img_number4.png') plt.close()
def __init__(self, clf, X, X_df=None ): self.clf = clf self.X = X self.explainer = shap.Explainer(clf.steps[-1][1]) if X_df is None: self.X_df = DataReconstructor(self.X, self.clf).make() else: self.X_df = X_df self.shap_values = self.explainer(self.X_df)
def __init__(self, model): ''' Only with BERT ''' label2id = model.model.config.label2id labels = sorted(label2id, key=label2id.get) self.tokenizer = model.tokenizer self.predict_proba = lambda s: model.predict_proba_batch(s) self.exp = shap.Explainer( self.predict_proba, self.tokenizer, output_names=labels) self.model = model
def shap_plots(model, train_features, test_features, test_labels): print("Computing shapley values..") # compute SHAP values if isinstance( model, (MLP, MLPRegressor, MLPClassifier, ElasticNet, LogisticRegression)): train_sample = shap.sample(train_features, 10) explainer = shap.Explainer(model.predict, train_sample) elif isinstance(model, (RandomForestRegressor, RandomForestClassifier)): explainer = shap.TreeExplainer(model, train_features) else: explainer = shap.Explainer(model, train_features) shap_values = explainer(test_features) shap.plots.bar(shap_values, max_display=10) # shap.plots.bar(shap_values[0]) # Local # beeswarm plot shap.plots.beeswarm(shap_values) # Decision plot expected_value = explainer.expected_value select = range(20) features_sample = test_features.iloc[select] shap.decision_plot(expected_value, explainer.shap_values(features_sample), features_sample) # Heatmap shap.plots.heatmap(shap_values, max_display=10) # Scatter shap.plots.scatter(shap_values[:, "hs_child_age_None"], color=shap_values, alpha=0.8) # Feature clustering (redondant feature detection) clustering = shap.utils.hclust( test_features, test_labels ) # by default this trains (X.shape[1] choose 2) 2-feature XGBoost models shap.plots.bar(shap_values, clustering=clustering, clustering_cutoff=0.5)
def pickle( target,title,max_depth=3,n_esti=160,lr=0.1,withexperience = False, color='YlGnBu'): matrics = [] seed(2145) df_small = df_model_draft[df_model_draft['surgyear'].isin([2015])] print (df_small.shape) groups = df_small['HospID'] print (groups) if withexperience is False: X = df_small.drop( ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1) y = df_small[target] else: X = df_small.drop( ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM', 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear', 'HospID_total_cardiac_surgery', 'surgid_total_cardiac_surgery', 'surgid_total_CABG', 'surgid_Reop_CABG'], axis=1) y = df_small[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sm = SMOTE() # SVMSMOTE(random_state=21) # fit and apply the transform X_over, y_over = sm.fit_resample(X_train, y_train) # summarize class distribution print("after under sampling") counter = Counter(y_over) print(counter) estimate = counter[0] / counter[1] print('Estimate: %.3f' % estimate) model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=max_depth, learning_rate=lr, n_estimators=n_esti) model.fit(X_over, y_over) y_pred = model.predict(X_test) explainer = shap.Explainer(model) shap_values = explainer(X) # visualize the first prediction's explanation shap.plots.beeswarm(shap_values, max_display=50,show=False) shaptitle = path+ 'SHAP '+title+'.png' print(shaptitle) plt.savefig(shaptitle,bbox_inches='tight') # plt.show() auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1]) cm = confusion_matrix(y_test, y_pred) mats = Make_Confusion_Matrix(cm,roc=auc, categories=categories, cmap=color, title=title, group_names=labels,y_pred=y_pred,y_test=y_test) mats['AUROC'] = auc matrics.append(mats) return matrics,title
def test_wrapping_for_topk_lm_model(): """ This tests using the Explainer class to auto wrap a masker in a language modelling scenario. """ transformers = pytest.importorskip("transformers") tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2") model = transformers.AutoModelForCausalLM.from_pretrained("gpt2") wrapped_model = shap.models.TopKLM(model, tokenizer) masker = shap.maskers.Text(tokenizer, mask_token="...") explainer = shap.Explainer(wrapped_model, masker, seed=1) assert shap.utils.safe_isinstance(explainer.masker, "shap.maskers.FixedComposite")
def _fixed_mode_shap_vals(self, X, to_fix, func, masker): """ Helper stud to compute shap values in single-modal mode where one sample is fixed """ assert len(X) == len(to_fix) out = [] explainer = shap.Explainer(func, masker, algorithm=self.algorithm) last_shape = X[0].shape # loop through samples for i in range(len(X)): self._curr_fixed = to_fix[i] if self._mode == "fix_text" and X[i].shape != last_shape: # reinitialise explainer using new masker if shape different masker = shap.maskers.Image("inpaint_telea", X[i].shape) explainer = shap.Explainer(func, masker, algorithm=self.algorithm) last_shape = X[i].shape if self._mode == "fix_image" and self.algorithm == "permutation": # Workaround if max_evals given was too low, increase to acceptable value self.max_evals = self._min_acceptable_evals( len(self.tokenizer.tokenize(X[i])) ) values = explainer( X[i : i + 1], max_evals=self.max_evals, batch_size=self.batch_size ) out.append(values[0]) # select the only first element return out
def test_load_pyfunc(tmpdir): X, y = shap.datasets.boston() model = sklearn.ensemble.RandomForestRegressor(n_estimators=100) model.fit(X, y) explainer_original = shap.Explainer(model.predict, X, algorithm="permutation") shap_values_original = explainer_original(X[:2]) path = tmpdir.join("pyfunc_test").strpath mlflow.shap.save_explainer(explainer_original, path) explainer_pyfunc = mlflow.shap._load_pyfunc(path) shap_values_new = explainer_pyfunc.predict(X[:2]) np.testing.assert_allclose(shap_values_original.values, shap_values_new, rtol=100, atol=100)
def main(): df = 'features_gfa_cts.csv' save = './' seed = 22478 model = 'xgboost_model.pickle' os.makedirs(save, exist_ok=True) with open(model, 'rb') as f: model = pickle.load(f) # Setting seed for reproducibility np.random.seed(seed) np.random.RandomState(seed) model.random_state = seed df = pd.read_csv(df) df = df.sample(frac=1, random_state=seed).reset_index(drop=True) y = df['gfa'].values y = [1 if i == 'good' else 0 for i in y] feats = df.drop(['comp', 'gfa'], axis=1) X = feats.values scaler = StandardScaler() scaler.fit(X) X = scaler.transform(X) feat_names = np.array(feats.columns) names = [label[i] for i in list(feats.columns)] model.fit(X, y) pickle.dump(model, open(save + '/xgboost_model.pickle', 'wb')) explainer = shap.Explainer(model, X, feature_names=names) ranks = explainer(X) with open(save + '/shap.pickle', 'wb') as f: pickle.dump(ranks, f) shap.summary_plot(ranks, X, class_names=['Poor GFA', 'Good GFA'], show=False) pl.savefig(save + '/shap') pl.close('all')
def test_serialization_permutation_no_model_or_masker(): import shap import xgboost import pickle import numpy as np import tempfile # get a dataset on income prediction X, y = shap.datasets.adult() # train an XGBoost model (but any other model type would also work) model = xgboost.XGBClassifier() model.fit(X, y) explainer_original = shap.Explainer(model.predict_proba, X, algorithm='permutation') shap_values_original = explainer_original(X[:1]) temp_serialization_file = tempfile.TemporaryFile() # Serialization explainer_original.model.save = None explainer_original.masker.save = None explainer_original.save(temp_serialization_file) temp_serialization_file.seek(0) # Deserialization explainer_new = shap.Explainer.load(temp_serialization_file) temp_serialization_file.close() # manually insert model and masker explainer_new.model = explainer_original.model explainer_new.masker = explainer_original.masker shap_values_new = explainer_new(X[:1]) for i in range(len(explainer_original.masker.feature_names)): assert explainer_original.masker.feature_names[ i] == explainer_new.masker.feature_names[i] assert np.array_equal(shap_values_original.base_values, shap_values_new.base_values) assert type(explainer_original) == type(explainer_new) assert type(explainer_original.masker) == type(explainer_new.masker)
def grid_search_fit_svc(self, c=None, scale=False): if scale: s = StandardScaler() s.fit(self.x_train) self.x_train = s.transform(self.x_train) self.x_pool = s.transform(self.x_pool) self.x_test = s.transform(self.x_test) if c is None: c = [0.8, 1, 2] max_iter = 1000 best_f1 = 0 model = None for c_option in c: m = SVC(max_iter=max_iter, C=c_option, kernel='linear', class_weight='balanced', probability=True) m.fit(self.x_train, self.y_train) predictions = m.predict(self.x_test) f1 = f1_score(predictions, self.y_test) if f1 > best_f1: self.model = m best_f1 = f1 pred = self.model.predict(self.x_test) print("F1 score on test set ", f1_score(self.y_test, pred)) print("Confusion matrix on test set ", confusion_matrix(self.y_test, pred)) print("Accuracy test set", accuracy_score(self.y_test, pred)) pred = self.model.predict(self.x_pool) print("F1 score on pool ", f1_score(self.y_pool, pred)) print("Confusion matrix of final model on pool ", confusion_matrix(self.y_pool, pred)) print("Accuracy of final model on pool", accuracy_score(self.y_pool, pred)) explainer = shap.Explainer(self.model, self.x_train, feature_perturbation="independent") # TODO extract feature importance value of each feature self.shap_values_train = explainer.shap_values(self.x_train) self.shap_values_pool = explainer.shap_values(self.x_pool) feature_names = np.array(self.tfid.get_feature_names( )) # len(feature_names) = #cols in shap_values_pool shap.summary_plot(self.shap_values_train, self.x_train, feature_names=feature_names) return self.model, explainer
def test_wrapping_for_text_to_text_teacher_forcing_model(): """ This tests using the Explainer class to auto wrap a masker in a text to text scenario. """ transformers = pytest.importorskip("transformers") def f(x): # pylint: disable=unused-argument pass tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2") model = transformers.AutoModelForCausalLM.from_pretrained("gpt2") wrapped_model = shap.models.TeacherForcing(f, similarity_model=model, similarity_tokenizer=tokenizer) masker = shap.maskers.Text(tokenizer, mask_token="...") explainer = shap.Explainer(wrapped_model, masker, seed=1) assert shap.utils.safe_isinstance(explainer.masker, "shap.maskers.OutputComposite")
def test_single_class_independent_auto_api(): xgboost = pytest.importorskip('xgboost') # get a dataset on income prediction X, y = shap.datasets.adult() X = X.iloc[:100] y = y[:100] # train an XGBoost model (but any other model type would also work) model = xgboost.XGBClassifier() model.fit(X, y) # build an Exact explainer and explain the model predictions on the given dataset explainer = shap.Explainer(model.predict, X, algorithm="permutation") shap_values = explainer(X) assert np.max(np.abs(shap_values.base_values + shap_values.values.sum(1) - model.predict(X[:100])) < 1e6)
def Explain_model(Model, X_train): 'Get shap in linear models' explainer = shap.Explainer(Model, X_train, feature_names=X_train.columns) shap_values = explainer(X_train) #Plot fig = shap.plots.waterfall(shap_values[1], show=False) plt.savefig('scratch.png') #Importance values vals = np.abs(shap_values[1].values).mean(0) feature_importance = pd.DataFrame( list(zip(X_train.columns, vals)), columns=['col_name', 'feature_importance_vals']) feature_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True) print(feature_importance)
def test_raw_function(): """ Make sure passing a simple masking function works. """ X, _ = shap.datasets.boston() def test(X): return np.sum(X, 1) def custom_masker(mask, x): return (x * mask).reshape( 1, len(x)) # just zero out the features we are masking explainer = shap.Explainer(test, custom_masker) shap_values = explainer(X[:100]) assert np.var(shap_values.values - shap_values.data) < 1e-6