def dependence(shap_values, X_vald, model_file_path, learner_name, file_postfix=""): with warnings.catch_warnings(): warnings.simplefilter("ignore") fig = plt.figure(figsize=(14, 7)) plots_cnt = np.min([9, X_vald.shape[1]]) cols_cnt = 3 rows_cnt = 3 if plots_cnt < 4: rows_cnt = 1 elif plots_cnt < 7: rows_cnt = 2 for i in range(plots_cnt): ax = fig.add_subplot(rows_cnt, cols_cnt, i + 1) shap.dependence_plot( f"rank({i})", shap_values, X_vald, show=False, title=f"Importance #{i+1}", ax=ax, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join( model_file_path, f"{learner_name}_shap_dependence{file_postfix}.png")) plt.close("all")
def test_lightgbm_binary(): try: import lightgbm except: print("Skipping test_lightgbm_binary!") return import shap from sklearn.model_selection import train_test_split # train lightgbm model X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.adult(), test_size=0.2, random_state=0) model = lightgbm.sklearn.LGBMClassifier() model.fit(X_train, Y_train) # explain the model's predictions using SHAP values shap_values = shap.TreeExplainer(model).shap_values(X_test) # validate structure of shap values, must be a list of ndarray for both classes assert isinstance(shap_values, list) assert len(shap_values) == 2 # ensure plot works for first class shap.dependence_plot(0, shap_values[0], X_test, show=False)
def plot_shap(model, test, instance=None, feature=None, dataset=False): """ Displays shap plots to explain a black box model. :param model: the model considered. The shap plots are calculated only after the model has been fit. :param test: test dataset. :param instance: instance of the test dataset to explain. default_value=None :param feature: feature of the test dataset to explain. default_value=None :param dataset: if True the entire dataset is taken into account. default_value=False :return: """ # Make an explainer on the model given. Not all the models are supported explainer = TreeExplainer(model) # Compute SHAP values shap_values = explainer.shap_values(test) initjs() # If not None explain single prediction if instance is not None: force_plot(explainer.expected_value, shap_values[instance, :], test.iloc[instance, :], matplotlib=True) # If not None explain single feature if feature is not None: fig, ax = plt.subplots(figsize=(13, 10)) dependence_plot(feature, shap_values, test, ax=ax) # If True explain the entire dataset if dataset: summary_plot(shap_values, test, plot_size=(8, 8)) summary_plot(shap_values, test, plot_type="bar", plot_size=(8, 8))
def test_front_page_sklearn(): import sklearn.ensemble import shap # load JS visualization code to notebook shap.initjs() # train model X, y = shap.datasets.boston() models = [ sklearn.ensemble.RandomForestRegressor(n_estimators=100), sklearn.ensemble.ExtraTreesRegressor(n_estimators=100), ] for model in models: model.fit(X, y) # explain the model's predictions using SHAP values explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) # visualize the first prediction's explaination shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :]) # visualize the training set predictions shap.force_plot(explainer.expected_value, shap_values, X) # create a SHAP dependence plot to show the effect of a single feature across the whole dataset shap.dependence_plot(5, shap_values, X, show=False) shap.dependence_plot("RM", shap_values, X, show=False) # summarize the effects of all the features shap.summary_plot(shap_values, X, show=False)
def test_front_page_xgboost(): import xgboost import shap # load JS visualization code to notebook shap.initjs() # train XGBoost model X,y = shap.datasets.boston() bst = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100) # explain the model's predictions using SHAP values (use pred_contrib in LightGBM) shap_values = bst.predict(xgboost.DMatrix(X), pred_contribs=True) # visualize the first prediction's explaination shap.force_plot(shap_values[0,:], X.iloc[0,:]) # visualize the training set predictions shap.force_plot(shap_values, X) # create a SHAP dependence plot to show the effect of a single feature across the whole dataset shap.dependence_plot(5, shap_values, X, show=False) shap.dependence_plot("RM", shap_values, X, show=False) # summarize the effects of all the features shap.summary_plot(shap_values, X, show=False)
def shap_dependence_plot(self, ind, interaction_index, interaction=False): try: if not interaction: # explainer, shap_values, expected_value = self.calc_shap_values(attr=None, # background_sample=background_sample, # ) shap.dependence_plot(ind=ind, interaction_index=interaction_index, shap_values=self.shap_v, features=self.x_train, display_features=self.x_train, show=False) fig_id = str(time.time()).split('.')[0] path = save_fig('dependence_plot_{}_{}'.format(ind, fig_id)) return path # return else: explainer, shap_inter_values, expected_value = self.calc_shap_inter_values() shap_inter_values = np.array(shap_inter_values) shap.dependence_plot((ind, interaction_index), shap_inter_values, features=self.x_train, display_features=self.x_train, show=False) fig_id = str(time.time()).split('.')[0] path = save_fig('inter_dependence_{}_{}'.format(ind, fig_id)) return path # return except Exception as err: print('Error: model is not supported by SHAP dependence plot') err_logging(err) raise Exception(err)
def test_front_page_xgboost(): try: import xgboost except Exception as e: print("Skipping test_front_page_xgboost!") return import shap # load JS visualization code to notebook shap.initjs() # train XGBoost model X, y = shap.datasets.boston() model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100) # explain the model's predictions using SHAP values explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) # visualize the first prediction's explaination shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :]) # visualize the training set predictions shap.force_plot(explainer.expected_value, shap_values, X) # create a SHAP dependence plot to show the effect of a single feature across the whole dataset shap.dependence_plot(5, shap_values, X, show=False) shap.dependence_plot("RM", shap_values, X, show=False) # summarize the effects of all the features shap.summary_plot(shap_values, X, show=False)
def test_front_page_xgboost(): xgboost = pytest.importorskip("xgboost") # load JS visualization code to notebook shap.initjs() # train XGBoost model X, y = shap.datasets.california(n_points=500) model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100) # explain the model's predictions using SHAP values explainer = shap.GPUTreeExplainer(model) shap_values = explainer.shap_values(X) # visualize the first prediction's explaination shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :]) # visualize the training set predictions shap.force_plot(explainer.expected_value, shap_values, X) # create a SHAP dependence plot to show the effect of a single feature across the whole dataset shap.dependence_plot(5, shap_values, X, show=False) shap.dependence_plot("Longitude", shap_values, X, show=False) # summarize the effects of all the features shap.summary_plot(shap_values, X, show=False)
def test_random_dependence(): fig, ax = plt.subplots(1, 1) shap.dependence_plot(0, np.random.randn(20, 5), np.random.randn(20, 5), show=False, ax=ax)
def test_random_dependence(): """ Make sure a dependence plot does not crash. """ shap.dependence_plot(0, np.random.randn(20, 5), np.random.randn(20, 5), show=False)
def shapear(test): """ Explain features """ # Open model del test[target] with open(target + '_model.pkl', 'rb') as f: model = pickle.load(f) shap.initjs() shap_values = shap.TreeExplainer(model).shap_values(test) global_shap_vals = np.abs(shap_values).mean(0) global_shap_std = np.abs(shap_values).std(0) df = pd.DataFrame() df['features'] = test.columns df['shap'] = global_shap_vals df['shap_std'] = global_shap_std df = df.sort_values(by='shap', ascending=False) df.index = range(len(df)) df.to_csv('shaps.csv') # Summary plot pdf_shap = PdfPages(target + '_shap.pdf') top_inds = np.argsort(-np.sum(np.abs(shap_values), 0)) for i in top_inds: plt.figure() shap.dependence_plot(top_inds[i], shap_values, test, show=False, interaction_index=None, alpha=0.2) pdf_shap.savefig() plt.close() pdf_shap.close() return
def plot_shap_dependence(self, treatment_group, feature_idx, shap_dict=None, interaction_idx='auto', **kwargs): """ Plots dependency of shapley values for a specified feature, colored by an interaction feature. Skips the calculation part if shap_dict is given. This plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extenstion of the classical parital dependence plots. Vertical dispersion of the data points represents interaction effects. Args: treatment_group (str or int): name of treatment group to create dependency plot on feature_idx (str or int): feature index / name to create dependency plot on shap_dict (optional, dict): a dict of shapley value matrices. If None, shap_dict will be computed. interaction_idx (optional, str or int): feature index / name used in coloring scheme as interaction feature. If "auto" then shap.common.approximate_interactions is used to pick what seems to be the strongest interaction (note that to find to true stongest interaction you need to compute the SHAP interaction values). """ if shap_dict is None: shap_dict = self.get_shap_values() shap_values = shap_dict[treatment_group] shap.dependence_plot(feature_idx, shap_values, self.X, interaction_index=interaction_idx, feature_names=self.features, **kwargs)
def test_random_dependence_no_interaction(): """ Make sure a dependence plot does not crash when we are not showing interations. """ shap.dependence_plot(0, np.random.randn(20, 5), np.random.randn(20, 5), show=False, interaction_index=None)
def SHAP_DepenContrib(X,feature_names,feature,shap_values,interact='auto'): import shap print('Creating SHAP dependence contribution plot') # SHAP dependence contribution plots # plt.figure() shap.dependence_plot(feature, shap_values[1],features=X,feature_names=feature_names,show=False,interaction_index=interact)
def test_random_dependence_no_interaction(): fig, ax = plt.subplots(1, 1) shap.dependence_plot(0, np.random.randn(20, 5), np.random.randn(20, 5), show=False, interaction_index=None, ax=ax)
def generate_shap_dependency_plot(self): shap_values = self.shapexplainer.shap_values(self.training_data) plt.clf() for ftr in range(len(self.feature_names)): shap.dependence_plot(ftr, shap_values, self.training_data,feature_names=self.feature_names, show=False) # results_dir = os.path.join(os.getcwd(), '/app/home/static/graphs/') # plt.savefig(results_dir+'shap_summary.png') plt.savefig('shap_dependency_plot_' + self.feature_names[ftr] + '.png')
def show_SHAP_PDP_interaction(self, features=[]): for f in features: try: shap.dependence_plot(tuple(f), self.explainer.shap_interaction_values(self.X_train), self.X_train) except: print("Linear estimators don't have interaction values.") return
def test_xgboost_mixed_types(): xgboost = pytest.importorskip('xgboost') X, y = shap.datasets.california(n_points=500) X["HouseAge"] = X["HouseAge"].astype(np.int64) X['IsOld'] = (X['HouseAge'] > 30) bst = xgboost.train({"learning_rate": 0.01, "silent": 1}, xgboost.DMatrix(X, label=y), 1000) shap_values = shap.TreeExplainer(bst).shap_values(X) shap.dependence_plot(0, shap_values, X, show=False)
def plot_print_feature_shap(model_path, data_feats, type): ''' 利用shap打印特征重要度 :param model_path: :param data_feats: :param type: :return: ''' if not (os.path.exists(model_path) and os.path.exists(data_feats)): print("file no exists! {}, {}".format(model_path, data_feats)) sys.exit(0) gbm = lgb.Booster(model_file=model_path) gbm.params["objective"] = "regression" #feature列名 feats_col_name = [] for feat_index in range(46): feats_col_name.append('feat' + str(feat_index) + 'name') X_train, _ = ds.load_svmlight_file(data_feats) #features feature_mat = X_train.todense() df_feature = pd.DataFrame(feature_mat) #增加表头 df_feature.columns = feats_col_name explainer = shap.TreeExplainer(gbm) shap_values = explainer.shap_values(df_feature[feats_col_name]) #特征总体分析,分别绘出散点图和条状图 if type == 1: #把一个特征对目标变量影响程度的绝对值的均值作为这个特征的重要性(不同于feature_importance的计算方式) shap.summary_plot(shap_values, df_feature[feats_col_name], plot_type="bar") # 对特征总体分析 shap.summary_plot(shap_values, df_feature[feats_col_name]) #部分依赖图的功能,与传统的部分依赖图不同的是,这里纵坐标不是目标变量y的数值而是SHAP值 if type == 2: shap.dependence_plot('feat3name', shap_values, df_feature[feats_col_name], interaction_index=None, show=True) # 两个变量交互下变量对目标值的影响 if type == 3: shap.dependence_plot('feat3name', shap_values, df_feature[feats_col_name], interaction_index='feat5name', show=True) #多个变量的交互进行分析 if type == 4: shap_interaction_values = explainer.shap_interaction_values( df_feature[feats_col_name]) shap.summary_plot(shap_interaction_values, df_feature[feats_col_name], max_display=4, show=True)
def test_xgboost_mixed_types(): xgboost = pytest.importorskip('xgboost') X, y = shap.datasets.boston() X["LSTAT"] = X["LSTAT"].astype(np.int64) X["B"] = X["B"].astype(np.bool) bst = xgboost.train({"learning_rate": 0.01, "silent": 1}, xgboost.DMatrix(X, label=y), 1000) shap_values = shap.TreeExplainer(bst).shap_values(X) shap.dependence_plot(0, shap_values, X, show=False)
def plot_shap_values( shap_values: dict, raw_data: pd.core.frame.DataFrame, processed_data: Union[None, pd.core.frame.DataFrame] = None, no_summary_col: str = Union[None, str], alpha: float = 0.5, path: str = "", ) -> None: """Make plots of SHAP values. SHAP values quantify feature contributions to predictions. Args: shap_values: A dictionary of numpy arrays, each of which contains SHAP values for the outcome given by its key. raw_data: Feature values prior to processing into model input. processed_data: Feature values used as model input. no_summary_col: The name of a column to never use for summary plots. alpha: The opacity of plotted points, from 2e-8 (nearly transparent) to 1 (opaque). path: The path preceding the Output folder in which the plots will be saved. """ shap.initjs() if processed_data is None: processed_data = raw_data for col in processed_data.select_dtypes("category"): processed_data[col] = processed_data[col].cat.codes for key, arr in shap_values.items(): shap.summary_plot(arr, plot_type="bar", feature_names=raw_data.columns, show=False) save_plot(f"Importance_{key}", path=path) shap.summary_plot(arr, raw_data, alpha=alpha, show=False) save_plot(f"Summary_{key}", path=path) if raw_data.columns[np.argmax( np.abs(arr).mean(axis=0))] == no_summary_col: shap.dependence_plot( f"rank(1)", arr, processed_data, display_features=raw_data, alpha=alpha, show=False, ) else: shap.dependence_plot( f"rank(0)", arr, processed_data, display_features=raw_data, alpha=alpha, show=False, ) save_plot(f"Dependence_{key}", path=path)
def plot_shap(X, model, label): shap_values = shap.TreeExplainer(model).shap_values(X.values) pyplot.figure()#figsize=(10, 15)) title = f'SHAP summary {label}' pyplot.title(title) shap.summary_plot(shap_values, X) pyplot.show() title = f'SHAP dependence for {label}' for col in X.columns.difference(['PATIENT_AGE_YEARS', 'PATIENT_GNDR']): shap.dependence_plot(col, shap_values, X, interaction_index=None)
def makeDependence(X_train, shap_values): for col in X_train.columns: for i in len(shap.values): f = plt.figure() shap.dependence_plot(col, shap_values[1], X_train) f.savefig(col + "_dependence.png", bbox_inches='tight', dpi=600) pass
def shap_dep_plot(self, top_features, outcome): shap_values = self.explainer.shap_values(self.x) shap.dependence_plot(top_features[0], shap_values[outcome], self.x, interaction_index=top_features[1], show=False) plt.tight_layout() plt.savefig(self.out + "/shap_dependence.jpg", dpi=400) plt.close()
def test_dependence_one_string_feature_auto_interaction(): X = _create_sample_dataset(string_features={"Sex"}) shap.dependence_plot( "Sex", np.random.randn(*X.values.shape), X, interaction_index='auto', show=False )
def test_dependence_one_string_feature(): """ Test the dependence plot with a string feature. """ X = _create_sample_dataset(string_features={"Sex"}) shap.dependence_plot("Sex", np.random.randn(*X.values.shape), X, interaction_index="Age", show=False)
def plot_dependency(self, feature: str, interaction_index: str = "auto", save: Path = None): shap.dependence_plot(feature, self.stable_shap_values, self.partitions.X, feature_names=self.feature_names, interaction_index=interaction_index) return self.first.make_figure(save)
def test_dependence_two_string_features(): """ Test the dependence plot with two string features. """ X = _create_sample_dataset(string_features={"Sex", "Blood group"}) shap.dependence_plot("Sex", np.random.randn(*X.values.shape), X, interaction_index="Blood group", show=False)
def shap_dependence_viz(self, shap_df, features_df, model_dict, i, name): shap.dependence_plot(i, shap_df.loc[:, model_dict['features_list']].values, features_df.loc[:, model_dict['features_list']], show=(not self.plots_dict['save']['plots'])) if self.plots_dict['save']['plots'] is True: dependence_path = '{}/dependence_plots'.format(self.plots_dir) if not os.path.exists(dependence_path): os.mkdir(dependence_path) plt.savefig(f'{dependence_path}/dependence_plot_{name}_{i}.png') plt.clf()
def test_lightgbm_multiclass(): lightgbm = pytest.importorskip("lightgbm") # train lightgbm model X, Y = shap.datasets.iris() model = lightgbm.sklearn.LGBMClassifier() model.fit(X, Y) # explain the model's predictions using SHAP values shap_values = shap.TreeExplainer(model).shap_values(X) # ensure plot works for first class shap.dependence_plot(0, shap_values[0], X, show=False)