def decisions_regression( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, ): fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds.lp[:10], :], X_vald.loc[df_preds.index[:10]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join(model_file_path, f"{learner_name}_shap_worst_decisions.png")) plt.close("all") fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds.lp[-10:], :], X_vald.loc[df_preds.index[-10:]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join(model_file_path, f"{learner_name}_shap_best_decisions.png")) plt.close("all")
def explain_row_shap(scaled_row, explainer, nsamples=100, verbose=0): shap_values = explainer.shap_values(scaled_row.reshape(1, shape_size), nsamples=nsamples, l1_reg="num_features(32)") if (verbose == 1): shap.decision_plot(explainer.expected_value[0], shap_values[0][0, :], scaled_row, feature_names=list( df.drop('loan_repaid', axis=1).columns), link="logit") map_values = {} for class_value in range(len(shap_values)): s = shap_values[class_value][0] sorted_indices = sorted(range(len(s)), key=lambda k: s[k], reverse=True) # print(sorted_indices) ordered_list = [(a, shap_values[class_value][0][a]) for a in sorted_indices if shap_values[class_value][0][a] > 0] map_values[class_value] = ordered_list # print(map_values) return map_values
def get_decision_plot(patente, step_id_week): f = plt.figure() ranker = joblib.load( 'C:/Users/raskolnnikov/Desktop/projects/samtech/samtech_entrega/modelos/ranker_v_1.0.joblib' ) explainer = shap.TreeExplainer(ranker) ranker_features = ranker.feature_names expected_value = explainer.expected_value entry = RankingEntry.query.filter_by( patente=patente, step_id_week=step_id_week).one().instance_json instance = pd.read_json(entry, orient='records').T instance.columns = ranker_features shap_value = ranker.predict( xgb.DMatrix(instance[ranker_features]), pred_contribs=True, )[0, :-1] shap.decision_plot(expected_value, shap_value, instance.iloc[0, :], link='logit', highlight=0, show=False) buf = BytesIO() f.savefig(buf, format="png", dpi=150, bbox_inches='tight') buf.seek(0) f.clear() plt.close(f) img_base64 = base64.b64encode(buf.read()) response = { "image": img_base64.decode(), } return response, 200
def shap_decision_plot(expected_value, shap_values, samples, save=True, crop_feature_names=20): if crop_feature_names: feature_names = [] for col in samples.columns: if len(col) > crop_feature_names: feature_names.append(col[:20]) else: feature_names.append(col) shap.decision_plot(expected_value, shap_values, features=samples, feature_names=feature_names, show=False) else: shap.decision_plot(expected_value, shap_values, features=samples, show=False) f = plt.gcf() f.show() if save: f.savefig("shap_decision_plot.png")
def combine_summary_decision_curve(shap_value, expected_value, features, feature_names, n_features, examples_subset_index, misclassified, link, save_path): """ Generate a combined SHAP Summary and Decision Plot Parameters ---------- shap_value: np.ndarray SHAP value for a particular output class expected_value: float Average of the classifier/model output over training dataset features: np.ndarray Features in testing dataset feature_names: np.array List of feature names n_features: int Maximum count of features to use for the plots examples_subset_index: list of indices Samples to select for the decision plot misclassified: list of Boolean values Denotes which selected samples are misclassified (set to true) link: string Link type to be used for the decision plot save_path: string File path where the plot will be saved """ figsize = (12, 6) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize) # Generate Summary plot plt.sca(ax1) shap.summary_plot(shap_value, features=features, feature_names=feature_names, sort=True, show=False, max_display=n_features) # Generate Decision plot plt.gcf().set_size_inches(figsize) plt.sca(ax2) feature_order = np.argsort(np.sum(np.abs(shap_value), axis=0)) shap.decision_plot(expected_value, shap_value[examples_subset_index], features[examples_subset_index, :], feature_names=list(feature_names), feature_display_range=slice(None, -(n_features + 1), -1), ignore_warnings=True, highlight=misclassified, show=False, plot_color='viridis', feature_order=feature_order, link=link) plt.plot([0.5, 0.5], [0, n_features], ':k', alpha=0.3) ax2.set_yticklabels([]) plt.savefig(save_path) plt.close()
def decision_plot(self, class_id=0, row_idx=-1, **kwargs): "Visualize model decision using cumulative `SHAP` values." shap_vals, exp_val = _get_values(self, class_id) n_rows = shap_vals.shape[0] if row_idx == -1: print(f'Displaying rows 0-9 of {n_rows} (use `row_idx` to specify another row)') return shap.decision_plot(exp_val, shap_vals[:10], self.test_data.iloc[:10], **kwargs) print(f'Displaying row {row_idx} of {n_rows} (use `row_idx` to specify another row)') return shap.decision_plot(exp_val, shap_vals[row_idx], self.test_data.iloc[row_idx], **kwargs)
def plot_shap_force(drug_id, expected_value, shap_values_test, data_for_shap, drug_names, X_train, force_plot_file_type, dpi, eval_label): curr_shap_value = shap_values_test.loc[ drug_id, :] # explainer.shap_values(curr_drug_features) curr_features = data_for_shap.loc[drug_id, :] drug_name = drug_names.loc[drug_id, 'Drug name'] title = 'Probability higher risk (%s)' % (drug_name) curr_feature_vals = np.array([ 'Yes' if bool(x) else "No" if X_train.columns[i] != 'Number of Category' else x for i, x in enumerate(curr_features.values) ]) p = shap.force_plot( base_value=expected_value, shap_values=curr_shap_value.values, #feature_names=[x.replace("Cluster: ",'').replace(';','\n') for x in X_train.columns], #x.split(': ')[1] if ': ' in x else feature_names=[ x.split(': ')[1] if ': ' in x else x for x in X_train.columns ], features= curr_feature_vals, #['Yes' if x else 'No' for x in curr_features.values], out_names=[title], figsize=(20, 4) # , show=False, matplotlib=True, text_rotation=int(45 / 2)) p.savefig(os.path.join('output', 'SHAP' + "_" + eval_label, drug_id + '.' + force_plot_file_type), dpi=dpi, bbox_inches='tight') # p.show() plt.close('all') shap.decision_plot( base_value=expected_value, shap_values=curr_shap_value.values, feature_names=[ x.split(': ')[1] if ': ' in x else x for x in X_train.columns ], features=curr_feature_vals, feature_display_range=slice(-1, -11, -1), title=title, show=False, #link='logit', #highlight=0 ) p = plt.gcf() p.savefig(os.path.join('output', 'SHAP' + "_" + eval_label, drug_id + '_decision_plot.' + force_plot_file_type), dpi=dpi, bbox_inches='tight') plt.close('all')
def decision_plot(self, X, y): """Visualization of the additive feature attribution.""" # Automates single-target slicing y = super()._check_target_index(y=y) for index in range(_n_targets(y)): self.fit(X=X, y=y, index=index) explainer, shap_values = self.explainer(X=X) shap.decision_plot(base_value=explainer.expected_value, shap_values=shap_values, feature_names=list(X.columns), show=self.show)
def visualise_explanation(explanation, per_class=True): """ Visualises an explanation of classification performance. Parameters ---------- explanation Output of `classification.explain_classifier()`. per_class : bool, optional Whether to also plot explanations at the level of the individual classes, or just the summary. """ # Summarise across all classes fig, ax = plt.subplots() shap.summary_plot(explanation.shap_values, explanation.data, class_names=explanation.clf_categories, max_display=15) ax.set_ylabel("Feature") ax.set_title("Classifier feature weightings (all classes)") fig.tight_layout() # And then break down by class if per_class: for i in range(len(explanation.shap_values)): # Summary plot: break down by feature fig, ax = plt.subplots() shap.summary_plot(explanation.shap_values[i], explanation.data, max_display=10) ax.set_ylabel("Feature") ax.set_title("Classifier feature weightings (class: {})".format( explanation.clf_categories[i])) fig.tight_layout() # Decision plot: break down by observation fig, ax = plt.subplots() shap.decision_plot(explanation.explainer.expected_value[i], explanation.shap_values[i], link='logit', features=explanation.data) ax.set_ylabel("Feature") ax.set_title("Classifier decision weightings (class: {})".format( explanation.clf_categories[i])) fig.tight_layout() return
def shap_explain(booster, datasource, dataset, summary_params, result_table="", is_pai=False, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) explainer.plot_and_save(plot_func, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name, filename='summary')
def decision_plot(self, fold, supersample_frac=.1, feature_display_range=None, cmap=None): """ For a small sample of test data points, starting from the models base prediction value, incrementely adding features to each prediction and plotting the path of the model score as a continuous line. Good for visualising how features across the model combine to create a diverse range of model scores. """ expected_value = self.fold_expected_values[fold] shap_values = self.shapley_values_array[:, :, fold] X = self.SHAP_X_sample super_sample = X.sample(frac=supersample_frac, replace=False) super_sample_idx = super_sample.reset_index().index.values super_sampled_shapley_values = shap_values[super_sample_idx, :] return shap.decision_plot(expected_value, super_sampled_shapley_values, super_sample, feature_display_range=feature_display_range, plot_color=cmap)
def local_plot(name, explainer, shap_values, feature_names, chosen_sample, estimand_name, X_test, plot_type='force_plot'): if plot_type == 'force_plot': h = shap.force_plot(base_value=explainer.expected_value, shap_values=shap_values[chosen_sample], features=X_test[chosen_sample], feature_names=feature_names, link="identity", out_names=estimand_name, matplotlib=False, show=False) save_plot(h, name) elif plot_type == 'decision_plot': h = shap.decision_plot(base_value=explainer.expected_value, shap_values=shap_values[chosen_sample], features=X_test[chosen_sample], feature_names=feature_names, link="identity", matplotlib=False, show=False) save_plot(h, name) return h
def decision_plot(self, X, y): """Visualization of the additive feature attribution.""" y = self._slice_target_index(y=y) for index in range(_n_targets(y)): if sklearn.utils.multiclass.type_of_target( y) == 'continuous-multioutput': self.fit(X, y.iloc[:, index].values.ravel(order='K')) else: self.fit(X, y) explainer, shap_values = self.explainer(X=X) shap.decision_plot(base_value=explainer.expected_value, shap_values=shap_values, feature_names=list(X.columns), show=self.show)
def shap_explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, result_table="", is_pai=False, pai_explain_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None, transform_fn=None, feature_column_code=""): x = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_field_meta, is_pai, pai_explain_table, transform_fn=transform_fn, feature_column_code=feature_column_code) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if result_table != "": if is_pai: from runtime.dbapi.paiio import PaiIOConnection conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values write_shap_values(to_write, conn, result_table, feature_column_names) if summary_params.get("plot_type") == "decision": explainer.plot_and_save( lambda: shap.decision_plot(expected_value, shap_interaction_values, x, show=False, feature_display_range=slice( None, -40, -1), alpha=1), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: explainer.plot_and_save( lambda: shap.summary_plot( shap_values, x, show=False, **summary_params), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def shap_explain(booster, datasource, dataset, summary_params, result_table): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32) ] * len(columns) _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) filename = 'summary.png' with temp_file.TemporaryDirectory(as_cwd=True): explainer.plot_and_save(plot_func, filename=filename) with open(filename, 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def decision_plot( self, num_samples=0.25, sample_no=None, output_file="", **decisionplot_kwargs ): """ Plots a SHAP decision plot. Parameters ---------- num_samples : int, float, or 'all', optional Number of samples to display, if less than 1 it will treat it as a percentage, 'all' will include all samples , by default 0.25 sample_no : int, optional Sample number to isolate and analyze, if provided it overrides num_samples, by default None Returns ------- DecisionPlotResult If return_objects=True (the default). Returns None otherwise. """ return_objects = decisionplot_kwargs.pop("return_objects", True) highlight = decisionplot_kwargs.pop("highlight", None) if sample_no is not None: if sample_no < 1 or not isinstance(sample_no, int): raise ValueError("Sample number must be greater than 1.") samples = slice(sample_no - 1, sample_no) else: if num_samples == "all": samples = slice(0, len(self.x_test_array)) elif num_samples <= 0: raise ValueError( "Number of samples must be greater than 0. If it is less than 1, it will be treated as a percentage." ) elif num_samples > 0 and num_samples < 1: samples = slice(0, int(num_samples * len(self.x_test_array))) else: samples = slice(0, num_samples) if highlight is not None: highlight = highlight[samples] s = shap.decision_plot( self.expected_value, self.shap_values[samples], self.x_train.columns, return_objects=return_objects, highlight=highlight, show=False, **decisionplot_kwargs, ) if output_file: # pragma: no cover pl.savefig(os.path.join(IMAGE_DIR, self.model_name, output_file)) return s
def get_shap(model, exp_data, i): shap.initjs() explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(exp_data) force_plot = shap.force_plot(explainer.expected_value[i], shap_values[i][0, :], exp_data.iloc[0, :]) decision_plot = shap.decision_plot(explainer.expected_value[i], shap_values[i][0, :], exp_data.iloc[0, :]) return force_plot
def explain(self, options, instance=None): if instance is None: raise ValueError("Instance was not provided") initjs() instance = instance.to_numpy() data = self._kmeans(options['kmeans_count']) \ if options['background_data'] == 'kmeans' else options['data'] nsamples = 'auto' if options['auto_nsamples'] else options['nsamples'] explainer = KernelExplainer(model=self.predict_function, data=data, link=options['link']) shap_values = explainer.shap_values(X=instance, nsamples=nsamples, l1_reg=options['l1_reg']) if self.is_classification: shap_values = shap_values[options['class_to_explain']] base_value = explainer.expected_value[[ options['class_to_explain'] ]] else: base_value = explainer.expected_value if options['plot_type'] == 'force' or options['plot_type'] == 'both': display( force_plot(base_value=base_value, shap_values=shap_values, features=instance, feature_names=self.feature_names, show=True, link=options['link'])) if options['plot_type'] == 'decision' or options['plot_type'] == 'both': decision_plot(base_value=base_value, shap_values=shap_values, features=instance, feature_names=list(self.feature_names), show=True, color_bar=True, link=options['link'])
def decisions_binary( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, ): # classes are from 0 ... for t in np.unique(y_vald): fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds[df_preds.target == t].lp[:10], :], X_vald.loc[df_preds[df_preds.target == t].index[:10]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join( model_file_path, f"{learner_name}_shap_class_{t}_worst_decisions.png", ) ) plt.close("all") fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds[df_preds.target == t].lp[-10:], :], X_vald.loc[df_preds[df_preds.target == t].index[-10:]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join( model_file_path, f"{learner_name}_shap_class_{t}_best_decisions.png" ) ) plt.close("all")
def explain(datasource, select, feature_field_meta, feature_column_names, label_spec, summary_params, result_table="", is_pai=False, pai_explain_table="", hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_field_meta, is_pai, pai_explain_table) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if result_table != "": if is_pai: # TODO(typhoonzero): the shape of shap_values is (3, num_samples, num_features) # use the first dimension here, should find out how to use the other two. write_shap_values(shap_values[0], "pai_maxcompute", None, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: conn = connect_with_data_source(datasource) write_shap_values(shap_values[0], conn.driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) return if summary_params.get("plot_type") == "decision": explainer.plot_and_save( lambda: shap.decision_plot(expected_value, shap_interaction_values, x, show=False, feature_display_range=slice( None, -40, -1), alpha=1), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: explainer.plot_and_save( lambda: shap.summary_plot( shap_values, x, show=False, **summary_params), is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def shap_plots(model, train_features, test_features, test_labels): print("Computing shapley values..") # compute SHAP values if isinstance( model, (MLP, MLPRegressor, MLPClassifier, ElasticNet, LogisticRegression)): train_sample = shap.sample(train_features, 10) explainer = shap.Explainer(model.predict, train_sample) elif isinstance(model, (RandomForestRegressor, RandomForestClassifier)): explainer = shap.TreeExplainer(model, train_features) else: explainer = shap.Explainer(model, train_features) shap_values = explainer(test_features) shap.plots.bar(shap_values, max_display=10) # shap.plots.bar(shap_values[0]) # Local # beeswarm plot shap.plots.beeswarm(shap_values) # Decision plot expected_value = explainer.expected_value select = range(20) features_sample = test_features.iloc[select] shap.decision_plot(expected_value, explainer.shap_values(features_sample), features_sample) # Heatmap shap.plots.heatmap(shap_values, max_display=10) # Scatter shap.plots.scatter(shap_values[:, "hs_child_age_None"], color=shap_values, alpha=0.8) # Feature clustering (redondant feature detection) clustering = shap.utils.hclust( test_features, test_labels ) # by default this trains (X.shape[1] choose 2) 2-feature XGBoost models shap.plots.bar(shap_values, clustering=clustering, clustering_cutoff=0.5)
def decision_plot(self, class_id=0, **kwargs): """ Visualize model decisions using cumulative SHAP values. Each colored line in the plot represents the model prediction for a single observation. Note that plotting too many samples at once can make the plot unintelligible. `class_id` is used to indicate the class of interest for classification models, it can ba an int or a string For an up-to-date list of the parameters, see: https://github.com/slundberg/shap/blob/master/shap/plots/decision.py For more informations, see: https://github.com/slundberg/shap/blob/master/notebooks/plots/decision_plot.ipynb """ # NOTE: there is a shap.multioutput_decision_plot but it uses a single row shap_values, expected_value = _get_values(self, class_id) return shap.decision_plot(expected_value, shap_values, self.test_data, **kwargs)
def explain_one_sample(self, X): ''' Draws decision plot for one sample @X => one sample dataframe ''' if X.shape[0] > 1: raise Exception( 'You need to pass only one sample of data for this function.\nIt means sample size (1, n_features)' ) explainer = shap.TreeExplainer(self.model[0]['clf']) shap_values = explainer.shap_values(self.process_data(X))[1] try: shap.decision_plot( explainer.expected_value[1], shap_values, ignore_warnings=False, feature_names=self.model[0]['teach_cols'].tolist()) except IndexError: shap.decision_plot( explainer.expected_value, shap_values, ignore_warnings=False, feature_names=self.model[0]['teach_cols'].tolist())
def _plot_decision_(self, expected_value: float, shap_values: List[np.ndarray] or np.ndarray, title: str = None, gene_names: bool = True, auto_size_plot: bool = True, minimum: int = 0.0, maximum: int = 0.0, feature_display_range=None, save: Path = None): #shap.summary_plot(shap_values, self.partitions.X, show=False) feature_names = None if gene_names is False else self.feature_names min_max = (self.partitions.data.y.min(), self.partitions.data.y.max()) print(f"min_max dataset values: {min_max}") xlim = (min(min_max[0], minimum), max(min_max[1], maximum)) shap.decision_plot(expected_value, shap_values, xlim=xlim, feature_names=feature_names.tolist(), title=title, auto_size_plot=auto_size_plot, feature_display_range=feature_display_range, show=False) return self.make_figure(save)
def explain(datasource, select, feature_field_meta, label_spec, summary_params): feature_column_names = [k["name"] for k in feature_field_meta] feature_specs = {k['name']: k for k in feature_field_meta} x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if summary_params.get("plot_type") == "decision": explainer.plot_and_save(lambda: shap.decision_plot( expected_value, shap_interaction_values, x, show=False, feature_display_range=slice(None, -40, -1), alpha=1)) else: explainer.plot_and_save(lambda: shap.summary_plot( shap_values, x, show=False, **summary_params))
""") explainer, shap_values = load_shap_explainer( root + 'shap_treeExplainer.bz2', X_test_transformed) st_shap( shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :], link='logit')) # ----------- shap.decision_plot(base_value=explainer.expected_value, shap_values=shap_values[0], features=X_test.iloc[0, :], link='logit', feature_display_range=slice( None, -X_test.shape[1] - 1, -1), return_objects=True, show=False, y_demarc_color='#00172b') fig = plt.gcf() ax = plt.gca() fig.patch.set_facecolor('#00172b') ax.set_facecolor('#00172b') ax.set_xlabel('Probability', fontsize=16, color='white') ax.tick_params(axis='both', colors='white') ax.grid(axis='both', color='white', linestyle='-', linewidth=0.25) for ln in ax.lines: ln.set_linewidth(3) for text in ax.texts: text.set_color('white')
Next, we use the SHAP values to build up 2D scatter graphs for every feature. They shows the effect of a feature for the prediction for every instance. fig, axs = plt.subplots(7,3,figsize=(16,22),squeeze=True) ind = 0 for ax in axs.flat: feat = bst.feature_names[ind] ax.scatter(x_df[feat],shap_values_XGB_test[:,ind],s=1,color='gray') # ax.set_ylim([-0.2,0.2]) ax.set_title(feat) ind+=1 plt.subplots_adjust(hspace=0.8) plt.savefig('shap_sc.png') **Decision_plot()** is interesting as it shows how the prediction is formed from the contributions of different features. shap.decision_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features) **Force_plot** is similar to decision_plot. We plot only the first 100 instances because it would be very slow to draw a force_plot with all the instances. shap.force_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features,figsize=(20,10)) **Waterfall_plot** is great when you want to analyse one instance. shap.waterfall_plot(explainerXGB.expected_value,shap_values_XGB_test[2000],x_df.iloc[2000],features) ### Other interpretation methods For the following methods, we need to use the Xgboost's Scikit-learn wrapper **XGBRegressor()** to make our Xgboost model to be compatible with the Scikit-learn ecosystem. m_depth = 5
for mode in [df_features_ec_season, df_features_ec_season_permuted]: shap_values = explainer.shap_values(mode, approximate=True, check_additivity=True) # dependence plots for name in mode.columns: shap.dependence_plot(name, shap_values[1], mode) # Summary plots shap.summary_plot(shap_values, mode, plot_type="bar") shap.summary_plot(shap_values[1], mode, plot_type="bar") shap.summary_plot(shap_values[1], mode) # Failure # Decision plots explaining decisions to classify shap.decision_plot(explainer.expected_value[1], shap_values[1], mode) shap.decision_plot(explainer.expected_value[1], shap_values[1][1], mode.iloc[1]) #2012 year # Calculate force plot for a given value 2012 shap.initjs() shap_values_2012 = explainer.shap_values(mode.iloc[[4]]) shap_display = shap.force_plot(explainer.expected_value[1], shap_values_2012[1], mode.iloc[[4]], matplotlib=True) #%% Predictions for 2C degree y_pred_2C = brf_model.predict(df_features_ec_2C_season) score_prc_2C = sum(y_pred_2C) / len(y_pred_2C) print("The ratio of failure seasons by total seasons for 2C is:", score_prc_2C)
shap.dependence_plot("user_level", shap_values, X_test) shap.dependence_plot("user_level", shap_values, X_test, interaction_index=None) shap.dependence_plot("education", shap_values, X_test) shap.dependence_plot("education", shap_values, X_test, interaction_index=None) # load JS visualization code to notebook shap.initjs() plt.clf() shap.force_plot(explainer.expected_value, shap_values[1594, :], X_test_disp.iloc[1594, :], matplotlib=True, show=False, figsize=(20, 4)) plt.savefig("prediction1.png", bbox_inches='tight') plt.clf() shap.force_plot(explainer.expected_value, shap_values[1594, :], X_test_disp.iloc[1594, :], matplotlib=True, show=False, figsize=(20, 4)) plt.savefig("prediction1.eps", bbox_inches='tight', format='eps') shap.decision_plot(explainer.expected_value, shap_values[1594, :], X_test_disp.iloc[1594, :])
def app(): st.markdown("""<style>.big-font {font-size:100px !important;}</style>""", unsafe_allow_html=True) st.markdown( """<style> .boxBorder { border: 2px solid #990066; padding: 10px; outline: #990066 solid 5px; outline-offset: 5px; font-size:25px; }</style> """, unsafe_allow_html=True) st.markdown('<div class="boxBorder"><font color="RED">Disclaimer: This predictive tool is only for research purposes</font></div>', unsafe_allow_html=True) st.write("## Model Perturbation Analysis") @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None}) def load_model2(): with open('saved_models/trainXGB_class_map.pkl', 'rb') as f: class_names = list(pickle.load(f)) return class_names class_names = load_model2() # @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None}) def load_model(): M_dict = {} for classname in class_names: M_dict[classname] = joblib.load('saved_models/trainXGB_gpu_{}.model'.format(classname)) return M_dict M_dict = load_model() @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None}) def load_model1(): with open('saved_models/trainXGB_gpu_{}.data'.format(class_names[0]), 'rb') as f: train = pickle.load(f) with open('saved_models/trainXGB_categorical_map.pkl', 'rb') as f: col_dict_map = pickle.load(f) return train, col_dict_map train, col_dict_map = load_model1() X = train[1]['X_valid'].copy() ids = list(train[3]['ID_test']) X.index = ids labels_pred = list(train[3]['y_pred_test']) labels_actual = list(train[3]['y_test']) # select_patient = st.selectbox("Select the patient", list(X.index), index=0) categorical_columns = [] numerical_columns = [] X_new = X.fillna('X') for col in X_new.columns: # if len(X_new[col].value_counts()) <= 10: if col_dict_map.get(col, None) is not None: categorical_columns.append(col) else: numerical_columns.append(col) st.write('### Please enter the following {} factors to perform prediction or select a random patient'.format(len(categorical_columns + numerical_columns))) # st.write("***Categorical Columns:***", categorical_columns) # st.write("***Numerical Columns:***", numerical_columns) from collections import defaultdict if st.button("Random Patient"): import random select_patient = random.choice(list(X.index)) else: select_patient = list(X.index)[0] select_patient_index = ids.index(select_patient) new_feature_input = defaultdict(list) for key, val in col_dict_map.items(): rval = {j:i for i,j in val.items()} X_new[key] = X_new[key].map(lambda x: rval.get(x, x)) st.write('--'*10) st.write('##### Note: X denoted NA values') col1, col2, col3, col4 = st.beta_columns(4) for i in range(0, len(categorical_columns), 4): with col1: if (i+0) >= len(categorical_columns): continue c1 = categorical_columns[i+0] idx = list(X_new[c1].unique()).index(X_new.loc[select_patient, c1]) f1 = st.selectbox("{}".format(feature_mapping[c1]), list(X_new[c1].unique()), index=idx) new_feature_input[c1].append(col_dict_map[c1].get(f1, np.nan)) with col2: if (i+1) >= len(categorical_columns): continue c2 = categorical_columns[i+1] idx = list(X_new[c2].unique()).index(X_new.loc[select_patient, c2]) f2 = st.selectbox("{}".format(feature_mapping[c2]), list(X_new[c2].unique()), index=idx) new_feature_input[c2].append(col_dict_map[c2].get(f2, np.nan)) with col3: if (i+2) >= len(categorical_columns): continue c3 = categorical_columns[i+2] idx = list(X_new[c3].unique()).index(X_new.loc[select_patient, c3]) f3 = st.selectbox("{}".format(feature_mapping[c3]), list(X_new[c3].unique()), index=idx) new_feature_input[c3].append(col_dict_map[c3].get(f3, np.nan)) with col4: if (i+3) >= len(categorical_columns): continue c4 = categorical_columns[i+3] idx = list(X_new[c4].unique()).index(X_new.loc[select_patient, c4]) f4 = st.selectbox("{}".format(feature_mapping[c4]), list(X_new[c4].unique()), index=idx) new_feature_input[c4].append(col_dict_map[c4].get(f4, np.nan)) for col in numerical_columns: X_new[col] = X_new[col].map(lambda x: float(x) if not x=='X' else np.nan) for i in range(0, len(numerical_columns), 4): with col1: if (i+0) >= len(numerical_columns): continue c1 = numerical_columns[i+0] idx = X_new.loc[select_patient, c1] f1 = st.number_input("{}".format(feature_mapping[c1]), min_value=X_new[c1].min(), max_value=X_new[c1].max(), value=idx) new_feature_input[c1].append(f1) with col2: if (i+1) >= len(numerical_columns): continue c2 = numerical_columns[i+1] idx = X_new.loc[select_patient, c2] f2 = st.number_input("{}".format(feature_mapping[c2]), min_value=X_new[c2].min(), max_value=X_new[c2].max(), value=idx) new_feature_input[c2].append(f2) with col3: if (i+2) >= len(numerical_columns): continue c3 = numerical_columns[i+2] idx = X_new.loc[select_patient, c3] f3 = st.number_input("{}".format(feature_mapping[c3]), min_value=X_new[c3].min(), max_value=X_new[c3].max(), value=idx) new_feature_input[c3].append(f3) with col4: if (i+3) >= len(numerical_columns): continue c4 = numerical_columns[i+3] idx = X_new.loc[select_patient, c4] f4 = st.number_input("{}".format(feature_mapping[c4]), min_value=X_new[c4].min(), max_value=X_new[c4].max(), value=idx) new_feature_input[c4].append(f4) st.write('--'*10) st.write("### Do you want to see the effect of changing a factor on this patient?") color_discrete_map = {} color_discrete_map_list = ["red", "green", "blue", "goldenred", "magenta", "yellow", "pink", "grey"] for e, classname in enumerate(class_names): color_discrete_map[classname] = color_discrete_map_list[e] show_whatif = st.checkbox("Enable what-if analysis") col01, col02 = st.beta_columns(2) with col01: st.write('### Prediction on actual feature values') feature_print = X_new.loc[select_patient, :].fillna('X') feature_print.index = feature_print.index.map(lambda x: feature_mapping[x]) feature_print = feature_print.reset_index() feature_print.columns = ["Feature Name", "Feature Value"] st.table(feature_print.set_index("Feature Name").astype(str)) predicted_prob = defaultdict(list) predicted_class = -1 max_val = -1 for key, val in M_dict.items(): predicted_prob['predicted_probability'].append(val.predict(xgb.DMatrix(X.loc[select_patient, :].values.reshape(1, -1), feature_names=X.columns))[0]) predicted_prob['classname'].append(key) if predicted_prob['predicted_probability'][-1] > max_val: predicted_class = key max_val = predicted_prob['predicted_probability'][-1] K = pd.DataFrame(predicted_prob) K['predicted_probability'] = K['predicted_probability'] / K['predicted_probability'].sum() K['color'] = ['zed' if i==predicted_class else 'red' for i in list(predicted_prob['classname']) ] # fig = px.bar(K, x='predicted_probability', y='classname', color='color', width=500, height=400, orientation='h') # # fig = px.bar(K, y='predicted_probability', x=sorted(list(predicted_prob['classname'])), width=500, height=400) # fig.update_layout( # legend=None, # yaxis_title="Class Labels", # xaxis_title="Predicted Probability", # font=dict( # family="Courier New, monospace", # size=12, # color="RebeccaPurple" # ), # margin=dict(l=10, r=10, t=10, b=10), # ) # st.plotly_chart(fig) import altair as alt K = K.rename(columns={"classname": "Class Labels", "predicted_probability": "Predicted Probability"}) f = alt.Chart(K).mark_bar().encode( y=alt.Y('Class Labels:N',sort=alt.EncodingSortField(field="Predicted Probability", order='descending')), x=alt.X('Predicted Probability:Q'), color=alt.Color('color', legend=None), ).properties(width=500, height=300) st.write(f) # st.write('#### Trajectory for Predicted Class') st.write('#### Model Output Trajectory for {} Class using SHAP values'.format(predicted_class)) @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None}) def load_model5(): with open('saved_models/trainXGB_gpu_{}.data'.format(predicted_class), 'rb') as f: new_train = pickle.load(f) return new_train new_train = load_model5() exval = new_train[2]['explainer_train'] explainer_train = shap.TreeExplainer(M_dict[predicted_class]) t1 = pd.DataFrame(X.loc[select_patient, :]).T t2 = pd.DataFrame(X_new.loc[select_patient, :].fillna('X')).T shap_values_train = explainer_train.shap_values(t1) shap.force_plot(exval, shap_values_train, t1, show=False, matplotlib=True) st.pyplot() fig, ax = plt.subplots() r = shap.decision_plot(exval, shap_values_train, t2, link='logit', return_objects=True, new_base_value=0, highlight=0) st.pyplot(fig) if show_whatif: with col02: dfl = pd.DataFrame(new_feature_input) ndfl = dfl.copy() for key, val in col_dict_map.items(): rval = {j:i for i,j in val.items()} ndfl[key] = ndfl[key].map(lambda x: rval.get(x, x)) st.write('### Prediction with what-if analysis') feature_print_what = ndfl.iloc[0].fillna('X') feature_print_what.index = feature_print_what.index.map(lambda x: feature_mapping[x]) feature_print_what = feature_print_what.reset_index() feature_print_what.columns = ["Feature Name", "Feature Value"] selected = [] for i in range(len(feature_print_what)): if feature_print.iloc[i]["Feature Value"] == feature_print_what.iloc[i]["Feature Value"]: pass else: selected.append(feature_print.iloc[i]["Feature Name"]) # st.table(feature_print) st.table(feature_print_what.astype(str).set_index("Feature Name").style.apply(lambda x: ['background: yellow' if (x.name in selected) else 'background: lightgreen' for i in x], axis=1)) dfl = dfl[X.columns].replace('X', np.nan) predicted_prob = defaultdict(list) predicted_class = -1 max_val = -1 for key, val in M_dict.items(): predicted_prob['predicted_probability'].append(val.predict(xgb.DMatrix(dfl.iloc[0, :].values.reshape(1, -1), feature_names=dfl.columns))[0]) predicted_prob['classname'].append(key) if predicted_prob['predicted_probability'][-1] > max_val: predicted_class = key max_val = predicted_prob['predicted_probability'][-1] K = pd.DataFrame(predicted_prob) K['predicted_probability'] = K['predicted_probability'] / K['predicted_probability'].sum() K['color'] = ['zed' if i==predicted_class else 'red' for i in list(predicted_prob['classname']) ] import altair as alt K = K.rename(columns={"classname": "Class Labels", "predicted_probability": "Predicted Probability"}) f = alt.Chart(K).mark_bar().encode( y=alt.Y('Class Labels:N',sort=alt.EncodingSortField(field="Predicted Probability", order='descending')), x=alt.X('Predicted Probability:Q'), color=alt.Color('color', legend=None), ).properties( width=500, height=300) st.write(f) # fig = px.bar(K, x='predicted_probability', y='classname', color='color', width=500, height=400, orientation='h') # # fig = px.bar(K, y='predicted_probability', x=sorted(list(predicted_prob['classname'])), width=500, height=400) # fig.update_layout( # legend=None, # yaxis_title="Class Labels", # xaxis_title="Predicted Probability", # font=dict( # family="Courier New, monospace", # size=12, # color="RebeccaPurple" # ), # margin=dict(l=10, r=10, t=10, b=10), # ) # st.plotly_chart(fig) st.write('#### Model Output Trajectory for {} Class using SHAP values'.format(predicted_class)) @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None}) def load_model6(): with open('saved_models/trainXGB_gpu_{}.data'.format(predicted_class), 'rb') as f: new_train = pickle.load(f) return new_train new_train = load_model6() exval = new_train[2]['explainer_train'] explainer_train = shap.TreeExplainer(M_dict[predicted_class]) t1 = dfl.copy() shap_values_train = explainer_train.shap_values(t1) shap.force_plot(exval, shap_values_train, t1, show=False, matplotlib=True) st.pyplot() fig, ax = plt.subplots() _ = shap.decision_plot(exval, shap_values_train, ndfl.fillna('X'), link='logit', feature_order=r.feature_idx, return_objects=True, new_base_value=0, highlight=0) st.pyplot(fig)