def plot_shap_summary( shap_values, features, plot_type=None, figsize=None, color=None, max_display=None, feature_names=None, title=None, show=True, sort=True, color_bar=True, layered_violin_max_num_bins=None, class_names=None, class_inds=None, color_bar_label=None, ): """Function to plot shap summary plot. This function is a helper function to plot the shap summary plot based on all types of shap explainers including tree, linear, and dnn. Parameters ---------- shap_values: Numpy array or Pandas DataFrame Calculated SHAP values. For single output explanations like binary classificationthis this is a matrix of SHAP values (n_samples, n_features). For multi-output explanations this is a list of such matrices of SHAP values features: Numpy array or Pandas DataFrame The feature matrix that was used to calculate the SHAP values. For the case of Numpy array it is recommened to pass the feature_names list as well plot_type: str, optional (single-output default="dot", multi-output default="bar") The type of summar plot. Options are "bar", "dot", "violin", and "compact_dot" which is recommended for SHAP interactions figsize: tuple, optional, (default="auto") Figure size color: str, optional, (default="#D0AAF3") Color of the horizontal lines when plot_type="bar" max_display: int, optional, (default=20) Limit to show the number of features in the plot feature_names: str, optional, (default=None) List of feature names to pass. It should follow the order of fatures title: str, optional, (default=None) Title of the plot show: bool, optional, (default=True) Flag to show the plot in inteactive environment sort: bool, optional, (default=True) Flag to plot sorted shap vlues in descending order color_bar: bool, optional, (default=True) Flag to show color_bar when plot_type is "dot" or "violin" layered_violin_max_num_bins: int, optional, (default=20) The number of bins for calculating the violin plots ranges and outliers class_names: list, optional, (default=None) List of class names for multi-output problems class_inds: list, optional, (default=True) List of class indices for multi-output problems color_bar_label: str, optional, (default="Feature Value") Label for color bar """ # initializing figsize if figsize is None: figsize = "auto" elif isinstance(figsize, list) or isinstance(figsize, tuple): figsize = figsize else: raise TypeError("Only tuple and list types are allowed for figsize.") # initializing color if color is None: color = "#D0AAF3" elif isinstance(color, str): color = color else: raise TypeError("Only str type is allowed for color.") # initializing layered_violin_max_num_bins if layered_violin_max_num_bins is None: layered_violin_max_num_bins = 20 elif isinstance(layered_violin_max_num_bins, int): layered_violin_max_num_bins = layered_violin_max_num_bins else: raise TypeError( "Only int type is allowed for layered_violin_max_num_bins.") # initializing color_bar_label if color_bar_label is None: color_bar_label = "Feature Value" elif isinstance(color_bar_label, int): color_bar_label = color_bar_label else: raise TypeError("Only str type is allowed for color_bar_label.") shap.summary_plot( shap_values, features, plot_type=plot_type, plot_size=figsize, color=color, max_display=max_display, feature_names=feature_names, title=title, show=show, sort=sort, color_bar=color_bar, layered_violin_max_num_bins=layered_violin_max_num_bins, class_names=class_names, class_inds=class_inds, color_bar_label=color_bar_label, ) plt.show()
if st.sidebar.checkbox("Afficher les informations du client?"): st.write("Statut famille :**", selected_id["NAME_FAMILY_STATUS"].iloc[0], "**") st.write("Nombre d'enfant(s) :**", selected_id["CNT_CHILDREN"].iloc[0], "**") st.write("Age client :**", int(selected_id["DAYS_BIRTH"].values / 365), "**", "ans.") st.write("DAYS_LAST_PHONE_CHANGE", selected_id['DAYS_LAST_PHONE_CHANGE'].iloc[0]) st.write("AMT CREDIT", selected_id['AMT_CREDIT'].iloc[0]) st.write("AMT INCOME TOTAL", selected_id['AMT_INCOME_TOTAL'].iloc[0]) st.write("AMT ANNUITY", selected_id['AMT_ANNUITY'].iloc[0]) fig, axs = plt.subplots(nrows=1, ncols=1) shap.summary_plot(shap_values[0], X, plot_type='bar') st.sidebar.pyplot(fig) fig1, ax = plt.subplots(nrows=1, ncols=1) shap.summary_plot(shap_values[0], X) st.sidebar.pyplot(fig1) vals = np.abs(shap_values[0]) feature_importance = pd.DataFrame( list(zip(X.columns, sum(vals))), columns=['col_name', 'feature_importance_vals']) feature_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True) val = feature_importance['col_name'].head(6)
plt.title('Number of cases above a failure prediction level') #%% FURTHER EXPLORATION: train explainer shap, explore predictions and how decisions are called import shap explainer = shap.TreeExplainer(brf_model) for mode in [df_features_ec_season, df_features_ec_season_permuted]: shap_values = explainer.shap_values(mode, approximate=True, check_additivity=True) # dependence plots for name in mode.columns: shap.dependence_plot(name, shap_values[1], mode) # Summary plots shap.summary_plot(shap_values, mode, plot_type="bar") shap.summary_plot(shap_values[1], mode, plot_type="bar") shap.summary_plot(shap_values[1], mode) # Failure # Decision plots explaining decisions to classify shap.decision_plot(explainer.expected_value[1], shap_values[1], mode) shap.decision_plot(explainer.expected_value[1], shap_values[1][1], mode.iloc[1]) #2012 year # Calculate force plot for a given value 2012 shap.initjs() shap_values_2012 = explainer.shap_values(mode.iloc[[4]]) shap_display = shap.force_plot(explainer.expected_value[1], shap_values_2012[1], mode.iloc[[4]], matplotlib=True)
import joblib import shap import pandas as pd import matplotlib.pyplot as plt import xgboost as xgb from pylab import rcParams rcParams['figure.figsize'] = 8, 16 X_test = pd.read_csv('models/X_test.csv', index_col = 0) Y_test = pd.read_csv('models/Y_test.csv', index_col = 0) shap.initjs() xg_reg = joblib.load('models/XG_boost.model') explainer = shap.TreeExplainer(xg_reg) shap_values = explainer.shap_values(X_test) # visualize the first prediction's explanation shap.summary_plot(shap_values, X_test, show=False) plt.tight_layout() plt.savefig("plots/shap_summary_plot.png")
fig = px.bar(fi_dt, x='Importance', y='Feature', orientation='h', color='Importance') st.plotly_chart(fig) ### ### st.title("Shap Value") import shap shap_values = shap.TreeExplainer(model).shap_values(X) st.pyplot(shap.summary_plot(shap_values, X, plot_type="bar")) ### ### st.title("Shap Summary Plot") f = plt.figure() st.pyplot(shap.summary_plot(shap_values, X)) ### ### import shap import streamlit as st import streamlit.components.v1 as components
def main(args): """ Runs evaluation for the data set 1. Loads model from tar.gz 2. Reads in test features 3. Runs an accuracy report 4. Generates feature importance with SHAP Args: model-name (str): Name of the trained model, default xgboost test-features (str): preprocessed test features for evaluation, default test_features.csv train-features (str): preproceed train features for SHAP, default train_features.csv test-features (str): preproceed test features for SHAP, default test_features.csv report-name (str): Name of the evaluation output , default evaluation.json shap-name (str): Name of the SHAP feature importance output file, default shap.csv threshold (float): Threshold to cut probablities at , default 0.5 tau (int): time range for the c-index will be from 0 to tau , default 100 """ model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz") logger.info(f"Extracting model from path: {model_path}") with tarfile.open(model_path) as tar: tar.extractall(path=".") logger.info("Loading model") with open(args.model_name, "rb") as f: model = pickle.load(f) logger.info("Loading train and test data") test_features_data = os.path.join("/opt/ml/processing/test", args.test_features) train_features_data = os.path.join("/opt/ml/processing/train", args.train_features) X_test = pd.read_csv(test_features_data, header=0) X_train = pd.read_csv(train_features_data, header=0) y_test = X_test.iloc[:, 0] y_train = X_train.iloc[:, 0] # Reverse transfrom to event and duration columns y_test_df = pd.DataFrame( np.vstack((np.where(y_test > 0, 1, 0), np.abs(y_test))).T, columns=["event", "duration"], ) y_train_df = pd.DataFrame( np.vstack((np.where(y_train > 0, 1, 0), np.abs(y_train))).T, columns=["event", "duration"], ) X_test.drop(X_test.columns[0], axis=1, inplace=True) X_train.drop(X_test.columns[0], axis=1, inplace=True) logger.info("Running inference") predictions = model.predict(xgboost.DMatrix(X_test.values[:, 1:]), output_margin=False) logger.info("Creating evaluation report") # NOTE: technical evaluation is really not as a classifier # TO DO: Normalize to 0 to 1 scale report_dict = classification_report(y_test_df["event"], predictions > args.threshold, output_dict=True) report_dict["accuracy"] = accuracy_score(y_test_df["event"], predictions > args.threshold) _, y_train_tuple = get_x_y(y_train_df, ["event", "duration"], pos_label=True) _, y_test_tuple = get_x_y(y_test_df, ["event", "duration"], pos_label=True) concordance_index = concordance_index_ipcw( y_train_tuple, y_test_tuple, predictions, tau=args.tau, # default within 100 days ) report_dict["concordance_index"] = { "cindex": float(concordance_index[0]), "concordant": int(concordance_index[1]), "discordant": int(concordance_index[2]), "tied_risk": int(concordance_index[3]), "tied_time": int(concordance_index[4]), } times, score = brier_score(y_train_tuple, y_test_tuple, predictions, y_test_df["duration"].max() - 1) report_dict["brier_score"] = { "times": times.astype(np.int32).tolist(), "score": score.astype(np.float32).tolist(), } logger.info(f"Classification report:\n{report_dict}") evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", args.report_name) logger.info(f"Saving classification report to {evaluation_output_path}") logger.debug(report_dict) with open(evaluation_output_path, "w") as f: f.write(json.dumps(report_dict)) # SHAP latest_job_debugger_artifacts_path = "/opt/ml/processing/debug/debug-output" trial = create_trial(latest_job_debugger_artifacts_path) shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step) pd.DataFrame(shap_values).to_csv( os.path.join("/opt/ml/processing/evaluation", args.shap_name)) shap_no_base = shap_values[1:, :-1] feature_names = X_train.columns os.makedirs("/opt/ml/processing/plot/", exist_ok=True) logger.info(shap_values.shape, shap_no_base.shape, X_train.shape) shap.summary_plot(shap_no_base, features=X_train, feature_names=feature_names, show=False) plt.savefig("/opt/ml/processing/plot/feature_importance.png", bbox_inches="tight")
# Just like with the permutation method, we might also want to understand model output in aggregate. Shapley values allow us to do this as well. Run the next cell to initialize the shapley values for each example in the test set (this may also take a few minutes). # In[41]: shap_values = shap.TreeExplainer(rf).shap_values(X_test)[1] # You can ignore the `setting feature_perturbation` message. # Run the next cell to see a summary plot of the shapley values for each feature on each of the test examples. The colors indicate the value of the feature. The features are listed in terms of decreasing absolute average shapley value over all the individuals in the dataset. # In[42]: shap.summary_plot(shap_values, X_test) # In the above plot, you might be able to notice a high concentration of points on specific SHAP value ranges. This means that a high proportion of our test set lies on those ranges. # # As with the permutation method, age, sex, poverty index, and diastolic BP seem to be the most important features. Being older has a negative impact on mortality, and being a woman (sex=2.0) has a positive effect. # <a name="2-2-3"></a> # #### 2.2.3 Visualizing Interactions between Features # The `shap` library also lets you visualize interactions between features using dependence plots. These plot the Shapley value for a given feature for each data point, and color the points in using the value for another feature. This lets us begin to explain the variation in shapley value for a single value of the main feature. # Run the next cell to see the interaction between Age and Sex. # In[43]:
clf = RandomForestClassifier(n_estimators=100, max_depth=30, n_jobs=-1) outlier_detector = IsolationForest(contamination=0.15) without_outliers_classifier = WithoutOutliersClassifier(outlier_detector, clf) cross_validate_test(X, y, without_outliers_classifier, metric=accuracy_score, outlier_detection=True) # %% import shap tf = transformation(raw_data).drop_columns(columns=[]) X, y = tf.create_X_y() pipe = make_pipeline(KNNImputer(n_neighbors=7), StandardScaler()) pipe.fit(X) X_train = pipe.transform(X) clf = RandomForestClassifier(n_estimators=100, n_jobs=-1) clf.fit(X_train, y) explainer = shap.TreeExplainer(clf) shap_values = explainer.shap_values(X_train)[1] shap.summary_plot(shap_values, X_train, X.columns.tolist()) # shap.summary_plot(shap_values, X_train, X.columns.tolist(), plot_type="bar")
# Build Regression Model model = RandomForestRegressor() model.fit(X, Y) # Apply Model to Make Prediction # Unpickle our model RF so we can use it! if os.path.isfile("./model.pkl"): mod = pickle.load(open("./model.pkl", "rb")) else: raise FileNotFoundError prediction_RF = mod.predict(df) st.write("""**Median Predicted value** of owner-occupied homes in $1000s""") st.write(prediction_RF) st.write('---') # Explaining the model's predictions using SHAP values explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) st.header('Feature Importance') plt.title('Feature importance based on SHAP values') shap.summary_plot(shap_values, X) st.pyplot(bbox_inches='tight') st.write('---') plt.title('Feature importance based on SHAP values (Bar)') shap.summary_plot(shap_values, X, plot_type="bar") st.pyplot(bbox_inches='tight')
Original file is located at https://colab.research.google.com/drive/1qbgfxSs_mTGTnBbBaP87YqOEuOYKxOu- ### Opening the black box ### eli5 """ import eli5 eli5.show_weights(random_forest, feature_names=features) """### SHAP""" import shap shap_values = shap.TreeExplainer(random_forest).shap_values(X_rus) shap.summary_plot(shap_values, X_rus, plot_type="bar", feature_names=features) """### LIME""" import lime predict_fn_xgb = lambda x: random_forest.predict_proba(x).astype(float) explainer = lime.lime_tabular.LimeTabularExplainer(X_rus, feature_names=features, kernel_width=3) observation_1 = 2 exp = explainer.explain_instance(X_rus[observation_1], predict_fn_xgb, num_features=6) exp.show_in_notebook(show_all=False)
def _bar_ranking_plot(mean_shap_values, X, folder, max_feats, ext=".png"): """Function for customizing and saving SHAP summary bar plot.""" shap.summary_plot(mean_shap_values, X, plot_type="bar", max_display=max_feats, show=False) plt.title("Feature Rankings-All Classes") plt.savefig(os.path.join(folder, "shap_bar_rank" + ext), dpi=200, bbox_inches="tight")
def explain(x, model, task, path="outputs/plots/mlflow_artifacts/shap", n_features=5): ''' explain a model' decisions based on SHAP value approximation. SHAP algorithm is quadratic with the depth of trees. -> Be careful not to go over 12 for max_depth. Args: x (DataFrame): Input data model ([type]): Model to explain task (str): Task to perform. Available: regression, classification. path (str, optional): [description]. Defaults to "outputs/plots/mlflow_artifacts/shap". n_features (int, optional): Number of most important features for which to generate partial dependance plot. ''' explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(x, check_additivity=False) features_names = x.columns if os.path.exists(path): shutil.rmtree(path) #Compute top features vals = np.abs(shap_values).mean(0) feature_importance = pd.DataFrame( list(zip(x.columns, sum(vals))), columns=['feature', 'feature_importance_vals']) feature_importance.sort_values(by=['feature_importance_vals'], ascending=False, inplace=True) top_features = feature_importance['feature'].values[:n_features] os.makedirs("{}/summary_plots".format(path)) os.makedirs("{}/dependance_plots".format(path)) # os.makedirs("{}/interaction_plots".format(path)) plt.rcParams.update({'figure.max_open_warning': 0}) if task == 'classification': ## Summary plots shap.summary_plot(shap_values, x, class_names=model.classes_, show=False) plt.savefig("{}/summary_plots/main_summary_plot.png".format(path), dpi=150, bbox_inches='tight') plt.clf() for i, klass in enumerate(model.classes_): shap.summary_plot(shap_values[i], x, class_names=model.classes_, show=False) plt.savefig("{}/summary_plots/{}_summary_plot.png".format( path, klass), dpi=150, bbox_inches='tight') plt.clf() ## Dependance plots for feature in top_features: for i, klass in enumerate(model.classes_): shap.dependence_plot( feature, shap_values[i], x, title='Impact of the {} variable on the prediction of {}'. format(feature, klass), show=False) plt.savefig( "{}/dependance_plots/{}_{}_dependance_plot.png".format( path, klass, feature), dpi=150, bbox_inches='tight') plt.clf() ## Interaction plots # TO DO: takes the 5 most imporant values and watch for interaction # plt.clf() # explainer.shap_interaction_values(x) # plt.savefig("plots/shap/interaction_plots/interaction_plot.png",dpi=150, bbox_inches='tight') elif task == 'regression': ## Summary plots shap.summary_plot(shap_values, x, show=False) plt.savefig("{}/summary_plots/main_summary_plot.png".format(path), dpi=150, bbox_inches='tight') plt.clf() ## Dependance plots for feature in features_names: shap.dependence_plot( feature, shap_values, x, title='Impact of the {} variable'.format(feature), show=False) plt.savefig("{}/dependance_plots/{}_dependance_plot.png".format( path, feature), dpi=150, bbox_inches='tight') plt.clf()
model(interpreter.test_data[idx, :, 2:].unsqueeze(0)) interpreter.test_data[idx] interpreter.explainer.subject_ids interpreter.feat_names interpreter.feat_scores.reshape(-1, model.n_inputs+1).shape val_features[:, :4, 2:].numpy().reshape(-1, model.n_inputs+1).shape # Summarize the effects of all the features shap.summary_plot(interpreter.feat_scores.reshape(-1, model.n_inputs+1), features=interpreter.test_data[:, :4, 2:].numpy().reshape(-1, model.n_inputs+1), feature_names=interpreter.feat_names, plot_type='bar') # + # [TODO] Do the same bar plot as above but in plotly # - np.abs(interpreter.feat_scores).reshape(-1, interpreter.feat_scores.shape[-1]).shape mean_abs_shap = np.mean(np.abs(interpreter.feat_scores).reshape(-1, interpreter.feat_scores.shape[-1]), axis=0) mean_abs_shap sorted_idx = np.argsort(mean_abs_shap) sorted_idx interpreter.feat_names
def train(self, importance_cutoff=0.15, limit=None): classes, self.class_names = self.get_labels() self.class_names = sort_class_names(self.class_names) # Get items and labels, filtering out those for which we have no labels. X_gen, y = split_tuple_generator(lambda: self.items_gen(classes)) # Extract features from the items. X = self.extraction_pipeline.fit_transform(X_gen) # Calculate labels. y = np.array(y) if limit: X = X[:limit] y = y[:limit] print(f"X: {X.shape}, y: {y.shape}") is_multilabel = isinstance(y[0], np.ndarray) is_binary = len(self.class_names) == 2 # Split dataset in training and test. X_train, X_test, y_train, y_test = self.train_test_split(X, y) if self.sampler is not None: pipeline = make_pipeline(self.sampler, self.clf) else: pipeline = self.clf tracking_metrics = {} # Use k-fold cross validation to evaluate results. if self.cross_validation_enabled: scorings = ["accuracy"] if len(self.class_names) == 2: scorings += ["precision", "recall"] scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5) print("Cross Validation scores:") for scoring in scorings: score = scores[f"test_{scoring}"] tracking_metrics[f"test_{scoring}"] = { "mean": score.mean(), "std": score.std() * 2, } print( f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})" ) print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") # Training on the resampled dataset if sampler is provided. if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X_train, y_train) print( f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}" ) print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") self.clf.fit(X_train, y_train) print("Model trained") feature_names = self.get_human_readable_feature_names() if self.calculate_importance and len(feature_names): explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X_train) # In the binary case, sometimes shap returns a single shap values matrix. if is_binary and not isinstance(shap_values, list): shap_values = [-shap_values, shap_values] summary_plot_value = shap_values[1] summary_plot_type = "layered_violin" else: summary_plot_value = shap_values summary_plot_type = None shap.summary_plot( summary_plot_value, to_array(X_train), feature_names=feature_names, class_names=self.class_names, plot_type=summary_plot_type, show=False, ) matplotlib.pyplot.savefig("feature_importance.png", bbox_inches="tight") matplotlib.pyplot.xlabel("Impact on model output") matplotlib.pyplot.clf() important_features = self.get_important_features( importance_cutoff, shap_values) self.print_feature_importances(important_features) # Save the important features in the metric report too feature_report = self.save_feature_importances( important_features, feature_names) tracking_metrics["feature_report"] = feature_report print("Training Set scores:") y_pred = self.clf.predict(X_train) if not is_multilabel: print( classification_report_imbalanced(y_train, y_pred, labels=self.class_names)) print("Test Set scores:") # Evaluate results on the test set. y_pred = self.clf.predict(X_test) if is_multilabel: assert isinstance( y_pred[0], np.ndarray), "The predictions should be multilabel" print(f"No confidence threshold - {len(y_test)} classified") if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( y_test, y_pred) else: confusion_matrix = metrics.confusion_matrix( y_test, y_pred, labels=self.class_names) print( classification_report_imbalanced(y_test, y_pred, labels=self.class_names)) report = classification_report_imbalanced_values( y_test, y_pred, labels=self.class_names) tracking_metrics["report"] = report print_labeled_confusion_matrix(confusion_matrix, self.class_names, is_multilabel=is_multilabel) tracking_metrics["confusion_matrix"] = confusion_matrix.tolist() confidence_thresholds = [0.6, 0.7, 0.8, 0.9] if is_binary: confidence_thresholds = [0.1, 0.2, 0.3, 0.4 ] + confidence_thresholds # Evaluate results on the test set for some confidence thresholds. for confidence_threshold in confidence_thresholds: y_pred_probas = self.clf.predict_proba(X_test) confidence_class_names = self.class_names + ["__NOT_CLASSIFIED__"] y_pred_filter = [] classified_indices = [] for i in range(0, len(y_test)): if not is_binary: argmax = np.argmax(y_pred_probas[i]) else: argmax = 1 if y_pred_probas[i][ 1] > confidence_threshold else 0 if y_pred_probas[i][argmax] < confidence_threshold: if not is_multilabel: y_pred_filter.append("__NOT_CLASSIFIED__") continue classified_indices.append(i) if is_multilabel: y_pred_filter.append(y_pred[i]) else: y_pred_filter.append(argmax) if not is_multilabel: y_pred_filter = np.array(y_pred_filter) y_pred_filter[classified_indices] = self.le.inverse_transform( np.array(y_pred_filter[classified_indices], dtype=int)) classified_num = sum(1 for v in y_pred_filter if v != "__NOT_CLASSIFIED__") print( f"\nConfidence threshold > {confidence_threshold} - {classified_num} classified" ) if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( y_test[classified_indices], np.asarray(y_pred_filter)) else: confusion_matrix = metrics.confusion_matrix( y_test.astype(str), y_pred_filter.astype(str), labels=confidence_class_names, ) print( classification_report_imbalanced( y_test.astype(str), y_pred_filter.astype(str), labels=confidence_class_names, )) print_labeled_confusion_matrix(confusion_matrix, confidence_class_names, is_multilabel=is_multilabel) self.evaluation() if self.entire_dataset_training: print("Retraining on the entire dataset...") if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X, y) else: X_train = X y_train = y print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") self.clf.fit(X_train, y_train) joblib.dump(self, self.__class__.__name__.lower()) if self.store_dataset: joblib.dump(X, f"{self.__class__.__name__.lower()}_data_X") joblib.dump(y, f"{self.__class__.__name__.lower()}_data_y") return tracking_metrics
dict(enumerate(purchastingpower_categories))) eneder_categories = ['U', 'F', 'M'] channel_categories = ['app', 'wechat', 'pc', 'mobile', 'others'] marital_categories = ['U', 'M', 'S'] X_test_disp['gender'] = X_test_disp['gender'].map( dict(enumerate(geneder_categories))) X_test_disp['channel'] = X_test_disp['channel'].map( dict(enumerate(channel_categories))) X_test_disp['marital_status'] = X_test_disp['marital_status'].map( dict(enumerate(marital_categories))) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, plot_type="bar", max_display=21) shap.summary_plot(shap_values, X_test, max_display=21) predictions = model.predict(X_test) predictions[predictions < 0] = 0 actual = y_test print("RMSE Error LightGBM:{}".format( np.sqrt(metrics.mean_squared_error(actual, predictions)))) X_test.rename( columns={'ordertable-original_unit_price': 'original_unit_price'}, inplace=True)
model = cls.get_xgb_model() class_order = model.classes_ explainer = shap.TreeExplainer(model) test = np.where(mutants == mutant)[0] mutant_tag = np.unique(y[test]) assert len(mutant_tag) == 1 mutant_tag = mutant_tag[0] assert len(test) > 1 x_test, y_test = x[test], y[test] dim_names = BulkSignatureGenerator.dim_names() shap_values = explainer.shap_values(x_test, approximate=True) x_test = pd.DataFrame(x_test, columns=dim_names) if plot_type == 'bar': shap.summary_plot(shap_values, x_test, max_display=top_display, plot_type='bar', class_names=['deficient', 'basal', 'enhanced'], color=lambda i: list(["#b09c8599","#dc000099","#8491b4ff",])[i]) else: for i, data_in in enumerate(shap_values): if class_order[i] == mutant_tag: plt.subplots_adjust(left=0.35, right=0.98) plt.title("Mutant: %s (%d)" % (mutant, mutant_tag)) shap.summary_plot(data_in, x_test, max_display=top_display, plot_type='violin', class_names=['deficient', 'basal', 'enhanced'])
'infections_value': infections_value, 'accumulated': accumulated/100} features = pd.DataFrame(data, index=[0]) return features s = user_input_features() print(shap.__version__) def st_shap(plot, height=None): shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>" components.html(shap_html, height=height) pickle_file = '../models/model_lgbm_reg' if st.sidebar.button('Calculate Estimated Reproduction Rate'): st.subheader('Specified Input parameters') st.write(s) model = pickle.load(open(pickle_file,'rb')) model_predict = (model.predict(s)).astype(str) st.markdown('**Estimated Reproduction Rate:**') st.write(model_predict[0]) st.markdown('**Feature importance based on SHAP values**') explainerModel = shap.TreeExplainer(model) shap_values_Model = explainerModel.shap_values(s) st_shap(shap.force_plot(explainerModel.expected_value, shap_values_Model[0], s.iloc[[0]]), 125) shap.summary_plot(shap_values_Model, s, plot_type="bar") st.pyplot(bbox_inches='tight')
model_white.load_model("../Models/Optimal_XGB_survival_white_model.m") print(model_white) # + #SHAP model to get importance for AA shap_values = shap.TreeExplainer(model_black).shap_values(X_black_survival) #Top pathways for AA people to determine survival black_imp_df = pd.read_csv("../Results/black.csv", header='infer', sep=",") flist = black_imp_df["Feature"].tolist() # "IPA:Myc_Mediated_Apoptosis_Signaling","IPA:EGF_Signaling","HM:Oxidative_phosphorylation", # "TPW:Immunogenic_Cell_Death_(ICD)","ICRscore","IPA:UVB_Induced_MAPK_Signaling","IPA:UVA_Induced_MAPK_Signaling"] fids = [X_black_survival.columns.get_loc(c) for c in flist] shap.summary_plot(shap_values[:, fids], X_black_survival.iloc[:, fids], sort=False) # + #SHAP model to get importance for black shap_values = shap.TreeExplainer(model_white).shap_values(X_white_survival) #Top pathways for white people to determine survival white_imp_df = pd.read_csv("../Results/white.csv", header='infer', sep=",") flist = white_imp_df["Features"][0:20] #flist = ["IPA:Telomere_Extension_by_Telomerase","HM:PI3K_Akt_mTOR_signaling","LM:Proliferation", # "HM:Wnt_beta_catenin_signaling","TBI:Barrier_genes","IPA:AMPK_Signaling","IPA:PI3K_AKT_Signaling", # "HM:Angiogenesis","IPA:ErbB_Signaling","IPA:ERK5_Signaling","HM:G2M_checkpoint", # "HM:p53_pathway","HM:UV_response_down","IPA:UVC_Induced_MAPK_Signaling","IPA:HER_2_Signaling_in_Breast_Cancer", # "HM:Reactive_oxigen_species_pathway","IPA:VEGF_Signaling","IPA:Estrogen_Dependent_Breast_Cancer_Signaling",
X_train_final = np.concatenate((X_train_A_enc, X_train_B), axis=1) X_test_final = np.concatenate((X_test_A_enc, X_test_B), axis=1) #%% xgb_classifier = xgb.XGBClassifier(n_estimators=90, max_depth=4, learning_rate=0.075, colsample_bytree=0.7, subsample=0.8, reg_lambda=16, gamma=1, min_child_weight=1.5, objective='binary:logistic', scale_pos_weight=20) xgb_classifier.fit(X_train_final, y_train) y_score = xgb_classifier.predict_proba(X_train_final)[:, 1:] print(f'ROC AUC: {roc_auc_score(y_train, y_score):0.3f}') print(f'AUPRC: {auprc(y_train, y_score):0.3f}') #%% SHAP import shap import matplotlib.pyplot as plt shap.initjs() explainer = shap.TreeExplainer(xgb_classifier) shap_values = explainer.shap_values(X_train_final) plt.figure() shap.summary_plot(shap_values, X_train_final, plot_type='bar') plt.show()
def train(self, importance_cutoff=0.15): classes, self.class_names = self.get_labels() self.class_names = sort_class_names(self.class_names) # Get items and labels, filtering out those for which we have no labels. X_iter, y_iter = split_tuple_iterator(self.items_gen(classes)) # Extract features from the items. X = self.extraction_pipeline.fit_transform([item for item in X_iter]) # Calculate labels. y = np.array(y_iter) print(f"X: {X.shape}, y: {y.shape}") is_multilabel = isinstance(y[0], np.ndarray) # Split dataset in training and test. X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=0 ) if self.sampler is not None: pipeline = make_pipeline(self.sampler, self.clf) else: pipeline = self.clf tracking_metrics = {} # Use k-fold cross validation to evaluate results. if self.cross_validation_enabled: scorings = ["accuracy"] if len(self.class_names) == 2: scorings += ["precision", "recall"] scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5) print("Cross Validation scores:") for scoring in scorings: score = scores[f"test_{scoring}"] tracking_metrics[f"test_{scoring}"] = { "mean": score.mean(), "std": score.std() * 2, } print( f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})" ) # Training on the resampled dataset if sampler is provided. if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X_train, y_train) print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") self.clf.fit(X_train, y_train) feature_names = self.get_human_readable_feature_names() if self.calculate_importance and len(feature_names): explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X_train) shap.summary_plot( shap_values, X_train.toarray(), feature_names=feature_names, class_names=self.class_names, plot_type="layered_violin" if not isinstance(shap_values, list) else None, show=False, ) matplotlib.pyplot.savefig("feature_importance.png", bbox_inches="tight") important_features = self.get_important_features( importance_cutoff, shap_values ) self.print_feature_importances(important_features, feature_names) print("Test Set scores:") # Evaluate results on the test set. y_pred = self.clf.predict(X_test) if is_multilabel: assert isinstance( y_pred[0], np.ndarray ), "The predictions should be multilabel" print(f"No confidence threshold - {len(y_test)} classified") if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix(y_test, y_pred) else: confusion_matrix = metrics.confusion_matrix( y_test, y_pred, labels=self.class_names ) print( classification_report_imbalanced( y_test, y_pred, labels=self.class_names ) ) report = classification_report_imbalanced_values( y_test, y_pred, labels=self.class_names ) tracking_metrics["report"] = report print_labeled_confusion_matrix( confusion_matrix, self.class_names, is_multilabel=is_multilabel ) tracking_metrics["confusion_matrix"] = confusion_matrix.tolist() # Evaluate results on the test set for some confidence thresholds. for confidence_threshold in [0.6, 0.7, 0.8, 0.9]: y_pred_probas = self.clf.predict_proba(X_test) y_test_filter = [] y_pred_filter = [] for i in range(0, len(y_test)): argmax = np.argmax(y_pred_probas[i]) if y_pred_probas[i][argmax] < confidence_threshold: continue y_test_filter.append(y_test[i]) if is_multilabel: y_pred_filter.append(y_pred[i]) else: y_pred_filter.append(argmax) if not is_multilabel: y_pred_filter = self.le.inverse_transform(y_pred_filter) print( f"\nConfidence threshold > {confidence_threshold} - {len(y_test_filter)} classified" ) if len(y_test_filter) != 0: if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( np.asarray(y_test_filter), np.asarray(y_pred_filter) ) else: confusion_matrix = metrics.confusion_matrix( np.asarray(y_test_filter), np.asarray(y_pred_filter), labels=self.class_names, ) print( classification_report_imbalanced( y_test_filter, y_pred_filter, labels=self.class_names ) ) print_labeled_confusion_matrix( confusion_matrix, self.class_names, is_multilabel=is_multilabel ) joblib.dump(self, self.__class__.__name__.lower()) return tracking_metrics
'number_inpatient', 'num_medications', 'number_diagnoses', 'num_lab_procedures', 'num_procedures', 'time_in_hospital', 'number_outpatient', 'number_emergency', 'gender_Female', 'payer_code_?', 'medical_specialty_?', 'diag_1_428', 'diag_1_414', 'diabetesMed_Yes', 'A1Cresult_None' ] # Some versions of shap package error when mixing bools and numerics X = data[base_features].astype(float) train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # For speed, we will calculate shap values on smaller subset of the validation data small_val_X = val_X.iloc[:150] my_model = RandomForestClassifier(n_estimators=30, random_state=1).fit(train_X, train_y) data.describe() explainer = shap.TreeExplainer(my_model) shap_values = explainer.shap_values(small_val_X) shap.summary_plot(shap_values[1], small_val_X) feature_with_bigger_range_of_effects = 'diag_1_428' shap.summary_plot(shap_values[1], small_val_X) bigger_effect_when_changed = "diag_1_428" shap.summary_plot(shap_values[1], small_val_X) shap.dependence_plot('num_lab_procedures', shap_values[1], small_val_X) shap.dependence_plot('num_medications', shap_values[1], small_val_X)
def generate_feature_importance_data(self, probs, importance): X_shap_values = shap.TreeExplainer(self.model.clf).shap_values(self.X) pred_class = self.model.le.inverse_transform([probs[0].argmax()])[0] features = [] for i, (val, feature_index, is_positive) in enumerate( importance["importances"]["classes"][pred_class][0]): name = importance["feature_legend"][str(i + 1)] value = importance["importances"]["values"][0, int(feature_index)] shap.summary_plot( X_shap_values[:, int(feature_index)].reshape(self.X.shape[0], 1), self.X[:, int(feature_index)].reshape(self.X.shape[0], 1), feature_names=[""], plot_type="layered_violin", show=False, ) matplotlib.pyplot.xlabel("Impact on model output") img = io.BytesIO() matplotlib.pyplot.savefig(img, bbox_inches="tight") matplotlib.pyplot.clf() img.seek(0) base64_img = base64.b64encode(img.read()).decode("ascii") X = self.X[:, int(feature_index)] y = self.y[X != 0] X = X[X != 0] spearman = spearmanr(X, y) buggy_X = X[y == 1] clean_X = X[y == 0] median = np.median(X) median_clean = np.median(clean_X) median_buggy = np.median(buggy_X) perc_buggy_values_higher_than_median = ( buggy_X >= median).sum() / buggy_X.shape[0] perc_buggy_values_lower_than_median = ( buggy_X < median).sum() / buggy_X.shape[0] perc_clean_values_higher_than_median = ( clean_X > median).sum() / clean_X.shape[0] perc_clean_values_lower_than_median = ( clean_X <= median).sum() / clean_X.shape[0] logger.info("Feature: {}".format(name)) logger.info("Shap value: {}{}".format( "+" if (is_positive) else "-", val)) logger.info(f"spearman: {spearman}") logger.info(f"value: {value}") logger.info(f"overall mean: {np.mean(X)}") logger.info(f"overall median: {np.median(X)}") logger.info(f"mean for y == 0: {np.mean(clean_X)}") logger.info(f"mean for y == 1: {np.mean(buggy_X)}") logger.info(f"median for y == 0: {np.median(clean_X)}") logger.info(f"median for y == 1: {np.median(buggy_X)}") logger.info( f"perc_buggy_values_higher_than_median: {perc_buggy_values_higher_than_median}" ) logger.info( f"perc_buggy_values_lower_than_median: {perc_buggy_values_lower_than_median}" ) logger.info( f"perc_clean_values_higher_than_median: {perc_clean_values_higher_than_median}" ) logger.info( f"perc_clean_values_lower_than_median: {perc_clean_values_lower_than_median}" ) features.append({ "index": i + 1, "name": name, "shap": float(f'{"+" if (is_positive) else "-"}{val}'), "value": importance["importances"]["values"][0, int(feature_index)], "spearman": spearman, "median": median, "median_bug_introducing": median_buggy, "median_clean": median_clean, "perc_buggy_values_higher_than_median": perc_buggy_values_higher_than_median, "perc_buggy_values_lower_than_median": perc_buggy_values_lower_than_median, "perc_clean_values_higher_than_median": perc_clean_values_higher_than_median, "perc_clean_values_lower_than_median": perc_clean_values_lower_than_median, "plot": base64_img, }) # Group together features that are very similar to each other, so we can simplify the explanation # to users. attributes = ["Total", "Maximum", "Minimum", "Average"] already_added = set() feature_groups = [] for i1, f1 in enumerate(features): if i1 in already_added: continue feature_groups.append([f1]) for j, f2 in enumerate(features[i1 + 1:]): i2 = j + i1 + 1 f1_name = f1["name"] for attribute in attributes: if f1_name.startswith(attribute): f1_name = f1_name[len(attribute) + 1:] break f2_name = f2["name"] for attribute in attributes: if f2_name.startswith(attribute): f2_name = f2_name[len(attribute) + 1:] break if f1_name != f2_name: continue already_added.add(i2) feature_groups[-1].append(f2) # Pick a representative example from each group. features = [] for feature_group in feature_groups: shap_sum = sum(f["shap"] for f in feature_group) # Only select easily explainable features from the group. selected = [ f for f in feature_group if (f["shap"] > 0 and abs(f["value"] - f["median_bug_introducing"]) < abs(f["value"] - f["median_clean"])) or ( f["shap"] < 0 and abs(f["value"] - f["median_clean"]) < abs(f["value"] - f["median_bug_introducing"])) ] # If there are no easily explainable features in the group, select all features of the group. if len(selected) == 0: selected = feature_group def feature_sort_key(f): if f["shap"] > 0 and f["spearman"][0] > 0: return f["perc_buggy_values_higher_than_median"] elif f["shap"] > 0 and f["spearman"][0] < 0: return f["perc_buggy_values_lower_than_median"] elif f["shap"] < 0 and f["spearman"][0] > 0: return f["perc_clean_values_lower_than_median"] elif f["shap"] < 0 and f["spearman"][0] < 0: return f["perc_clean_values_higher_than_median"] feature = max(selected, key=feature_sort_key) feature["shap"] = shap_sum for attribute in attributes: if feature["name"].startswith(attribute): feature["name"] = feature["name"][len(attribute) + 1:].capitalize() break features.append(feature) with open("importances.json", "w") as f: json.dump(features, f)
def persist_shap(model, X_train): shap_values = shap.TreeExplainer(model).shap_values(X_train) shap.summary_plot(shap_values, X_train, show=False) plt.savefig('/dbfs/mnt/documents/images/shap.png')
# Baseline with LGB import lightgbm as lgb from math import sqrt from sklearn.metrics import mean_squared_error lgb_dtrain = lgb.Dataset(data=train_x, label=train_y) lgb_param = { 'max_depth': 10, 'learning_rate': 0.01, 'n_estimators': 1000, 'objective': 'regression' } lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain) lgb_model_predict = lgb_model.predict(test_x) print("RMSE: {}".format(sqrt(mean_squared_error(lgb_model_predict, test_y)))) # !pip install shap # import skimage -> skimage.__version__ (skimage version) # skimage version upgrade -> !pip install --upgrade scikit-image import shap explainer = shap.TreeExplainer(lgb_model) shap_values = explainer.shap_values(test_x) # Sample shap.initjs() shap.force_plot(explainer.expected_value, shap_values[0, :], test_x.iloc[0, :]) # Higher effect with Red Color, Lower effect with Blue Color shap.force_plot(explainer.expected_value, shap_values, test_x) shap.summary_plot(shap_values, test_x) shap.summary_plot(shap_values, test_x, plot_type="bar")
categorical_binary = np.array([i in categorical for i in Xcols]) cab.run(X_train, y_train, X_test, y_test, None, categorical_binary) bst = cab.bst with open(join('..', 'data', 'ml_data', model_name), 'wb') as f: pickle.dump(bst, f) else: # shap on a subset of values with open(join('..', 'data', 'ml_data', model_name), 'rb') as f: bst = pickle.load(f) # get a subset of the test set and get shap values # or just load it from the save file if compute_shap: rand_inds = np.random.choice(np.arange(len(X_test)), n_subset_for_shap) X_test_sub = X_test[rand_inds, :] y_test_sub = y_test[rand_inds] shap_values = shap.TreeExplainer(bst).shap_values(X_test_sub) shap.summary_plot(shap_values, X_test_sub, feature_names=Xcols) with open(join('..', 'data', 'ml_data', 'lightgbm_' + str(n_estimators) + '_shap.pickle'), 'wb') as f: pickle.dump((X_test_sub, y_test_sub, shap_values), f) else: with open(join('..', 'data', 'ml_data', 'lightgbm_' + str(n_estimators) + '_shap.pickle'), 'rb') as f: X_test_sub, y_test_sub, shap_values = pickle.load(f) shap.summary_plot(shap_values, X_test_sub, feature_names=Xcols)
pdp_feat = pdp.pdp_isolate(model=lgb_clf, dataset=test_X, model_features=feature_names, feature=feature) pdp.pdp_plot(pdp_feat, feature) plt.show() pdp_plotter('service_to_uza_area', lgb_clf) # SHAP # Re-fit the model and extract the SHAP tree explainer Features # to determine which features fit most often top_feats = shuffle_SHAP(X, y, lgb_clf, n_shuffles=100) # top_feats.to_csv(DATA_PATH + 'top_features.csv') explainer = shap.TreeExplainer(lgb_clf) shap_values = explainer.shap_values(test_X) shap.summary_plot(shap_values, test_X) # SHAP Dependence PLot shap.dependence_plot("Unlinked_Passenger_Trips_FY", shap_values, test_X) # Denver RTD [1 - Ridership is Stable / Increasing] original.loc[original.HQ_City.str.contains('Denver')] data.loc[data['5_digit_NTD_ID'] == 80006] shap.initjs() shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[630,:])
def run_explanations(csv_path, csv_columns, target_column, zero_value): # Read the dataset from the provided CSV and print out information about it. df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True, skiprows=1) #df = df.drop('Target',axis=1) input_features = [name for name in csv_columns if name != target_column] #data, labels = shap.datasets.adult(display=True) if target_column not in csv_columns: print("target column error") return ("target column error") elif zero_value not in df[target_column].tolist(): if str.isdecimal(zero_value) and ( np.int64(zero_value) in df[target_column].tolist() or np.float64(zero_value) in df[target_column].tolist()): print("happy") zero_value = np.int64(zero_value) else: print(zero_value, df[target_column].tolist(), df[target_column].dtype) return ("zero value error") labels = df[target_column].tolist() #labels = np.array([int(label) for label in labels]) labels2 = [] for label in labels: if label == zero_value: labels2.append(0) else: labels2.append(1) labels = np.array(labels2) data = df[input_features] for feature in input_features: if data[feature].dtype is not np.dtype( np.int64) and data[feature].dtype is not np.dtype( np.float64) and data[feature].dtype is not np.dtype( np.float32): data[feature] = data[feature].astype('category') cat_cols = data.select_dtypes(['category']).columns data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42) data_disp, labels_disp = shap.datasets.adult(display=True) X_train_disp, X_test_disp, y_train_disp, y_test_disp = train_test_split( data_disp, labels_disp, test_size=0.3, random_state=42) xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5, objective='binary:logistic', random_state=42) xgc.fit(X_train, y_train) predictions = xgc.predict(X_test) fig = plt.figure(figsize=(16, 12)) title = fig.suptitle("Default Feature Importances from XGBoost", fontsize=14) ax1 = fig.add_subplot(2, 2, 1) xgb.plot_importance(xgc, importance_type='weight', ax=ax1) t = ax1.set_title("Feature Importance - Feature Weight") ax2 = fig.add_subplot(2, 2, 2) xgb.plot_importance(xgc, importance_type='gain', ax=ax2) t = ax2.set_title("Feature Importance - Split Mean Gain") ax3 = fig.add_subplot(2, 2, 3) xgb.plot_importance(xgc, importance_type='cover', ax=ax3) t = ax3.set_title("Feature Importance - Sample Coverage") #plt.savefig('static/explanations.png') explanation = eli5.explain_weights(xgc.get_booster()) explanation_html = eli5.formatters.html.format_as_html(explanation) print(explanation_html) with open("templates/explanation.html", "a+") as file: file.write(explanation_html) doc_num = 0 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) #eli5.show_prediction(xgc.get_booster(), X_test.iloc[doc_num], # feature_names=list(data.columns) ,show_feature_values=True) explanation2 = eli5.explain_prediction(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns)) explanation_html2 = eli5.formatters.html.format_as_html(explanation2) with open("templates/explanation.html", "a") as file: file.write(explanation_html2) doc_num = 2 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) #eli5.show_predicon(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns) ,show_feature_values=True) explanation3 = eli5.explain_prediction(xgc.get_booster(), X_test.iloc[doc_num], feature_names=list(data.columns)) explanation_html3 = eli5.formatters.html.format_as_html(explanation3) with open("templates/explanation.html", "a") as file: file.write(explanation_html3) #target_names = ['$50K or less', 'More than $50K'] interpreter = Interpretation(training_data=X_test, training_labels=y_test, feature_names=list(data.columns)) im_model = InMemoryModel(xgc.predict_proba, examples=X_train) plots = interpreter.feature_importance.plot_feature_importance( im_model, ascending=True, n_samples=23000) plots[0].savefig('skater.png') features_pdp = input_features xgc_np = xgb.XGBClassifier(n_estimators=500, max_depth=5, base_score=0.5, objective='binary:logistic', random_state=42) xgc_np.fit(X_train.values, y_train) # In[ ]: from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer exp = LimeTabularExplainer(X_test.values, feature_names=list(data.columns), discretize_continuous=True) doc_num = 0 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) instance = exp.explain_instance(X_test.iloc[doc_num].values, xgc_np.predict_proba) instance.save_to_file('templates/lime.html', show_all=False) doc_num = 2 print('Actual Label:', y_test[doc_num]) print('Predicted Label:', predictions[doc_num]) instance2 = exp.explain_instance(X_test.iloc[doc_num].values, xgc_np.predict_proba) instance2.save_to_file('templates/lime2.html', show_all=False) explainer = shap.TreeExplainer(xgc) shap_values = explainer.shap_values(X_test) pd.DataFrame(shap_values).head() #shap.force_plot(explainer.expected_value, shap_values[:,], X_test_disp.iloc[:,],show=False,matplotlib=True) #plt.savefig("static/force_plot.png") shap.summary_plot(shap_values, X_test, plot_type="bar", show=False) plt.savefig("static/summary_plot.png") shap.summary_plot(shap_values, X_test, show=False) plt.savefig("static/summary_plot2.png") return "Everyone Happy"
def calculate_average_shap(file_name, treatment, outcome,algorithm, top_features, plot_file = ''): if file_name == '': print("Invalid treatment/outcome combination (" + str(treatment) + ", " + str(outcome)+ ")") else: with open(file_name, 'rb') as file: model_file = pickle.load(file) model = model_file['model_original'] data = model_file['train'] data_test = model_file['test'] X = data.drop(["COMORB_DEATH"], axis=1, inplace = False) y = data["COMORB_DEATH"] X_test = data_test.drop(["COMORB_DEATH"], axis=1, inplace = False) y_test = data_test["COMORB_DEATH"] ## Calculate SHAP values (for each observation x feature) if algorithm in ['rf','cart','xgboost']: explainer = shap.TreeExplainer(model, data=X_test, model_output="probability", ); shap_values = explainer.shap_values(X_test); ## only save plot for tree models if plot_file != '': plt.close() if isinstance(shap_values, list): shap.summary_plot(shap_values[1], X_test, show=False, max_display=10, plot_size=(10, 5), plot_type="violin") else: shap.summary_plot(shap_values, X_test, show=False, max_display=10, plot_size=(10, 5), plot_type="violin") f = plt.gcf() ax = plt.gca() plt.xlabel('SHAP value (impact on model output)') f.savefig(plot_file, bbox_inches='tight') plt.close() else: X_train_summary = shap.kmeans(X, 50) explainer = shap.KernelExplainer(model.predict_proba, data=X_train_summary, model_output="logit", ); shap_values = explainer.shap_values(X_test); if plot_file != '': print('Cannot plot summary plot for non-tree models') df = pd.DataFrame(columns = ['Risk Factor', 'Mean Absolute SHAP Value']) for i in range(0,len(X.columns)): if isinstance(shap_values, list): df = df.append({'Risk Factor' : X.columns[i], 'Mean Absolute SHAP Value' : pd.Series(shap_values[1][:,i]).abs().mean()}, ignore_index = True) else: df = df.append({'Risk Factor' : X.columns[i], 'Mean Absolute SHAP Value' : pd.Series(shap_values[:,i]).abs().mean()}, ignore_index = True) df = df.sort_values(by='Mean Absolute SHAP Value', ascending=False) df = df.head(top_features) return df
def main(): st.title("Feature Interpreation using SHAP") st.subheader("Sayantan Ghosh") @st.cache #Loading teh Boston Data--------------------------------------- def load_data(): boston = load_boston() return boston #Laoding the Dataset data_load_state = st.text("Loading Data") boston = load_data() data_load_state = st.text("Data Loaded") #----------------------------------------------------------------- @st.cache def load_dataframe(): Boston = pd.DataFrame(boston.data, columns=boston.feature_names) Boston['MEDV'] = boston.target return Boston Boston = load_dataframe() #Showing the snapshot of the data st.write(Boston.head(5)) user_input = st.text_input(" Give rad value RAD", 1) record = Boston.loc[Boston['RAD'] == int(user_input)] st.write(record) #Just pass the record into the Model.predict(record) #-------------------------------------------------------------------------------------------------------- #Defining X and Y x = Boston.loc[:, Boston.columns != 'MEDV'].values y = Boston.loc[:, Boston.columns == 'MEDV'].values x_train, x_test, y_train, y_test = train_test_split (Boston[boston.feature_names],y, test_size = 0.25, random_state=34) # Building the dashboard on XGBOOST model: st.title('Model the Boston Housing Dataset using XGBOOST') # creating DMatrices for XGBOOST application #dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=boston.feature_names) #dtest = xgb.DMatrix(x_test, label=y_test, feature_names=boston.feature_names) # Loading the cross-validated tuned XGBOOST model #loaded_model = pickle.load(open("xgboost_cv_best_pickle.dat", "rb")) #loaded_predictions = loaded_model.predict(dtest) loaded_model = xgb.XGBRegressor( n_estimators=150, reg_lambda=1, gamma=0, max_depth=8 ) loaded_model.fit(x_train,y_train) loaded_predictions = loaded_model.predict(x_test) st.write('RMSE of the XGBoost model on test set:', round(np.sqrt(metrics.mean_squared_error(y_test, loaded_predictions)),2)) #feature importance------------------------------------------------------------------------------------------- try: st.write('Using the standard XGBOOST importance plot feature, exposes the fact that the most important feature is not stable, select' ' different importance types using the selectbox below') importance_type = st.selectbox('Select the desired importance type', ('weight','gain','cover'),index=0) importance_plot = xgb.plot_importance(loaded_model,importance_type=importance_type) pl.title ('xgboost.plot_importance(best XGBoost model) importance type = '+ str(importance_type)) st.pyplot(bbox_inches='tight') pl.clf() except: pass #Feature Importance------------------------------------------------------------------------------------------ st.write('To handle this inconsitency, SHAP values give robust details, among which is feature importance') explainer = shap.TreeExplainer(loaded_model) shap_values = explainer.shap_values(x_train) pl.title('Assessing feature importance based on Shap values') shap.summary_plot(shap_values,x_train,plot_type="bar",show=False) st.pyplot(bbox_inches='tight') pl.clf() #-------------------------------------------------------------------------------------------------------------- st.write('SHAP values can also be used to represent the distribution of the training set of the respectable' 'SHAP value in relation with the Target value, in this case the Median House Value (MEDV)') pl.title('Total distribution of observations based on Shap values, colored by Target value') shap.summary_plot(shap_values,x_train,show=False) st.pyplot(bbox_inches='tight') pl.clf() #---------------------------------------------- st.write('Another example of SHAP values is for GDPR regulation, one should be able to give detailed information as to' ' why a specific prediction was made.') expectation = explainer.expected_value individual = st.number_input('Select the desired record from the training set for detailed explanation.' ,min_value=1 ,max_value=1000) predicted_values = loaded_model.predict(x_train) real_value = y_train[individual] st.write('The real median house value for this individual record is: '+str(real_value)) st.write('The predicted median house value for this individual record is: '+str(predicted_values[individual])) st.write('This prediction is calculated as follows: ' 'The average median house value: ('+str(expectation)+')'+ ' + the sum of the SHAP values. ') st.write( 'For this individual record the sum of the SHAP values is: '+str(sum(shap_values[individual,:]))) st.write( 'This yields to a predicted value of median house value of:'+str(expectation)+' + '+str(sum(shap_values[individual,:]))+ '= '+str(expectation+(sum(shap_values[individual,:])))) st.write('Which features caused this specific prediction? features in red increased the prediction, in blue decreased them') shap.force_plot(explainer.expected_value, shap_values[individual,:],x_train.iloc[individual,:],matplotlib=True,show=False ,figsize=(16,5)) st.pyplot(bbox_inches='tight',dpi=300,pad_inches=0) pl.clf()
fts = "" for ft in imp: fts = fts + ft[0] + " " + str(it) + " " + str( vol) + " " + str(no) + " " + str(ft[2]) + " " + str( round(ft[1], 4)) + "\n" print(fts) imp_str = imp_str + fts if get_shap: shap_vals = shap.TreeExplainer(xg_reg).shap_values( train[features]) shap.summary_plot(shap_vals, train[features], plot_type="bar") shap_comb = shap_vals.transpose() shap_mean = [] num_f = len(shap_comb) for fi in range(len(shap_comb)): vabs = abs(shap_comb[fi]) v_mean = stat.mean(vabs) shap_mean.append(v_mean) shapl = list(zip(features, shap_mean, range(1, numf + 1))) shapl.sort(key=lambda tup: tup[1], reverse=True) shps = "" for ft in shapl: