def plot_shap_force_plot(self, games, model, shap_df=None): model_dict = model.model_dict features_df = model.cv_scores if type(shap_df) is type(None): shap_df = self.get_shap_vals(model_dict, model.model_objects) # check if games is a single element if not np.array(games).shape: games = np.array(games).tolist() bias = shap_df.loc[games, 'bias'] multi = False else: bias = shap_df.loc[games, 'bias'].mean() multi = True force_plot = shap.force_plot( bias, shap_df.loc[games, model_dict['features_list']].values, features_df.loc[games, model_dict['features_list']]) if self.plots_dict['save']['plots'] is True: if multi is True: img_path = '{}/force_plot_multi.html'\ .format(self.plots_dir) else: img_path = '{}/force_plot_{}.html' \ .format(self.plots_dir, games) shap.save_html(img_path, force_plot) else: return force_plot
def print_shap(self, data_for_pred, outcome): #shap.initjs() shap_values = self.explainer.shap_values(data_for_pred) shap.save_html( self.out + "/individual_shap.html", shap.force_plot(self.explainer.expected_value[outcome], shap_values[outcome], data_for_pred))
def classify(self, items, probabilities=False, importances=False, importance_cutoff=0.15): assert items is not None assert (self.extraction_pipeline is not None and self.clf is not None), "The module needs to be initialized first" if not isinstance(items, list): items = [items] assert isinstance(items[0], dict) or isinstance(items[0], tuple) X = self.extraction_pipeline.transform(items) if probabilities: classes = self.clf.predict_proba(X) else: classes = self.clf.predict(X) classes = self.overwrite_classes(items, classes, probabilities) if importances: explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X) # TODO: Actually implement feature importance visualization for multiclass problems. if isinstance(shap_values, list): shap_values = np.sum(np.abs(shap_values), axis=0) top_importances = self.get_important_features( importance_cutoff, shap_values) top_indexes = [ int(index) for importance, index, is_positive in top_importances ] feature_names = self.get_feature_names() with io.StringIO() as out: p = shap.force_plot( explainer.expected_value, shap_values[:, top_indexes], X.toarray()[:, top_indexes], feature_names=[feature_names[i] for i in top_indexes], matplotlib=False, show=False, ) # TODO: use full_html=False shap.save_html(out, p) html = out.getvalue() return classes, {"importances": top_importances, "html": html} return classes
def single_force_plot(i, html=True): if html: fig = shap.force_plot(explainer.expected_value, shap_values[i, :], data_to_explain.iloc[i, :], feature_names=feat_used, show=False, link='logit') shap.save_html('./result/shap_force_plot_' + str(i) + '.htm', fig) else: fig = shap.force_plot(explainer.expected_value, shap_values[i, :], data_to_explain.iloc[i, :], feature_names=feat_used, show=False, matplotlib=True, link='logit') # fig = plt.gcf() # fig.savefig('./result/shap_force_plot_' + str(i) + '.svg') # fig.close() return fig
def heart_disease_risk_factors(model, patient): # Get weights of each feature explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(patient) # Plot weights shap.initjs() shap.save_html( "./test.html", shap.force_plot(explainer.expected_value[1], shap_values[1], patient)) return shap.force_plot(explainer.expected_value[1], shap_values[1], patient)
def go(): # save user input in query pid_query = request.args.get('pid', '') # Load data: cur = db.cursor() cur.execute(f"select * from {config.DB_TABLE} where patientId={pid_query}") # Convert to json format: results_json = [] header = [i[0] for i in cur.description] results = cur.fetchall() for i in results: results_json.append(dict(zip(header, i))) # Clean data: patient = list(results[0]) patient_clean = process_data.clean_data(data=patient) # Data transform: X_transformed = predict.transform_data(model=model, X=patient_clean) # Prediction: pred = predict.prediction(model=model, X=X_transformed) # Model insights: decision = predict.model_decision(pred=pred) # Risk: risk = 'dropping off' if (pred[0][0] < 0.5) else 'engaged' # SHAP force_plot in html format: shap_plot = predict.shap_plot(model=model, X=X_transformed) shap.save_html('../app/templates/shap.html', shap_plot) # This will render the go.html Please see that file. return render_template( 'go.html', results=results_json[0], # decision = decision, pred=f'{round(pred[0][0]*100,3)}%', risk=risk # query=query )
def generate_shap_html(feature, user_bin, user_id): xgb_clf = XGBClassifier() xgb_clf.load_model(os.path.join(MODEL_DIRECTORY, "xgb.model")) explainer = shap.TreeExplainer(xgb_clf) values = explainer.shap_values(feature) shap.initjs() fp = shap.force_plot(explainer.expected_value[user_bin - 1], values[user_bin - 1][0], feature, show=False) shap.save_html(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html"), fp) with open(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html"), "r", encoding='utf-8') as f: html = f.read() os.remove(os.path.join(MODEL_DIRECTORY, f"User_{user_id}.html")) return str(html), values
def meta_learning(working_dir): meta_features_df = pd.read_csv(f'{working_dir}/RegressionAll.csv') datasets = list(meta_features_df['name']) X = meta_features_df.iloc[:, :-1] X = X.fillna(0) y = meta_features_df.iloc[:, -1] test_data = [] for dataset_index in range(len(datasets)): dataset_name = datasets[dataset_index] print(f"Dataset {dataset_name}, {dataset_index+1}/100") test_cross_data = {'Dataset Name': dataset_name, 'Algorithm Name': 'XGBoost meta learning', 'Hyper-Parameters Values': None, 'Accuracy': None, 'TPR': None, 'FPR': None, 'Precision': None, 'AUC': ' ', 'PR Curve': ' ', 'Predict Probability': None, 'Predict Model': None, 'True Label': None, 'Training Time': None, 'Inference Time': None} X_train = X.loc[X['name'] != dataset_name] X_train = X_train.iloc[:, 1:] X_test = X.loc[X['name'] == dataset_name] X_test = X_test.iloc[:, 1:] y_train = y.loc[X_train.index] y_test = y.loc[X_test.index] meta_learning_model = xgb.XGBClassifier() time_before_train = datetime.now() meta_learning_model.fit(X_train, y_train) train_time = datetime.now() - time_before_train test_cross_data['Training Time'] = f"{train_time.microseconds} microseconds" time_before_predict = datetime.now() y_pred = meta_learning_model.predict(X_test) y_scores = meta_learning_model.predict_proba(X_test) predict_time = datetime.now() - time_before_predict test_cross_data['Inference Time'] = f"{predict_time.microseconds} microseconds" test_cross_data['Predict Probability'] = y_scores[0][1] test_cross_data['Predict Probability'] = f"{test_cross_data['Predict Probability']:.4f}" test_cross_data['Predict Model'] = 'Ensemble Genetic Programming' if y_pred[0] == 1 else 'Extra Tree Regressor' test_cross_data['True Label'] = 'Ensemble Genetic Programming' if y_test.values[0] == 1 \ else 'Extra Tree Regressor' test_cross_data['Accuracy'] = accuracy_score(y_test, y_pred) test_cross_data['TPR'] = 1 if y_pred[0] == y_test.values[0] else 0 test_cross_data['FPR'] = 0 if y_pred[0] == y_test.values[0] else 1 test_cross_data['Precision'] = test_cross_data['TPR'] test_data.append(test_cross_data) pd.DataFrame(test_data).to_csv('meta_learning_final_results.csv', index=False) X = X.iloc[:, 1:] meta_learning_model = xgb.XGBClassifier() meta_learning_model.fit(X, y) importance_types = ['weight', 'cover', 'gain'] plt.rcParams["figure.figsize"] = (40, 40) for imp_type in importance_types: ax = plot_importance(meta_learning_model, max_num_features=167, importance_type=imp_type, title=f'Meta Learning XGBoost {imp_type} importance') plt.show() shap.initjs() explainer = shap.TreeExplainer(meta_learning_model) shap_values = explainer.shap_values(X) shap.save_html('SHAP force plot.html', shap.force_plot(explainer.expected_value, shap_values, X, figsize=(20, 20))) shap.summary_plot(shap_values, X, plot_size=(20, 20), title="SHAP summary plot")
def shap_prop(df_cli2_scaled, df_t_scaled, clf_brf_all): ''' Function to explore the random forest decision mechanism. It consists in using the Shapley approach. Here we have the main contributors, the dependence plots and the decision triggers. Input: climatic features, yield output, model (random forest) ''' df_severe = pd.DataFrame(np.where( df_t_scaled < df_t_scaled.mean() - df_t_scaled.std(), True, False), index=df_t_scaled.index, columns=['severe_loss']).astype(int) loss_intensity = df_severe X, y = df_cli2_scaled, loss_intensity #divide data train and test # X_train, X_test, y_train, y_test = train_test_split(df_cli2_scaled, loss_intensity, test_size=0.3, random_state=0) #train explainer shap explainer = shap.TreeExplainer(clf_brf_all) shap_values = explainer.shap_values(X, approximate=False, check_additivity=True) # train for bars and scatters explainer_dif = shap.TreeExplainer(clf_brf_all, X) shap_values_dif = explainer_dif(X) # get just the explanations for the positive class shap_values_dif_one = shap_values_dif[..., 1] # Summary plots shap.summary_plot(shap_values, X, plot_type="bar") shap.summary_plot(shap_values[1], X, plot_type="bar") shap.summary_plot(shap_values[1], X) # Failure # bar plot priority # shap.plots.bar(shap_values_dif_one) # - not sure why it is giving different results # plots for dependence plots and scatter + interaction # for feature in X_train.columns.values.tolist(): # shap.dependence_plot(feature, shap_values[1], X_train, interaction_index=None) for name in X.columns: shap.dependence_plot(name, shap_values[1], X) # HTML to interact with all predictors shap_display_all = shap.force_plot(explainer.expected_value[1], shap_values[1], X, show=False) shap.save_html("index.html", shap_display_all) ## open browser for the interactive model # Decision plots explaining decisions to classify shap.decision_plot(explainer.expected_value[1], shap_values[1], X) shap.decision_plot(explainer.expected_value[1], shap_values[1][52], X.loc[[2012]]) #2012 year shap.decision_plot(explainer.expected_value[1], shap_values[1][53], X.loc[[2013]]) #2012 year # Calculate force plot for a given value 2012 shap.initjs() shap_values_2012 = explainer.shap_values(X.loc[[2012]]) shap_display = shap.force_plot(explainer.expected_value[1], shap_values_2012[1], X.loc[[2012]], matplotlib=True) shap_display2013 = shap.force_plot(explainer.expected_value[1], explainer.shap_values(X.loc[[2013]])[1], X.loc[[2013]], matplotlib=True) display(shap_display)
def training(train, test, validation_size, estimator, target_variable, drop_list, target_type, cv_folds, scoring_cv, cv=True, final=False, hypertuning=False): import matplotlib.pyplot as plt import pandas as pd import lightgbm as lgbm import training import os import sklearn import numpy as np import seaborn as sns import re import matplotlib.pyplot as plt import math from datetime import datetime import datetime import statsmodels.api as sm from sklearn.model_selection import train_test_split from scipy import stats from sklearn.feature_selection import SelectFromModel from sklearn.model_selection import cross_val_score, validation_curve from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn import ensemble from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.preprocessing import KBinsDiscretizer from sklearn.metrics import mean_squared_log_error from sklearn.metrics import make_scorer from sklearn.model_selection import KFold from sklearn.metrics import (confusion_matrix, accuracy_score, recall_score, roc_curve, roc_auc_score, plot_roc_curve, mean_squared_error) import xgboost import shap from catboost import CatBoostClassifier from catboost import CatBoostRegressor import lightgbm as lgbm import optuna.integration.lightgbm as lgb from optuna.integration import _lightgbm_tuner as tuner from optuna.integration._lightgbm_tuner import LightGBMTuner from optuna.integration._lightgbm_tuner import LightGBMTunerCV rmsle_scorer = make_scorer(score_func) train_y = train[target_variable] train_x = train.drop(columns=drop_list) test_y = test[target_variable] test_x = test.drop(columns=drop_list) column_names = list(train_x.columns) if final==True: train_x = train_x.append(test_x) train_y = train_y.append(test_y) if target_type=="bin": if estimator == "log_sk": model = LogisticRegression(max_iter=1000) log_sk = model.fit(train_x, train_y) fitted_model = log_sk if estimator == "gb" and hypertuning==False: model = ensemble.GradientBoostingClassifier(learning_rate = 0.1, max_depth=3, n_estimators= 100) gb = model.fit(train_x, train_y) fitted_model = gb if estimator == "gb" and hypertuning==True: param_grid = { 'n_estimators': [100, 200, 400], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.05, 0.025, 0.01, 0.001, 0.005], 'random_state': [42] } gb = ensemble.GradientBoostingClassifier() gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv) gb_grid.fit(train_x, train_y) print('Optimal parameters for gradient boosting classifier = ', gb_grid.best_params_) gb = gb_grid.best_estimator_ fitted_model = gb if estimator == "rf" and hypertuning==False: model = ensemble.RandomForestClassifier(max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 12, n_estimators= 100) rf = model.fit(train_x, train_y) fitted_model=rf if estimator == "rf" and hypertuning==True: param_grid = { 'bootstrap': [True], 'max_depth': [10, 20, 30], 'max_features': [2, 3, 5], 'min_samples_leaf': [3, 5, 10], 'min_samples_split': [8, 12], 'n_estimators': [100, 300, 500], 'n_jobs': [3] } rf = RandomForestClassifier() rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv) rf_grid.fit(train_x, train_y) print('Optimal parameters for random forest classifier = ', rf_grid.best_params_) rf = rf_grid.best_estimator_ fitted_model = rf if cv and hypertuning==False: cross_val_accuracy = cross_val_score(estimator=model , X=train_x , y=train_y , cv=cv_folds , scoring=scoring_cv) print(f'The average cross validation accuracy of the model is {round(cross_val_accuracy.mean(), 2)}') print(cross_val_accuracy) if target_type=="con": if estimator == "lgbm" and hypertuning==False: train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=validation_size, shuffle=True, random_state=42) train_data=lgb.Dataset(train_x,label=train_y) valid_data=lgb.Dataset(valid_x,label=valid_y) model = lgbm.LGBMRegressor(random_state=42, n_estimators=1000) lgbm_model = model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=scoring_cv, verbose = -1) fitted_model = lgbm_model if estimator == "lin_reg" and hypertuning==False: model = LinearRegression(max_iter=1000) lin_reg = model.fit(train_x, train_y) fitted_model = lin_reg if estimator == "gb" and hypertuning==False: model = ensemble.GradientBoostingRegressor(learning_rate = 0.001, max_depth=5, n_estimators= 100) gb = model.fit(train_x, train_y) fitted_model = gb if estimator == "rf" and hypertuning==False: model = ensemble.RandomForestRegressor(max_depth= 30, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 500, n_jobs= -1) rf = model.fit(train_x, train_y) fitted_model=rf if estimator == "gb" and hypertuning==True: # {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42} param_grid = { 'n_estimators': [100,500,1000], 'max_features': ["auto","sqrt","log2",0.6,0.8], 'min_samples_leaf':[30,50,70], 'min_samples_split':[10,20,500,100], 'max_depth' : [10,15,20,25], 'learning_rate':[0.1,0.01,0.001] } gb = ensemble.GradientBoostingRegressor() gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv) gb_grid.fit(train_x, train_y) print('Optimal parameters for gradient boosting regressor = ', gb_grid.best_params_) gb = gb_grid.best_estimator_ fitted_model = gb if estimator == "lgbm" and hypertuning==True: if __name__ == "__main__": dtrain = lgb.Dataset(train_x, label=train_y) params = { "objective": "regression", "metric": "rmse", "verbosity": -1, "boosting_type": "gbdt", } tuner = lgb.LightGBMTunerCV( params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=5) ) tuner.run() print("Best score:", tuner.best_score) best_params = tuner.best_params print("Best params:", best_params) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value)) if estimator == "rf" and hypertuning==True: # {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100, 'n_jobs': 1} # max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 300, n_jobs= 1 # {'bootstrap': True, 'max_depth': 100, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 500, 'n_jobs': 4} param_grid = { 'max_depth': [10, 20, 30], 'max_features': [2, 3, 5], 'min_samples_leaf': [3, 5, 10], 'min_samples_split': [8, 12], 'n_estimators': [100, 300, 500], 'n_jobs': [4] } rf = RandomForestRegressor() rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv) rf_grid.fit(train_x, train_y) print('Optimal parameters for random forest regressor = ', rf_grid.best_params_) rf = rf_grid.best_estimator_ fitted_model = rf if cv and hypertuning==False: cross_val_rmse = cross_val_score(estimator=model , X=train_x , y=train_y , cv=cv_folds , scoring=scoring_cv) print(f'The average cross validation rmsle of the model is {-1*round(cross_val_rmse.mean(), 2)}') print(cross_val_rmse) if estimator=="gb" or estimator=="rf" or estimator=="lgbm": list_all_Features = train_x.columns.tolist() # Feature importance fi_df = pd.DataFrame({"Feature": list_all_Features, "Importance": fitted_model.feature_importances_}).sort_values(by="Importance", ascending=False) fi_selected=fi_df[:15] important_feature_list = fi_selected["Feature"].tolist() if estimator=="gb": fi_selected.to_excel(r'fi_selected.xlsx') fig = plt.figure(figsize=(20,10)) feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) feat_importances.nlargest(30).plot(kind='barh', color="green") plt.title("Feature Importance from Gradient Boosting") plt.savefig('Feature Importance from Gradient Boosting.png', bbox_inches = "tight") if estimator=="rf": fi_selected.to_excel(r'fi_selected.xlsx') fig = plt.figure(figsize=(20,20)) feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) feat_importances.nlargest(30).plot(kind='barh', color="green") plt.title("Feature Importance from Random Forest") plt.savefig('Feature Importance from Random Forest.png', bbox_inches = "tight") if estimator=="lgbm": fi_selected.to_excel(r'fi_selected.xlsx') feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) explainer = shap.TreeExplainer(fitted_model) shap_values = explainer.shap_values(valid_x) shap.initjs() force_plot = shap.force_plot(explainer.expected_value, shap_values[0,:], valid_x.iloc[0,:]) shap.save_html("index_force_plot.htm", force_plot) force_plot_all = shap.force_plot(explainer.expected_value, shap_values, valid_x) shap.save_html("index_force_plot_all.htm", force_plot_all) plt.figure(figsize=(10,20)) shap.summary_plot(shap_values, valid_x, show=False) plt.savefig('summary_plot.png', bbox_inches = "tight") top_features = feat_importances.nlargest(10) top_features = top_features.reset_index() top_features = top_features['index'].to_list() for i in top_features: plt.figure(figsize=(20,20)) shap.dependence_plot(i, shap_values, valid_x, show=False) plt.savefig(f"dep_plot_{i}.png", bbox_inches = "tight") if final==False and target_type=="con": yhat = fitted_model.predict(test_x).astype(float) y_pred = list(yhat.astype(float)) y_true = list(test_y) print(np.sqrt(mean_squared_error(y_true, y_pred))) if final==False and target_type=="bin": yhat = fitted_model.predict(test_x) y_pred = list(map(round, yhat)) cm = confusion_matrix(test_y, y_pred) print ("Confusion Matrix : \n", cm) print('Test accuracy = ', accuracy_score(test_y, prediction)) print('Test recall = ', recall_score(test_y, prediction)) return fitted_model
import warnings warnings.filterwarnings("ignore") # Create SHAP explainer explainer = shap.TreeExplainer(RFModel) # Get shap values for observtation of interest shap_values = explainer.shap_values(data_for_prediction.values, check_additivity=False) decisionhtml = shap.decision_plot(base_value= explainer.expected_value[1], shap_values= shap_values[1], features= data_for_prediction, feature_names=data_for_prediction.columns.tolist(),show = False) plt.savefig('decisionPlot.pdf') plt.close() onedshap_values = shap_values[1].flatten() shap.waterfall_plot(explainer.expected_value[1], onedshap_values, feature_names=data_for_prediction.columns, max_display=10, show=False) plt.savefig('waterfallPlot.pdf') plt.close() # SHAP Plots for Class 1 (sRNA-mRNA Interaction) forcehtml = shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction) shap.save_html(out_file = 'forcePlot.html', full_html=False, plot = forcehtml) elif((len(sys.argv) - 1) < 2): print("Error: Required parameters not passed! Please pass two parameters, sRNA ID and mRNA ID.") else: print("Error: Only two parameters can be passed. sRNA ID and mRNA ID.")
plt.savefig("dB.png", bbox_inches='tight', dpi=600) plt.close('all') shap.dependence_plot("median_stride_length", shap_values, X_train) plt.savefig("sl-dep.pdf", bbox_inches='tight', dpi=600) plt.close('all') shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :], show=False, matplotlib=True) plt.savefig('tmp.pdf') shap.force_plot(explainer.expected_value, shap_values, X_train) shap.save_html('explainer.html', shap.force_plot(explainer.expected_value, shap_values, X_train)) #XGBoost best_params = { 'XGBRegressor__alpha': 0.8, 'XGBRegressor__colsample_bytree': 0.7, 'XGBRegressor__eta': 0.05, 'XGBRegressor__max_depth': 5, 'XGBRegressor__objective': 'reg:squarederror', 'XGBRegressor__subsample': 0.5 } #This set of parameters is obtained by running the main grid search code on Winter cluster. pipe = make_pipeline( RobustScaler(), XGBRegressor(
def get_shap_force(df, explainer, key): shap_values = explainer.shap_values(df) plot = shap.force_plot(explainer.expected_value[0], shap_values[0], show=False, feature_names = loaded_model.get_booster().feature_names) shap.save_html(f'templates/force_plots/{key}.html', plot, full_html=False)
shap.summary_plot(shap_values, X) # # Force Plot - Feature Contribution Visualization Across Observatons # In[ ]: # load JS visualization code to notebook shap.initjs() # visualize the prediction's explanation (use matplotlib=True to avoid Javascript) # Impact on Day 0 Price output = shap.force_plot(explainer.expected_value[0], shap_values[0], X, plot_cmap=["#FF0000", "#008000"]) shap.save_html("Price Influence by Features Across Observations.html", output) shap.force_plot(explainer.expected_value[0], shap_values[0], X, plot_cmap=["#FF0000", "#008000"]) # # Key Price Influencer Daily Slider # In[ ]: import pandas as pd import pickle import numpy as np from sklearn.model_selection import train_test_split from __future__ import print_function from ipywidgets import interact, interactive, fixed, interact_manual
def save_plot(plt, name): shap.save_html("plots/" + name, plt)
def shap_plot(self, explainer=None, shap_vals=None, specific_var=None, interactions=False, interaction_vars=None, classwise=True, class_ind=1, num_display=20): """ :param explainer: explainer :param shap_vals: vals derived from running the explainer :param specific_var: if desired, run the individual feature plots :param interaction_var: which desired var to plot as interacting with 'specific var' :param class ind: when plotting classifier results, pick class index to plot with :return shap_interaction_vals: these are expensive to compute, so only want to do so once! """ interaction_var = None if interaction_vars is not None: if len(interaction_vars) > 2: raise Exception( 'Interaction vars list cannot be greater than 2.') def plot_interactions(data, expl=None, vars_=None, class_index=1): if self.shap_interaction_vals is None: if self.type_ == 'cls': self.shap_interaction_vals = expl.shap_interaction_values( data)[class_index] elif self.type_ == 'reg': self.shap_interaction_vals = expl.shap_interaction_values( data) tmp = np.abs(self.shap_interaction_vals).sum(0) for i in range(tmp.shape[0]): tmp[i, i] = 0 inds = np.argsort(-tmp.sum(0))[:50] tmp2 = tmp[inds, :][:, inds] plt.figure(figsize=(12, 12)) plt.imshow(tmp2) plt.yticks(range(tmp2.shape[0]), data.columns[inds], rotation=50.4, horizontalalignment="right") plt.xticks(range(tmp2.shape[0]), data.columns[inds], rotation=50.4, horizontalalignment="left") plt.gca().xaxis.tick_top() plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_interaction_matrix_{}.png'.format(class_index))) plt.show() plt.close() if vars_ != None: shap.dependence_plot(vars_, self.shap_interaction_vals, data, show=False) plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_interaction_{}_{}_{}.png'.format( vars_[0], vars_[1], class_index))) plt.show() plt.close() if self.k_cv == 'split': X_test_plot = self.X elif self.k_cv == 'loo_cv' or self.k_cv == 'k_fold': X_test_plot = self.X if self.type_ == 'cls': if interactions: plot_interactions(X_test_plot, explainer, interaction_vars, class_ind) if classwise or (self.class_ == 'lin'): shap.summary_plot(shap_values=shap_vals, features=X_test_plot, max_display=num_display, plot_type='bar', show=False) plt.xlabel('mean(|SHAP value|) (impact on output magnitude)') plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_' + str(num_display) + '_shap_val_summary.png')) plt.show() plt.close() else: shap.summary_plot(shap_values=shap_vals[class_ind], features=X_test_plot, max_display=num_display, plot_type='bar', show=False) plt.xlabel('mean(|SHAP value|) (impact on output magnitude)') plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_' + str(num_display) + '_shap_val_summary.png')) plt.show() plt.close() if self.class_ == 'RF': shap.summary_plot(shap_values=shap_vals[class_ind], features=X_test_plot, max_display=num_display, plot_type='dot', show=False) elif self.class_ == 'lin': shap.summary_plot(shap_values=shap_vals, features=X_test_plot, max_display=num_display, plot_type='dot', show=False) elif self.class_ == 'svm': print('not implemented shap for svm yet') plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_' + str(class_ind) + '_' + str(num_display) + '_shap_effects_summary.png')) plt.show() plt.close() if specific_var is not None: if self.class_ == 'RF': shap.dependence_plot(specific_var, interaction_index=interaction_var, shap_values=shap_vals[class_ind], features=X_test_plot, show=False) else: shap.dependence_plot(specific_var, interaction_index=interaction_var, shap_values=shap_vals, features=X_test_plot, show=False) plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_' + str(num_display) + '_shap_interaction_summary_{}.png'.format(specific_var) )) plt.show() plt.close() elif self.type_ == 'reg': if interactions: plot_interactions(X_test_plot, explainer, interaction_vars, class_ind) shap.summary_plot(shap_values=shap_vals, features=X_test_plot, max_display=num_display, plot_type='bar', show=False) plt.xlabel('mean(|SHAP value|) (impact on output magnitude)') plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_' + str(num_display) + '_shap_val_summary.png')) plt.show() plt.close() if self.class_ == 'RF': shap.summary_plot(shap_values=shap_vals, features=X_test_plot, max_display=num_display, plot_type='dot', show=False) else: shap.summary_plot(shap_values=shap_vals, features=X_test_plot, max_display=num_display, plot_type='dot', show=False) plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_' + str(num_display) + '_shap_effects_summary.png')) plt.show() plt.close() if specific_var is not None: if self.class_ == 'RF': shap.dependence_plot(specific_var, interaction_index=interaction_var, shap_values=shap_vals, features=X_test_plot, show=False) else: shap.dependence_plot(specific_var, interaction_index=interaction_var, shap_values=shap_vals, features=X_test_plot, show=False) plt.tight_layout() plt.savefig( os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + '_' + str(num_display) + '_shap_interaction_summary_{}.png'.format(specific_var) )) plt.show() plt.close() # visualize the training set predictions f = os.path.join( self.output_dir, self.outcome_var + '_' + str(self.type_) + '_' + str(self.class_) + 'shap_forceplot_{}.html'.format(class_ind)) if self.type_ == 'cls': shap.save_html( f, shap.force_plot(explainer.expected_value[class_ind], shap_vals[class_ind], X_test_plot, show=False)) elif self.type_ == 'reg': shap.save_html( f, shap.force_plot(explainer.expected_value, shap_vals, X_test_plot, show=False)) if interactions: return self.shap_interaction_vals
def main(): print(tf.__version__) #Loading input data - test, val, train data and dropping different labels (differential diagnosis, combined label, lab tests from the dataset) test_dat = pd.read_pickle(test) test_dat.drop(name, axis=1, inplace=True) test_dat.drop('CM_Label', axis=1, inplace=True) test_dat.drop('PrimaryDx', axis=1, inplace=True) print(test_dat['PrimaryDx_Label'].value_counts()) val_dat = pd.read_pickle(val) val_dat.drop(name, axis=1, inplace=True) val_dat.drop('CM_Label', axis=1, inplace=True) val_dat.drop('PrimaryDx', axis=1, inplace=True) print(val_dat['PrimaryDx_Label'].value_counts()) train_dat = pd.read_pickle(train) train_dat.drop(name, axis=1, inplace=True) train_dat.drop('CM_Label', axis=1, inplace=True) train_dat.drop('PrimaryDx', axis=1, inplace=True) print(train_dat['PrimaryDx_Label'].value_counts()) train_dat = train_dat.astype('int') test_dat = test_dat.astype('int') val_dat = val_dat.astype('int') train_dat = balance_classes( 1, train_dat) #Calling function to upsample the minority class print("Data Loaded") #Extract the labels from the dataset test_y = np.array(test_dat.pop(Label)) train_y = np.array(train_dat.pop(Label)) val_y = np.array(val_dat.pop(Label)) #Input features x to the models test_x = test_dat train_x = train_dat val_x = val_dat #Getting feature names from column headers feature = list(train_x.columns) #Transform features by scaling each feature to a given range sc_X = MinMaxScaler() train_x = sc_X.fit_transform(train_x) test_x = sc_X.transform(test_x) val_x = sc_X.transform(val_x) positive_results = 1 - len([i for i in train_y if i == 1]) / len(train_y) print(positive_results) positive_results = 1 - len([i for i in test_y if i == 1]) / len(test_y) print(positive_results) train_x = np.nan_to_num(train_x) test_x = np.nan_to_num(test_x) #Note a pretrained model can be loaded instead of training a new model here as an option. """Neural Network Model""" model = keras.Sequential() model.add(keras.layers.Dense(2048, activation=tf.nn.relu)) model.add(keras.layers.Dense(1024, activation=tf.nn.relu)) model.add(keras.layers.Dense(210, activation=tf.nn.relu)) model.add(keras.layers.Dense(120, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) US = 1000 No_US = 1 #Optimizer and Loss Function opt = keras.optimizers.SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True) model.compile(class_weight={ 0: US, 1: No_US }, loss="binary_crossentropy", optimizer=opt, metrics=['accuracy'], kernel_regularizer=keras.regularizers.l2(0.05), bias_regularizer=keras.regularizers.l2(0.01)) early_stopping_monitor = EarlyStopping(patience=20) history = model.fit(train_x, train_y, epochs=1000, batch_size=1000, validation_data=(val_x, val_y), callbacks=[early_stopping_monitor], verbose=1) model.save(path + name + '_NN.h5') #Saving the NN Model #Determing the SHAP values and generating SHAP plots #https://github.com/slundberg/shap background = train_x[np.random.choice(train_x.shape[0], 10, replace=False)] e = shap.DeepExplainer(model, background) shap_values = e.shap_values(train_x) summary_plot = shap.summary_plot(shap_values[0], train_x, feature_names=feature, show=False) plt.savefig(path + 'NN_Shap_Summary_plot_3.png', bbox_inches='tight', dpi=600) plt.close() bar_plot = shap.summary_plot(shap_values, train_x, feature_names=feature, show=False) plt.savefig(path + 'NN_Shap_Bar_plot_3.png', bbox_inches='tight', dpi=600) plt.close() D_plot = shap.dependence_plot("Sex_F", shap_values[0], train_x, interaction_index='Age', feature_names=feature) plt.savefig(path + 'Gender_NN_D_plot_3.png', bbox_inches='tight', dpi=600) Age_plot = shap.dependence_plot("Age", shap_values[0], train_x, interaction_index=None, feature_names=feature) plt.savefig(path + 'Age_NN_D_plot_3.png', bbox_inches='tight', dpi=600) Pulse_plot = shap.dependence_plot("Pulse", shap_values[0], train_x, interaction_index=None, feature_names=feature) plt.savefig(path + 'Pulse_NN_D_plot_3.png', bbox_inches='tight', dpi=600) Pulse_Age_plot = shap.dependence_plot("Age", shap_values[0], train_x, interaction_index='Pulse', feature_names=feature) plt.savefig(path + 'Pulse_Age_NN_D_plot_3.png', bbox_inches='tight', dpi=600) #Selecting individual patients predictions and generating patient specific SHAP values as examples data_for_prediction = test_x[9:10, :] background = train_x[0:100, :] explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(data_for_prediction) force_plot = shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction, feature_names=feature) shap.save_html(path + "force_plot.html", force_plot) data_for_prediction = test_x[22:23, :] background = train_x[0:100, :] explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(data_for_prediction) force_plot = shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction, feature_names=feature) shap.save_html(path + "force_plot2.html", force_plot) data_for_prediction = test_x[100:101, :] background = train_x[0:100, :] explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(data_for_prediction) force_plot = shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction, feature_names=feature) shap.save_html(path + "force_plot3.html", force_plot) data_for_prediction = test_x[8:9, :] background = train_x[0:100, :] explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(data_for_prediction) force_plot = shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction, feature_names=feature) shap.save_html(path + "force_plot4.html", force_plot) data_for_prediction = test_x[68:69, :] background = train_x[0:100, :] explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(data_for_prediction) force_plot = shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction, feature_names=feature) shap.save_html(path + "force_plot5.html", force_plot) data_for_prediction = test_x[0:100, :] background = train_x[0:100, :] explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(data_for_prediction) force_plot = shap.force_plot(explainer.expected_value[0], shap_values[0], data_for_prediction, feature_names=feature) shap.save_html(path + "summary_force_plot3.html", force_plot)
# save xgb_results_df = pd.DataFrame(results, columns=columns) result_file = 'xgb_results_T.csv' xgb_results_df.to_csv(result_file, index=False) # train all print('Trainig based on all') model, _ = train(X, y) ax = plot_importance(model, title='Weight', importance_type='weight', max_num_features=10) plt.show() ax = plot_importance(model, title='Gain', importance_type='gain', max_num_features=10) plt.show() ax = plot_importance(model, title='Cover', importance_type='cover', max_num_features=10) plt.show() # fix tree booster = model.get_booster() model_bytearray = booster.save_raw()[4:] def fix(self=None): return model_bytearray booster.save_raw = fix # shap import shap explainer = shap.TreeExplainer(booster) shap_values = explainer.shap_values(X) shap.initjs() for i in range(4): shap.save_html('shap_' + str(i) + '.html', shap.force_plot(explainer.expected_value[i], shap_values[i], X)) shap.summary_plot(shap_values, X)
def classify(self, items, probabilities=False, importances=False, importance_cutoff=0.15): assert items is not None assert (self.extraction_pipeline is not None and self.clf is not None), "The module needs to be initialized first" if not isinstance(items, list): items = [items] assert isinstance(items[0], dict) or isinstance(items[0], tuple) X = self.extraction_pipeline.transform(items) if probabilities: classes = self.clf.predict_proba(X) else: classes = self.clf.predict(X) classes = self.overwrite_classes(items, classes, probabilities) if importances: explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X) important_features = self.get_important_features( importance_cutoff, shap_values) important_features["values"] = X # Workaround: handle multi class case for force_plot to work correctly if len(classes[0]) > 2: pred_class_index = classes.argmax(axis=-1)[0] explainer.expected_value = explainer.expected_value[ pred_class_index] shap_values = shap_values[pred_class_index] else: pred_class_index = 0 pred_class = self.class_names[pred_class_index] top_indexes = [ int(index) for importance, index, is_positive in important_features["classes"][pred_class][0] ] feature_names = self.get_human_readable_feature_names() feature_legend = { str(i + 1): feature_names[feature_i] for i, feature_i in enumerate(top_indexes) } with io.StringIO() as out: p = shap.force_plot( explainer.expected_value, shap_values[:, top_indexes], X.toarray()[:, top_indexes], feature_names=[ str(i + 1) for i in range(len(top_indexes)) ], matplotlib=False, show=False, ) # TODO: use full_html=False shap.save_html(out, p) html = out.getvalue() return ( classes, { "importances": important_features, "html": html, "feature_legend": feature_legend, }, ) return classes
#classifier = model_keras (needs fixing) #We will use SHAP KernelExplainer to explain the model. explainer = shap.KernelExplainer(model=classifier.predict_proba, data=X_train.iloc[0:100,:]) #Next, we compute the SHAP values shap_values= explainer.shap_values(X=X_test.iloc[0:50,:]) #Since is binary classification, len = 2 print(len(shap_values)) #(50,6) - 50 objects, 6 features print(shap_values[0].shape) #Explaining a single prediction for passing shap.initjs() plot = shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_test.iloc[0,:]) shap.save_html('plot_1_instances.html', plot) #Explaining a single prediction for failing plot = shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:]) shap.save_html('plot_2_instances.html', plot) #Explaining predictions for passing for 50 instances of X_test plot = shap.force_plot(explainer.expected_value[1], shap_values[1], X_test) shap.save_html('plot_X_test_instances.html', plot) #Shap summary plot print(shap.summary_plot(shap_values, X_test))