# Summarizing the data with k-Means is a trick to speed up the processing """ Rather than use the whole training set to estimate expected values, we summarize with a set of weighted kmeans, each weighted by the number of points they represent. Running without kmeans took 1 hr 6 mins 7 sec. Running with kmeans took 2 min 47 sec. Boston Housing is a small dataset. Running SHAP on models that require the Kernel method becomes prohibitive. """ # build the kmeans summary X_train_summary = shap.kmeans(X_train, 10) # using the kmeans summary t0 = time.time() explainerKNN = shap.KernelExplainer(knn.predict,X_train_summary) shap_values_KNN_test = explainerKNN.shap_values(X_test) t1 = time.time() timeit=t1-t0 timeit # without kmeans# a test run took 3967.6232330799103 seconds """ t0 = time.time() explainerKNN = shap.KernelExplainer(knn.predict, X_train)shap_values_KNN_test = explainerKNN.shap_values(X_test) t1 = time.time() timeit=t1-t0timeit """ j=52 # now we can plot the SHAP explainer shap.force_plot(explainerKNN.expected_value, shap_values_KNN_test[j], X_test.iloc[[j]])
def calculate_predictions(img_orig, original_class, explainer_name, num_features, strategy, sigma, verbose=0): # segment the image so we don't have to explain every pixel segments_slic = slic(img_orig, n_segments=49, compactness=1000, sigma=3) def f(z): return model.predict(mask_image(z, segments_slic, img_orig, 255)) new_class_lime = None new_prediction_lime = None global prev_shap_values global prev_img_orig global prev_explanation global prev_num_features if ((explainer_name == 'shap' or explainer_name == 'random' or explainer_name == 'grad') & (img_orig == prev_img_orig).all()): if verbose: print("Hitting cache, returning prev values") shap_values = prev_shap_values elif explainer_name == 'shap': # use Kernel SHAP to explain the network's predictions explainer = shap.KernelExplainer(f, np.zeros((1, 49))) shap_values = explainer.shap_values(np.ones( (1, 49)), nsamples=1000) # runs model 1000 times elif explainer_name == 'grad': # use Kernel SHAP to explain the network's predictions last_conv_layer_name = "conv2d" classifier_layer_names = [ "global_average_pooling2d", "dense_1", ] heatmap = grad.make_gradcam_heatmap(img_orig.reshape(1, 250, 250, 3), model, last_conv_layer_name, classifier_layer_names) if verbose: grad.test_drive_grad(img_orig) plt.imshow(heatmap) plt.show() shap_values = [] for i in range(5): shap_values.append( [[item for sublist in heatmap for item in sublist]]) shap_values = np.array(shap_values) shap_values = shap_values / np.max(shap_values) elif explainer_name == 'random': shap_values = [ numpy.asarray([[random.uniform(0, 1) for iter in range(50)]]) for i in range(5) ] elif explainer_name == 'lime': if ((img_orig == prev_img_orig).all()): explanation = prev_explanation if verbose: print("Hitting LIME cache, returning prev values") else: explainer = lime_image.LimeImageExplainer() explanation = explainer.explain_instance(img_orig.astype("double"), model.predict, num_samples=1000) prev_explanation = explanation prev_img_orig = img_orig if (num_features == prev_num_features): new_class_lime = None new_prediction_lime = None else: if strategy == "top": lime_img, _ = explanation.get_image_and_mask( explanation.top_labels[original_class], positive_only=True, negative_only=False, hide_rest=True, num_features=num_features, min_weight=0) else: lime_img, _ = explanation.get_image_and_mask( explanation.top_labels[original_class], positive_only=False, negative_only=True, num_features=1000, hide_rest=True) if verbose: plt.matshow(lime_img) plt.show() new_class_lime = model.predict_classes( lime_img.reshape(1, 250, 250, 3))[0] new_prediction_lime = model.predict( lime_img.reshape(1, 250, 250, 3))[0][original_class] prev_num_features = num_features _, mask = explanation.get_image_and_mask( explanation.top_labels[original_class], positive_only=True, negative_only=False, hide_rest=True, num_features=num_features, min_weight=0) shap_values = convert_to_shap_values(mask, verbose) prev_shap_values = shap_values.copy() prev_img_orig = img_orig.copy() # get the top predictions from the model preds = model.predict(np.expand_dims(img_orig.copy(), axis=0)) top_preds = np.argsort(-preds) inds = top_preds[0] # show_cut_image(shap_values, img_orig, num_features) shap_values = [np.where(a < 0, 0, a) for a in shap_values] shap_values = extract_top_ten(shap_values, num_features) if strategy == 'rest': shap_values = (shap_values - 1) * -1 masked_image = mask_image_with_noise(shap_values[inds[0]], segments_slic, img_orig, sigma) if verbose: plt.imshow(masked_image[0]) plt.show() new_class = model.predict_classes(masked_image)[0] new_prediction = model.predict(masked_image)[0][original_class] return new_class, new_prediction, new_class_lime, new_prediction_lime
def test_null_model_small(): explainer = shap.KernelExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 4)), nsamples=100) e = explainer.explain(np.ones((1, 4))) assert np.sum(np.abs(e.effects)) < 1e-8
#For context, we'll look at the raw predictions before looking at the SHAP values my_model.predict_proba(data_for_prediction_array) # Create object that can calculate shap values explainer = shap.TreeExplainer(my_model) ### Calculate Shap values shap_values = explainer.shap_values(data_for_prediction) # It's cumbersome to review raw arrays, but the shap package has a nice way to visualize the results. shap.initjs() shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction) #Here is an example using KernelExplainer to get similar results. #The results aren't identical because kernelExplainer gives an approximate result. # use Kernel SHAP to explain test set predictions k_explainer = shap.KernelExplainer(my_model.predict_proba, train_X) ### Calculate Shap values k_shap_values = k_explainer.shap_values(data_for_prediction) shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction) # or # **Calculate and show Shap Values for One Prediction:** # ``` # import shap # package used to calculate Shap values # data_for_prediction = val_X.iloc[0,:] # use 1 row of data here. Could use multiple rows if desired
from sklearn.model_selection import train_test_split import numpy as np import time import shap from warnings import simplefilter simplefilter(action='ignore', category=FutureWarning) # In[2]: X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) def print_accuracy(f): print("Accuracy = {0}%".format(100 * np.sum(f(X_test) == Y_test) / len(Y_test))) time.sleep(0.5) shap.initjs() # In[9]: linear_lr = sklearn.linear_model.LogisticRegression() linear_lr.fit(X_train, Y_train) print_accuracy(linear_lr.predict) explainer = shap.KernelExplainer(linear_lr.predict_proba, X_train) shap_values = explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test)
# In[23]: joblib.dump(regression, open(os.path.join(MODEL_FOLDER, 'regression_model'), 'wb')) # ## Model explanation # In[24]: import shap # In[25]: # create kernel explaner object explainer = shap.KernelExplainer(model=regression.predict, data=test[X_FEATURES][:40], link="identity") # In[28]: idx = 22 shap_value_single = explainer.shap_values( X=test[X_FEATURES].iloc[idx], nsamples=100, ) shap.initjs() shap.force_plot( base_value=explainer.expected_value, shap_values=shap_value_single[0], features=test[X_FEATURES].iloc[idx],
pdp.pdp_interact_plot(pdp_data, ['x4_plot', 'x5_plot'], plot_type='contour', x_quantile=False, plot_pdp=False, which_classes=None, ncols=2, plot_params=None) #%% #y = x1**2 + x2**3 + x3**4 + 2*x4**2 - x5**3 i = 1.3 print(i, 2*i , 3*i, 4*i, -5*i) print(i + 2*i + 3*i + 4*i -5*i) import shap data_for_prediction = np.array([[i, i, i, i, i]]) print(model.predict(data_for_prediction)) k_explainer = shap.KernelExplainer(model.predict, X) k_shap_values = k_explainer.shap_values(data_for_prediction) shap.force_plot(k_explainer.expected_value[1], k_shap_values[1], data_for_prediction, matplotlib=True) #explainer = shap.TreeExplainer(model) l_explainer = shap.LinearExplainer(model, X_train) #shap_values = explainer.shap_values(data_for_prediction) l_shap_values = l_explainer.shap_values(data_for_prediction) #shap.initjs() shap.force_plot(l_explainer.expected_value[0], l_shap_values[0], data_for_prediction, matplotlib=True) #%%
rounded=True, out_file=None) graph = graphviz.Source(dot_data) graph.render("iris-tree", format="png") if method_flag == 7: # 説明変数の重要度 x_importances = pd.DataFrame(clf.feature_importances_, index=pd.DataFrame(train_x).columns, columns=['importance']) x_importances.to_csv( 'rf_x_importances.csv') # csv ファイルに保存。同じ名前のファイルがあるときは上書きされますので注意してください #SHAP visualization if method_flag == 3 or method_flag == 4 or method_flag == 5 or method_flag == 9: explainer = shap.KernelExplainer(clf.predict, train_x) shap_values = explainer.shap_values(train_x.loc[[0]]) shap.force_plot(explainer.expected_value, shap_values[0], train_x.loc[[0]], matplotlib=True) """ shap_values = explainer.shap_values(train_x) shap.summary_plot(shap_values, features = train_x, #plot_type = 'bar' ) shap.dependence_plot(ind="RM", shap_values=shap_values, features = train_x, interaction_index = 'TSTAT', ) """
plt.figure(figsize=(10, 4)) # plt.set_title('Training & Validation Loss') no_of_features = list(range(0, len(avg))) plt.plot(no_of_features, avg, color='navy', marker='o', linestyle='-', label='Confidence level change vs no of features') plt.legend(loc='best') # In[ ]: import shap # use Kernel SHAP to explain test set predictions explainer = shap.KernelExplainer(model.predict_proba, X_train, link="logit") shap.initjs() # In[ ]: # In[ ]: # In[ ]: # In[ ]: c # # Section 3: Evaluating Model Performance. # # **TASK: Plot out the validation loss versus the training loss.**
def interpret(self, raw_input): """ Runs the interpretation command for the machine learning model. Handles both the "default" out-of-the-box interpretation for a certain set of UI component types, as well as the custom interpretation case. :param raw_input: a list of raw inputs to apply the interpretation(s) on. """ if isinstance(self.interpretation, list): # Either "default" or "shap" processed_input = [ input_component.preprocess(raw_input[i]) for i, input_component in enumerate(self.input_components) ] original_output = self.run_prediction(processed_input) scores, alternative_outputs = [], [] for i, (x, interp) in enumerate(zip(raw_input, self.interpretation)): if interp == "default": input_component = self.input_components[i] neighbor_raw_input = list(raw_input) if input_component.interpret_by_tokens: tokens, neighbor_values, masks = input_component.tokenize( x) interface_scores = [] alternative_output = [] for neighbor_input in neighbor_values: neighbor_raw_input[i] = neighbor_input processed_neighbor_input = [ input_component.preprocess( neighbor_raw_input[i]) for i, input_component in enumerate( self.input_components) ] neighbor_output = self.run_prediction( processed_neighbor_input) processed_neighbor_output = [ output_component.postprocess( neighbor_output[i]) for i, output_component in enumerate(self.output_components) ] alternative_output.append( processed_neighbor_output) interface_scores.append( quantify_difference_in_label( self, original_output, neighbor_output)) alternative_outputs.append(alternative_output) scores.append( input_component.get_interpretation_scores( raw_input[i], neighbor_values, interface_scores, masks=masks, tokens=tokens)) else: neighbor_values, interpret_kwargs = input_component.get_interpretation_neighbors( x) interface_scores = [] alternative_output = [] for neighbor_input in neighbor_values: neighbor_raw_input[i] = neighbor_input processed_neighbor_input = [ input_component.preprocess( neighbor_raw_input[i]) for i, input_component in enumerate( self.input_components) ] neighbor_output = self.run_prediction( processed_neighbor_input) processed_neighbor_output = [ output_component.postprocess( neighbor_output[i]) for i, output_component in enumerate(self.output_components) ] alternative_output.append( processed_neighbor_output) interface_scores.append( quantify_difference_in_label( self, original_output, neighbor_output)) alternative_outputs.append(alternative_output) interface_scores = [ -score for score in interface_scores ] scores.append( input_component.get_interpretation_scores( raw_input[i], neighbor_values, interface_scores, **interpret_kwargs)) elif interp == "shap" or interp == "shapley": try: import shap except (ImportError, ModuleNotFoundError): raise ValueError( "The package `shap` is required for this interpretation method. Try: `pip install shap`" ) input_component = self.input_components[i] if not (input_component.interpret_by_tokens): raise ValueError( "Input component {} does not support `shap` interpretation" .format(input_component)) tokens, _, masks = input_component.tokenize(x) # construct a masked version of the input def get_masked_prediction(binary_mask): masked_xs = input_component.get_masked_inputs( tokens, binary_mask) preds = [] for masked_x in masked_xs: processed_masked_input = copy.deepcopy( processed_input) processed_masked_input[ i] = input_component.preprocess(masked_x) new_output = self.run_prediction( processed_masked_input) pred = get_regression_or_classification_value( self, original_output, new_output) preds.append(pred) return np.array(preds) num_total_segments = len(tokens) explainer = shap.KernelExplainer( get_masked_prediction, np.zeros( (1, num_total_segments))) shap_values = explainer.shap_values( np.ones((1, num_total_segments)), nsamples=int(self.num_shap * num_total_segments), silent=True) scores.append( input_component.get_interpretation_scores( raw_input[i], None, shap_values[0], masks=masks, tokens=tokens)) alternative_outputs.append([]) elif interp is None: scores.append(None) alternative_outputs.append([]) else: raise ValueError( "Uknown intepretation method: {}".format(interp)) return scores, alternative_outputs else: # custom interpretation function processed_input = [ input_component.preprocess(raw_input[i]) for i, input_component in enumerate(self.input_components) ] interpreter = self.interpretation if self.capture_session and self.session is not None: graph, sess = self.session with graph.as_default(), sess.as_default(): interpretation = interpreter(*processed_input) else: try: interpretation = interpreter(*processed_input) except ValueError as exception: if str(exception).endswith( "is not an element of this graph."): raise ValueError(strings.en["TF1_ERROR"]) else: raise exception if len(raw_input) == 1: interpretation = [interpretation] return interpretation, []
def calculate_shap_values(data,max_lag_steps,input_columns,output_columns, include_output_column = default_include_output_column, include_t0 = default_include_t0, model_complexity = default_model_complexity, batch_size = default_batch_size, num_epochs = default_num_epochs, learning_rate = default_learning_rate, num_dense_layers = default_dense_layers, dense_layer_activation = default_dense_activation, output_layer_activation= default_output_activation, input_scaling = default_input_scaling, num_shap_samples = default_shap_samples, nan_percent_cutoff = default_nan_percent_cutoff, verbose = False): assert(not (include_output_column and include_t0)) assert(len(output_columns)== 1) assert(max_lag_steps>=1) time_series_length = max_lag_steps if include_t0 else max_lag_steps + 1 samples = reshape_and_pad(data, time_series_length, input_columns+output_columns, verbose=verbose) train_X, train_Y, test_X, test_Y = filter_samples(samples, include_output_column = include_output_column, include_t0 = include_t0, input_scaling = input_scaling, nan_percent_cutoff = nan_percent_cutoff, verbose = verbose ) model, history = build_and_run_lstm(train_X, train_Y, test_X, test_Y, model_complexity = model_complexity, batch_size = batch_size, num_epochs = num_epochs, learning_rate = learning_rate, num_dense_layers = num_dense_layers, dense_layer_activation = dense_layer_activation, output_layer_activation = output_layer_activation, verbose = verbose) if verbose: print("Validation losses:", history.history['val_loss'][-1], " / Training losses:", history.history['loss'][-1], " Test Y std_dev:", np.std(test_Y) ) input_reshape = lambda x : np.reshape(x,(1, time_series_length - 1, len(input_columns))) f = lambda X : np.array([model.predict(input_reshape(x)) for x in X]) flattened_X = np.reshape(train_X[:num_shap_samples], (-1,train_X.shape[1]*train_X.shape[2])) kernel_explainer = shap.KernelExplainer(f, flattened_X) lookup_elem = lambda elem_c : data[data.elementcode == elem_c].element.tolist()[0] lookup_item = lambda item_c : data[data.itemcode == item_c].item.tolist()[0] feature_names = [lookup_elem(element_c)+'/'+ lookup_item(item_c)+" : t-"+ \ str(i-1 if include_t0 else i) for i in range(max_lag_steps-1,0,-1) \ for element_c, item_c in input_columns ] return lambda : shap.summary_plot(kernel_explainer.shap_values(flattened_X), flattened_X,feature_names)
classificator.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) #Train model history = classificator.fit(train_data, train_label, batch_size=batch_size_classification, epochs=num_epochs_classification, verbose=0) loss, accuracy = classificator.evaluate(test_data, test_label, verbose=1) #SHAP elements = np.random.choice(len(train_data), int(0.3 * len(train_data)), False) explainer = shap.KernelExplainer(classificator.predict, train_data[elements]) #Apply aggregation function to test if necessary if num_epochs_detection == 0: matrix_metadata = metadata_to_matrix(TMP_TEST, "json") names = matrix_metadata[:, -1] test_data = np.zeros((len(names), num_archi_features)) test_label = np.zeros(len(names)) if data == "MonumenAI": for i in range(len(names)): im_name = names[i][2:-4] idx = test_loader.images_loc['path'].str.contains(im_name) test_data[idx] = matrix_metadata[i, :num_archi_features] test_label[idx] = matrix_metadata[i, num_archi_features] if data == "PascalPart": for i in range(len(names)): im_name = os.path.join(
""" Notebook code for shapley values. """ import sklearn import shap from sklearn.model_selection import train_test_split # print the JS visualization code to the notebook shap.initjs() # train a SVM classifier X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) svm = sklearn.svm.SVC(kernel='rbf', probability=True) svm.fit(X_train, Y_train) # use Kernel SHAP to explain test set predictions explainer = shap.KernelExplainer(svm.predict_proba, X_train, link="logit") shap_values = explainer.shap_values(X_test, nsamples=100) # plot the SHAP values for the Setosa output of the first instance shap.force_plot(explainer.expected_value[0], shap_values[0][0, :], X_test.iloc[0, :], link="logit")
st.text('ICE Plot') features = [0, 1] fig, ax = plt.subplots(figsize=(7, 6)) plot_partial_dependence(gscv, X_valid, features, kind='both', target=0, ax=ax) plt.tight_layout() st.pyplot(fig) st.text('Prediction on Test file') df_test['Predicted'] = gscv.predict(df_test) st.write(df_test) st.text('Shapley Explainer') # X_test = df_test.drop('Predicted', axis = 1) explainer = shap.KernelExplainer(gscv.predict_proba, X_valid) shap_values = explainer.shap_values(X_valid.iloc[2,:]) st.pyplot(shap.force_plot(explainer.expected_value[0], shap_values[0], X_valid.iloc[2,:], matplotlib=True, text_rotation=8)) st.text('Shapley Explainer WaterFall Plot') f = lambda x: gscv.predict_proba(x)[:,1] med = X_train.median().values.reshape((1,X_train.shape[1])) explainer = shap.Explainer(f, med) shap_values = explainer(X_train.iloc[0:100,:]) st.pyplot(shap.plots.waterfall(shap_values[2], max_display=7)) st.text('Partial Dependence Plot from pdp_box') pdp_ = pdp.pdp_isolate(model=gscv, dataset=X_valid, model_features=X_valid.columns,
def test_single_tree_nonlinear_transformations(): """ Make sure Independent Tree SHAP single trees with non-linear transformations. """ # Supported non-linear transforms def sigmoid(x): return (1 / (1 + np.exp(-x))) def log_loss(yt, yp): return (-(yt * np.log(yp) + (1 - yt) * np.log(1 - yp))) def mse(yt, yp): return (np.square(yt - yp)) try: import xgboost except: print("Skipping test_several_trees!") return np.random.seed(10) n = 1000 X = np.random.normal(size=(n, 7)) b = np.array([-2, 1, 3, 5, 2, 20, -5]) y = np.matmul(X, b) y = y + abs(min(y)) y = np.random.binomial(n=1, p=y / max(y)) max_depth = 6 # train a model with single tree Xd = xgboost.DMatrix(X, label=y) model = xgboost.train( { 'eta': 1, 'max_depth': max_depth, 'base_score': y.mean(), "lambda": 0, "objective": "binary:logistic" }, Xd, 1) pred = model.predict(Xd, output_margin=True) # In margin space (log odds) trans_pred = model.predict(Xd) # In probability space expl = shap.TreeExplainer(model, X, feature_perturbation="interventional") f = lambda inp: model.predict(xgboost.DMatrix(inp), output_margin=True) expl_kern = shap.KernelExplainer(f, X) x_ind = 0 x = X[x_ind:x_ind + 1, :] itshap = expl.shap_values(x) kshap = expl_kern.shap_values(x, nsamples=300) assert np.allclose(itshap.sum() + expl.expected_value, pred[x_ind]), \ "SHAP values don't sum to model output on explaining margin!" assert np.allclose(itshap, kshap), \ "Independent Tree SHAP doesn't match Kernel SHAP on explaining margin!" model.set_attr(objective="binary:logistic") expl = shap.TreeExplainer(model, X, feature_perturbation="interventional", model_output="probability") itshap = expl.shap_values(x) assert np.allclose(itshap.sum() + expl.expected_value, trans_pred[x_ind]), \ "SHAP values don't sum to model output on explaining logistic!"
model.fit(dataframe_train, dataframe_label.astype('float32')) dataframe_train = dataframe_train.to_pandas() dataframe_label = dataframe_label.to_pandas() else: model.fit(dataframe_train.values, dataframe_label.values.ravel()) # ------------------------------------------------------------- # Check if cv score should be calculated for the AutoML workflow # if alg.automl: if alg.type == 'classification': scores = cross_val_score(model, dataframe_train.values, dataframe_label.values.ravel(), cv=int(variables.get("N_SPLITS")), scoring=alg.scoring) loss = 1 - np.mean(scores) if (not alg.name.startswith("TPOT") and not alg.name.startswith("AutoSklearn")): model_explainer = shap.KernelExplainer(model.predict_proba, dataframe_train) # feature importance if alg.type == 'anomaly': scores = cross_val_score(model, dataframe_train.values, dataframe_label.values.ravel(), cv=int(variables.get("N_SPLITS")), scoring=alg.scoring) loss = 1 - np.mean(scores) model_explainer = shap.KernelExplainer(model.predict, dataframe_train) # feature importance if alg.type == 'regression': scores = cross_val_score(model, dataframe_train.values, dataframe_label.values.ravel(), cv=int(variables.get("N_SPLITS")), scoring=alg.scoring) loss = np.abs(np.mean(scores)) if alg.name == 'BayesianRidgeRegression' or alg.name == 'LinearRegression': model_explainer = shap.LinearExplainer(model, dataframe_train) else: if (not alg.name.startswith("TPOT") and not alg.name.startswith("AutoSklearn")): model_explainer = shap.KernelExplainer(model.predict, dataframe_train) # -------------------------------------------------------------
def log_explanation(predict_function, features, artifact_path=None): r""" Given a ``predict_function`` capable of computing ML model output on the provided ``features``, computes and logs explanations of an ML model's output. Explanations are logged as a directory of artifacts containing the following items generated by `SHAP`_ (SHapley Additive exPlanations). - Base values - SHAP values (computed using `shap.KernelExplainer`_) - Summary bar plot (shows the average impact of each feature on model output) :param predict_function: A function to compute the output of a model (e.g. ``predict_proba`` method of scikit-learn classifiers). Must have the following signature: .. code-block:: python def predict_function(X) -> pred: ... - ``X``: An array-like object whose shape should be (# samples, # features). - ``pred``: An array-like object whose shape should be (# samples) for a regressor or (# classes, # samples) for a classifier. For a classifier, the values in ``pred`` should correspond to the predicted probability of each class. Acceptable array-like object types: - ``numpy.array`` - ``pandas.DataFrame`` - ``shap.common.DenseData`` - ``scipy.sparse matrix`` :param features: A matrix of features to compute SHAP values with. The provided features should have shape (# samples, # features), and can be either of the array-like object types listed above. .. note:: Background data for `shap.KernelExplainer`_ is generated by subsampling ``features`` with `shap.kmeans`_. The background data size is limited to 100 rows for performance reasons. :param artifact_path: The run-relative artifact path to which the explanation is saved. If unspecified, defaults to "model_explanations_shap". :return: Artifact URI of the logged explanations. .. _SHAP: https://github.com/slundberg/shap .. _shap.KernelExplainer: https://shap.readthedocs.io/en/latest/generated /shap.KernelExplainer.html#shap.KernelExplainer .. _shap.kmeans: https://github.com/slundberg/shap/blob/v0.36.0/shap/utils/_legacy.py#L9 .. code-block:: python :caption: Example import os import numpy as np import pandas as pd from sklearn.datasets import load_boston from sklearn.linear_model import LinearRegression import mlflow # prepare training data dataset = load_boston() X = pd.DataFrame(dataset.data[:50, :8], columns=dataset.feature_names[:8]) y = dataset.target[:50] # train a model model = LinearRegression() model.fit(X, y) # log an explanation with mlflow.start_run() as run: mlflow.shap.log_explanation(model.predict, X) # list artifacts client = mlflow.tracking.MlflowClient() artifact_path = "model_explanations_shap" artifacts = [x.path for x in client.list_artifacts(run.info.run_id, artifact_path)] print("# artifacts:") print(artifacts) # load back the logged explanation dst_path = client.download_artifacts(run.info.run_id, artifact_path) base_values = np.load(os.path.join(dst_path, "base_values.npy")) shap_values = np.load(os.path.join(dst_path, "shap_values.npy")) print("\n# base_values:") print(base_values) print("\n# shap_values:") print(shap_values[:3]) .. code-block:: text :caption: Output # artifacts: ['model_explanations_shap/base_values.npy', 'model_explanations_shap/shap_values.npy', 'model_explanations_shap/summary_bar_plot.png'] # base_values: 20.502000000000002 # shap_values: [[ 2.09975523 0.4746513 7.63759026 0. ] [ 2.00883109 -0.18816665 -0.14419184 0. ] [ 2.00891772 -0.18816665 -0.14419184 0. ]] .. figure:: ../_static/images/shap-ui-screenshot.png Logged artifacts """ import matplotlib.pyplot as plt import shap artifact_path = _DEFAULT_ARTIFACT_PATH if artifact_path is None else artifact_path background_data = shap.kmeans( features, min(_MAXIMUM_BACKGROUND_DATA_SIZE, len(features))) explainer = shap.KernelExplainer(predict_function, background_data) shap_values = explainer.shap_values(features) _log_numpy(explainer.expected_value, _BASE_VALUES_FILE_NAME, artifact_path) _log_numpy(shap_values, _SHAP_VALUES_FILE_NAME, artifact_path) shap.summary_plot(shap_values, features, plot_type="bar", show=False) fig = plt.gcf() fig.tight_layout() _log_matplotlib_figure(fig, _SUMMARY_BAR_PLOT_FILE_NAME, artifact_path) plt.close(fig) return append_to_uri_path(mlflow.active_run().info.artifact_uri, artifact_path)
def experiment_main(): """ Run through experiments for SHAP on CC using both one and two unrelated features. * This may take some time given that we iterate through every point in the test set * We print out the rate at which features occur in the top three features """ # Setup SHAP # Choose the optimal number of clusters candidates = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 32, 64, 100] s_score(xtrain, candidates) n_clusters = int(input("Please enter the optimal number of clusters: ")) ############################################## # One unrelated (innocuous_model_psi is used) ############################################## background_distribution = shap.kmeans(xtrain, n_clusters) generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.2,\ "experiment": "CC", "feature_names": features} # Adversarial models adv_models = dict() adv_models["Perturbation"] = Adversarial_Kernel_SHAP_Model( racist_model_f(), innocuous_model_psi()).train(xtrain, ytrain, feature_names=features) adv_models["DropoutVAE"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi(), generator = "DropoutVAE", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, n_samples=10*xtrain.shape[0]) adv_models["RBF"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi(), generator = "RBF", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes) adv_models["Forest"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi(), generator = "Forest", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes) for adversarial in ["Perturbation", "DropoutVAE", "RBF", "Forest"]: adv_shap = adv_models[adversarial] # Explainers adv_kernel_explainers = dict() adv_kernel_explainers["Perturbation"] = shap.KernelExplainer( adv_shap.predict, background_distribution) adv_kernel_explainers["DropoutVAE"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, instance_multiplier=100) adv_kernel_explainers["RBF"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="RBF", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs) adv_kernel_explainers["Forest"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs) adv_kernel_explainers["ForestFill"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs) for explainer in [ "Perturbation", "DropoutVAE", "RBF", "Forest", "ForestFill" ]: adv_kernel_explainer = adv_kernel_explainers[explainer] if explainer == "ForestFill": explanations = adv_kernel_explainer.shap_values( xtest, fill_data=True, data_location="..\Data/cc_forest_shap.csv") else: explanations = adv_kernel_explainer.shap_values(xtest) # format for display formatted_explanations = [] for exp in explanations: if explainer == "Perturbation": formatted_explanations.append([(features[i], exp[i]) for i in range(len(exp))]) else: formatted_explanations.append([(original_names[i], exp[i]) for i in range(len(exp))]) print( f"SHAP Ranks and Pct Occurances one unrelated feature, adversarial: {adversarial}, explainer: {explainer}:" ) if explainer == "Perturbation": summary = experiment_summary(formatted_explanations, features) else: summary = experiment_summary(formatted_explanations, original_names) print(summary) print("Fidelity:", round(adv_shap.fidelity(xtest), 2)) file_name = f"../Results/CCShap/ccShapSummary_adversarial_{adversarial}_explainer_{explainer}.csv" with open(file_name, "w") as output: w = csv.writer(output) for key, val in summary.items(): w.writerow([key] + [pair for pair in val]) ################################################## # Two unrelated (innocuous_model_psi_two is used) ################################################## background_distribution = shap.kmeans(xtrain, n_clusters) generator_specs = {"original_dim": original_dim, "intermediate_dim": 8, "latent_dim": latent_dim, "epochs": 100, "dropout": 0.2,\ "experiment": "CC", "feature_names": features} # Adversarial models adv_models = dict() adv_models["Perturbation"] = Adversarial_Kernel_SHAP_Model( racist_model_f(), innocuous_model_psi_two()).train(xtrain, ytrain, feature_names=features) adv_models["DropoutVAE"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi_two(), generator = "DropoutVAE", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, n_samples=10*xtrain.shape[0]) adv_models["RBF"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi_two(), generator = "RBF", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes) adv_models["Forest"] = Adversarial_Kernel_SHAP_Model(racist_model_f(), innocuous_model_psi_two(), generator = "Forest", generator_specs = generator_specs).\ train(xtrain, ytrain, feature_names=features, dummy_idcs=dummy_idcs, integer_idcs=integer_attributes) for adversarial in ["Perturbation", "DropoutVAE", "RBF", "Forest"]: adv_shap = adv_models[adversarial] # Explainers adv_kernel_explainers = dict() adv_kernel_explainers["Perturbation"] = shap.KernelExplainer( adv_shap.predict, background_distribution) adv_kernel_explainers["DropoutVAE"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="DropoutVAE", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs, integer_idcs=integer_attributes, instance_multiplier=100) adv_kernel_explainers["RBF"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="RBF", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs) adv_kernel_explainers["Forest"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs) adv_kernel_explainers["ForestFill"] = shap.KernelExplainer(adv_shap.predict, xtrain, generator="Forest", generator_specs=generator_specs,\ dummy_idcs=dummy_idcs) for explainer in [ "Perturbation", "DropoutVAE", "RBF", "Forest", "ForestFill" ]: adv_kernel_explainer = adv_kernel_explainers[explainer] if explainer == "ForestFill": explanations = adv_kernel_explainer.shap_values( xtest, fill_data=True, data_location="..\Data/cc_forest_shap.csv") else: explanations = adv_kernel_explainer.shap_values(xtest) # format for display formatted_explanations = [] for exp in explanations: if explainer == "Perturbation": formatted_explanations.append([(features[i], exp[i]) for i in range(len(exp))]) else: formatted_explanations.append([(original_names[i], exp[i]) for i in range(len(exp))]) print( f"SHAP Ranks and Pct Occurances two unrelated features, adversarial: {adversarial}, explainer: {explainer}:" ) if explainer == "Perturbation": summary = experiment_summary(formatted_explanations, features) else: summary = experiment_summary(formatted_explanations, original_names) print(summary) print("Fidelity:", round(adv_shap.fidelity(xtest), 2)) file_name = f"../Results/CCShap/ccShapSummary2_adversarial_{adversarial}_explainer_{explainer}.csv" with open(file_name, "w") as output: w = csv.writer(output) for key, val in summary.items(): w.writerow([key] + [pair for pair in val]) print('---------------------')
import sklearn from sklearn.model_selection import train_test_split import numpy as np import shap import time X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) # rather than use the whole training set to estimate expected values, we could summarize with # a set of weighted kmeans, each weighted by the number of points they represent. But this dataset # is so small we don't worry about it #X_train_summary = shap.kmeans(X_train, 50) def print_accuracy(f): print("Accuracy = {0}%".format(100 * np.sum(f(X_test) == Y_test) / len(Y_test))) time.sleep(0.5) # to let the print get out before any progress bars shap.initjs() knn = sklearn.neighbors.KNeighborsClassifier() knn.fit(X_train, Y_train) print_accuracy(knn.predict) explainer = shap.KernelExplainer(knn.predict_proba, X_train) shap_values = explainer.shap_values(X_test.iloc[0, :]) shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[0, :])
def test_null_model(): explainer = shap.KernelExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 10)), nsamples=100) e = explainer.explain(np.ones((1, 10)))
# Works for [svc] # If too many examples (pass aux to explainer). aux = shap.sample(X_train, 100) # Set generic kernel explainer explainer = shap.KernelExplainer(predict_proba, aux) """ # Sample to speed up processing. sample = shap.sample(X_train, 100) if isinstance(clf, XGBClassifier): # Works for [llr, dtc, etc, xgb] explainer = shap.Explainer(clf, sample) else: # Works for all but [xgb] explainer = shap.KernelExplainer(predict_proba, sample) # Show kernel type print("Kernel type: %s" % type(explainer)) # Get shap values #shap_values = explainer(X) shap_values = explainer.shap_values(X_train) # Show information print("base value: %s" % \ explainer.expected_value) #print("shap_values: %s" % \ # str(shap_values.shape)) # Summary plot
# X_train = X_train.sample(X_train.shape[0]//160, random_state=2020) # X_test = X_train.sample(X_test.shape[0]//150, random_state=2020) X_train = np.array(X_train, dtype=float) X_test = np.array(X_test, dtype=float) with open('output/X_train.pkl', 'wb') as handle: pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('output/X_test.pkl', 'wb') as handle: pickle.dump(X_test, handle, protocol=pickle.HIGHEST_PROTOCOL) explainer = shap.KernelExplainer(find_cluster_matrix, X_train) shap_values = explainer.shap_values(X_test) with open('output/explainer.pkl', 'wb') as handle: pickle.dump(explainer, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('output/shap_values.pkl', 'wb') as handle: pickle.dump(shap_values, handle, protocol=pickle.HIGHEST_PROTOCOL) # shap_importance = pd.DataFrame( # {"feature_name": list(feature_cols), "shap_value": np.abs(shap_values).sum(axis=0)}).sort_values("shap_value", # ascending=False) # shap_importance["shap_value"] = shap_importance["shap_value"] * 100 / max(shap_importance["shap_value"]) # # shap_importance.to_csv("output/shap_variable_full.csv") #
def upload(): print('eer 0', request.form) dropdown_selection = str(request.form) dropdown_selection = dropdown_selection.split() print(dropdown_selection) model_type = dropdown_selection[3] dropdown_selection = dropdown_selection[1] print('model type ji ', model_type) print(dropdown_selection, " nuna bhai") global id_name target = 'images/' print('tt', target) if not os.path.isdir(target): os.mkdir(target) global ff ff = [] for file in request.files.getlist("file"): print(file) filename = file.filename destination = "/".join([target, filename]) print('des', destination) file.save(destination) ff.append(destination) mypath = os.getcwd() onlyfiles = [ os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f)) ] print('raJA ', ff) import warnings warnings.filterwarnings("ignore") with open(ff[0], 'rb') as file: model = pickle.load(file) with open(ff[1], 'rb') as file: X_data = pickle.load(file) with open(ff[2], 'rb') as file: y_data = pickle.load(file) if 'GL' in dropdown_selection: if 'RR' in model_type: PI = permutation_importance(model, X_data, y_data) row_to_show = 5 data_for_prediction = X_data.iloc[row_to_show] explainer = shap.Explainer(model, X_data, feature_names=X_data.columns) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() ICE = ind_cond_exp(model, X_data, y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) dt = DecisionTreeRegressor(random_state=100, max_depth=3) # We fit the shallow tree to the matrix X and the predictions of the random forest model dt.fit(X_data, predictions) fig, ax = plt.subplots(figsize=(20, 10)) plot_tree(dt, feature_names=list(X_data.columns), precision=3, filled=True, fontsize=12, impurity=True) pl.savefig('static/img/new2_plot.png') pl.close() return render_template('model_explanation_result.html', PI=PI, ICE=ICE, SH="static/img/new_plot.png", SM="static/img/new2_plot.png") if 'RF' in model_type: PI = permutation_importance(model, X_data, y_data) explainer = shap.TreeExplainer(model, X_data, feature_names=X_data.columns) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() ICE = ind_cond_exp(model, X_data, y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) dt = DecisionTreeRegressor(random_state=100, max_depth=3) # We fit the shallow tree to the matrix X and the predictions of the random forest model dt.fit(X_data, predictions) fig, ax = plt.subplots(figsize=(20, 10)) plot_tree(dt, feature_names=list(X_data.columns), precision=3, filled=True, fontsize=12, impurity=True) pl.savefig('static/img/new2_plot.png') pl.close() return render_template('model_explanation_result.html', PI=PI, ICE=ICE, SH="static/img/new_plot.png", SM="static/img/new2_plot.png") if 'CC' in model_type: PI = permutation_importance(model, X_data, y_data) explainer = shap.KernelExplainer(model.predict_proba, X_data) shap_values = explainer.shap_values(X_data) shap.summary_plot(shap_values, X_data) import matplotlib.pyplot as pl pl.savefig('static/img/new_plot.png') pl.close() #ICE = ind_cond_exp(model,X_data,y_data) #global surgat from sklearn.tree import DecisionTreeRegressor from sklearn.tree import plot_tree predictions = model.predict(X_data) return render_template( 'model_explanation_result_classification.html', PI=PI, SH="static/img/new_plot.png") if 'WI' in dropdown_selection: # print(res," resss") # import dash from dash.dependencies import Input, Output import dash_table import dash_core_components as dcc import dash_html_components as html app = dash.Dash(__name__) import pandas as pd #should be X data mean_list = [] features = X_data.columns.tolist() for i in features: mean_list.append(round(X_data[i].mean())) explainer = shap.TreeExplainer(model) shap.initjs() params = features id_name_str = "my_graph" + str(id_name) print('---------------', id_name_str) id_name = id_name + 1 what_plot.layout = html.Div([ dash_table.DataTable( id='table-editing-simple', columns=([{ 'id': 'Model', 'name': 'Model' }] + [{ 'id': p, 'name': p } for p in params]), data=[ dict(zip(features, mean_list)) #dict(Model=i, **{param: mean_list[i] for param in params}) # for i in range(0, len(mean_list)) ], editable=True), html.Div(id=id_name_str) ]) @what_plot.callback(Output(id_name_str, "children"), Input('table-editing-simple', 'data'), Input('table-editing-simple', 'columns')) def update_graphs(rows, columns): df = pd.DataFrame(rows, columns=[c['name'] for c in columns]) print(rows) # rows = rows[0] col = [] vvalue = [] for key in rows: print(key, '->', int(rows[key])) col.append(key) vvalue.append([int(rows[key])]) ik = dict(zip(col, vvalue)) instance = pd.DataFrame.from_dict(ik) print('instancceee ', instance) from shap.plots._force_matplotlib import draw_additive_plot # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models) #explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(instance) shap.initjs() #plt.style.use("_classic_test_patch") ytu = model.predict(instance) print('ress ', ytu) koko = _force_plot_html2(explainer.expected_value, shap_values, instance) #print('kkkk ',koko) print('Done') return koko # return render_template('local_explain_lime.html', LL=what_plot.index()) if 'LL' in dropdown_selection: None #table and plots ======================================================== import dash from dash.dependencies import Input, Output import dash_table import dash_core_components as dcc import dash_html_components as html import pandas as pd id_name_str = "my_graph" + str(id_name) print('---------------', id_name_str) id_name = id_name + 1 print('in LL') # make graph=============================================================== table_plot.layout = html.Div([ dash_table.DataTable( id='datatable-interactivity', columns=[{ "name": i, "id": i, "deletable": True, "selectable": True } for i in X_data.columns], data=X_data.to_dict('records'), editable=True, filter_action="native", sort_action="native", sort_mode="multi", column_selectable="single", row_selectable="single", row_deletable=True, selected_columns=[], selected_rows=[], page_action="native", page_current=0, page_size=10, ), html.Div(id=id_name_str) ]) print('miod LL') @table_plot.callback(Output(id_name_str, "children"), Input('datatable-interactivity', "derived_virtual_data"), Input('datatable-interactivity', "derived_virtual_selected_rows")) def update_graphs(rows, derived_virtual_selected_rows): # When the table is first rendered, `derived_virtual_data` and # `derived_virtual_selected_rows` will be `None`. This is due to an # idiosyncrasy in Dash (unsupplied properties are always None and Dash # calls the dependent callbacks when the component is first rendered). # So, if `rows` is `None`, then the component was just rendered # and its value will be the same as the component's dataframe. # Instead of setting `None` in here, you could also set # `derived_virtual_data=df.to_rows('dict')` when you initialize # the component. if derived_virtual_selected_rows is None: derived_virtual_selected_rows = [] dff = X_data if rows is None else pd.DataFrame(rows) colors = [ '#7FDBFF' if i in derived_virtual_selected_rows else '#0074D9' for i in range(len(dff)) ] print('my value', derived_virtual_selected_rows) print('i am row ', X_data.iloc[derived_virtual_selected_rows]) print(type(derived_virtual_selected_rows)) from shap.plots._force_matplotlib import draw_additive_plot ttt = X_data.loc[derived_virtual_selected_rows] # explain the model's predictions using SHAP values(same syntax works for LightGBM, CatBoost, and scikit-learn models) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(ttt) shap.initjs() plt.style.use("_classic_test_patch") bubu = _force_plot_html(explainer.expected_value, shap_values, ttt) shap_values = explainer.shap_values(X_data) #shap.force_plot(explainer.expected_value, shap_values, X_data) explain_all = _force_plot_html(explainer.expected_value, shap_values, X_data) print('bubu ', bubu) return bubu, explain_all return render_template('local_explain_lime.html', LL=table_plot.index()) if 'BD' in dropdown_selection: None #FI if 'DB' in dropdown_selection: # if 'CC' in model_type: # from explainerdashboard import ClassifierExplainer, ExplainerDashboard # ExplainerDashboard(ClassifierExplainer(model, X_data, y_data)).run() if 'RF' in model_type: import threading import time def dashboard_exp(model, X_data, y_data): import dash_bootstrap_components as dbc from explainerdashboard import RegressionExplainer, ExplainerDashboard ExplainerDashboard( RegressionExplainer(model, X_data, y_data), bootstrap=dbc.themes.SANDSTONE, importances=True, model_summary=False, contributions=True, whatif=True, shap_dependence=False, shap_interaction=False, decision_trees=False, hide_whatifindexselector=True, hide_whatifprediction=True, hide_inputeditor=False, hide_whatifcontributiongraph=False, hide_whatifcontributiontable=True, hide_whatifpdp=False, hide_predindexselector=True, hide_predictionsummary=True, hide_contributiongraph=False, hide_pdp=False, hide_contributiontable=True, hide_dropna=True, hide_range=True, hide_depth=True, hide_sort=True, hide_sample=True, # hide sample size input on pdp component hide_gridlines=True, # hide gridlines on pdp component hide_gridpoints=True, hide_cats_sort= True, # hide the sorting option for categorical features hide_cutoff= True, # hide cutoff selector on classification components hide_percentage= True, # hide percentage toggle on classificaiton components hide_log_x= True, # hide x-axis logs toggle on regression plots hide_log_y= True, # hide y-axis logs toggle on regression plots hide_ratio=True, # hide the residuals type dropdown hide_points= True, # hide the show violin scatter markers toggle hide_winsor=True, # hide the winsorize input hide_wizard= True, # hide the wizard toggle in lift curve component hide_star_explanation=True, ).run() t1 = threading.Thread(target=dashboard_exp, args=(model, X_data, y_data)) t1.start() return '''<H2>
from sklearn.ensemble import RandomForestRegressor # The target variable is 'quality'. Y = df['quality'] X = df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol']] # Split the data into train and test data: X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2) # Build the model with the random forest regression algorithm: model = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10) model.fit(X_train, Y_train) rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10) rf.fit(X_train, Y_train) print(rf.feature_importances_) importances = rf.feature_importances_ indices = np.argsort(importances) features = X_train.columns # plt.title('Feature Importances') # plt.barh(range(len(indices)), importances[indices], color='b', align='center') # plt.yticks(range(len(indices)), [features[i] for i in indices]) # plt.xlabel('Relative Importance') # plt.show() import shap rf_shap_values = shap.KernelExplainer(rf.predict,X_test) # shap.summary_plot(rf_shap_values, X_test) shap.dependence_plot("alcohol", rf_shap_values, X_test) # plot the SHAP values for the 10th observation # shap.force_plot(rf_explainer.expected_value, rf_shap_values[10,:], X_test.iloc[10,:])
out[i][segmentation == j, :] = background return out def f(z): # print("Call") # for i in z: # print (i) # print(img_orig.shape) return model.predict(mask_image(z, segments_slic, img_orig, 250)) # In[ ]: # use Kernel SHAP to explain the network's predictions explainer = shap.KernelExplainer(f, np.zeros((1, 50))) shap_values = explainer.shap_values(np.ones((1, 50)), nsamples=1000) # runs VGG16 1000 times # In[ ]: list(test_image_gen.class_indices.keys()) # In[ ]: # get the top predictions from the model preds = model.predict(np.expand_dims(img_orig.copy(), axis=0)) top_preds = np.argsort(-preds) # In[ ]:
best_xgb_model = xgboost.XGBRegressor(colsample_bytree=0.4, gamma=0, learning_rate=0.07, max_depth=3, min_child_weight=1.5, n_estimators=10000, reg_alpha=0.75, reg_lambda=0.45, subsample=0.6, seed=42) best_xgb_model.fit(trainX, trainY) print("\n xgboost: \n") print( 'MAE: \n', mean_absolute_error(valY, best_xgb_model.predict(valX), multioutput='raw_values')) # The mean squared error print('MSE: %.5f' % mean_squared_error(valY, best_xgb_model.predict(valX))) # The coefficient of determination: 1 is perfect prediction print('R-Sqrd: %.2f' % r2_score(valY, best_xgb_model.predict(valX))) import shap shap.initjs() explainer = shap.KernelExplainer(br.predict, trainX) shap_values = explainer.shap_values(valX, nsamples=5) shap.summary_plot(shap_values, valX, plot_type="bar")
print(f"Relevant columns are: {', '.join(relevant_columns)}") print('Classification summary:') print(classification_report(y, est.predict(X))) # print('Classification summary:') # print(classification_report(y, est.predict(X))) # El clasificador base solo usa age, avg_glucose_level, bmi # Busca explicaciones en las que aparezcan variables distintas a estas tres. # Usa X e y como datos (los nombres de las columnas estan en X_features), est es el clasificador final from sklearn.metrics import classification_report import numpy as np #import dalex as dx import shap i = 99 class_names = ['healthy', 'stroke'] #print(X[i].mean) X = np.array(X) X_features = np.array(X_features) #SHAP EXPLAINER #explainer = shap.LinearExplainer(est, X) explainer = shap.KernelExplainer(est.predict, X) shap_values = explainer.shap_values(X) import pickle with open('stroke-shap.pkl', 'wb') as fd: pickle.dump([explainer, shap_values], fd) with open('stroke-shap.pkl', 'rb') as fd: [explainer, shap_values] = pickle.load(fd)
def _compute_shap_values(pipeline, features, training_data=None): """Computes SHAP values for each feature. Arguments: pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP. features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on. training_data (pd.DataFrame): Training data the pipeline was fit on. For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm. Returns: dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values. For classification problems, returns a list of dictionaries. One for each class. """ estimator = pipeline.estimator if estimator.model_family == ModelFamily.BASELINE: raise ValueError( "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed." ) feature_names = features.columns # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise. # Sklearn components do this under-the-hood so we're not changing the data the model was trained on. # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric. if estimator.model_family != ModelFamily.CATBOOST: features = check_array(features.values) if estimator.model_family.is_tree_estimator(): # Because of this issue: https://github.com/slundberg/shap/issues/1215 if estimator.model_family == ModelFamily.XGBOOST: raise NotImplementedError( "SHAP values cannot currently be computed for xgboost models.") if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.MULTICLASS: # Will randomly segfault raise NotImplementedError( "SHAP values cannot currently be computed for catboost models for multiclass problems." ) # Use tree_path_dependent to avoid linear runtime with dataset size with warnings.catch_warnings(record=True) as ws: explainer = shap.TreeExplainer( estimator._component_obj, feature_perturbation="tree_path_dependent") if ws: logger.debug( f"_compute_shap_values TreeExplainer: {ws[0].message}") shap_values = explainer.shap_values(features, check_additivity=False) # shap only outputs values for positive class for Catboost binary estimators. # this modifies the output to match the output format of other binary estimators. # Ok to fill values of negative class with zeros since the negative class will get dropped # in the UI anyways. if estimator.model_family == ModelFamily.CATBOOST and pipeline.problem_type == ProblemTypes.BINARY: shap_values = [np.zeros(shap_values.shape), shap_values] else: if training_data is None: raise ValueError( "You must pass in a value for parameter 'training_data' when the pipeline " "does not have a tree-based estimator. " f"Current estimator model family is {estimator.model_family}.") # More than 100 datapoints can negatively impact runtime according to SHAP # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114 sampled_training_data_features = pipeline.compute_estimator_features( shap.sample(training_data, 100)).to_dataframe() sampled_training_data_features = check_array( sampled_training_data_features) if pipeline.problem_type == ProblemTypes.REGRESSION: link_function = "identity" decision_function = estimator._component_obj.predict else: link_function = "logit" decision_function = estimator._component_obj.predict_proba with warnings.catch_warnings(record=True) as ws: explainer = shap.KernelExplainer(decision_function, sampled_training_data_features, link_function) shap_values = explainer.shap_values(features) if ws: logger.debug( f"_compute_shap_values KernelExplainer: {ws[0].message}") # classification problem if isinstance(shap_values, list): mappings = [] for class_shap_values in shap_values: mappings.append( _create_dictionary(class_shap_values, feature_names)) return mappings # regression problem elif isinstance(shap_values, np.ndarray): return _create_dictionary(shap_values, feature_names) else: raise ValueError( f"Unknown shap_values datatype {str(type(shap_values))}!")
np.zeros(shape=(predictions.shape[0], 2))) class_probabilities.iloc[:, POSITIVE_CLASS_PROB_INDEX] = predictions class_probabilities.iloc[:, NEGATIVE_CLASS_PROB_INDEX] = pd.DataFrame(predictions).apply(lambda x: 1 - x) return class_probabilities # use SHAP KernelExplainer to explain test set predictions USE_MATPLOTLIB = True # False requires IPython and likely additional setup TRAIN_SUMMARY_N_SAMPLES = 100 MULTISAMPLE_PLOTS_N_TEST_SAMPLES = 100 # provided for IPython / Jupyter consistency if not USE_MATPLOTLIB: shap.initjs() X_train_samples = shap.sample(X_train, nsamples=TRAIN_SUMMARY_N_SAMPLES) # summarize the X_train with a total of 100 samples explainer = shap.KernelExplainer(predict_proba, X_train_samples, link="logit") # find first true positive and true negative samples Y_test_proba = predict_proba(X_test) tp_index = -1 tn_index = -1 for i in range(len(Y_test_proba)): if Y_test_proba.iloc[i, POSITIVE_CLASS_PROB_INDEX] >= 0.5 and Y_test.iloc[i] == "yes": tp_index = i if Y_test_proba.iloc[i, POSITIVE_CLASS_PROB_INDEX] < 0.5 and Y_test.iloc[i] == "no": tn_index = i if tp_index != -1 and tn_index != -1: break # force_plot for single TP sample, if it exists if tp_index != -1:
def explain(self, test_df, row_index=None, row_num=None, class_id=None, background_size=50, nsamples=500): """ Explain the prediction of an example using SHAP. Args: df(pd.DataFrame): a pd.DataFrame of test data is same format as original training data DataFrame The DataFrame does NOT need to contain all the original label columns (e.g., the Survived column in Kaggle's Titatnic dataset) but MUST contain all the original predictor columns (e.g., un-normalized numerical variables, categorical variables as strings). row_index(int): index of row in DataFrame to explain (e.g., PassengerID in Titanic dataset). mutually-exclusive with row_id row_num(int): raw row number in DataFrame to explain (i.e., 0=first row, 1=second rows, etc.) mutually-exclusive with row_index class_id(int): Only required for classification background_size(int): size of background data (SHAP parameter) nsamples(int): number of samples (SHAP parameter) """ try: import shap except ImportError: msg = 'TabularPredictor.explain requires shap library. Please install with: pip install shap. '+\ 'Conda users should use this command instead: conda install -c conda-forge shap' warnings.warn(msg) return classification, multilabel = U.is_classifier(self.model) if classification and class_id is None: raise ValueError('For classification models, please supply the class_id of the class you would like to explain.' + \ 'It should be an index into the list returned by predictor.get_classes().') f = self._predict_shap # prune dataframe df_display = test_df.copy() df_display = df_display[self.preproc.pc] # add synthetic labels for lab in self.preproc.lc: df_display[lab] = np.zeros(df_display.shape[0], dtype=int) # convert DataFrame to TabularDataset with processed/normalized independent variables tabseq = self.preproc.preprocess_test(df_display, verbose=0) tabseq.batch_size = df_display.shape[0] df = pd.DataFrame(data=np.concatenate(tabseq[0][0], axis=1), columns=tabseq.cat_columns + tabseq.cont_columns, index=df_display.index) # add new auto-engineered feature columns for col in [self.preproc.na_names + self.preproc.date_names]: df_display[col] = df[col] # sort display df correctly df_display = df_display[tabseq.cat_columns + tabseq.cont_columns] # select row if row_num is not None and row_index is not None: raise ValueError( 'row_num and row_index are mutually exclusive with eachother.') if row_index is not None: df_row = df[df.index.isin([row_index])].iloc[0, :] df_display_row = df_display[df_display.index.isin([row_index ])].iloc[0, :] r_key = 'row_index' if df.index.name is None else df.index.name r_val = row_index elif row_num is not None: df_row = df.iloc[row_num, :] df_display_row = df_display.iloc[row_num, :] r_key = 'row_num' r_val = row_num #print(df_row) #print(df_display_row) # shap explainer = shap.KernelExplainer(f, df.iloc[:background_size, :]) shap_values = explainer.shap_values(df_row, nsamples=nsamples, l1_reg='aic') expected_value = explainer.expected_value if not np.issubdtype(type(explainer.expected_value), np.floating): expected_value = explainer.expected_value[ 0 if class_id is None else class_id] if type(shap_values) == list: shap_values = shap_values[0 if class_id is None else class_id] if classification: print('Explanation for class = %s (%s=%s): ' % (self.get_classes()[class_id], r_key, r_val)) plt.show( shap.force_plot(expected_value, shap_values, df_display_row, matplotlib=True))