Пример #1
0
 def __init__(self, model, data, feature_names, mode, algorithm='tree'):
     self.model = model
     self.data = np.vstack(np.array(data)).astype(np.float)
     self.feature_names = feature_names
     self.algorithm = algorithm
     self.mode = mode
     if len(self.data) > 10:
         self.nsamples = 10
     if len(self.data) > 2000:
         self.data = shap.sample(self.data, 2000)
     else:
         self.nsamples = len(self.data)
     if algorithm == 'tree':
         self.explainer = shap.TreeExplainer(
             model,
             data=shap.sample(self.data, 50),
             feature_perturbation='interventional')
         self.shap_values = self.explainer.shap_values(
             self.data, check_additivity=False)
     else:
         if self.mode == 'classification':
             self.explainer = shap.KernelExplainer(self.model.predict_proba,
                                                   data=shap.sample(
                                                       self.data,
                                                       self.nsamples),
                                                   link="logit")
         elif self.mode == 'regression':
             self.explainer = shap.KernelExplainer(self.model.predict,
                                                   data=shap.sample(
                                                       self.data,
                                                       self.nsamples))
         self.shap_values = self.explainer.shap_values(
             self.data, check_additivity=False, nsamples=self.nsamples)
Пример #2
0
def explain_model(model, train_data, test_data, samples):
    """
    Function that computes and displays SHAP model explanations
    """
    model_name = type(model).__name__
    random.seed(13)
    samples_to_explain = samples
    if model_name not in ["RandomForestClassifier", "XGBClassifier"]:
        explainer = shap.KernelExplainer(model.predict_proba,
                                         train_data[:50],
                                         link="identity")
        shap_values = explainer.shap_values(train_data[:50],
                                            nsamples=200,
                                            l1_reg="num_features(100)")

    else:
        explainer = shap.TreeExplainer(model,
                                       data=shap.sample(
                                           train_data, samples_to_explain),
                                       feature_perturbation='interventional')
        shap_values = explainer.shap_values(shap.sample(
            train_data, samples_to_explain),
                                            check_additivity=False)

    fig = shap.summary_plot(shap_values, test_data, max_display=5, show=False)
    return fig
Пример #3
0
    def data_for_shap(self, input_data):
        if is_classification(self.model):
            explainer, pred, pred_fcn = self.shap_explainer()
            if type(explainer) == shap.explainers._tree.Tree:
                global_shap_values = explainer.shap_values(input_data)
                data_with_shap = self.append_shap_values_to_df(input_sv = global_shap_values[0], 
                                                                in_data=input_data.copy(), scope="local")
                prediction = pred([input_data])
                probabilities = pred_fcn([input_data])
                
                data_with_shap['Model Decision'] = prediction[0]
                #data_with_shap['True Values'] = self.actual_data
                

                for i in range(len(np.unique(self.actual_data))):
                    data_with_shap['Probability: {}'.format(np.unique(self.actual_data)[i])] = probabilities[:,i][0]
                return data_with_shap
            else:
                predictions = pred(shap.sample(input_data,100))
                global_shap_values = explainer.shap_values(shap.sample(input_data,100))  
                data_with_shap = self.append_shap_values_to_df(input_sv = global_shap_values[0], 
                                                               in_data=shap.sample(input_data,100).copy(),
                                                              scope='local')  
                prediction = pred(shap.sample(input_data,100))
                probabilities = pred_fcn(shap.sample(input_data,100))
                data_with_shap['Model Decision'] = prediction[0]
                #data_with_shap['True Values'] = self.actual_data
                
                for i in range(len(np.unique(self.actual_data))):
                    data_with_shap['Probability: {}'.format(np.unique(self.actual_data)[i])] = probabilities[:,i][0]

                return data_with_shap
        else:
            explainer, pred = self.shap_explainer()
            if type(explainer) == shap.explainers._tree.Tree:
                #Complete! Do not change. 
                global_shap_values = explainer.shap_values(input_data)
                data_with_shap = self.append_shap_values_to_df(input_sv = global_shap_values, 
                                                                in_data=self.input_data.copy(),
                                                                  scope="local")
                data_with_shap['Model Decision'] = pred(input_data)
                #data_with_shap['True Values'] = self.actual_data
                
                return data_with_shap

            else:
                global_shap_values = explainer.shap_values(shap.sample(input_data,100))  
                predictions = pred(shap.sample(input_data,100))
                data_with_shap = self.append_shap_values_to_df(input_sv = global_shap_values, 
                                                               in_data=shap.sample(input_data,100).copy(),
                                                              scope="local")
                data_with_shap['Model Decision'] = pred(shap.sample(self.input_data,100))
                #data_with_shap['True Values'] = self.actual_data
                
                return data_with_shap
Пример #4
0
def shap_compute(X, model, model_name):
    if model_name == 'xgboost':
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)
        # fig1=shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
        shap_mean = np.mean(abs(shap_values), axis=0)
        shap_mean_df = pd.DataFrame({
            'columns_name': X.columns,
            'shap_mean_values': shap_mean
        }).sort_values(by='shap_mean_values').reset_index().drop('index',
                                                                 axis=1)
        index = np.arange(len(X.columns))
        return index, shap_mean_df
    else:
        explainer = shap.KernelExplainer(model.predict, X)
        #-----
        X = shap.sample(X, 100)
        #------
        shap_values = explainer.shap_values(X)
        # fig1=shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
        shap_mean = np.mean(abs(shap_values), axis=0)
        shap_mean_df = pd.DataFrame({
            'columns_name': X.columns,
            'shap_mean_values': shap_mean
        }).sort_values(by='shap_mean_values').reset_index().drop('index',
                                                                 axis=1)
        index = np.arange(len(X.columns))
        return index, shap_mean_df
def shap_select(model,
                X_train,
                X_test,
                feature_names,
                task='classification',
                agnostic=False):
    """
    Return the feature ordering of a multidimensional dataset based on the features importance.
    The importance is calculated upon SHAP values, which takes into account a fitted model.


    :param model: a fitted model 
    :param X_train: training data
    :param X_test: test data
    :param feature_names: feature names
    :return: Ordered feature names based on the importance computed using SHAP values
    """

    explainer = None

    if not agnostic:
        explainer = shap.TreeExplainer(model)
    else:
        background = None
        if len(X) < 500:
            background = X_train
        else:
            background = shap.sample(X_train, int(len(X_train) * 0.05))
        explainer = shap.KernelExplainer(model.predict_proba, background)

    shap_values = explainer.shap_values(X_test)
    ordering = _shap_ordering(feature_names, shap_values)

    return ordering
    def create_blurred_baseline(self, X, sigma, iterations=1000):
        shuffled_gaussian_df = pd.DataFrame().reindex_like(X).fillna(0)
        features_to_shuffle = list(X.columns)
        df_to_shuffle = X.copy(deep=True)
        permutations = []
        for i in range(iterations):
            unique = True

            gaussian_filter_df = pd.DataFrame(gaussian_filter(df_to_shuffle,
                                                              sigma=sigma),
                                              columns=features_to_shuffle)
            for feature in features_to_shuffle:
                shuffled_gaussian_df[feature] += gaussian_filter_df[feature]

            permutations.append(features_to_shuffle[:])
            random.shuffle(features_to_shuffle)

            while unique:
                unique = True
                for permutation in permutations:
                    if features_to_shuffle == permutation:
                        random.shuffle(features_to_shuffle)
                        unique = False
                        break
                if unique:
                    break

            df_to_shuffle = df_to_shuffle[features_to_shuffle]

        shuffled_gaussian_df = shuffled_gaussian_df.div(iterations)
        return shap.sample(np.asarray(shuffled_gaussian_df),
                           self.shap_sample_size)
Пример #7
0
 def shap_collective(self):
     shap.initjs()
     z = shap.sample(self.X_test, nsamples=100)
     explainer = shap.KernelExplainer(self.cat.predict, z)
     k_shap_values = explainer.shap_values(self.X_test)
     return shap.force_plot(explainer.expected_value, k_shap_values,
                            self.X_test)
Пример #8
0
    def avatar_q_model(
        self,
        X_train,
        X_test,
        l1_reg="num_features(10)",
        check_additivity=False,
        n_samples=20,
        silent=True,
    ):
        assert shap is not None, "SHAP not found, so cannot do anything here."

        # Extract function to explain
        m = self.q_model
        f = self._extract_function_to_explain(self.q_model)

        # Data
        assert (
            X_train.shape[1] == X_test.shape[1]
        ), "Inconsistent attribute count. Your carelessness is disappointing."
        if X_train.shape[1] != len(m.desc_ids):
            attribute_filter = m.desc_ids
            X_train = X_train[:, attribute_filter]
            X_test = X_test[:, attribute_filter]

        explainer = shap.KernelExplainer(f, shap.sample(X_train, n_samples))
        raw_shaps = explainer.shap_values(
            X_test, l1_reg=l1_reg, check_additivity=check_additivity, silent=silent
        )

        # Process Shap values
        abs_shaps = self._raw_to_abs_shaps(raw_shaps)
        nrm_shaps = self._abs_to_nrm_shaps(abs_shaps)

        return nrm_shaps
def shap_multiprocessing(patient, model, X_train, y_train, X_test, y_test):
    shap_list = []
    print((patient))
    # training
    model_list={}
    #model_list[patient]=clone(model_pat)
    #model_list[patient].fit(X_train, y_train)
    model.fit(X_train, y_train)
    # create explainer
    #shap_explainer = shap.KernelExplainer(model_list[patient].predict_proba, X_train.iloc[0:500, :])
    shap_explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train, 500))
    shap_values = shap_explainer.shap_values(X_test)
    # for i in range(len(shap_values)):
    #     shap_df = pd.DataFrame(data=shap_values[i], columns=X_test[i].columns.values)
    #     shap_list.append(shap_df)
    # path_img_bar = path_shap + "model{}_patient{}_allclasses_tmp.png".format(model_name,patient)
    # plt.figure()
    # shap.summary_plot(shap_values, X_test[patient], plot_type="bar", show=False)
    # plt.savefig(path_img_bar)

    with open(
            '../resources/results_ordered/SHAP_no_correlated_features/SHAP_SVM_{}.pkl'.format(patient),
            'wb') as f:
        pickle.dump(shap_values, f)
    return 1
 def create_gaussian_baseline(self, X, sigma):
     gaussian_baseline = np.random.randn(*X.shape) * sigma + X
     # Make sure the min and max values are not higher then from X
     return shap.sample(
         np.clip(gaussian_baseline,
                 a_min=X.min().min(),
                 a_max=X.max().max()), self.shap_sample_size)
Пример #11
0
    def bootstrap(self, X):
        """
        Function that performs the bootstrapping. 
        Randomly changes the background dataset and records the new attributions
        Parameters:
        X : instances to explain
        Returns:
        array of mean feature attributions for the samples in X
        """
        self.values = np.empty(
            (self.num_straps, len(self.labels), len(self.features)))
        self.mean_std_arr = np.empty((2, len(self.labels), len(self.features)))
        for i in range(self.num_straps):
            background_i = shap.sample(self.data,
                                       self.back_size,
                                       random_state=np.random.randint(100))
            if self.explainer_type == 'kernel':
                exp_i = shap.KernelExplainer(self.model, background_i)
                shapper = exp_i.shap_values(X)
            elif self.explainer_type == 'sample':
                exp_i = shap.SampleExplainer(self.model, background_i)
                shapper = exp_i.shap_values(X)
            elif self.explainer_type == 'lime':
                exp_i = MyLime(self.model, background_i, mode="regression")
                shapper = exp_i.attributions(X)
            self.values[i, 0, :] = shapper[0]
            self.values[i, 1, :] = shapper[1]
        for i in range(len(self.labels)):
            for j in range(len(self.features)):
                self.mean_std_arr[0, i, j] = np.mean(self.values[:, i, j])
                self.mean_std_arr[1, i, j] = np.std(self.values[:, i, j])

        return self.mean_std_arr
Пример #12
0
def get_shap_kernel(estimator: object, X_train):
    """compute the shap value importance for non-tree based model

    Args:
        estimator (a none tree based sklearn estimator): a sklearn non tree based estimator
        x_train ((pd.DataFrame, np.ndarray),): X training data
        x_test ((pd.DataFrame, np.ndarray),): X testing data

    Returns:
        shap plot
    """
    warnings.filterwarnings("ignore")
    # because the kernel explainer for non-tree based model extremly slower
    # so we must use kmeans to extract mainly information from x_train
    # to speed up the calculation
    if X_train.shape[1] > 3:
        x_train_summary = shap.kmeans(X_train, 3)
    else:
        x_train_summary = shap.kmeans(X_train, X_train.shape[1])
    explainer = shap.KernelExplainer(estimator.predict, x_train_summary)

    size = len(X_train)
    if size < 50:
        size = size
    elif size * 0.2 > 50:
        size = 50
    else:
        size = int(size * 0.2)
    sample_values = shap.sample(X_train, size)
    shap_values = explainer.shap_values(sample_values,
                                        lr_reg='num_features(10)')

    return explainer, shap_values, sample_values
Пример #13
0
 def shap_summary(self):
     z = shap.sample(self.X_test, nsamples=100)
     explainer = shap.KernelExplainer(self.cat.predict, z)
     k_shap_values = explainer.shap_values(self.X_test)
     print("Shap Summary Plot")
     plt.figure()
     shap.summary_plot(k_shap_values, self.X_test, show=False)
     plt.savefig('shap_summary.png')
Пример #14
0
def shapley_tree(model_predict, obs, dataset, column_names, plot_draw=False):
    explainer = shap.KernelExplainer(model_predict, shap.sample(dataset, 100))
    shap_values = explainer.shap_values(obs)
    if plot_draw:
        shap.waterfall_plot(explainer.expected_value,
                            shap_values,
                            feature_names=column_names)
    return shap_values, explainer.expected_value
Пример #15
0
def test_HistGradientBoostingClassifier_proba():
    # train a tree-based model
    X, y = shap.datasets.adult()
    model = sklearn.ensemble.HistGradientBoostingClassifier(max_iter=10, max_depth=6).fit(X, y)
    explainer = shap.TreeExplainer(model, shap.sample(X, 10), model_output="predict_proba")
    shap_values = explainer.shap_values(X)
    assert np.max(np.abs(
        shap_values[0].sum(1) + explainer.expected_value[0] - model.predict_proba(X)[:, 0])) < 1e-4
Пример #16
0
 def kernel_explainer_with_ct(self):
     try:
         #classification case
         pred_fcn = lambda x : self.model.predict_proba(self.ct.transform(x))
         explainer = shap.KernelExplainer(pred_fcn, shap.sample(self.input_data, 100), 
                                                              link='logit',
                                                              feature_names=self.input_data.columns,
                                                              seed=0)
         pred = lambda x : self.model.predict(self.ct.transform(x))
         return explainer, pred, pred_fcn
     
     except:
         pred_fcn = lambda x : self.model.predict(self.ct.transform(x))
         explainer = shap.KernelExplainer(pred_fcn, shap.sample(self.input_data, 100), 
                                                              link='identity',
                                                              feature_names=self.input_data.columns,
                                                              seed=0)
         return explainer, pred_fcn
Пример #17
0
    def __init__(self, dt_model, X, num_samples=50):
        self.dt_model = dt_model
        self.num_samples = num_samples
        self.feature_names = X.columns.to_list()

        if num_samples is not None:
            samples = shap.sample(X, num_samples)
        else:
            samples = X
        self.explainer = shap.KernelExplainer(self.model_fn, samples)
Пример #18
0
 def kernel_explainer(self):
     try:
         #classification case
         explainer = shap.KernelExplainer(self.model.predict_proba, 
                                                          shap.sample(self.input_data, 100),
                                                          link='logit',
                                                          feature_names=self.input_data.columns,
                                                         seed=0)
         predictions = self.model.predict
         prediction_probabilities = self.model.predict_proba
         return explainer, predictions, prediction_probabilities
     except:
         #regression case
         explainer = shap.KernelExplainer(self.model.predict, 
                                                          shap.sample(self.input_data, 100), 
                                                          link='identity',
                                                          feature_names=self.input_data.columns,
                                                         seed=0)
         predictions = self.model.predict
         return explainer, predictions
Пример #19
0
def test_HistGradientBoostingClassifier_multidim():
    # train a tree-based model
    X, y = shap.datasets.adult()
    X = X[:100]
    y = y[:100]
    y = np.random.randint(0, 3, len(y))
    model = sklearn.ensemble.HistGradientBoostingClassifier(max_iter=10, max_depth=6).fit(X, y)
    explainer = shap.TreeExplainer(model, shap.sample(X, 10), model_output="raw")
    shap_values = explainer.shap_values(X)
    assert np.max(np.abs(shap_values[0].sum(1) +
                         explainer.expected_value[0] - model.decision_function(X)[:, 0])) < 1e-4
Пример #20
0
    def setUp(self):
        X_train, y_train, X_test, y_test = titanic_fare()
        self.test_len = len(X_test)

        model = LinearRegression().fit(X_train, y_train)
        self.explainer = RegressionExplainer(model,
                                             X_test.iloc[:20],
                                             y_test.iloc[:20],
                                             shap='kernel',
                                             X_background=shap.sample(
                                                 X_train, 5))
Пример #21
0
def get_feature_importances(models, test_data, complication, train_columns):
    """ This function calculates the shap values of the top models for each of the investigated complication"""

    tree_explain = [LGBMClassifier]

    features = []
    avg_shap_values = []

    test_data_i = test_data[complication]
    importance_dfs = {}
    counter = 0

    for x in models[complication]:
        # Get feature importances
        X_importance = test_data_i[train_columns]
        if (type(x) in tree_explain):
            explainer = shap.TreeExplainer(x)
            shap_values = explainer.shap_values(X_importance,
                                                check_additivity=False)

        else:
            model_results = x.predict_proba
            X_importance = X_importance.fillna(X_importance.median())
            X_importance_ = shap.sample(X_importance, 50)
            explainer = shap.KernelExplainer(model_results, X_importance_)
            shap_values = explainer.shap_values(X_importance_,
                                                check_additivity=False)[0]
        values = np.abs(shap_values).mean(0)

        importance_df = pd.DataFrame()
        importance_df['column_name'] = train_columns
        importance_df["importance"] = values
        importance_df = importance_df.sort_values('column_name')
        importance_dfs[counter] = importance_df
        counter = counter + 1

    importance_df = pd.DataFrame([X_importance.columns.tolist()]).T
    importance_df.columns = ['column_name']
    for x in range(6):
        importance_df[f"importance_{str(x)}"] = importance_dfs[x]["importance"]

    col = importance_df.loc[:, "importance_0":"importance_6"]

    importance_df["avg"] = col.mean(axis=1)

    features.append(
        importance_df.sort_values(by="avg",
                                  ascending=False).column_name[:4].values)
    avg_shap_values.append(
        importance_df.sort_values(by="avg", ascending=False).avg[:4].values)

    display_top_features(features, complication)

    return (features, avg_shap_values)
Пример #22
0
def plot_prediction_desicion(model, X_test, pred, row_idx):
    #The decision plot below shows the model’s multiple outputs for a single observation
    #the dashed line is the prediction of our classifier
    explainer = shap.TreeExplainer(model, data=shap.sample(X_test, 100), feature_dependence="interventional")
    shap_values = explainer.shap_values(X_test)
    shap.multioutput_decision_plot([1, 2, 3], shap_values,
                                   row_index=row_idx,
                                   feature_names=list(X_test.columns) ,
                                   highlight=int(pred[row_idx]),
                                   legend_labels=["0-18", "19-70", "70+"], #legend_labels=["0-23", "24-50", "50+"],
                                   legend_location='lower right')
    plt.show()
Пример #23
0
def shap_importances(model, X_train, X_test, n_shap, normalize=True, sort=True):
    start = timer()
    # only use n_shap from X_test
    X_test = X_test.sample(n=min(n_shap, len(X_test)), replace=False)
    if isinstance(model, RandomForestRegressor) or \
        isinstance(model, GradientBoostingRegressor) or \
        isinstance(model, xgb.XGBRegressor):
        """
        We get this warning for big X_train so choose smaller
        'Passing 20000 background samples may lead to slow runtimes. Consider using shap.sample(data, 100) to create a smaller background data set.'
        """
        explainer = shap.TreeExplainer(model,
                                       data=shap.sample(X_train, 100),
                                       feature_perturbation='interventional')
        shap_values = explainer.shap_values(X_test, check_additivity=False)
    elif isinstance(model, Lasso) or isinstance(model, LinearRegression):
        explainer = shap.LinearExplainer(model,
                                         shap.sample(X_train, 100),
                                         feature_perturbation='interventional')
        shap_values = explainer.shap_values(X_test)
    else:
        # gotta use really small sample; verrry slow
        explainer = shap.KernelExplainer(model.predict, shap.sample(X_train, 100))
        shap_values = explainer.shap_values(X_test, nsamples='auto')
    shapimp = np.mean(np.abs(shap_values), axis=0)
    stop = timer()
    print(f"SHAP time for {len(X_test)} test records using {model.__class__.__name__} = {(stop - start):.1f}s")

    total_imp = np.sum(shapimp)
    normalized_shap = shapimp
    if normalize:
        normalized_shap = shapimp / total_imp

    # print("SHAP", normalized_shap)
    shapI = pd.DataFrame(data={'Feature': X_test.columns, 'Importance': normalized_shap})
    shapI = shapI.set_index('Feature')
    if sort:
        shapI = shapI.sort_values('Importance', ascending=False)
    # plot_importances(shapI)
    return shapI
Пример #24
0
def plot_feature_importance_for_class(model, X_train):
    explainer = shap.TreeExplainer(model, data=shap.sample(X_train, 100), feature_dependence="interventional")
    shap_values = explainer.shap_values(X_train)
    #shap.dependence_plot("rank(10)", shap_values, X_train) #raise TypeError("The passed shap_values are a list not an array! If you have a list of explanations try " \ # TypeError: The passed shap_values are a list not an array! If you have a list of explanations try passing shap_values[0] instead to explain the first output class of a multi-output model.
    shap.summary_plot(shap_values, X_train, plot_type="bar", class_names=model.classes_, color=pl.get_cmap("tab10")) #labels=model.classes_
    # IMPORTANT for some reason the three lines below might break if the line above isn't commmented out
    #shap.summary_plot(shap_values[0], X_train, class_names=model.classes_)
    #shap.summary_plot(shap_values[1], X_train, class_names=model.classes_)
    #shap.summary_plot(shap_values[2], X_train, class_names=model.classes_) #show=False
    #Feature values in pink cause to increase the prediction.
    #Size of the bar shows the magnitude of the feature's effect.
    #Feature values in blue cause to decrease the
    plt.show()
Пример #25
0
def RF():
    rf = RandomForestRegressor(n_estimators=30, oob_score=True, n_jobs=-1)
    rf.fit(X, y)
    print("OOB", rf.oob_score_)

    explainer = shap.TreeExplainer(rf,
                                   data=shap.sample(X, 300),
                                   feature_perturbation='interventional')
    shap_values = explainer.shap_values(X[:shap_test_size],
                                        check_additivity=False)
    print("shap_j averages:", np.mean(shap_values, axis=0))
    shapimp = np.mean(np.abs(shap_values), axis=0)
    s = np.sum(shapimp)
    print("\nRF SHAP importances", list(shapimp), shapimp * xrange,
          list(shapimp / s))
    return shap_values
Пример #26
0
    def setUp(self):
        X_train, y_train, X_test, y_test = titanic_survive()
        train_names, test_names = titanic_names()

        model = LogisticRegression()
        model.fit(X_train, y_train)

        self.explainer = ClassifierExplainer(
            model,
            X_test.iloc[:20],
            y_test.iloc[:20],
            shap='kernel',
            model_output='probability',
            X_background=shap.sample(X_train, 5),
            cats=[{
                'Gender': ['Sex_female', 'Sex_male', 'Sex_nan']
            }, 'Deck', 'Embarked'],
            labels=['Not survived', 'Survived'])
Пример #27
0
    def _summarise_background(self,
                              background_data: Union[shap.common.Data, pd.DataFrame, np.ndarray, sparse.spmatrix],
                              n_background_samples: int) -> \
            Union[shap.common.Data, pd.DataFrame, np.ndarray, sparse.spmatrix]:
        """
        Summarises the background data to n_background_samples in order to reduce the computational cost. If the
        background data is a `shap.common.Data object`, no summarisation is performed.

        Returns
        -------
            If the user has specified grouping, then the input object is subsampled and an object of the same
            type is returned. Otherwise, a `shap.common.Data` object containing the result of a k-means algorithm
            is wrapped in a `shap.common.DenseData` object and returned. The samples are weighted according to the
            frequency of the occurrence of the clusters in the original data.
        """

        if isinstance(background_data, shap.common.Data):
            msg = "Received option to summarise the data but the background_data object " \
                  "was an instance of shap.common.Data. No summarisation will take place!"
            logger.warning(msg)
            return background_data

        if background_data.ndim == 1:
            msg = "Received option to summarise the data but the background_data object only had " \
                  "one record with {} features. No summarisation will take place!"
            logger.warning(msg.format(len(background_data)))
            return background_data

        self.summarise_background = True

        # if the input is sparse, we assume there are categorical variables and use random sampling, not kmeans
        if self.use_groups or self.categorical_names or isinstance(
                background_data, sparse.spmatrix):
            return shap.sample(background_data, nsamples=n_background_samples)
        else:
            logger.info(
                "When summarising with kmeans, the samples are weighted in proportion to their "
                "cluster occurrence frequency. Please specify a different weighting of the samples "
                "through the by passing a weights of len=n_background_samples to the constructor!"
            )
            return shap.kmeans(background_data, n_background_samples)
Пример #28
0
def shap_plots(model, train_features, test_features, test_labels):
    print("Computing shapley values..")
    # compute SHAP values
    if isinstance(
            model,
        (MLP, MLPRegressor, MLPClassifier, ElasticNet, LogisticRegression)):
        train_sample = shap.sample(train_features, 10)
        explainer = shap.Explainer(model.predict, train_sample)
    elif isinstance(model, (RandomForestRegressor, RandomForestClassifier)):
        explainer = shap.TreeExplainer(model, train_features)
    else:
        explainer = shap.Explainer(model, train_features)

    shap_values = explainer(test_features)
    shap.plots.bar(shap_values, max_display=10)
    # shap.plots.bar(shap_values[0]) # Local

    # beeswarm plot
    shap.plots.beeswarm(shap_values)

    # Decision plot
    expected_value = explainer.expected_value
    select = range(20)
    features_sample = test_features.iloc[select]
    shap.decision_plot(expected_value, explainer.shap_values(features_sample),
                       features_sample)

    # Heatmap
    shap.plots.heatmap(shap_values, max_display=10)

    # Scatter
    shap.plots.scatter(shap_values[:, "hs_child_age_None"],
                       color=shap_values,
                       alpha=0.8)

    # Feature clustering (redondant feature detection)
    clustering = shap.utils.hclust(
        test_features, test_labels
    )  # by default this trains (X.shape[1] choose 2) 2-feature XGBoost models
    shap.plots.bar(shap_values, clustering=clustering, clustering_cutoff=0.5)
Пример #29
0
    def compute_shap_values(self) -> None:
        """Shap values depending on what model we are using

        `shap.TreeExplainer` by default and if not it uses
        `KernelExplainer`

        Also provides compatibility with sklearn pipelines

        `shap_values` are stored in `self.shap_values`
        """
        with warnings.catch_warnings():
            # Some `shap` warnings are not useful for this implementation
            warnings.simplefilter("ignore")
            try:
                explainer = shap.TreeExplainer(
                    model=self.model,
                    feature_perturbation='tree_path_dependent'
                )

                shap_values_arguments = dict(X=self.X_test_to_shap)
            except Exception:
                def model_predict(data_array):
                    data_frame = pd.DataFrame(data_array,
                                              columns=self.column_names)
                    return self.model.predict_proba(data_frame)[:, 1]

                explainer = shap.KernelExplainer(model=model_predict,
                                                 data=shap.sample(
                                                     self.X_train_to_shap,
                                                     100
                                                 ),
                                                 link='logit')

                shap_values_arguments = dict(X=self.X_test_to_shap,
                                             l1_reg='aic')

        self.shap_values = explainer.shap_values(**shap_values_arguments)
    #    continue
    parameters = {
        'time': time,
        'target': target,
        'drop_opt': drop_opt,
    }
    X, y, info = _load_train_data(return_info=True, **parameters)

    estimators = load_models(time, target, drop_opt, model_names)
    # Subsample time indices to reduce autocorrelations
    X_subset, y_subset = get_independent_samples(X, y, info)
    explainer = InterpretToolkit(estimators=estimators,
                                 estimator_names=model_names,
                                 X=X_subset.copy(),
                                 y=y_subset.copy())
    background_dataset = shap.sample(X, 100)

    results = explainer.local_contributions(
        method='shap',
        background_dataset=background_dataset,
        performance_based=True,
        n_samples=n_samples)

    results = explainer.save(fname=save_fname, data=results)

    duration = datetime.datetime.now() - start_time
    seconds = duration.total_seconds()
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60