示例#1
0
    def test_lime_explainer_entropy_discretizer(self):
        np.random.seed(1)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        i = np.random.randint(0, self.test.shape[0])

        explainer = LimeTabularExplainer(self.train,
                                         feature_names=self.feature_names,
                                         class_names=self.target_names,
                                         training_labels=self.labels_train,
                                         discretize_continuous=True,
                                         discretizer='entropy')

        exp = explainer.explain_instance(self.test[i],
                                         rf.predict_proba,
                                         num_features=2)
        self.assertIsNotNone(exp)
        keys = [x[0] for x in exp.as_list()]
        print(keys)
        self.assertEqual(1,
                         sum([1 if 'petal width' in x else 0 for x in keys]),
                         "Petal Width is a major feature")
        self.assertEqual(1,
                         sum([1 if 'petal length' in x else 0 for x in keys]),
                         "Petal Length is a major feature")
示例#2
0
    def test_lime_explainer_good_regressor(self):
        np.random.seed(1)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        i = np.random.randint(0, self.test.shape[0])

        explainer = LimeTabularExplainer(self.train,
                                         mode="classification",
                                         feature_names=self.feature_names,
                                         class_names=self.target_names,
                                         discretize_continuous=True)

        exp = explainer.explain_instance(self.test[i],
                                         rf.predict_proba,
                                         num_features=2,
                                         model_regressor=LinearRegression())

        self.assertIsNotNone(exp)
        keys = [x[0] for x in exp.as_list()]
        self.assertEqual(1,
                         sum([1 if 'petal width' in x else 0 for x in keys]),
                         "Petal Width is a major feature")
        self.assertEqual(1,
                         sum([1 if 'petal length' in x else 0 for x in keys]),
                         "Petal Length is a major feature")
示例#3
0
def interpret_data(X, y, func):
    explainer = LimeTabularExplainer(X, discretize_continuous=False, kernel_width=3)
    times, scores = [], []
    for r_idx in range(100):
        start_time = time.time()
        explanation = explainer.explain_instance(X[r_idx, :], func)
        times.append(time.time() - start_time)
        scores.append(explanation.score)
        print('...')

    return times, scores
示例#4
0
    def test_lime_explainer_no_regressor(self):
        np.random.seed(1)
        iris = load_iris()
        train, test, labels_train, labels_test = sklearn.cross_validation.train_test_split(iris.data, iris.target,
                                                                                           train_size=0.80)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(train, labels_train)
        i = np.random.randint(0, test.shape[0])

        explainer = LimeTabularExplainer(train, feature_names=iris.feature_names,
                                                           class_names=iris.target_names, discretize_continuous=True)

        exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2)
        self.assertIsNotNone(exp)
示例#5
0
    def test_lime_explainer_bad_regressor(self):
        iris = load_iris()
        train, test, labels_train, labels_test = sklearn.cross_validation.train_test_split(iris.data, iris.target,
                                                                                           train_size=0.80)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(train, labels_train)
        lasso = Lasso(alpha=1, fit_intercept=True)
        i = np.random.randint(0, test.shape[0])
        with self.assertRaises(TypeError):
            explainer = LimeTabularExplainer(train, feature_names=iris.feature_names,
                                                               class_names=iris.target_names,
                                                               discretize_continuous=True)

            exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2, top_labels=1, model_regressor=lasso)
示例#6
0
    def test_lime_explainer_bad_regressor(self):

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        lasso = Lasso(alpha=1, fit_intercept=True)
        i = np.random.randint(0, self.test.shape[0])
        with self.assertRaises(TypeError):
            explainer = LimeTabularExplainer(self.train,
                                             mode="classification",
                                             feature_names=self.feature_names,
                                             class_names=self.target_names,
                                             discretize_continuous=True)
            exp = explainer.explain_instance(self.test[i],  # noqa:F841
                                             rf.predict_proba,
                                             num_features=2, top_labels=1,
                                             model_regressor=lasso)
    def explain_tabular(self, trainset, labels, instance, num_features=5, kernel_width=3):
        """Explain categorical and numeric features for a prediction.

        It analyze the prediction by LIME, and returns a report of the most impactful tabular
        features contributing to certain labels.

        Args:
          trainset: a DataFrame representing the training features that LIME can use to decide
              value distributions.
          labels: a list of labels to explain.
          instance: the prediction instance. It needs to conform to model's input. Can be a csv
              line string, or a dict.
          num_features: maximum number of features to show.
          kernel_width: Passed to LIME LimeTabularExplainer directly.

        Returns:
          A LIME's lime.explanation.Explanation.
        """
        from lime.lime_tabular import LimeTabularExplainer

        if isinstance(instance, six.string_types):
            instance = next(csv.DictReader([instance], fieldnames=self._headers))

        categories = self._get_unique_categories(trainset)
        np_trainset = self._preprocess_data_for_tabular_explain(trainset, categories)
        predict_fn = self._make_tabular_predict_fn(labels, instance, categories)
        prediction_df = pd.DataFrame([instance])
        prediction_instance = self._preprocess_data_for_tabular_explain(prediction_df, categories)

        explainer = LimeTabularExplainer(
            np_trainset,
            feature_names=(self._categorical_columns + self._numeric_columns),
            class_names=labels,
            categorical_features=range(len(categories)),
            categorical_names={i: v for i, v in enumerate(categories)},
            kernel_width=kernel_width)

        exp = explainer.explain_instance(
            prediction_instance[0],
            predict_fn,
            num_features=num_features,
            labels=range(len(labels)))
        return exp
示例#8
0
    def test_lime_explainer_good_regressor_synthetic_data(self):
        X, y = make_classification(n_samples=1000,
                                   n_features=20,
                                   n_informative=2,
                                   n_redundant=2,
                                   random_state=10)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(X, y)
        instance = np.random.randint(0, X.shape[0])
        feature_names = ["feature" + str(i) for i in range(20)]
        explainer = LimeTabularExplainer(X,
                                         feature_names=feature_names,
                                         discretize_continuous=True)

        exp = explainer.explain_instance(X[instance], rf.predict_proba)

        self.assertIsNotNone(exp)
        self.assertEqual(10, len(exp.as_list()))
示例#9
0
    def test_lime_explainer_no_regressor(self):
        np.random.seed(1)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        i = np.random.randint(0, self.test.shape[0])

        explainer = LimeTabularExplainer(self.train,
                                         feature_names=self.feature_names,
                                         class_names=self.target_names,
                                         discretize_continuous=True)

        exp = explainer.explain_instance(self.test[i],
                                         rf.predict_proba,
                                         num_features=2)
        self.assertIsNotNone(exp)
        keys = [x[0] for x in exp.as_list()]
        self.assertEqual(1,
                         sum([1 if 'petal width' in x else 0 for x in keys]),
                         "Petal Width is a major feature")
        self.assertEqual(1,
                         sum([1 if 'petal length' in x else 0 for x in keys]),
                         "Petal Length is a major feature")
示例#10
0
class LIME:
    def __init__(self,
                 X,
                 predict_fn,
                 num_features=5,
                 features_names=None,
                 result_label='score',
                 categorical_features=None):

        self.explainer = LimeTabularExplainer(
            X,
            feature_names=features_names,
            class_names=[result_label],
            mode='regression',
            categorical_features=categorical_features)

        self.predict_fn = predict_fn
        self.num_features = num_features
        self.splime = None

    def explain_instance(self, x):
        return self.explainer.explain_instance(x,
                                               self.predict_fn,
                                               num_features=self.num_features)

    def fit(self, X, sample_size=20, num_expected_examples=15):
        # https://github.com/marcotcr/lime/blob/master/doc/notebooks/Submodular%20Pick%20examples.ipynb
        self.splime = submodular_pick.SubmodularPick(
            self.explainer,
            X,
            self.predict_fn,
            sample_size=sample_size,
            num_features=self.num_features,
            num_exps_desired=num_expected_examples)

    def get_explanations(self):
        return self.splime.sp_explanations
def explain_certainty(model,
                      X_train,
                      y_train,
                      feature_names,
                      instance,
                      num_features=NUM_EXPLATIONS,
                      silent=False,
                      html=False):
    """ |explain_certainty| is a legacy function left for compatibility.
  """
    class_names = ["uncertain", "certain"]
    explainer = LimeTabularExplainer(X_train,
                                     training_labels=y_train,
                                     feature_names=feature_names,
                                     class_names=class_names,
                                     discretize_continuous=True,
                                     discretizer=DISCRETIZER)

    explanation = explainer.explain_instance(instance,
                                             model.predict_proba,
                                             num_features=num_features,
                                             top_labels=None)
    if html:
        exp_html = explanation.as_html()
        write("uncertainty-exp.html", exp_html)

    if not silent:
        print_lime_model_prediction(model.predict_proba, instance)
        print_explanation(explanation)
        print_instance_values(feature_names, explanation, instance)

    exp_map = explanation.as_map()[1]
    exp_feature_ids = map(lambda x: x[0], exp_map)
    exp_list = explanation.as_list()
    exps = map(lambda x: parse_discretized_feature(x[0]), exp_list)
    return zip(exp_feature_ids, exps)
示例#12
0
# For this, separate out the categorical features:
import numpy as np
categorical_features = [i for i, col in enumerate(boston.data.T)
                        if np.unique(col).size < 10]

##########################################################
# Now use a lime explainer for tabular data
from lime.lime_tabular import LimeTabularExplainer
explainer = LimeTabularExplainer(X_train,
    feature_names=boston.feature_names,
    class_names=['price'],
    categorical_features=categorical_features,
    mode='regression')

# Now explain a prediction
exp = explainer.explain_instance(X_test[25], regressor.predict,
        num_features=10)

exp.as_pyplot_figure()
from matplotlib import pyplot as plt
plt.tight_layout()
##########################################################
print(exp.as_list())

##########################################################
# Explain a few more predictions
for i in [7, 50, 66]:
    exp = explainer.explain_instance(X_test[i], regressor.predict,
            num_features=10)
    exp.as_pyplot_figure()
    plt.tight_layout()
    def test_lime_explainer_with_data_stats(self):
        np.random.seed(1)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        i = np.random.randint(0, self.test.shape[0])

        # Generate stats using a quartile descritizer
        descritizer = QuartileDiscretizer(self.train, [],
                                          self.feature_names,
                                          self.target_names,
                                          random_state=20)

        d_means = descritizer.means
        d_stds = descritizer.stds
        d_mins = descritizer.mins
        d_maxs = descritizer.maxs
        d_bins = descritizer.bins(self.train, self.target_names)

        # Compute feature values and frequencies of all columns
        cat_features = np.arange(self.train.shape[1])
        discretized_training_data = descritizer.discretize(self.train)

        feature_values = {}
        feature_frequencies = {}
        for feature in cat_features:
            column = discretized_training_data[:, feature]
            feature_count = collections.Counter(column)
            values, frequencies = map(list, zip(*(feature_count.items())))
            feature_values[feature] = values
            feature_frequencies[feature] = frequencies

        # Convert bins to list from array
        d_bins_revised = {}
        index = 0
        for bin in d_bins:
            d_bins_revised[index] = bin.tolist()
            index = index + 1

        # Descritized stats
        data_stats = {}
        data_stats["means"] = d_means
        data_stats["stds"] = d_stds
        data_stats["maxs"] = d_maxs
        data_stats["mins"] = d_mins
        data_stats["bins"] = d_bins_revised
        data_stats["feature_values"] = feature_values
        data_stats["feature_frequencies"] = feature_frequencies

        data = np.zeros((2, len(self.feature_names)))
        explainer = LimeTabularExplainer(data,
                                         feature_names=self.feature_names,
                                         random_state=10,
                                         training_data_stats=data_stats,
                                         training_labels=self.target_names)

        exp = explainer.explain_instance(self.test[i],
                                         rf.predict_proba,
                                         num_features=2,
                                         model_regressor=LinearRegression())

        self.assertIsNotNone(exp)
        keys = [x[0] for x in exp.as_list()]
        self.assertEqual(1,
                         sum([1 if 'petal width' in x else 0 for x in keys]),
                         "Petal Width is a major feature")
        self.assertEqual(1,
                         sum([1 if 'petal length' in x else 0 for x in keys]),
                         "Petal Length is a major feature")
示例#14
0
# X = array[:,0:number_of_features]
# print X[:5]
# array_train = np.array(X_train)
# array_test = np.array(X_validation)
# print array_train
# print X_train[:, '110']
explainer = LimeTabularExplainer(X_train,
                                 feature_names=cols,
                                 class_names=['0', '1'],
                                 discretize_continuous=True)

observation_1 = 2
# print array_test[observation_1]
# print predict_fn_log
exp = explainer.explain_instance(X_validation[observation_1],
                                 predict_fn_log,
                                 num_features=100)
a = exp.as_list()
# print model_log.predict_proba(X_validation[observation_1]).astype(float)
# print exp.local_pred
for li in a:
    print li
# print X_train['453'].value_counts()

exp = explainer.explain_instance(X_validation[observation_1],
                                 predict_fn_rf,
                                 num_features=100)
a = exp.as_list()
# print model_log.predict_proba(X_validation[observation_1]).astype(float)
# print exp.local_pred
for li in a:
示例#15
0
class CSModel(object):
    """docstring for CSModel"""
    def __init__(self, datapath, logger):
        super(CSModel, self).__init__()
        self.datapath = datapath
        self.logger = logger

    def load_raw_data(self):
        with open(
                os.path.join(self.datapath, "german_credit_preprocessed.csv"),
                "r") as infile:
            raw = pd.read_csv(infile)
        return raw

    def preprocess(self):
        self.raw = self.load_raw_data()
        self.labels = self.raw['Creditability']
        self.features = self.raw[self.raw.columns[1:]]
        self.feature_names = self.features.columns
        self.class_names = ['Not Credit Worthy', 'Credit Worthy']  # [0, 1]
        self.categorical_features = [
            'Account Balance', 'Payment Status of Previous Credit', 'Purpose',
            'Value Savings/Stocks', 'Length of current employment',
            'Sex & Marital Status', 'Most valuable available asset',
            'Concurrent Credits', 'Type of apartment', 'Occupation',
            'Telephone', 'Foreign Worker', 'Guarantors'
        ]
        self.categorical_feature_indices = [
            self.features.columns.tolist().index(cf)
            for cf in self.categorical_features
        ]

    def create_model(self):
        features = self.features.as_matrix()
        self.categorical_names = {}
        self.encoders = {}
        for feature in self.categorical_feature_indices:
            le = LabelEncoder()
            le.fit(features[:, feature])
            features[:, feature] = le.transform(features[:, feature])
            self.encoders[feature] = le
            self.categorical_names[feature] = le.classes_
        train, test, labels_train, labels_test = train_test_split(
            features, self.labels, train_size=0.80)
        self.train = train
        self.test = test
        self.labels_train = labels_train.as_matrix().reshape(-1, 1)
        self.labels_test = labels_test.as_matrix().reshape(-1, 1)
        self.classifier = RandomForestClassifier(n_estimators=500)
        self.classifier.fit(self.train, self.labels_train.ravel())

    def create_model_explainer(self):
        self.explainer = LimeTabularExplainer(
            self.train,
            feature_names=self.feature_names,
            training_labels=self.labels_train,
            class_names=self.class_names,
            categorical_features=self.categorical_feature_indices,
            categorical_names=self.categorical_names,
            discretize_continuous=True)

    def get_form_content(self):
        forms = []
        for i, fn in enumerate(self.feature_names):
            if i in self.categorical_feature_indices:
                forms.append({
                    "name":
                    fn,
                    "options":
                    list(enumerate(self.categorical_names.get(i)))
                })
            else:
                forms.append({
                    "name":
                    fn,
                    "options":
                    list(enumerate(sorted(self.raw[fn].unique().tolist())))
                })
        return forms

    def get_explanation(self, i):
        exp = self.explainer.explain_instance(self.test[i],
                                              self.classifier.predict_proba,
                                              num_features=3,
                                              top_labels=1)
        return (exp.as_html(show_table=True,
                            show_all=False,
                            show_predicted_value=True,
                            predict_proba=False), self.labels_test[i])

    def get_custom_explanation(self, instance):
        exp = self.explainer.explain_instance(instance,
                                              self.classifier.predict_proba,
                                              num_features=3,
                                              top_labels=1)
        return exp.as_html(show_table=True,
                           show_all=False,
                           show_predicted_value=True,
                           predict_proba=False)

    def remap_categoricals(self, df):
        for feature, categories in self.categorical_names.items():
            featurename = self.feature_names[feature]
            df[featurename] = df[featurename].map(
                lambda x: list(categories)[x])
        return df
示例#16
0
def FidelityCoverageExperimetns(blackbox, X_explain, y_explain, index, dataset, anchor_explainer, path_data, verbose=False):
    # Reading data set information
    feature_names = dataset['feature_names']
    possible_outcomes = dataset['possible_outcomes']
    discrete_indices = dataset['discrete_indices']
    discrete_names = dataset['discrete_names']

    # Creating a data frame of the explanation data
    dfX_expalin = build_df2explain(blackbox, X_explain, dataset).to_dict('records')

    # Variable initialization
    fidelity_x_EXPLAN = exp_size_EXPLAN = cv_cv_EXPLAN = precision_EXPLAN = fidelity_X_EXPLAN = \
    coverage_EXPLAN = coverage_X_EXPLAN = n_samples_EXPLAN = distance_EXPLAN = balance_rate_X_EXPLAN = \
    fidelity_x_LORE = exp_size_LORE = cv_cv_LORE = precision_LORE = fidelity_X_LORE = coverage_LORE = \
    coverage_X_LORE = n_samples_LORE = distance_LORE = balance_rate_X_LORE = fidelity_x_Anchor = \
    exp_size_Anchor = cv_cv_Anchor =precision_Anchor = fidelity_X_Anchor = coverage_Anchor = \
    coverage_X_Anchor = n_samples_Anchor = distance_Anchor = balance_rate_X_Anchor = fidelity_x_LIME = \
    exp_size_LIME = fidelity_X_LIME = rule_LORE = rule_Anchor = rule_EXPLAN = 0

    # Hit evaluation function
    def hit_outcome(x, y):
        return 1 if x == y else 0

    # EXPLAN
    print(datetime.datetime.now(), '\tEXPLAN')
    start_time = time.time()
    try:
        # Explaining the instance specified by index
        exp_EXPLAN, info_EXPLAN = explan.Explainer(X_explain[index],
                                                   blackbox,
                                                   dataset,
                                                   N_samples=3000,
                                                   tau=250)

        # Calculating the overall neighborhood distance w.r.t instance2explain
        X = info_EXPLAN['X']
        X_hat = np.r_[X_explain[index].reshape(1, -1), X]
        distances = pairwise_distances(
            X_hat,
            X_hat[0, :].reshape(1, -1),
            metric='euclidean').ravel()
        distance_EXPLAN = np.sum(distances)

        # Calculating the feature frequency variance of neighborhood
        X_hat = X_hat[np.random.choice(range(X_hat.shape[0]),np.min([X_hat.shape[0],1000]),replace=False)]
        cv_EXPLAN = variation(X_hat, axis=0)
        cv_EXPLAN[np.isnan(cv_EXPLAN)] = 0
        cv_cv_EXPLAN = variation(cv_EXPLAN)

        # Measuring the balance rate of neighborhood samples
        n_samples_EXPLAN = X.shape[0]
        predictions = blackbox.predict(X)
        ones = np.sum(predictions)
        balance_rate_X_EXPLAN = 1 - np.abs(0.5 - (ones / n_samples_EXPLAN))

        # Extracting the predicted labels by black-box and interpretable model
        y_x_bb_EXPLAN = y_explain[index]
        y_x_dt_EXPLAN = exp_EXPLAN[0][dataset['class_name']]
        y_X_bb_EXPLAN = info_EXPLAN['y_X_bb']
        y_X_dt_EXPLAN = info_EXPLAN['y_X_dt']

        # Calculating fidelity metrics for the explained instance and its neighborhood samples
        fidelity_x_EXPLAN = hit_outcome(y_x_bb_EXPLAN, y_x_dt_EXPLAN)
        fidelity_X_EXPLAN = f1_score(y_X_bb_EXPLAN, y_X_dt_EXPLAN)

        # Printing the explanation rule
        rule_EXPLAN = exp_EXPLAN[1]
        print(rule_EXPLAN)

        # Calculating the global coverage
        covered_EXPLAN = get_covered(rule_EXPLAN, dfX_expalin, dataset)
        coverage_EXPLAN = (len(covered_EXPLAN) / len(dfX_expalin))

        # Calculating the local coverage
        covered_X_EXPLAN = get_covered(rule_EXPLAN, info_EXPLAN['dfX'].to_dict('records'), dataset)
        coverage_X_EXPLAN = (len(covered_X_EXPLAN) / len(info_EXPLAN['dfX']))

        # Measuring the precision score based on the global coverage
        precision_EXPLAN = [hit_outcome(c, y_x_dt_EXPLAN) for c in y_explain[covered_EXPLAN]]
        precision_EXPLAN = 0 if precision_EXPLAN == [] else precision_EXPLAN

        # Calculating the explanation size
        exp_size_EXPLAN = len(info_EXPLAN['tree_path']) - 1

    except Exception:
        pass

    time_EXPLAN = time.time() - start_time

    #LORE
    print(datetime.datetime.now(), '\tLORE')
    start_time = time.time()

    try:
        # Explaining the instance specified by index
        exp_LORE, info_LORE = lore.explain(index, X_explain,
                                           dataset, blackbox,
                                           ng_function=genetic_neighborhood,
                                           discrete_use_probabilities=True,
                                           continuous_function_estimation=False,
                                           returns_infos=True, path=path_data,
                                           sep=';', log=verbose)

        # Calculating the overall neighborhood distance w.r.t instance2explain
        Z = info_LORE['Z']
        Z_hat = np.r_[X_explain[index].reshape(1, -1), Z]
        distances = pairwise_distances(
            Z_hat,
            Z_hat[0, :].reshape(1, -1),
            metric='euclidean').ravel()
        distance_LORE = np.sum(distances)

        # Calculating the feature frequency variance of neighborhood
        Z_hat = Z_hat[np.random.choice(range(Z_hat.shape[0]), np.min([Z_hat.shape[0], 1000]), replace=False)]
        cv_LORE = variation(Z_hat, axis=0)
        cv_LORE[np.isnan(cv_LORE)] = 0
        cv_cv_LORE = variation(np.abs(cv_LORE))

        # Measuring the balance rate of neighborhood samples
        n_samples_LORE = Z.shape[0]
        predictions = blackbox.predict(Z)
        ones = np.sum(predictions)
        balance_rate_X_LORE = 1 - np.abs(0.5 - (ones / n_samples_LORE))

        # Extracting the predicted labels by black-box and interpretable model
        y_x_bb_LORE = y_explain[index]
        y_x_dt_LORE = exp_LORE[0][0][dataset['class_name']]
        y_X_bb_LORE = info_LORE['y_pred_bb']
        y_X_dt_LORE = info_LORE['y_pred_cc']

        # Calculating fidelity metrics for the explained instance and its neighborhood samples
        fidelity_x_LORE = hit_outcome(y_x_bb_LORE, y_x_dt_LORE)
        fidelity_X_LORE = f1_score(y_X_bb_LORE, y_X_dt_LORE)

        # Printing the explanation rule
        rule_LORE = exp_LORE[0][1]
        print(rule_LORE)

        # Calculating the global coverage
        covered_LORE = get_covered(rule_LORE, dfX_expalin, dataset)
        coverage_LORE  = (len(covered_LORE ) / len(dfX_expalin))

        # Calculating the local coverage
        covered_X_LORE  = get_covered(rule_LORE, info_LORE['dfZ'].to_dict('records'), dataset)
        coverage_X_LORE  = (len(covered_X_LORE ) / len(info_LORE['dfZ']))

        # Measuring the precision score based on the global coverage
        precision_LORE  = [hit_outcome(c, y_x_dt_LORE) for c in y_explain[covered_LORE]]
        precision_LORE  = 0 if precision_LORE  == [] else precision_LORE

        # Calculating the explanation size
        exp_size_LORE = len(info_LORE['tree_path']) - 1

    except Exception:
        pass

    time_LORE = time.time() - start_time

    # Anchor
    print(datetime.datetime.now(), '\tAnchor')
    start_time = time.time()
    try:
        # Explaining the instance specified by index
        exp_Anchor, info_Anchor = anchor_explainer.explain_instance(X_explain[index].reshape(1, -1),
                                                                    blackbox.predict, threshold=0.95)

        # Calculating the overall neighborhood distance w.r.t instance2explain
        Z = info_Anchor['state']['raw_data']
        Z = Z[:info_Anchor['state']['current_idx'] - 1, :]
        Z_hat = np.r_[X_explain[index].reshape(1, -1), Z]
        distances = pairwise_distances(
            Z_hat,
            Z_hat[0, :].reshape(1, -1),
            metric='euclidean').ravel()
        distance_Anchor = np.sum(distances)

        # Calculating the feature frequency variance of neighborhood
        Z_hat = Z_hat[np.random.choice(range(Z_hat.shape[0]), np.min([Z_hat.shape[0], 1000]), replace=False)]
        cv_Anchor = variation(Z_hat, axis=0)
        cv_Anchor[np.isnan(cv_Anchor)] = 0
        cv_cv_Anchor = variation(cv_Anchor)

        # Measuring the balance rate of neighborhood samples
        n_samples_Anchor = Z.shape[0]
        predictions = blackbox.predict(Z)
        ones = np.sum(predictions)
        balance_rate_X_Anchor = 1 - np.abs(0.5 - (ones / n_samples_Anchor))

        # Extracting the predicted labels by black-box and interpretable model
        y_X_bb_Anchor = blackbox.predict(Z)
        y_X_dt_Anchor = blackbox.predict(Z)
        y_x_bb_Anchor = y_explain[index]

        # Printing the explanation rule
        rule_Anchor = anchor2arule(exp_Anchor)
        print(rule_Anchor)

        # Calculating the global coverage
        covered_Anchor = get_covered(rule_Anchor, dfX_expalin, dataset)
        coverage_Anchor = (len(covered_Anchor) / len(dfX_expalin))

        # Calculating fidelity metrics for the explained instance and its neighborhood samples
        if len(covered_Anchor) > 0:
            if isinstance(y_explain[0], str):
                y_x_dt_Anchor = mode(y_explain[covered_Anchor])
            else:
                y_x_dt_Anchor = int(np.round(y_explain[covered_Anchor].mean()))
        else:
            y_x_dt_Anchor = y_x_bb_Anchor
        fidelity_x_Anchor = hit_outcome(y_x_bb_Anchor, y_x_dt_Anchor)
        fidelity_X_Anchor = f1_score(y_X_bb_Anchor, y_X_dt_Anchor)

        # Calculating the local coverage
        dfZ = build_df2explain(blackbox, Z, dataset).to_dict('records')
        covered_X_Anchor = get_covered(rule_Anchor, dfZ, dataset)
        coverage_X_Anchor = (len(covered_X_Anchor) / len(Z))

        # Measuring the precision score based on the global coverage
        precision_Anchor = [hit_outcome(v, y_x_dt_Anchor) for v in y_explain[covered_Anchor]]
        precision_Anchor = 0 if precision_Anchor == [] else precision_Anchor

        # Calculating the explanation size
        exp_size_Anchor = len(rule_Anchor)

    except Exception:
        pass

    time_Anchor = time.time() - start_time

    # LIME
    print(datetime.datetime.now(), '\tLIME')
    start_time = time.time()
    try:
        # Creating LIME tabular explainer
        exp_LIME =  LimeTabularExplainer(X_explain,
                                        feature_names=feature_names,
                                        class_names=possible_outcomes,
                                        categorical_features=discrete_indices,
                                        categorical_names=discrete_names,
                                        verbose=False)
        # Finding the number of explanation features that result
        # in the highest score of the interpretable mode
        score = []
        for i in range(2, 11):
            exp, Zlr, Z, lr = exp_LIME.explain_instance(X_explain[index],
                                                        blackbox.predict_proba,
                                                        num_features=i,
                                                        num_samples=5000)
            score.append(exp.score)
        num_features = score.index(max(score)) + 2

        # Explaining the instance using the best number of features
        exp, Zlr, Z, lr = exp_LIME.explain_instance(X_explain[index],
                                                    blackbox.predict_proba,
                                                    num_features=num_features,
                                                    num_samples=5000)

        # Extracting the information provided by the feature importance explanation
        used_features_idx = list()
        used_features_importance = list()
        logic_explanation = list()
        for idx, weight in exp.local_exp[1]:
            used_features_idx.append(idx)
            used_features_importance.append(weight)
            logic_explanation.append(exp.domain_mapper.discretized_feature_names[idx])

        # Printing the feature importance explanation
        for feature, weight in zip(logic_explanation, used_features_importance):
            print(feature, weight)

        # Extracting the predicted labels by black-box and interpretable model
        y_x_bb_LIME = blackbox.predict(Z[0].reshape(1, -1))[0]
        y_x_lr_LIME = np.round(lr.predict(Zlr[0, used_features_idx].reshape(1, -1))).astype(int)[0]
        y_X_bb_LIME = blackbox.predict(Z)
        y_X_lr_LIME = np.round(lr.predict(Zlr[:, used_features_idx])).astype(int)

        # Calculating fidelity metrics for the explained instance and its neighborhood samples
        fidelity_x_LIME = hit_outcome(y_x_bb_LIME, y_x_lr_LIME)
        fidelity_X_LIME = f1_score(y_X_bb_LIME, y_X_lr_LIME)

        # Calculating the explanation size
        exp_size_LIME = num_features

    except Exception:
        pass

    time_LIME = time.time() - start_time

    # Returning the achieved results
    results = '%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,%d,%d,%.3f,%.3f,%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,%d,%d,%.3f,%.3f,' \
          '%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,%d,%d,%.3f,%.3f,%d,%d,%.3f,%.3f,%s,%s,%s,%s,%s,%s' \
          % (fidelity_x_EXPLAN, exp_size_EXPLAN, cv_cv_EXPLAN, np.mean(precision_EXPLAN), fidelity_X_EXPLAN,
            coverage_EXPLAN, coverage_X_EXPLAN, n_samples_EXPLAN, distance_EXPLAN, balance_rate_X_EXPLAN, time_EXPLAN,
            fidelity_x_LORE, exp_size_LORE, cv_cv_LORE, np.mean(precision_LORE), fidelity_X_LORE,
            coverage_LORE, coverage_X_LORE, n_samples_LORE, distance_LORE, balance_rate_X_LORE, time_LORE,
            fidelity_x_Anchor, exp_size_Anchor, cv_cv_Anchor, np.mean(precision_Anchor), fidelity_X_Anchor,
            coverage_Anchor, coverage_X_Anchor, n_samples_Anchor, distance_Anchor, balance_rate_X_Anchor, time_Anchor,
            fidelity_x_LIME, exp_size_LIME, fidelity_X_LIME, time_LIME,
            'EXPLAN Rule -> ', rule_EXPLAN, 'LORE Rule ->', rule_LORE, 'Anchor Rule ->', rule_Anchor)
    return results
示例#17
0
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

boston = load_boston()

x_train, x_test, y_train, y_test = train_test_split(boston.data,
                                                    boston.target,
                                                    train_size=0.8)

rf = RandomForestRegressor(n_estimators=1000)

rf.fit(x_train, y_train)

categorical_features = np.argwhere(
    np.array(
        [len(set(boston.data[:, x]))
         for x in range(boston.data.shape[1])]) <= 10).flatten()
explainer = LimeTabularExplainer(x_train,
                                 categorical_features=categorical_features,
                                 feature_names=boston.feature_names,
                                 class_names=['price'],
                                 verbose=True,
                                 mode='regression')

exp = explainer.explain_instance(x_test[0], rf.predict, num_features=5)

print(exp.as_list())
示例#18
0
class GLADEnsembleLimeExplainer(object):
    """ Generates explanations with LIME

    Explainer and Describer are two different concepts:
        - Explainer tells us why the detector considers an instance an anomaly.
        - Describer generates a compact description for one (or more) instances.

    Must install LIME. Use the following command:
        pip install lime

    References:
        "Why Should I Trust You?" Explaining the Predictions of Any Classifier
            by Marco Tulio Ribeiro, Sameer Singh and Carlos Guestrin, KDD, 2016.
            https://marcotcr.github.io/lime
    """
    def __init__(self, x, y, ensemble, afss, feature_names=None):
        self.members = ensemble.get_members()
        self.afss = afss
        self.describer = GLADRelevanceDescriber(x, y, model=afss, opts=None)
        logger.debug("#ensemble members: %d" % len(self.members))
        try:
            import lime
            from lime.lime_tabular import LimeTabularExplainer
            logger.debug("loaded LIME")
            self.explainer = LimeTabularExplainer(x,
                                                  mode="regression",
                                                  feature_names=feature_names,
                                                  random_state=42)
        except:
            self.explainer = None
            logger.warning(
                "Failed to load LIME. Install LIME with command: 'pip install lime' or "
                "see: https://marcotcr.github.io/lime")
            print(
                "WARNING: Failed to load LIME. Install LIME with command: 'pip install lime' or "
                "see: https://marcotcr.github.io/lime")

    def explain(self, inst, member_index=-1):
        """ Generates explanation with a single ensemble member

        First, finds the best member for the instance using AFSS relevance scores.
        Then employs LIME to generate the explanation.

        :param inst: 1d array of instance features
        :param member_index: int
            The index of the anomaly detector in the ensemble.
            If -1, the most relevant member as per AFSS will be used
        :return: LIME Explanation, best ensemble, member relevance
        """
        if self.explainer is None:
            print(
                "WARNING: Explainer is not initialized. No explanations generated."
            )
            logger.warning(
                "Explainer is not initialized. No explanations generated.")
            return None, None, None
        member_relevance = None
        if member_index < 0:
            member_relevance, ranks_all, best_member = self.describer.get_member_relevance_scores_ranks(
                np.reshape(inst, (1, -1)))
            member_index = best_member[0]
            # logger.debug("best member index: %d" % member_index)
        explanation = self.explainer.explain_instance(
            inst, predict_fn=self.members[member_index].decision_function)
        return explanation, member_index, member_relevance
predict_fn = lambda x: blackbox_model.predict_proba(oh_enc.transform(x))
np.random.seed(1)
explainer = LimeTabularExplainer(X_train, class_names=['no_weapon', 'weapon'], feature_names=use_names,
                                 categorical_features=range(len(use_names)), categorical_names=categorical_names,
                                 mode='classification')
y_pred = predict_fn(X_test)
has_weapon_idx = []
for i in range(y_pred.shape[0]):
    if y_pred[i, 0] < y_pred[i, 1]:
        has_weapon_idx.append(i)

idx = has_weapon_idx[100]
idx = has_weapon_idx[350]
fi = dict()
for i in range(100):
    exp = explainer.explain_instance(X_test[idx], predict_fn, num_features=8, labels=[0, 1])
    #print('test sample {} prediction: {}, explanation for its predicted class:'.format(i, y_pred[idx]), 
    #       exp.as_list(y_pred[idx, 0] < y_pred[idx, 1]))
    for item in exp.as_list(1):
        if item[0] not in fi:
            fi[item[0]] = []
        fi[item[0]].append(item[1])

for feature in fi.keys():
    temp = np.array(fi[feature])
    print('*' * 20)
    print('feature name:{}'.format(feature))
    print('count:{}'.format(temp.shape[0]))
    print('std:{}'.format(temp.std()))
    print('max:{}'.format(temp.max()))
    print('min:{}'.format(temp.min()))
示例#20
0
class interactive_iml_tool:
    def __init__(self, predictor, iml_methods, nn_forecasters, data, target,
                 features, target_scaler):
        """Init function
        Args:
            predictor:
            iml_methods:
            nn_forecasters:
            data:
            target:
            features:
            scaler:
        """
        self.lionet = iml_methods['LioNets']
        self.lime = LimeTabularExplainer(training_data=data.reshape(
            len(data), -1),
                                         discretize_continuous=False,
                                         mode="regression",
                                         random_state=0)
        self.ipca = {}
        self.ipca['LioNets'] = iPCA(self.lionet.give_me_the_neighbourhood,
                                    'local')
        self.ipca['Lime'] = iPCA(self.lime.give_me_the_neighbourhood, 'local',
                                 self._lime_predict)

        self.predictor = predictor
        self.forecaster = nn_forecasters['forecaster']
        self.nbeats = nn_forecasters['nbeats']
        self.xyz7_model = nn_forecasters['xyz7_model']
        self.features = features
        self.target_scaler = target_scaler

        self.num_features = data.shape[-1]
        self.time_window = data.shape[-2]
        self.input_dim = (self.time_window, self.num_features)
        self.forecast_window = self.forecaster.output_shape[-2]

        temp_train = data.reshape(-1, self.num_features)
        self.global_mean, self.global_std = [], []
        for i in range(self.num_features):
            self.global_mean.append(temp_train[:, i].mean())
            self.global_std.append(temp_train[:, i].std())

        self.min_target = target.min()
        self.max_target = target.max()

    def _lime_predict(self, instance):
        t_instance = np.array([instance]).reshape(
            (len(instance), self.time_window, self.num_features))
        a = self.predictor.predict(t_instance)
        a = np.array([i[0] for i in a])
        return a

    def load_instance(self, instance):

        self.instance = instance
        model = Ridge(alpha=0.0001, fit_intercept=True, random_state=0)

        # Lionets weights
        lionet_weights, real_prediction, local_prediction = self.lionet.explain_instance(
            self.instance, 200, model)

        # Lime weights
        explanation, _, _, _ = self.lime.explain_instance(
            self.instance.flatten(),
            predict_fn=self._lime_predict,
            num_features=700)
        weights = OrderedDict(explanation.as_list())
        lime_w = dict(
            sorted(
                zip(list([int(wk) for wk in weights.keys()]),
                    list(weights.values()))))
        lime_weights = np.array([lime_w[o] for o in lime_w.keys()])

        # iPCA
        pca_weights = {}
        for method in self.ipca.keys():
            [timestep_weights, feature_weights
             ], _ = self.ipca[method].find_importance(self.instance, 300,
                                                      model)
            timestep_weights = timestep_weights.flatten()
            pca_weights[method] = [timestep_weights, feature_weights]

        self.weights_dict = {
            'LioNets': {
                False: [lionet_weights],
                True: pca_weights['LioNets']
            },
            'Lime': {
                False: [lime_weights],
                True: pca_weights['Lime']
            }
        }

        ipca_options = [False, True]

        # Original stats
        self.original_instance_statistics = {'LioNets': {}, 'Lime': {}}
        for method, enable_ipca in product(self.weights_dict.keys(),
                                           ipca_options):
            self.original_instance_statistics[method][
                enable_ipca] = self.moded_instance_statistics(
                    self.instance, method, enable_ipca)

        # Recommend modifications
        self.recommendation = {'LioNets': {}, 'Lime': {}}
        for method, enable_ipca in product(self.weights_dict.keys(),
                                           ipca_options):
            self.recommendation[method][
                enable_ipca] = self.recommend_modifications(
                    method, enable_ipca)

        self.seeFtr = 1
        self.original_preds, self.original_ftr_all, self.original_ftr_stats = [],[],[]
        self.mod_preds, self.mod_ftr_all, self.mod_ftr_stats = [], [], []
        self.expected_preds, self.expected_ftr_all, self.expected_ftr_stats = [],[],[]
        self.load_UI()

    def modify(self,
               weights,
               ftr,
               mod,
               mod_range,
               uni_mod_val=0,
               uni_wght_sign=1,
               select_target=0,
               xyz7_tm=0,
               forecast_option=6):

        start, end = mod_range[0], mod_range[1]

        mod_instance = self.instance.copy()
        local_mean = self.instance[start - 1:end, ftr].mean()

        # ---MODS---
        if mod == 1:  # Uniform
            for i in range(start - 1, end):
                if weights.reshape(
                        self.input_dim)[i, ftr] > 0 and uni_wght_sign > 0:
                    mod_instance[i, ftr] = mod_instance[i, ftr] + uni_mod_val
                if weights.reshape(
                        self.input_dim)[i, ftr] < 0 and uni_wght_sign < 0:
                    mod_instance[i, ftr] = mod_instance[i, ftr] + uni_mod_val
        elif mod == 2:  # Local Mean
            mod_instance[start - 1:end, ftr] = local_mean
        elif mod == 3:  # Global Mean
            mod_instance[start - 1:end, ftr] = self.global_mean[ftr]
        elif mod == 4:  # Zeros
            mod_instance[start - 1:end, ftr] = 0
        elif mod == 5:  # Gaussian Noise
            for i in range(start - 1, end):
                np.random.seed(2000 + i)
                gaussian_noise = np.random.normal(self.global_mean[ftr],
                                                  self.global_std[ftr], 1) / 10
                mod_instance[i, ftr] += gaussian_noise[0]
            np.clip(mod_instance, 0, 1, out=mod_instance)
        elif mod == 6:  # Neural Forecaster
            prediction = self.forecaster.predict(
                np.expand_dims(mod_instance, axis=0))
            prediction = prediction.squeeze()
            mod_instance = np.append(mod_instance, prediction, axis=0)
            mod_instance = mod_instance[self.forecast_window:]
        elif mod == 7:  # Static Forecaster
            for i in range(mod_instance.shape[1]):
                dif = mod_instance[-1, i] - mod_instance[
                    -(self.forecast_window + 1):-1, i]
                temp = np.flip(dif) + mod_instance[-1, i]
                mod_instance[:, i] = np.append(
                    mod_instance[self.forecast_window:, i], temp)
                np.clip(mod_instance[:, i], 0, 1, out=mod_instance[:, i])
        elif mod == 8:  # NBeats Forecaster
            prediction = self.nbeats.predict(
                np.expand_dims(mod_instance, axis=0))
            prediction = prediction.squeeze()
            mod_instance = np.append(mod_instance, prediction, axis=0)
            mod_instance = mod_instance[self.forecast_window:]
        elif mod == 9:  # XYZ7 Forecaster
            start = xyz7_tm * self.forecast_window
            end = self.time_window - self.forecast_window + start
            mod_instance = mod_instance[start:end]
            prediction = self.xyz7_model.predict([
                np.expand_dims(mod_instance, axis=0),
                np.array(self.target_scaler.transform([[select_target]]))
            ])
            prediction = prediction.squeeze()
            mod_instance = np.append(mod_instance, prediction, axis=0)

        return mod_instance

    def moded_instance_statistics(self, temp_instance, iml_method,
                                  enable_ipca):

        model = Ridge(alpha=0.0001, fit_intercept=True, random_state=0)

        if enable_ipca:
            real_prediction = self.predictor.predict(
                np.expand_dims(temp_instance, axis=0)).squeeze()
            [timestep_weights, feature_weights
             ], local_prediction = self.ipca[iml_method].find_importance(
                 temp_instance, 300, model)
            weights = timestep_weights.flatten()
        elif iml_method == 'LioNets':
            weights, real_prediction, local_prediction = self.lionet.explain_instance(
                temp_instance, 200, model)
        else:
            real_prediction = self.predictor.predict(
                np.expand_dims(temp_instance, axis=0)).squeeze()
            explanation, _, _, _ = self.lime.explain_instance(
                temp_instance.flatten(),
                predict_fn=self._lime_predict,
                num_features=700)
            local_prediction = explanation.local_pred[0]
            weights = OrderedDict(explanation.as_list())
            lime_w = dict(
                sorted(
                    zip(list([int(wk) for wk in weights.keys()]),
                        list(weights.values()))))
            weights = np.array([lime_w[o] for o in lime_w.keys()])

        features_all = {}
        count = 0
        for j in range(self.time_window):
            count2 = 0
            for i in self.features:
                features_all.setdefault(i, []).append([
                    j, weights[count + count2], temp_instance[j][count2],
                    weights[count + count2] * temp_instance[j][count2]
                ])
                count2 = count2 + 1
            count = count + self.num_features

        if enable_ipca:
            ftr_stats = [feature_weights]
        else:
            features_std, features_mean, features_max, features_min = [],[],[],[]
            for i in features_all:
                naa = np.array(features_all[i])[:, 3]
                features_std.append(naa.std())
                features_mean.append(naa.mean())
                features_max.append(naa.max())
                features_min.append(naa.min())
            ftr_stats = [
                features_mean, features_std, features_min, features_max
            ]

        return [real_prediction, local_prediction], features_all, ftr_stats

    def recommend_modifications(self, iml_method, enable_ipca):

        _, _, original_ftr_stats = self.original_instance_statistics[
            iml_method][enable_ipca]
        ftr_importance = original_ftr_stats[
            0]  #The mean of weights per feature

        indexed = list(enumerate(ftr_importance))
        indexed.sort(key=lambda tup: tup[1])
        cls0_ftr = list([i for i, v in indexed[:2]])
        cls1_ftr = list(reversed([i for i, v in indexed[-2:]]))
        #print("Class 0 important features:",features[cls0_ftr[0]], features[cls0_ftr[1]])
        #print("Class 1 important features:",features[cls1_ftr[0]], features[cls1_ftr[1]])

        mods = ['Original', 'Uniform', 'Mean(Local)', 'Mean(Global)', 'Zeros', \
                'Noise', 'Forecast (Neural)', 'Forecast (Static)', 'Forecast (N-Beats)']
        wghts = ['Negative Weights', 'Positive Weights']

        cls0_mod_results = []
        cls1_mod_results = []
        unif_tests = [0.1, 0.5, -0.1, -0.5]

        weights = self.weights_dict[iml_method][enable_ipca][0]
        for ftr in cls0_ftr:
            temp = []
            for v, w in zip(unif_tests, np.sign(unif_tests)):
                mod_inst = self.modify(weights, ftr, 1, (1, self.time_window),
                                       v, w)
                mod_preds = self.predictor.predict(
                    np.array([mod_inst, mod_inst]))[0]
                temp.append((mod_preds[0], ftr, 1, v, w))
            for mod in range(2, len(mods)):
                mod_inst = self.modify(weights, ftr, mod,
                                       (1, self.time_window))
                mod_preds = self.predictor.predict(
                    np.array([mod_inst, mod_inst]))[0]
                temp.append((mod_preds[0], ftr, mod))
            cls0_mod_results.append(max(temp))

        for ftr in cls1_ftr:
            temp = []
            for v, w in zip(unif_tests, -np.sign(unif_tests)):
                mod_inst = self.modify(weights, ftr, 1, (1, self.time_window),
                                       v, w)
                mod_preds = self.predictor.predict(
                    np.array([mod_inst, mod_inst]))[0]
                temp.append((mod_preds[0], ftr, 1, v, w))
            for mod in range(2, len(mods)):
                mod_inst = self.modify(weights, ftr, mod,
                                       (1, self.time_window))
                mod_preds = self.predictor.predict(
                    np.array([mod_inst, mod_inst]))[0]
                temp.append((mod_preds[0], ftr, mod))
            cls1_mod_results.append(min(temp))

        recommendation = "\t\t\t\t\t\t<<< Recommendations >>>\n\n"
        for e0, rec in enumerate(cls0_mod_results):
            if rec[2] == 1:
                recommendation += str(e0+1)+") Try the Uniform modification on feature "+str(self.features[rec[1]])+\
                " with Value: "+str(rec[3])+" on the "+str(wghts[int((1+rec[4])/2)])+" to increase the target value.\n"
            else:
                recommendation += str(e0+1)+") Try the "+str(mods[rec[2]])+" modification on feature "+str(self.features[rec[1]])+ \
                " to increase the target value.\n"

        for e1, rec in enumerate(cls1_mod_results):
            if rec[2] == 1:
                recommendation += str(e1+e0+2)+") Try the Uniform modification on feature "+str(self.features[rec[1]])+\
                " with Value: "+str(rec[3])+" on the "+str(wghts[int((1+rec[4])/2)])+" to decrease the target value.\n"
            else:
                recommendation += str(e1+e0+2)+") Try the "+str(mods[rec[2]])+" modification on feature "+str(self.features[rec[1]])+ \
                " to decrease the target value.\n"

        return recommendation

    def plot_feature(self, ftr_i, mod_ftr_i, mod, rng_sldr, uni_sldr,
                     rd_btn_uni, select_target, rd_btn_xyz7, forecast_optns,
                     iml_method, enable_ipca):

        # Recommend modifications
        print(self.recommendation[iml_method][enable_ipca])

        # Disable/Enable UI elements
        if mod >= 2 and mod <= 6:
            self.mod_settings.children = ([self.opt2_settings])
            self.modify_ftr_i.disabled = False
        elif mod == 1:
            self.mod_settings.children = ([self.opt3_settings])
            self.modify_ftr_i.disabled = False
        elif mod == 9:
            self.mod_settings.children = ([self.opt4_settings])
            self.forecast.disabled = False if rd_btn_xyz7 else True
            self.modify_ftr_i.disabled = True
        else:
            self.mod_settings.children = ([self.opt1_settings])
            self.modify_ftr_i.disabled = True

        # If a UI element has been changed other than the Feature View proceed to the modification
        if self.seeFtr == ftr_i:
            inst_mod = self.modify(
                self.weights_dict[iml_method][enable_ipca][0], mod_ftr_i - 1,
                mod, rng_sldr, uni_sldr, rd_btn_uni, select_target,
                rd_btn_xyz7, forecast_optns)
            self.mod_preds, self.mod_ftr_all, self.mod_ftr_stats = self.moded_instance_statistics(
                inst_mod, iml_method, enable_ipca)
            self.original_preds, self.original_ftr_all, self.original_ftr_stats = self.original_instance_statistics[
                iml_method][enable_ipca]
            if mod == 9 and rd_btn_xyz7:
                inst_mod = self.modify(
                    self.weights_dict[iml_method][enable_ipca][0],
                    mod_ftr_i - 1, forecast_optns, rng_sldr)
                self.expected_preds, self.expected_ftr_all, self.expected_ftr_stats = \
                self.moded_instance_statistics(inst_mod, iml_method,enable_ipca)
        else:
            self.seeFtr = ftr_i

        # Print the predictions of Target for the original and modified instance
        print("ORIGINAL -> Real prediction: " + str(self.target_scaler.inverse_transform([[self.original_preds[0]]]).squeeze())[:7] + \
              ", Local prediction: " + str(self.target_scaler.inverse_transform([[self.original_preds[1]]]).squeeze())[:7])
        if mod == 9 and rd_btn_xyz7:
            print("EXPECTED -> Real prediction: " + str(self.target_scaler.inverse_transform([[self.expected_preds[0]]]).squeeze())[:7] + \
                  ", Local prediction: " + str(self.target_scaler.inverse_transform([[self.expected_preds[1]]]).squeeze())[:7])
        print("  MOD    -> Real prediction: " + str(self.target_scaler.inverse_transform([[self.mod_preds[0]]]).squeeze())[:7] + \
              ", Local prediction: " + str(self.target_scaler.inverse_transform([[self.mod_preds[1]]]).squeeze())[:7])

        # Plotting the figures
        to_vis = self.features
        x = np.arange(len(to_vis))
        if mod == 9 and rd_btn_xyz7:
            width = 0.25
            align = 'edge'
        else:
            width = 0.35
            align = 'edge'

        if enable_ipca:
            fig, axs = plt.subplots(1, 3, figsize=(18, 4))
            axs[1].bar(x - width,
                       self.original_ftr_stats[0],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C0')
            axs[1].bar(x,
                       self.mod_ftr_stats[0],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C1')
            axs[1].set_title('Feature Importance ')
            axs[1].legend(('Οriginal', 'Modded'))
            axs[1].set_xticklabels(to_vis, rotation=45)
            if mod == 9 and rd_btn_xyz7:
                axs[1].bar(x + width,
                           self.expected_ftr_stats[0],
                           width=width,
                           tick_label=to_vis,
                           align=align,
                           color='C2')
                axs[1].legend(('Οriginal', 'Modded', 'Expected'))
            axs[0].axis('off')
            axs[2].axis('off')
            plt.show()
        else:
            fig, axs = plt.subplots(1, 3, figsize=(18, 4), dpi=200)
            axs[0].bar(x - width,
                       self.original_ftr_stats[0],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C0')
            axs[0].bar(x,
                       self.mod_ftr_stats[0],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C1')
            axs[0].set_xticklabels(to_vis, rotation=45)
            axs[0].set_title('Mean')
            axs[0].legend(('Οriginal', 'Modded'))
            axs[1].bar(x - width,
                       self.original_ftr_stats[1],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C0')
            axs[1].bar(x,
                       self.mod_ftr_stats[1],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C1')
            axs[1].set_xticklabels(to_vis, rotation=45)
            axs[1].set_title('STD')
            axs[2].bar(
                x - width,
                self.original_ftr_stats[2],
                width=width,
                tick_label=to_vis,
                align=align,
                color='C0',
            )
            axs[2].bar(x - width,
                       self.original_ftr_stats[3],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C0')
            axs[2].bar(x,
                       self.mod_ftr_stats[2],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C1')
            axs[2].bar(x,
                       self.mod_ftr_stats[3],
                       width=width,
                       tick_label=to_vis,
                       align=align,
                       color='C1')
            axs[2].set_xticklabels(to_vis, rotation=45)
            axs[2].set_title('Max and Min')
            if mod == 9 and rd_btn_xyz7:
                axs[0].bar(x + width,
                           self.expected_ftr_stats[0],
                           width=width,
                           tick_label=to_vis,
                           align=align,
                           color='C2')
                axs[1].bar(x + width,
                           self.expected_ftr_stats[1],
                           width=width,
                           tick_label=to_vis,
                           align=align,
                           color='C2')
                axs[2].bar(x + width,
                           self.expected_ftr_stats[2],
                           width=width,
                           tick_label=to_vis,
                           align=align,
                           color='C2')
                axs[2].bar(x + width,
                           self.expected_ftr_stats[3],
                           width=width,
                           tick_label=to_vis,
                           align=align,
                           color='C2')
                axs[0].legend(('Οriginal', 'Modded', 'Expected'))
            plt.show()

        main_ftr_all = self.expected_ftr_all if mod == 9 and rd_btn_xyz7 else self.original_ftr_all
        TIMESTEPS = np.arange(self.instance.shape[0])
        plt.figure(figsize=(16, 4), dpi=200, facecolor='w', edgecolor='k')
        plt.subplot(131)
        plt.plot(TIMESTEPS,
                 np.array(main_ftr_all[self.features[ftr_i - 1]])[:, 1],
                 color='grey',
                 linestyle=':')
        plt.plot(TIMESTEPS,
                 np.array(self.mod_ftr_all[self.features[ftr_i - 1]])[:, 1],
                 color='tab:blue')
        plt.hlines(y=np.array(self.mod_ftr_all[self.features[ftr_i -
                                                             1]])[:, 1].mean(),
                   xmin=0,
                   xmax=self.time_window,
                   label='mean')
        plt.title(str("Feature\'s " + self.features[ftr_i - 1] + " influence"))
        plt.subplot(132)
        plt.plot(TIMESTEPS,
                 np.array(main_ftr_all[self.features[ftr_i - 1]])[:, 2],
                 color='grey',
                 linestyle=':')
        plt.plot(TIMESTEPS,
                 np.array(self.mod_ftr_all[self.features[ftr_i - 1]])[:, 2],
                 color='g')
        plt.hlines(y=np.array(self.mod_ftr_all[self.features[ftr_i -
                                                             1]])[:, 2].mean(),
                   xmin=0,
                   xmax=self.time_window,
                   label='mean')
        plt.title(str("Feature\'s " + self.features[ftr_i - 1] + " value"))
        plt.subplot(133)
        plt.plot(TIMESTEPS,
                 np.array(main_ftr_all[self.features[ftr_i - 1]])[:, 3],
                 color='grey',
                 linestyle=':')
        plt.plot(TIMESTEPS,
                 np.array(self.mod_ftr_all[self.features[ftr_i - 1]])[:, 3],
                 color='r')
        plt.hlines(y=np.array(self.mod_ftr_all[self.features[ftr_i -
                                                             1]])[:, 3].mean(),
                   xmin=0,
                   xmax=self.time_window,
                   label='mean')
        plt.title(
            str("Feature\'s " + self.features[ftr_i - 1] +
                " influence * value"))
        plt.show()

    def load_UI(self):
        '''Setting up the interactive visualization tool'''

        # UI elements
        range_slider = IntRangeSlider(value=[1, self.time_window],
                                      min=1,
                                      max=self.time_window,
                                      description="Range: ",
                                      continuous_update=False)
        view_ftr_i = IntSlider(min=1,
                               max=self.num_features,
                               default_value=2,
                               description="View Feature: ",
                               continuous_update=False)
        self.modify_ftr_i = IntSlider(min=1,
                                      max=self.num_features,
                                      default_value=2,
                                      description="Mod Feature: ",
                                      continuous_update=False)
        uniform_slider = FloatSlider(value=0,
                                     min=-1,
                                     max=1,
                                     step=0.05,
                                     description='Value:',
                                     continuous_update=False)
        radio_button_uni = RadioButtons(options=[('Positive Weights', 1),
                                                 ('Negative Weights', -1)],
                                        description='Affect:')
        select_target = BoundedFloatText(
            value=(self.min_target + self.max_target) / 2,
            min=self.min_target,
            max=self.max_target,
            layout={'width': '150px'})
        radio_button_xyz7 = RadioButtons(options=[
            ('Present (' + str(self.forecast_window) + '-last values)', 0),
            ('Future (' + str(self.forecast_window) + '-next values)', 1)
        ],
                                         description='Affect:')
        enable_iPCA = Checkbox(value=False, description='Enable iPCA')
        iml_method = ToggleButtons(options=['LioNets', 'Lime'])
        self.forecast = Dropdown(options=[('Neural', 6), ('Static', 7),
                                          ('N-Beats', 8)],
                                 description="Forecast: ")
        mod = Dropdown(options=[('Original', 0), ('Uniform', 1),
                                ('Mean (Local)', 2), ('Mean (Global)', 3),
                                ('Zeros', 4), ('Noise', 5),
                                ('Forecast (Neural)', 6),
                                ('Forecast (Static)', 7),
                                ('Forecast (N-Beats)', 8),
                                ('Forecast (XYZ7)', 9)],
                       description="Mods: ")
        jsdlink((self.modify_ftr_i, 'value'), (view_ftr_i, 'value'))

        # UI layout
        interpretable_settings = HBox(
            [Label('Interpretation method:'), iml_method, enable_iPCA])
        enable_iPCA.layout.margin = '0 0 0 -50px'
        interpretable_settings.layout.margin = '20px 0 20px 0'
        standard_settings = VBox([self.modify_ftr_i, view_ftr_i])
        xyz7_settings = VBox([
            HBox([Label('Desired Target:'), select_target]), radio_button_xyz7
        ])
        xyz7_settings.layout.margin = '0 0 0 30px'
        self.opt1_settings = VBox([mod])
        self.opt2_settings = VBox([mod, range_slider])
        self.opt3_settings = HBox([
            VBox([mod, range_slider]),
            VBox([uniform_slider, radio_button_uni])
        ])
        self.opt4_settings = HBox([VBox([mod, self.forecast]), xyz7_settings])
        self.mod_settings = VBox([])
        ui = VBox([
            interpretable_settings,
            HBox([standard_settings, self.mod_settings])
        ])

        # Starting the interactive tool
        inter = interactive_output(
            self.plot_feature, {
                'ftr_i': view_ftr_i,
                'mod_ftr_i': self.modify_ftr_i,
                'mod': mod,
                'rng_sldr': range_slider,
                'uni_sldr': uniform_slider,
                'rd_btn_uni': radio_button_uni,
                'select_target': select_target,
                'rd_btn_xyz7': radio_button_xyz7,
                'forecast_optns': self.forecast,
                'iml_method': iml_method,
                'enable_ipca': enable_iPCA
            })
        display(ui, inter)
示例#21
0
# Ideally these two should be equal
assert test_sample_pred == test_sample2_pred

# We could also check for drastic changes in prediction probabilities
# by putting a threshold for change in pred_prob
# print(xgbModel.predict_proba(scaler.transform(test_sample))
# print(xgbModel.predict_proba(scaler.transform(test_sample2)))
"""
Explainability: We could use LIME for generating explanations for individual instances
"""
np.random.seed(1)

explainer = LimeTabularExplainer(train_impute,
                                 class_names=['pos', 'neg'],
                                 feature_names=features_impute,
                                 kernel_width=3,
                                 verbose=False)

# Choose a sample instance
instance_to_explain = test_impute[0]

exp = explainer.explain_instance(instance_to_explain,
                                 xgbModel.predict_proba,
                                 num_features=5)

assert len(exp.as_list()) == 5
print('Features responsible for prediction of instance_to_explain: ',
      exp.as_list())
exp.as_pyplot_figure()
示例#22
0
    def analyze_lime(self):
        "Local Interpretable Model-agnostic Explanamtions"

        train_X_imp = self.imputer.transform(self.X)

        train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features)

        # create the explainer by passing our training data,
        # setting the correct modeling mode, pass in feature names and
        # make sure we don't discretize the continuous features
        explainer = LimeTabularExplainer(train_X_imp_df,
                                         mode='regression',
                                         feature_names=self.features,
                                         random_state=RANDOM_STATE,
                                         discretize_continuous=False)

        test_X_imp = self.imputer.transform(self.X_test)

        test_X_imp_df = pd.DataFrame(test_X_imp, columns=self.features)

        # the number of features to include in our predictions
        num_features = len(self.features)
        # the index of the instance we want to explaine
        exp_idx = 2
        exp = explainer.explain_instance(test_X_imp_df.iloc[exp_idx, :].values,
                                         self.estimator.predict,
                                         num_features=num_features)

        # a plot of the weights for each feature
        exp.as_pyplot_figure()

        plt.show()

        lime_expl = test_X_imp_df.apply(explainer.explain_instance,
                                        predict_fn=self.estimator.predict,
                                        num_features=num_features,
                                        axis=1)

        # get all the lime predictions
        lime_pred = lime_expl.apply(lambda x: x.local_pred[0])
        # RMSE of lime pred
        mean_squared_error(self.y_pred, lime_pred)**0.5

        # r^2 of lime predictions
        r2_score(self.y_pred, lime_pred)

        # new explainer with smaller kernel_width
        better_explainer = LimeTabularExplainer(train_X_imp_df,
                                                mode='regression',
                                                feature_names=self.features,
                                                random_state=RANDOM_STATE,
                                                discretize_continuous=False,
                                                kernel_width=1)

        better_lime_expl = test_X_imp_df.apply(
            better_explainer.explain_instance,
            predict_fn=self.estimator.predict,
            num_features=num_features,
            axis=1)

        # get all the lime predictions
        better_lime_pred = better_lime_expl.apply(lambda x: x.local_pred[0])
        # RMSE of lime pred
        mean_squared_error(self.y_pred, better_lime_pred)**0.5

        # r^2 of lime predictions
        r2_score(self.y_pred, better_lime_pred)

        # construct a DataFrame with all the feature weights and bias terms from LIME
        # create an individual dataframe for each explanation
        lime_dfs = [
            pd.DataFrame(dict(expl.as_list() + [('bias', expl.intercept[0])]),
                         index=[0]) for expl in better_lime_expl
        ]
        # then concatenate them into one big DataFrame
        lime_expl_df = pd.concat(lime_dfs, ignore_index=True)

        lime_expl_df.head()

        # scale the data
        scaled_X = (test_X_imp_df -
                    explainer.scaler.mean_) / explainer.scaler.scale_
        # calc the lime feature contributions
        lime_feat_contrib = lime_expl_df[self.features] * scaled_X

        # get the prediction and actual target values to plot
        y_test_and_pred_df = pd.DataFrame(
            np.column_stack((self.y_test, self.y_pred)),
            index=self.test_df.Player,
            columns=['true_AV_pctile', 'pred_AV_pctile'])

        # add on bias term, actual av %ile and predicted %ile
        other_lime_cols = ['bias', 'true_AV_pctile', 'pred_AV_pctile']
        lime_feat_contrib[other_lime_cols] = pd.DataFrame(
            np.column_stack((lime_expl_df.bias, y_test_and_pred_df)))

        lime_feat_contrib.sort_values('pred_AV_pctile', inplace=True)

        lime_feat_contrib.head()

        title = 'LIME Feature Contributions for each prediction in the testing data'
        fig = double_heatmap(
            lime_feat_contrib[['true_AV_pctile', 'pred_AV_pctile']].T,
            lime_feat_contrib.loc[:, :'bias'].T,
            title=title,
            cbar_label1='%ile',
            cbar_label2='contribution',
            subplot_top=0.9)
        # set the x-axis label for the bottom heatmap
        # fig has 4 axes object, the first 2 are the heatmaps, the other 2 are the colorbars
        fig.axes[1].set_xlabel('Player')
示例#23
0
class ModelInterface(object):

    def __init__(self, model_path, train_set_path, input_cols, classes, cat_map_path, categorical_features):
        
        # initialize model
        self._define_model(model_path)
        self._define_input_cols(input_cols)
        self._define_train_set(train_set_path)
        self._define_classes(classes)
        self._define_cat_mappers(cat_map_path, categorical_features)
        self._define_explainer()

    def _define_model(self, model_path):
        # load pickled model
        with open(model_path,'rb') as f:
            self.model = pickle.load(f)

    def _define_train_set(self, train_set_path):
        # load pickled train set
        with open(train_set_path,'rb') as f:
            self.train_set = pickle.load(f).values
                
    def _define_input_cols(self, input_cols):
        # set ordered input columns
        self.input_cols = input_cols

    def _define_cat_mappers(self, mapper_path, categorical_features):
        # define categorical mapper dictionary
        with open(mapper_path, 'rb') as f:
            cat_val_dict = pickle.load(f)
        self.categorical_features = []
        self.cat_mappers = {}
        self.cat_names = {}
        for key in categorical_features:
            self.cat_mappers[key] =  {k:v for v,k in enumerate(cat_val_dict[key])}
        for i, el in enumerate(self.input_cols):
            if el in categorical_features:
                self.cat_names[i] = cat_val_dict[el]
                self.categorical_features.append(i)
    def _define_classes(self, classes):
        # set prediction class names
        self.prediction_classes = classes

    def _define_explainer(self):
        # define explainer
        self.explainer = LimeTabularExplainer(
            training_data=self.train_set,
            feature_names=self.input_cols,
            class_names=self.prediction_classes,
            categorical_features=self.categorical_features,
            categorical_names=self.cat_names,
            discretize_continuous=True
                )

    def get_prediction(self, input_params):
        type_dict = {'time':int,
                     'day_of_week':float,
                     'source_area_code':str,
                     'recipient_area_code':float,
                     'month':float,
                     'day':float}
        # build prediction list in correct order
        predict_arr = []
        for k in self.input_cols:
            if k in input_params.keys():
                if k in self.cat_mappers.keys(): 
                    # convert type
                    label_to_lookup = input_params[k]
                    label_to_lookup = type_dict[k](label_to_lookup)
                    predict_arr.append(np.int(self.cat_mappers[k][label_to_lookup]))
                else:
                    predict_arr.append(type_dict[k](input_params[k]))

            else:
                print(input_params)
                return f'Error: Missing column {k} for prediction.'
        
        # wrap prediction in list since only doing a single prediction
        predict_arr = np.array(predict_arr)

        # create explanation of instance
        explanation = self.explainer.explain_instance(predict_arr, self.model.predict_proba, top_labels=len(self.prediction_classes)) 

        # get prediction probabilities for each class
        predict_probs = explanation.predict_proba.tolist()
        predictions = {k:v for k,v in zip(self.prediction_classes,predict_probs)}

        # get top explanation for instance
        explanation = {k:v for (k,v) in explanation.as_list(label=explanation.top_labels[0])}
        
        # format as dictionary
        pred_dict = {'predictions':predictions,
                     'explanation':explanation}
        return pred_dict
示例#24
0
feat_imp_df = explain_weights_df(model, feature_names=all_cols)
feat_imp_df.head(10)

X_train = train.values
exp_pred_df = explain_prediction_df(estimator=model,
                                    doc=X_train[0],
                                    feature_names=all_cols)

# lime
explainer = LimeTabularExplainer(X_train,
                                 mode='regression',
                                 feature_names=all_cols,
                                 categorical_features=cat_cols,
                                 random_state=1981,
                                 discretize_continuous=True)
exp = explainer.explain_instance(X_valid[10], model.predict, num_features=20)

# Shap
X_valid_rn = X_valid[random.sample(range(X_valid.shape[0]), 10000)]
shap_explainer = shap.TreeExplainer(model)
valid_shap_vals = shap_explainer.shap_values(X_valid_rn)
shap.force_plot(valid_shap_vals[0, :], feature_names=all_cols)
shap.force_plot(valid_shap_vals, feature_names=all_cols)
shap.summary_plot(valid_shap_vals,
                  feature_names=all_cols,
                  auto_size_plot=False)
shap.dependence_plot('discount_price_mean',
                     valid_shap_vals,
                     feature_names=all_cols,
                     dot_size=100)
示例#25
0
class XDeepLimeTabularExplainer(Explainer):
    def __init__(self,
                 predict_proba,
                 class_names,
                 feature_names,
                 train,
                 categorical_features=None,
                 categorical_names=None):
        """Init function.

        # Arguments
            predict_proba: Function. A classifier prediction probability function.
            class_names: List. A list of class names, ordered according to whatever the classifier is using.
            feature_names: List. A list of names (strings) corresponding to the columns in the training data.
            train: Array. Train data.
            categorical_features: List. A list of indices (ints) corresponding to the categorical columns. Everything else will be considered continuous. Values in these columns MUST be integers.
            categorical_names: Dict. A dict which maps from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x.
        """
        Explainer.__init__(self, predict_proba, class_names)
        self.train = train
        self.feature_names = feature_names
        self.categorical_features = categorical_features
        self.categorical_names = categorical_names
        # Initialize explainer
        self.set_parameters()

    def set_parameters(self, **kwargs):
        """Parameter setter for lime_tabular.

        # Arguments
            **kwargs: Parameters setter. For more detail, please check https://lime-ml.readthedocs.io/en/latest/index.html.
        """
        train = kwargs.pop("train", self.train)
        class_names = kwargs.pop("class_names", self.class_names)
        feature_names = kwargs.pop("feature_names", self.feature_names)
        categorical_features = kwargs.pop("categorical_features",
                                          self.categorical_features)
        categorical_names = kwargs.pop("categorical_names",
                                       self.categorical_names)
        self.explainer = LimeTabularExplainer(
            train,
            class_names=class_names,
            feature_names=feature_names,
            categorical_features=categorical_features,
            categorical_names=categorical_names,
            **kwargs)

    def explain(self, instance, top_labels=None, labels=(1, ), **kwargs):
        """Generate explanation for a prediction using certain method.

        # Arguments
            instance: Array. One row of tabular data to be explained.
            top_labels: Integer. Number of labels you care about.
            labels: Tuple. Labels you care about, if top_labels is not none, it will be replaced by the predicted top labels.
            **kwarg: Parameters setter. For more detail, please check https://lime-ml.readthedocs.io/en/latest/index.html.
        """
        Explainer.explain(self, instance, top_labels=top_labels, labels=labels)
        self.explanation = self.explainer.explain_instance(
            instance,
            self.predict_proba,
            top_labels=top_labels,
            labels=self.labels,
            **kwargs)

    def show_explanation(self, span=3):
        """Visualization of explanation of lime_text.

        # Arguments
            span: Integer. Each row shows how many features.
        """
        Explainer.show_explanation(self)
        exp = self.explanation
        labels = self.labels

        print()
        print("LIME Explanation")
        print("Instance: {}".format(self.instance))
        print()

        assert hasattr(labels, '__len__')
        for label in labels:
            result = exp.intercept[label]
            local_exp = exp.local_exp[label]
            for item in local_exp:
                result += item[1]
            print("Explanation for label {}:".format(self.class_names[label]))
            print("Local Prediction:     {:.3f}".format(result))
            print("Original Prediction:  {:.3f}".format(
                self.original_pred[label]))
            print()
            exp_list = exp.as_list(label=label)
            for idx in range(len(exp_list)):
                print("  {:20} : {:.3f}  |".format(exp_list[idx][0],
                                                   exp_list[idx][1]),
                      end="")
                if idx % span == span - 1:
                    print()
            print()
            exp.as_pyplot_figure(label=label)
        plt.show()
示例#26
0
def lime_inspection(X,
                    ys,
                    vectorizer,
                    summary,
                    target,
                    label,
                    num_features=5,
                    top_labels=1):
    """Inspect a random example with label `label` using LIME.

    Parameters
    ----------
    X : array-like (dense or sparse matrix)
        The design matrix.

    ys : a pd.DataFrame
        As produced by `build_experimental_dataset`.

    vectorizer : `DictVectorizer`
        Must have a `get_feature_names` method, and the resulting
        list must be aligned with the `X`.

    summary : list of dict
        As produced by `run_all_experiments`.

    target : str
        A value of 'target' in one of the dicts in `summary`.

    label : str
        A value that `target can take on.

    num_features : int
        Number of features to display.

    top_labels : int
        Number of classes to display -- the top classes predicted
        by the model.

    Displays
    --------
    A LimeTabularExplainer summary of a randomly chosen
    example with label `label`. This is HTML displayed
    using IPython specials.
    """
    X = X.toarray()
    target_summary = next(d for d in summary if d['target'] == target)
    # The first of the models is as good as any, since
    # they were trained on random folds.
    mod = target_summary['models'][0]
    explainer = LimeTabularExplainer(
        X,
        feature_names=vectorizer.get_feature_names(),
        class_names=mod.classes_,
        discretize_continuous=True)
    # TODO: Should take care not to sample an example that was used to
    # train `mod`, but we can set this aside for now, since we're
    # basically just demoing LIME:
    index = np.random.choice(
        [i for i, cls in enumerate(ys[target].values) if cls == label])
    exp = explainer.explain_instance(X[index],
                                     mod.predict_proba,
                                     num_features=num_features,
                                     top_labels=top_labels)
    exp.show_in_notebook(show_table=True, show_all=False)
示例#27
0
class LEAF:
    def __init__(self,
                 bb_classifier,
                 X,
                 class_names,
                 explanation_samples=5000):
        self.bb_classifier = bb_classifier
        self.EX, self.StdX = np.mean(X), np.array(np.std(X, axis=0, ddof=0))
        self.class_names = class_names
        self.F = X.shape[1]  # number of features
        self.explanation_samples = explanation_samples

        # SHAP Kernel
        self.SHAPEXPL = shap.KernelExplainer(self.bb_classifier.predict_proba,
                                             self.EX,
                                             nsamples=explanation_samples)

        # LIME Kernel
        self.LIMEEXPL = LimeTabularExplainer(
            X.astype('float'),
            feature_names=X.columns.tolist(),
            class_names=self.class_names,
            discretize_continuous=False,
            sample_around_instance=True,
            # categorical_features=categorical_features,
            # feature_selection='highest_weights',
            # sample_using_pca=False,
            # weight_classifier_labels=False,
            random_state=10)
        self.metrics = None
        self.lime_avg_jaccard_bin = self.lime_std_jaccard_bin = None
        self.shap_avg_jaccard_bin = self.shap_std_jaccard_bin = None

    def explain_instance(self,
                         instance,
                         num_reps=50,
                         num_features=4,
                         neighborhood_samples=10000,
                         use_cov_matrix=False,
                         verbose=False,
                         figure_dir=None):
        npEX = np.array(self.EX)
        cls_proba = self.bb_classifier.predict_proba

        x0 = copy.deepcopy(instance)  # instance to be explained
        mockobj = mock.Mock()

        # Neighborhood random samples
        cov_matrix = np.cov(
            ((X - npEX) / self.StdX).T) if use_cov_matrix else 1.0
        NormV = scipy.stats.multivariate_normal.rvs(mean=np.zeros(self.F),
                                                    cov=cov_matrix,
                                                    size=neighborhood_samples,
                                                    random_state=10)

        # Get the output of the black-box classifier on x0
        output = cls_proba([x0])[0]
        label_x0 = 1 if output[1] >= output[0] else 0
        prob_x0 = output[label_x0]
        prob_x0_F, prob_x0_T = output[0], output[1]
        if verbose:
            print('prob_x0', prob_x0, '   label_x0',
                  self.class_names[label_x0])

        # Prepare instance for LIME
        lime_x0 = np.divide((x0 - npEX),
                            self.StdX,
                            where=np.logical_not(np.isclose(self.StdX, 0)))
        shap_x0 = (x0 - npEX)

        rows = None
        progbar = IntProgress(min=0, max=num_reps)
        label = Label(value="")
        display(HBox([Label("K=%d " % (num_features)), progbar, label]))

        # Explain the same instance x0 multiple times
        for rnum in range(num_reps):
            label.value = "%d/%d" % (rnum + 1, num_reps)
            R = mock.Mock()  # store all the computed metrics
            R.rnum, R.prob_x0 = rnum, prob_x0

            # Explain the instance x0 with LIME
            lime_expl = self.LIMEEXPL.explain_instance(
                np.array(x0),
                cls_proba,
                num_features=num_features,
                top_labels=1,
                num_samples=self.explanation_samples)

            # Explain x0 using SHAP
            shap_phi = self.SHAPEXPL.shap_values(x0, l1_reg="num_features(10)")
            shap_phi0 = self.SHAPEXPL.expected_value

            # Take only the top @num_features from shap_phi
            argtop = np.argsort(np.abs(shap_phi[0]))
            for k in range(len(shap_phi)):
                shap_phi[k][argtop[:(self.F - num_features)]] = 0

            # Recover both the LIME and the SHAP classifiers
            R.lime_g = get_LIME_classifier(lime_expl, label_x0, x0)
            R.shap_g = get_SHAP_classifier(label_x0, shap_phi, shap_phi0, x0,
                                           self.EX)

            #----------------------------------------------------------
            # Evaluate the white box classifiers
            EL = eval_whitebox_classifier(R,
                                          R.lime_g,
                                          npEX,
                                          self.StdX,
                                          NormV,
                                          x0,
                                          label_x0,
                                          cls_proba,
                                          "lime",
                                          precision_recalls=True)
            ES = eval_whitebox_classifier(R,
                                          R.shap_g,
                                          npEX,
                                          np.ones(len(x0)),
                                          NormV * self.StdX,
                                          x0,
                                          label_x0,
                                          cls_proba,
                                          "shap",
                                          precision_recalls=True)

            R.lime_local_discr = np.abs(
                R.lime_g.predict([lime_x0])[0] - prob_x0)
            R.shap_local_discr = np.abs(
                R.shap_g.predict([shap_x0])[0] - prob_x0)

            # Indices of the most important features, ordered by their absolute value
            R.lime_argtop = np.argsort(np.abs(R.lime_g.coef_))
            R.shap_argtop = np.argsort(np.abs(R.shap_g.coef_))

            # get the K most common features in the explanation of x0
            R.mcf_lime = tuple(
                [R.lime_argtop[-k] for k in range(num_features)])
            R.mcf_shap = tuple(
                [R.shap_argtop[-k] for k in range(num_features)])

            # Binary masks of the argtops
            R.lime_bin_expl, R.shap_bin_expl = np.zeros(self.F), np.zeros(
                self.F)
            R.lime_bin_expl[np.array(R.mcf_lime)] = 1
            R.shap_bin_expl[np.array(R.mcf_shap)] = 1

            # Save the Ridge regressors built by LIME and SHAP
            # lime_g_W, shap_g_W = tuple(lime_g.coef_), tuple(shap_g.coef_)
            # lime_g_w0, shap_g_w0 = lime_g.intercept_, shap_g.intercept_

            # get the appropriate R keys
            R_keys = copy.copy(R.__dict__)
            for key in copy.copy(list(R_keys.keys())):
                if key.startswith("wb_"):
                    R_keys[wb_name + key[2:]] = R_keys.pop(key)
                elif key in mockobj.__dict__:
                    del R_keys[key]

            rows = pd.DataFrame(columns=R_keys) if rows is None else rows
            rows = rows.append({k: R.__dict__[k]
                                for k in R_keys},
                               ignore_index=True)
            progbar.value += 1

        label.value += " Done."

        # use the multiple explanations to compute the LEAF metrics
        # display(rows)

        # Jaccard distances between the various explanations (stability)
        lime_jaccard_mat = 1 - pdist(np.stack(rows.lime_bin_expl, axis=0),
                                     'jaccard')
        shap_jaccard_mat = 1 - pdist(np.stack(rows.shap_bin_expl, axis=0),
                                     'jaccard')
        self.lime_avg_jaccard_bin, self.lime_std_jaccard_bin = np.mean(
            lime_jaccard_mat), np.std(lime_jaccard_mat)
        self.shap_avg_jaccard_bin, self.shap_std_jaccard_bin = np.mean(
            shap_jaccard_mat), np.std(shap_jaccard_mat)

        # LIME/SHAP explanation comparisons
        lime_shap_jaccard_mat = 1 - cdist(np.stack(rows.lime_bin_expl, axis=0),
                                          np.stack(rows.shap_bin_expl, axis=0),
                                          'jaccard')
        lime_shap_avg_jaccard_bin, lime_shap_std_jaccard_bin = np.mean(
            lime_shap_jaccard_mat), np.std(lime_shap_jaccard_mat)

        # store the metrics for later use
        self.metrics = rows

        def leaf_plot(stability, method):
            fig, ax1 = plt.subplots(figsize=(6, 2.2))
            data = [
                stability.flatten(),
                1 - rows[method + '_local_discr'],
                rows[method + '_fidelity_f1'],
                # rows[method + '_prescriptivity_f1'],
                # rows[method + '_bal_prescriptivity' ],
                1 - 2 * np.abs(rows[method + '_boundary_discr'])
            ]

            # color = 'tab:red'
            ax1.tick_params(axis='both', which='major', labelsize=12)
            ax1.set_xlabel('distribution')
            ax1.set_ylabel('LEAF metrics', color='black', fontsize=15)
            ax1.boxplot(data, vert=False, widths=0.7)
            ax1.tick_params(axis='y', labelcolor='#500000')
            ax1.set_yticks(np.arange(1, len(data) + 1))
            ax1.set_yticklabels([
                'Stability', 'Local Concordance', 'Fidelity', 'Prescriptivity'
            ])
            ax1.set_xlim([-0.05, 1.05])
            ax1.invert_yaxis()

            ax2 = ax1.twinx(
            )  # instantiate a second axes that shares the same x-axis
            ax2.tick_params(axis='both', which='major', labelsize=12)
            ax2.set_ylabel(
                'Values',
                color='#000080')  # we already handled the x-label with ax1
            ax2.boxplot(data, vert=False, widths=0.7)
            # ax2.boxplot([np.mean(d) for d in data], color=color)
            ax2.tick_params(axis='y', labelcolor='#000080')
            ax2.set_yticks(np.arange(1, len(data) + 1))
            ax2.set_yticklabels(
                ["  %.3f ± %.3f  " % (np.mean(d), np.std(d)) for d in data])
            ax2.invert_yaxis()

            fig.tight_layout(
            )  # otherwise the right y-label is slightly clipped
            if figure_dir is not None:
                imgname = figure_dir + method + "_leaf.pdf"
                print('Saving', imgname)
                plt.savefig(imgname, dpi=150, bbox_inches='tight')
            plt.show()

        # Show LIME explanation
        display(HTML("<h2>LIME</h2>"))
        lime_expl.show_in_notebook(show_table=True, show_all=False)
        leaf_plot(lime_jaccard_mat, 'lime')

        # Show SHAP explanation
        display(HTML("<h2>SHAP</h2>"))
        display(shap.force_plot(shap_phi0[label_x0], shap_phi[label_x0], x0))
        leaf_plot(shap_jaccard_mat, 'shap')

        prescription = False
        if prescription:
            print("====================================================")
            lime_x1, lime_sx1 = EL
            shap_x1, shap_sx1 = ES

            print(
                'SHAP accuracy %f balanced_accuracy %f precision %f recall %f'
                % (rows.shap_prescriptivity.mean(),
                   rows.shap_bal_prescriptivity.mean(),
                   rows.shap_precision_x1.mean(), rows.shap_recall_x1.mean()))

            lime_diff = (rows.iloc[-1].lime_g.coef_ != 0) * (lime_x1 - x0)
            shap_diff = (rows.iloc[-1].shap_g.coef_ != 0) * (shap_x1 - x0)

            print(np.array(rows.iloc[-1].lime_g.coef_ != 0))
            print('lime_diff\n', lime_diff)
            print('shap_diff\n', shap_diff)

            lime_output_x1 = cls_proba([lime_x1])[0]
            shap_output_x1 = cls_proba([shap_x1])[0]
            lime_label_x1 = 1 if lime_output_x1[1] >= lime_output_x1[0] else 0
            shap_label_x1 = 1 if shap_output_x1[1] >= shap_output_x1[0] else 0

            print("LIME(x1) prob =", lime_output_x1)
            print("SHAP(x1) prob =", shap_output_x1)

            # df = pd.DataFrame([x0, x0 + shap_diff], index=['x', 'x\'']).round(2)
            # display(df.T.iloc[:math.ceil(F/2),:])
            # display(df.T.iloc[math.ceil(F/2):,:])

            # Show LIME explanation
            lime_expl = LIMEEXPL.explain_instance(
                np.array(shap_x1),
                cls_proba,
                num_features=num_features,
                top_labels=1,
                num_samples=self.explanation_samples)
            lime_expl.show_in_notebook(show_table=True, show_all=False)
            # leaf_plot(lime_jaccard_mat, 'lime')

            # Show SHAP explanation
            shap_phi = SHAPEXPL.shap_values(shap_x1, l1_reg="num_features(10)")
            shap_phi0 = SHAPEXPL.expected_value
            argtop = np.argsort(np.abs(shap_phi[0]))
            for k in range(len(shap_phi)):
                shap_phi[k][argtop[:(F - num_features)]] = 0
            display(
                shap.force_plot(shap_phi0[shap_label_x1],
                                shap_phi[shap_label_x1], shap_x1))

    def get_R(self):
        return self.metrics

    #------------------------------------------#

    def get_lime_stability(self):
        assert self.metrics is not None
        return self.lime_avg_jaccard_bin

    def get_lime_local_concordance(self):
        assert self.metrics is not None
        return hinge_loss(np.mean(self.metrics.lime_local_discr))

    def get_lime_fidelity(self):
        assert self.metrics is not None
        return np.mean(self.metrics.lime_fidelity_f1)

    def get_lime_prescriptivity(self):
        assert self.metrics is not None
        return hinge_loss(np.mean(2 *
                                  np.abs(self.metrics.lime_boundary_discr)))

    #------------------------------------------#

    def get_shap_stability(self):
        assert self.metrics is not None
        return self.shap_avg_jaccard_bin

    def get_shap_local_concordance(self):
        assert self.metrics is not None
        return hinge_loss(np.mean(self.metrics.shap_local_discr))

    def get_shap_fidelity(self):
        assert self.metrics is not None
        return np.mean(self.metrics.shap_fidelity_f1)

    def get_shap_prescriptivity(self):
        assert self.metrics is not None
        return hinge_loss(np.mean(2 *
                                  np.abs(self.metrics.shap_boundary_discr)))
示例#28
0
class LimeExplainer():
    def __init__(self,
                 kernel_width=3,
                 n_features_to_plot=None,
                 tol=1e-2,
                 max_samples=256000,
                 fillna=0):
        """

        :param kernel_width: Lime parameter
        :param n_features_to_plot: # of features to show/plot
        :param tol: desired convergence tol on explanation
        :param max_samples: limit on # of samples used to create explanation
        """

        self.kernel_width = kernel_width
        self.n_features_to_plot = n_features_to_plot
        self.predict_function = None
        self.tol = tol
        self.max_samples = max_samples
        self.fillna = fillna

    def fit(self, dmd_train: DMD, model):
        is_classification = GeneralUtils.is_classification(model)

        x = dmd_train.values.astype(float, copy=True)
        nan_mask = ~numpy.isfinite(x)
        if numpy.any(nan_mask):
            logging.warning(
                "Lime cannot handle missing values. Fillna={} was used to coerce the issue."
                .format(self.fillna))
            x[nan_mask] = self.fillna

        self.explainer = LimeTabularExplainer(
            training_data=x,
            mode="classification" if is_classification else "regression",
            training_labels=None,  # ???
            feature_names=dmd_train.feature_names,
            categorical_features=dmd_train.categorical_features
            if dmd_train.categorical_features is not None else [],
            ###
            categorical_names=dmd_train.categorical_encoding_by_icols,  ###
            kernel_width=self.kernel_width,
            kernel=None,
            # default is np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))
            verbose=False,
            class_names=dmd_train.labels,
            feature_selection='auto',
            # ??? options are 'forward_selection', 'lasso_path', 'none' or 'auto'.
            discretize_continuous=True,
            discretizer='decile',
            # -- Lime discretizers do not support nans (options are 'quartile', 'decile', 'entropy')
            sample_around_instance=True,  # default is False
            random_state=0,
            training_data_stats=None)

        self.model = model
        self.predict_function = self.model.predict_proba if is_classification else self.model.predict

        if is_classification:
            labels = dmd_train.labels
            if labels is None:
                labels = set(unique_labels(dmd_train.target))
            self.labels = numpy.arange(len(labels))
        else:
            self.labels = None

        self.n_features = dmd_train.n_features
        self.n_features_to_plot = self.n_features_to_plot or dmd_train.n_features
        self.n_features_to_plot = min(self.n_features_to_plot, self.n_features)

    def explain(self, sample: numpy.ndarray) -> [dict, None]:
        try:
            exp = self._converged_lime_explaination(sample)

            label = self.model.predict(sample.reshape(1, -1))
            return dict(exp.as_list(label=int(label)))
        except:
            logging.exception(
                "Failed to produce lime explanation for sample {}".format(
                    sample))
            return None

    def plot(self, sample: numpy.ndarray):
        try:
            exp = self._converged_lime_explaination(sample)

            label = self.model.predict(sample.reshape(1, -1))

            if GeneralUtils.is_classification(self.model):
                exp.as_pyplot_figure(label=int(label))
            else:
                exp.as_pyplot_figure(label=None)
                plt.title("Local explanation for predicted value of %.3g" %
                          label)

            plt.tight_layout()
            plt.draw()
        except ValueError as e:
            logging.exception(
                "Failed to plot Lime for instance\n{}".format(sample))

    def _lime_explaination(self, sample, num_samples=16000):
        model_regressor = ElasticNetWrapper(random_state=0,
                                            l1_ratio=0.9,
                                            alpha=1e-3,
                                            warm_start=True,
                                            copy_X=False,
                                            selection='random',
                                            tol=1e-4)

        exp = self.explainer.explain_instance(sample.ravel(),
                                              self.predict_function,
                                              labels=self.labels,
                                              num_features=self.n_features,
                                              num_samples=num_samples,
                                              model_regressor=model_regressor)

        return exp

    def _convergence_acheived(self, lower_exp, higher_exp):
        features_to_show = sorted(higher_exp.keys(),
                                  key=lambda key: abs(higher_exp[key]),
                                  reverse=True)[:self.n_features_to_plot]
        # inefficient diff, but self.n_features_to_plot is expected to be <20
        diff = {
            k: abs(lower_exp[k] - higher_exp[k])
            for k in lower_exp if k in features_to_show
        }

        max_value = numpy.max(numpy.abs(list(higher_exp.values())))
        delta = numpy.array(list(diff.values())) / max_value
        if max(delta) < self.tol:
            converged = True
        else:
            converged = False
        return converged

    def _converged_lime_explaination(self, sample):
        def as_dict(exp):
            return {k: numpy.round(v, 5) for k, v in exp.as_list()}

        sample = numpy.array(sample, dtype=float, copy=True)
        nan_mask = ~numpy.isfinite(sample)
        if numpy.any(nan_mask):
            logging.warning(
                "Lime cannot handle missing values. Fillna(0) was used to coerce the issue."
            )
            sample = numpy.copy(sample)
            sample[nan_mask] = self.fillna

        try:

            num_samples = min(16000, self.max_samples // 2)

            exp = self._lime_explaination(sample=sample,
                                          num_samples=num_samples)
            higher_exp = as_dict(exp)

            converged = False
            while not converged and num_samples < self.max_samples:
                num_samples *= 2
                lower_exp = higher_exp
                exp = self._lime_explaination(sample=sample,
                                              num_samples=num_samples)
                higher_exp = as_dict(exp)

                converged = self._convergence_acheived(lower_exp=lower_exp,
                                                       higher_exp=higher_exp)

            if not converged:
                logging.warning(
                    "Lime explainer did not converge with {} samples".format(
                        num_samples))

            return exp

        except ValueError as e:
            logging.exception(
                "Failed to explain Lime for instance\n{}".format(sample))
            raise
from lime.lime_tabular import LimeTabularExplainer


# Create Lime Explainer for tabular data, classification task
explainer = LimeTabularExplainer(X_train, mode='classification', class_names = ['NOT churn', 'churn'],
                                 feature_names=features, 
                                 categorical_names = cat_cols,  
                                 categorical_features = [0,1,2,3,5,6,7,8,9,10,11,12,13,16,17,18,19,20,21,22,23,24,25],
                                 discretize_continuous= True)



### Explain a particular instance 
i = 10  # instance studied
expl = explainer.explain_instance(X_train2.iloc[i,:].values,  # the index of the instance we want to explaine
                                 classifier.predict_proba, 
                                 num_features= len(features), # get all features   
                                 top_labels = 0)  # only most relevant features
                        
# Vizualisation of each feature's contribution 
expl.show_in_notebook(show_table=True, show_all=False)
# Note that the row we are explaining is displayed on the right side, in table format. More precisely, only the features used in the explanation are displayed.
# The left-most numbers reflect the predictions of the classifier used. 
# The central part reveals the average influence of that particular feature value in the final predictions. 
# These two sets of numbers should indeed convey similar information but they do not need to be exactly the same.


# Checking predictions of lime (cf true predictions)
print('True value of the observation (churn = 1)', y_train[i])
# Prediction of my model 
print('Predicted proba that customer will churn, with true model', y_proba2[i,1])
示例#30
0
class LIMEExplainer(BlackBoxExplainer):
    available_explanations = [Extension.GLOBAL, Extension.LOCAL]
    explainer_type = Extension.BLACKBOX
    """Defines the LIME Explainer for explaining black box models or functions.

    :param model: The model to explain or function if is_function is True.
    :type model: model that implements sklearn.predict or sklearn.predict_proba or function that accepts a 2d
        ndarray
    :param initialization_examples: A matrix of feature vector examples (# examples x # features) for
        initializing the explainer.
    :type initialization_examples: numpy.array or pandas.DataFrame or iml.datatypes.DenseData or
        scipy.sparse.csr_matrix
    :param is_function: Default set to false, set to True if passing function instead of model.
    :type is_function: bool
    :param explain_subset: List of feature indices. If specified, only selects a subset of the
        features in the evaluation dataset for explanation. The subset can be the top-k features
        from the model summary.
    :type explain_subset: list[int]
    :param nclusters: Number of means to use for approximation. A dataset is summarized with nclusters mean
        samples weighted by the number of data points they each represent. When the number of initialization
        examples is larger than (10 x nclusters), those examples will be summarized with k-means where
        k = nclusters.
    :type nclusters: int
    :param features: A list of feature names.
    :type features: list[str]
    :param classes: Class names as a list of strings. The order of the class names should match
        that of the model output.  Only required if explaining classifier.
    :type classes: list[str]
    :param verbose: If true, uses verbose logging in LIME.
    :type verbose: bool
    :param categorical_features: Categorical feature names or indexes.
        If names are passed, they will be converted into indexes first.
    :type categorical_features: Union[list[str], list[int]]
    :param show_progress: Default to 'True'.  Determines whether to display the explanation status bar
        when using LIMEExplainer.
    :type show_progress: bool
    :param transformations: sklearn.compose.ColumnTransformer or a list of tuples describing the column name and
    transformer. When transformations are provided, explanations are of the features before the transformation.
    The format for list of transformations is same as the one here:
    https://github.com/scikit-learn-contrib/sklearn-pandas.

    If the user is using a transformation that is not in the list of sklearn.preprocessing transformations that
    we support then we cannot take a list of more than one column as input for the transformation.
    A user can use the following sklearn.preprocessing  transformations with a list of columns since these are
    already one to many or one to one: Binarizer, KBinsDiscretizer, KernelCenterer, LabelEncoder, MaxAbsScaler,
    MinMaxScaler, Normalizer, OneHotEncoder, OrdinalEncoder, PowerTransformer, QuantileTransformer, RobustScaler,
    StandardScaler.

    Examples for transformations that work::

        [
            (["col1", "col2"], sklearn_one_hot_encoder),
            (["col3"], None) #col3 passes as is
        ]
        [
            (["col1"], my_own_transformer),
            (["col2"], my_own_transformer),
        ]

    Example of transformations that would raise an error since it cannot be interpreted as one to many::

        [
            (["col1", "col2"], my_own_transformer)
        ]

    This would not work since it is hard to make out whether my_own_transformer gives a many to many or one to
    many mapping when taking a sequence of columns.
    :type transformations: sklearn.compose.ColumnTransformer or list[tuple]
    :param allow_all_transformations: Allow many to many and many to one transformations
    :type allow_all_transformations: bool
    :param model_task: Optional parameter to specify whether the model is a classification or regression model.
        In most cases, the type of the model can be inferred based on the shape of the output, where a classifier
        has a predict_proba method and outputs a 2 dimensional array, while a regressor has a predict method and
        outputs a 1 dimensional array.
    :type model_task: str
    """
    @init_tabular_decorator
    @init_blackbox_decorator
    def __init__(self,
                 model,
                 initialization_examples,
                 is_function=False,
                 explain_subset=None,
                 nclusters=10,
                 features=None,
                 classes=None,
                 verbose=False,
                 categorical_features=[],
                 show_progress=True,
                 transformations=None,
                 allow_all_transformations=False,
                 model_task=ModelTask.Unknown,
                 **kwargs):
        """Initialize the LIME Explainer.

        :param model: The model to explain or function if is_function is True.
        :type model: model that implements sklearn.predict or sklearn.predict_proba or function that accepts a 2d
            ndarray
        :param initialization_examples: A matrix of feature vector examples (# examples x # features) for
            initializing the explainer.
        :type initialization_examples: numpy.array or pandas.DataFrame or iml.datatypes.DenseData or
            scipy.sparse.csr_matrix
        :param is_function: Default set to false, set to True if passing function instead of model.
        :type is_function: bool
        :param explain_subset: List of feature indices. If specified, only selects a subset of the
            features in the evaluation dataset for explanation. The subset can be the top-k features
            from the model summary.
        :type explain_subset: list[int]
        :param nclusters: Number of means to use for approximation. A dataset is summarized with nclusters mean
            samples weighted by the number of data points they each represent. When the number of initialization
            examples is larger than (10 x nclusters), those examples will be summarized with k-means where
            k = nclusters.
        :type nclusters: int
        :param features: A list of feature names.
        :type features: list[str]
        :param classes: Class names as a list of strings. The order of the class names should match
            that of the model output.  Only required if explaining classifier.
        :type classes: list[str]
        :param verbose: If true, uses verbose logging in LIME.
        :type verbose: bool
        :param categorical_features: Categorical feature names or indexes.
            If names are passed, they will be converted into indexes first.
        :type categorical_features: Union[list[str], list[int]]
        :param show_progress: Default to 'True'.  Determines whether to display the explanation status bar
            when using LIMEExplainer.
        :type show_progress: bool
        :param transformations: sklearn.compose.ColumnTransformer or a list of tuples describing the column name and
        transformer. When transformations are provided, explanations are of the features before the transformation.
        The format for list of transformations is same as the one here:
        https://github.com/scikit-learn-contrib/sklearn-pandas.

        If the user is using a transformation that is not in the list of sklearn.preprocessing transformations that
        we support then we cannot take a list of more than one column as input for the transformation.
        A user can use the following sklearn.preprocessing  transformations with a list of columns since these are
        already one to many or one to one: Binarizer, KBinsDiscretizer, KernelCenterer, LabelEncoder, MaxAbsScaler,
        MinMaxScaler, Normalizer, OneHotEncoder, OrdinalEncoder, PowerTransformer, QuantileTransformer, RobustScaler,
        StandardScaler.

        Examples for transformations that work::

            [
                (["col1", "col2"], sklearn_one_hot_encoder),
                (["col3"], None) #col3 passes as is
            ]
            [
                (["col1"], my_own_transformer),
                (["col2"], my_own_transformer),
            ]

        Example of transformations that would raise an error since it cannot be interpreted as one to many::

            [
                (["col1", "col2"], my_own_transformer)
            ]

        This would not work since it is hard to make out whether my_own_transformer gives a many to many or one to
        many mapping when taking a sequence of columns.
        :type transformations: sklearn.compose.ColumnTransformer or list[tuple]
        :param allow_all_transformations: Allow many to many and many to one transformations
        :type allow_all_transformations: bool
        :param model_task: Optional parameter to specify whether the model is a classification or regression model.
            In most cases, the type of the model can be inferred based on the shape of the output, where a classifier
            has a predict_proba method and outputs a 2 dimensional array, while a regressor has a predict method and
            outputs a 1 dimensional array.
        :type model_task: str
        """
        self._datamapper = None
        if transformations is not None:
            self._datamapper, initialization_examples = get_datamapper_and_transformed_data(
                examples=initialization_examples,
                transformations=transformations,
                allow_all_transformations=allow_all_transformations)
        wrapped_model, eval_ml_domain = _wrap_model(model,
                                                    initialization_examples,
                                                    model_task, is_function)
        super(LIMEExplainer, self).__init__(wrapped_model,
                                            is_function=is_function,
                                            model_task=eval_ml_domain,
                                            **kwargs)
        self._logger.debug('Initializing LIMEExplainer')

        self._method = 'lime'
        self.initialization_examples = initialization_examples
        self.classification = False
        self.features = initialization_examples.get_features(features=features)
        self.classes = classes
        self.nclusters = nclusters
        self.explain_subset = explain_subset
        self.show_progress = show_progress
        self.transformations = transformations
        # If categorical_features is a list of string column names instead of indexes, make sure to convert to indexes
        if not all(
                isinstance(categorical_feature, int)
                for categorical_feature in categorical_features):
            categorical_features = initialization_examples.get_column_indexes(
                self.features, categorical_features)
        # Index the categorical string columns
        self._column_indexer = initialization_examples.string_index(
            columns=categorical_features)
        function, summary = self._prepare_function_and_summary(
            self.function,
            self.original_data_ref,
            self.current_index_list,
            nclusters=nclusters,
            explain_subset=explain_subset,
            **kwargs)
        if isinstance(summary, DenseData):
            summary = summary.data
        self._lime_feature_names = [str(i) for i in range(summary.shape[1])]
        result = function(summary[0].reshape((1, -1)))
        # If result is 2D array, this is classification scenario, otherwise regression
        if len(result.shape) == 2:
            self.classification = True
            mode = ExplainType.CLASSIFICATION
        elif len(result.shape) == 1:
            self.classification = False
            mode = ExplainType.REGRESSION
        else:
            raise Exception(
                'Invalid function specified, does not conform to specifications on prediction'
            )
        self.explainer = LimeTabularExplainer(
            summary,
            feature_names=self._lime_feature_names,
            class_names=classes,
            categorical_features=categorical_features,
            verbose=verbose,
            mode=mode,
            discretize_continuous=False)
        self.explainer.function = function
        if self.classes is None and self.classification:
            raise ValueError(
                'LIME Explainer requires classes to be specified if using a classification model'
            )
        if self.classes is not None and not self.classification:
            if self.model is None:
                error = 'Classes is specified but function was predict, not predict_proba.'
            else:
                error = 'Classes is specified but model does not define predict_proba, only predict.'
            raise ValueError(error)

    @tabular_decorator
    def explain_global(self,
                       evaluation_examples,
                       sampling_policy=None,
                       include_local=True,
                       batch_size=Defaults.DEFAULT_BATCH_SIZE):
        """Explain the model globally by aggregating local explanations to global.

        :param evaluation_examples: A matrix of feature vector examples (# examples x # features) on which
            to explain the model's output.
        :type evaluation_examples: numpy.array or pandas.DataFrame or scipy.sparse.csr_matrix
        :param sampling_policy: Optional policy for sampling the evaluation examples.  See documentation on
            SamplingPolicy for more information.
        :type sampling_policy: SamplingPolicy
        :param include_local: Include the local explanations in the returned global explanation.
            If include_local is False, will stream the local explanations to aggregate to global.
        :type include_local: bool
        :param batch_size: If include_local is False, specifies the batch size for aggregating
            local explanations to global.
        :type batch_size: int
        :return: A model explanation object containing the global explanation.
        :rtype: GlobalExplanation
        """
        kwargs = {
            ExplainParams.METHOD: ExplainType.LIME,
            ExplainParams.SAMPLING_POLICY: sampling_policy,
            ExplainParams.INCLUDE_LOCAL: include_local,
            ExplainParams.BATCH_SIZE: batch_size
        }

        if self.classification:
            kwargs[ExplanationParams.CLASSES] = self.classes
            kwargs[ExplainType.MODEL_TASK] = ExplainType.CLASSIFICATION
        else:
            kwargs[ExplainType.MODEL_TASK] = ExplainType.REGRESSION

        kwargs[ExplainParams.EVAL_DATA] = evaluation_examples.typed_dataset

        return self._explain_global(evaluation_examples, **kwargs)

    @tabular_decorator
    def explain_local(self, evaluation_examples):
        """Explain the function locally by using LIME.

        :param evaluation_examples: A matrix of feature vector examples (# examples x # features) on which
            to explain the model's output.
        :type evaluation_examples: DatasetWrapper
        :param features: A list of feature names.
        :type features: list[str]
        :param classes: Class names as a list of strings. The order of the class names should match
            that of the model output.  Only required if explaining classifier.
        :type classes: list[str]
        :return: A model explanation object containing the local explanation.
        :rtype: LocalExplanation
        """
        if self._datamapper is not None:
            evaluation_examples = transform_with_datamapper(
                evaluation_examples, self._datamapper)

        if self._column_indexer:
            evaluation_examples.apply_indexer(self._column_indexer)

        # Compute subset info prior
        if self.explain_subset:
            evaluation_examples.take_subset(self.explain_subset)

        # sample the evaluation examples
        # note: the sampled data is also used by KNN
        if self.sampling_policy is not None and self.sampling_policy.allow_eval_sampling:
            sampling_method = self.sampling_policy.sampling_method
            max_dim_clustering = self.sampling_policy.max_dim_clustering
            evaluation_examples.sample(max_dim_clustering,
                                       sampling_method=sampling_method)
        features = self.features
        if self.explain_subset:
            features = [features[i] for i in self.explain_subset]
        kwargs = {ExplainParams.METHOD: ExplainType.LIME}
        kwargs[ExplainParams.FEATURES] = features
        kwargs[ExplainParams.NUM_FEATURES] = evaluation_examples.num_features
        original_evaluation = evaluation_examples.original_dataset
        evaluation_examples = evaluation_examples.dataset
        if len(evaluation_examples.shape) == 1:
            evaluation_examples = evaluation_examples.reshape(1, -1)

        self._logger.debug('Running LIMEExplainer')
        if self.classification:
            kwargs[ExplanationParams.CLASSES] = self.classes
            kwargs[ExplainType.MODEL_TASK] = ExplainType.CLASSIFICATION
            num_classes = len(self.classes)
            labels = list(range(num_classes))
        else:
            kwargs[ExplainType.MODEL_TASK] = ExplainType.REGRESSION
            num_classes = 1
            labels = None
        lime_explanations = []

        tqdm = get_tqdm(self._logger, self.show_progress)

        if self.explain_subset:
            self.original_data_ref[0] = original_evaluation
            self.current_index_list.append(0)
            for ex_idx, example in tqdm(enumerate(evaluation_examples)):
                self.current_index_list[0] = ex_idx
                lime_explanations.append(
                    self.explainer.explain_instance(example,
                                                    self.explainer.function,
                                                    labels=labels))
            self.current_index_list = [0]
        else:
            for ex_idx, example in tqdm(enumerate(evaluation_examples)):
                lime_explanations.append(
                    self.explainer.explain_instance(example,
                                                    self.explainer.function,
                                                    labels=labels))
        if self.classification:
            lime_values = [None] * num_classes
            for lime_explanation in lime_explanations:
                for label in labels:
                    map_values = dict(lime_explanation.as_list(label=label))
                    if lime_values[label - 1] is None:
                        lime_values[label - 1] = [[
                            map_values.get(feature, 0.0)
                            for feature in self._lime_feature_names
                        ]]
                    else:
                        lime_values[label - 1].append([
                            map_values.get(feature, 0.0)
                            for feature in self._lime_feature_names
                        ])
        else:
            lime_values = None
            for lime_explanation in lime_explanations:
                map_values = dict(lime_explanation.as_list())
                if lime_values is None:
                    lime_values = [[
                        map_values.get(feature, 0.0)
                        for feature in self._lime_feature_names
                    ]]
                else:
                    lime_values.append([
                        map_values.get(feature, 0.0)
                        for feature in self._lime_feature_names
                    ])
        expected_values = None
        if self.model is not None:
            kwargs[ExplainParams.MODEL_TYPE] = str(type(self.model))
        else:
            kwargs[ExplainParams.MODEL_TYPE] = ExplainType.FUNCTION

        kwargs[ExplainParams.CLASSIFICATION] = self.classification
        kwargs[ExplainParams.LOCAL_IMPORTANCE_VALUES] = np.array(lime_values)
        kwargs[ExplainParams.EXPECTED_VALUES] = np.array(expected_values)
        kwargs[ExplainParams.EVAL_DATA] = original_evaluation

        explanation = _create_local_explanation(**kwargs)

        # if transformations have been passed, then return raw features explanation
        raw_kwargs = _get_raw_explainer_create_explanation_kwargs(
            kwargs=kwargs)
        return explanation if self._datamapper is None else _create_raw_feats_local_explanation(
            explanation,
            feature_maps=[self._datamapper.feature_map],
            features=self.features,
            **raw_kwargs)
示例#31
0
    def test_lime_tabular_explainer_not_equal_random_state(self):
        X, y = make_classification(n_samples=1000,
                                   n_features=20,
                                   n_informative=2,
                                   n_redundant=2,
                                   random_state=10)

        rf = RandomForestClassifier(n_estimators=500, random_state=10)
        rf.fit(X, y)
        instance = np.random.RandomState(10).randint(0, X.shape[0])
        feature_names = ["feature" + str(i) for i in range(20)]

        # ----------------------------------------------------------------------
        # -------------------------Quartile Discretizer-------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [], feature_names, y,
                                          random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())

        # ----------------------------------------------------------------------
        # --------------------------Decile Discretizer--------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [], feature_names, y,
                                        random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())

        # ----------------------------------------------------------------------
        # --------------------------Entropy Discretizer-------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [], feature_names, y,
                                         random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())
示例#32
0
def lime_explaination(inputs, results, select_sk_id):
    ''' compute and display explainer
    '''
    st.write(
        '*Please set the number of __features__ you want to analyse (LIME will grab most important first)*'
    )
    nb_features = st.slider(label='Number of Features to analyse',
                            min_value=7,
                            value=10,
                            max_value=15)
    st.write(
        '*Please set the number of __similar applications__ you want to compare with (similarity according to most important features)*'
    )
    nb_neighbors = st.slider(
        label='Number of similar applications to consider',
        min_value=10,
        value=20,
        max_value=50)

    if st.button("Explain Results by LIME"):
        with st.spinner('Calculating...'):
            lime_explainer = LimeTabularExplainer(
                training_data=inputs.values,
                mode='classification',
                training_labels=results[['RISK_FLAG']],
                feature_names=inputs.columns)
            exp = lime_explainer.explain_instance(
                inputs.loc[select_sk_id].values,
                pipe.predict_proba,
                num_features=nb_features)
            # introduce next step
            st.write('__ - LIME explaination for the selected Client:__')
            st.write(
                '*Positive value __Red__ means __Support__ the Class 1: Failure Risk*'
            )
            st.write(
                '*Negative value __Green__ means __Contradict__ the Class 1: Failure Risk*'
            )
            # Get features_to_show list
            id_cols = [item[0] for item in exp.as_map()[1]]
            # Create inputs restricted to the features_to_show
            df_lime = inputs.filter(inputs.columns[id_cols].tolist())
            # sk_id_row = df_lime.loc[[select_sk_id]]
            # compute inputs for plots
            exp_list = exp.as_list()
            vals = [x[1] for x in exp_list]
            names = [x[0] for x in exp_list]
            axisgb_colors = ['#fee0d2' if x > 0 else '#c7e9c0' for x in vals]
            vals.reverse()
            names.reverse()
            colors = ['red' if x > 0 else 'green' for x in vals]
            pos = np.arange(len(exp_list)) + .5
            # create tab plot
            tab = plt.figure()
            plt.barh(pos, vals, align='center', color=colors)
            plt.yticks(pos, names)
            plt.title('Local explanation for Class 1: Failure Risk')
            st.pyplot(tab)
            # st.write(sk_id_row)
            # find nb_neighbors nearest neighbors to catch anomaly
            nearest_neighbors = NearestNeighbors(n_neighbors=nb_neighbors,
                                                 radius=0.4)
            nearest_neighbors.fit(df_lime)
            neighbors = nearest_neighbors.kneighbors(
                df_lime.loc[[select_sk_id]],
                nb_neighbors + 1,
                return_distance=False)[0]
            neighbors = np.delete(neighbors, 0)
            # compute values for neighbors, class0 and class1
            df_lime['RISK_FLAG'] = results['RISK_FLAG']
            neighbors_values = pd.DataFrame(df_lime.iloc[neighbors].mean(),
                                            index=df_lime.columns,
                                            columns=['Neighbors_Mean'])
            st.write('__- Neighbors Risk Flag averaged__',
                     neighbors_values.Neighbors_Mean.tail(1).values[0])
            st.write(
                '*Nb. Neighborood __do not__ take Risk prediction values into account*'
            )
            client_values = df_lime.loc[[select_sk_id]].T
            client_values.columns = ['Client_Value']
            class1_values = pd.DataFrame(
                df_lime[df_lime['RISK_FLAG'] == 1].mean(),
                index=df_lime.columns,
                columns=['Class_1_Mean'])
            class0_values = pd.DataFrame(
                df_lime[df_lime['RISK_FLAG'] == 0].mean(),
                index=df_lime.columns,
                columns=['Class_0_Mean'])
            any_values = pd.concat([
                class0_values.iloc[:-1], class1_values.iloc[:-1],
                neighbors_values.iloc[:-1], client_values
            ],
                                   axis=1)
            colorsList = ('tab:green', 'tab:red', 'tab:cyan', 'tab:blue')
            fig, axs = plt.subplots(nb_features,
                                    sharey='row',
                                    figsize=(8, 4 * nb_features))
            for i in np.arange(0, nb_features):
                axs[i].barh(any_values.T.index,
                            any_values.T.iloc[:, i],
                            color=colorsList)
                axs[i].set_title(str(any_values.index[i]), fontweight="bold")
                axs[i].patch.set_facecolor(axisgb_colors[i])
            st.write('__ - Details of LIME explaination for each features: __')
            st.write(
                '*Nb. You may compare Client value with mean of its Neighbors, Class 1 & Class 0*'
            )
            st.write(
                '*Colored lightred / lightgreen foreground is related to Class 1: Failure Risk Support / Contradict*'
            )
            st.pyplot(fig)
示例#33
0
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

boston = load_boston()

x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, train_size=0.8)

rf = RandomForestRegressor(n_estimators=1000)

rf.fit(x_train, y_train)

categorical_features = np.argwhere(np.array([len(set(boston.data[:,x])) for x in range(boston.data.shape[1])]) <= 10).flatten()
explainer = LimeTabularExplainer(x_train, categorical_features=categorical_features, feature_names=boston.feature_names, class_names=['price'], verbose=True, mode='regression')

exp = explainer.explain_instance(x_test[0], rf.predict, num_features=5)

print(exp.as_list())
示例#34
0
    def explain_lime(self,
                     model,
                     known_examples,
                     target_example,
                     n_repeats=10,
                     n_samples=100,
                     n_features=None,
                     metric='euclidean',
                     kernel_width=10.0):
        CLASS_NAMES = ['negative', 'positive']
        FEATURE_NAMES = [
            '{r}_{c}'.format(**locals())
            for r, c in product(range(5), repeat=2)
        ]
        FEATURES = list(range(len(FEATURE_NAMES)))

        if n_features is None:
            n_features = 4 if self.rule == 0 else 3

        lime = LimeTabularExplainer(self.flat_images[known_examples],
                                    class_names=CLASS_NAMES,
                                    feature_names=FEATURE_NAMES,
                                    categorical_features=FEATURES,
                                    discretize_continuous=False,
                                    feature_selection='forward_selection',
                                    kernel_width=kernel_width,
                                    verbose=True)

        def flat_to_x(flat_images):
            n_examples = len(flat_images)
            X = np.array([
                self._flat_to_ohe(fi.reshape(5, 5))
                for fi in flat_images.astype(int)
            ],
                         dtype=np.float32)
            return np.hstack([X, np.ones((n_examples, 1))])

        pipeline = make_pipeline(PipeStep(flat_to_x), model)

        local_model = Ridge(alpha=1, fit_intercept=True, random_state=0)

        runtime = 0
        Z = np.zeros((n_repeats, 5, 5, 4))
        for i in range(n_repeats):
            t = time()
            explanation = lime.explain_instance(
                self.flat_images[target_example],
                pipeline.predict_proba,
                model_regressor=local_model,
                num_samples=n_samples,
                num_features=n_features,
                distance_metric=metric)
            runtime += time() - t

            # XXX technically the same feature can appear both as positive and
            # negative;  we ignore this for now

            for feat, coeff in explanation.as_list():
                r_c, value = feat.split('=')
                r, c = r_c.split('_')
                value = _TO_OHE[_COLORS_RGB[int(value)]]
                Z[i, int(r), int(c), :] = np.array(value) * np.sign(coeff)

        Z = np.hstack([Z.reshape((n_repeats, -1)), np.ones((n_repeats, 1))])
        z = np.sum(Z, axis=0)

        return z, Z, runtime
示例#35
0
class LimeTabular(ExplainerMixin):
    available_explanations = ['local']
    explainer_type = 'blackbox'

    def __init__(self,
                 predict_fn,
                 data,
                 sampler=None,
                 feature_names=None,
                 feature_types=None,
                 explain_kwargs={},
                 **kwargs):

        self.data, _, self.feature_names, self.feature_types = unify_data(
            data, None, feature_names, feature_types)
        self.predict_fn = unify_predict_fn(predict_fn, self.data)

        if sampler is not None:  # pragma: no cover
            warnings.warn('Sampler interface not currently supported.')

        self.sampler = sampler
        self.explain_kwargs = explain_kwargs

        self.kwargs = kwargs
        final_kwargs = {'mode': 'regression'}
        if self.feature_names:
            final_kwargs['feature_names'] = self.feature_names
        final_kwargs.update(self.kwargs)

        self.lime = LimeTabularExplainer(self.data, **final_kwargs)

    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)
        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        predictions = self.predict_fn(X)
        pred_fn = self.predict_fn

        data_dicts = []
        for i, instance in enumerate(X):
            lime_explanation = self.lime.explain_instance(
                instance, pred_fn, **self.explain_kwargs)

            names = []
            scores = []
            values = []
            feature_idx_imp_pairs = lime_explanation.as_map()[1]
            for feat_idx, imp in feature_idx_imp_pairs:
                names.append(self.feature_names[feat_idx])
                scores.append(imp)
                values.append(instance[feat_idx])
            intercept = lime_explanation.intercept[1]

            data_dict = {
                'type': 'univariate',
                'names': names,
                'perf': perf_dict(y, predictions, i),
                'scores': scores,
                'values': values,
                'extra': {
                    'names': ['Intercept'],
                    'scores': [intercept],
                    'values': [1],
                }
            }
            data_dicts.append(data_dict)

        internal_obj = {
            'overall': None,
            'specific': data_dicts,
        }
        selector = gen_local_selector(X, y, predictions)

        return FeatureValueExplanation('local',
                                       internal_obj,
                                       feature_names=self.feature_names,
                                       feature_types=self.feature_types,
                                       name=name,
                                       selector=selector)
    def test_lime_tabular_explainer_not_equal_random_state(self):
        X, y = make_classification(n_samples=1000,
                                   n_features=20,
                                   n_informative=2,
                                   n_redundant=2,
                                   random_state=10)

        rf = RandomForestClassifier(n_estimators=500, random_state=10)
        rf.fit(X, y)
        instance = np.random.RandomState(10).randint(0, X.shape[0])
        feature_names = ["feature" + str(i) for i in range(20)]

        # ----------------------------------------------------------------------
        # -------------------------Quartile Discretizer-------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = QuartileDiscretizer(X, [],
                                          feature_names,
                                          y,
                                          random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())

        # ----------------------------------------------------------------------
        # --------------------------Decile Discretizer--------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = DecileDiscretizer(X, [],
                                        feature_names,
                                        y,
                                        random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())

        # ----------------------------------------------------------------------
        # --------------------------Entropy Discretizer-------------------------
        # ----------------------------------------------------------------------

        # ---------------------------------[1]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[2]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=10)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[3]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=10)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertTrue(exp_1.as_map() != exp_2.as_map())

        # ---------------------------------[4]----------------------------------
        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_1 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_1 = explainer_1.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        discretizer = EntropyDiscretizer(X, [],
                                         feature_names,
                                         y,
                                         random_state=20)
        explainer_2 = LimeTabularExplainer(X,
                                           feature_names=feature_names,
                                           discretize_continuous=True,
                                           discretizer=discretizer,
                                           random_state=20)
        exp_2 = explainer_2.explain_instance(X[instance],
                                             rf.predict_proba,
                                             num_samples=500)

        self.assertFalse(exp_1.as_map() != exp_2.as_map())
示例#37
0
    def test_lime_explainer_with_data_stats(self):
        np.random.seed(1)

        rf = RandomForestClassifier(n_estimators=500)
        rf.fit(self.train, self.labels_train)
        i = np.random.randint(0, self.test.shape[0])

        # Generate stats using a quartile descritizer
        descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names,
                                          random_state=20)

        d_means = descritizer.means
        d_stds = descritizer.stds
        d_mins = descritizer.mins
        d_maxs = descritizer.maxs
        d_bins = descritizer.bins(self.train, self.target_names)

        # Compute feature values and frequencies of all columns
        cat_features = np.arange(self.train.shape[1])
        discretized_training_data = descritizer.discretize(self.train)

        feature_values = {}
        feature_frequencies = {}
        for feature in cat_features:
            column = discretized_training_data[:, feature]
            feature_count = collections.Counter(column)
            values, frequencies = map(list, zip(*(feature_count.items())))
            feature_values[feature] = values
            feature_frequencies[feature] = frequencies

        # Convert bins to list from array
        d_bins_revised = {}
        index = 0
        for bin in d_bins:
            d_bins_revised[index] = bin.tolist()
            index = index+1

        # Descritized stats
        data_stats = {}
        data_stats["means"] = d_means
        data_stats["stds"] = d_stds
        data_stats["maxs"] = d_maxs
        data_stats["mins"] = d_mins
        data_stats["bins"] = d_bins_revised
        data_stats["feature_values"] = feature_values
        data_stats["feature_frequencies"] = feature_frequencies

        data = np.zeros((2, len(self.feature_names)))
        explainer = LimeTabularExplainer(
            data, feature_names=self.feature_names, random_state=10,
            training_data_stats=data_stats, training_labels=self.target_names)

        exp = explainer.explain_instance(self.test[i],
                                         rf.predict_proba,
                                         num_features=2,
                                         model_regressor=LinearRegression())

        self.assertIsNotNone(exp)
        keys = [x[0] for x in exp.as_list()]
        self.assertEqual(1,
                         sum([1 if 'petal width' in x else 0 for x in keys]),
                         "Petal Width is a major feature")
        self.assertEqual(1,
                         sum([1 if 'petal length' in x else 0 for x in keys]),
                         "Petal Length is a major feature")
示例#38
0
文件: lime.py 项目: zzzace2000/GAMs
class LimeTabular(ExplainerMixin):
    available_explanations = ["local"]
    explainer_type = "blackbox"

    def __init__(
        self,
        predict_fn,
        data,
        sampler=None,
        feature_names=None,
        feature_types=None,
        explain_kwargs={},
        n_jobs=1,
        **kwargs
    ):

        self.data, _, self.feature_names, self.feature_types = unify_data(
            data, None, feature_names, feature_types
        )
        self.predict_fn = unify_predict_fn(predict_fn, self.data)
        self.n_jobs = n_jobs

        if sampler is not None:  # pragma: no cover
            warnings.warn("Sampler interface not currently supported.")

        self.sampler = sampler
        self.explain_kwargs = explain_kwargs

        self.kwargs = kwargs
        final_kwargs = {"mode": "regression"}
        if self.feature_names:
            final_kwargs["feature_names"] = self.feature_names
        final_kwargs.update(self.kwargs)

        self.lime = LimeTabularExplainer(self.data, **final_kwargs)

    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)
        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        predictions = self.predict_fn(X)
        pred_fn = self.predict_fn

        data_dicts = []
        scores_list = []
        perf_list = []
        for i, instance in enumerate(X):
            lime_explanation = self.lime.explain_instance(
                instance, pred_fn, **self.explain_kwargs
            )

            names = []
            scores = []
            values = []
            feature_idx_imp_pairs = lime_explanation.as_map()[1]
            for feat_idx, imp in feature_idx_imp_pairs:
                names.append(self.feature_names[feat_idx])
                scores.append(imp)
                values.append(instance[feat_idx])
            intercept = lime_explanation.intercept[1]

            perf_dict_obj = perf_dict(y, predictions, i)

            scores_list.append(scores)
            perf_list.append(perf_dict_obj)

            data_dict = {
                "type": "univariate",
                "names": names,
                "perf": perf_dict_obj,
                "scores": scores,
                "values": values,
                "extra": {"names": ["Intercept"], "scores": [intercept], "values": [1]},
            }
            data_dicts.append(data_dict)

        internal_obj = {
            "overall": None,
            "specific": data_dicts,
            "mli": [
                {
                    "explanation_type": "local_feature_importance",
                    "value": {
                        "scores": scores_list,
                        "intercept": intercept,
                        "perf": perf_list,
                    },
                }
            ],
        }
        internal_obj["mli"].append(
            {
                "explanation_type": "evaluation_dataset",
                "value": {"dataset_x": X, "dataset_y": y},
            }
        )
        selector = gen_local_selector(X, y, predictions)

        return FeatureValueExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )
示例#39
0
def upload2():
    from werkzeug.datastructures import ImmutableMultiDict

    with open(ff[0], 'rb') as file:
        model = pickle.load(file)

    with open(ff[1], 'rb') as file:
        X_data = pickle.load(file)

    with open(ff[2], 'rb') as file:
        y_data = pickle.load(file)

    print('start')
    print(request.form)
    hh = request.form
    hh = hh.to_dict(flat=False)
    print('hh ', hh)
    for file in request.files.getlist("gg"):
        print(file)
    print(list(X_data.columns))

    series = pd.Series(hh)

    import shap
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_data)

    # load JS visualization code to notebook
    shap.initjs()

    #plt.style.use("_classic_test_patch")
    #plt.clf()
    # visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
    #shap.force_plot(explainer.expected_value, shap_values[1,:], series, matplotlib=True, figsize=(22, 4))
    #shap.force_plot(explainer.expected_value, shap_values[10,:],  \
    #                series,feature_names=X_data.columns,\
    #               matplotlib=True, show=False)

    # plt.savefig("gg.png",dpi=150, bbox_inches='tight')

    #yyy = shap.getjs()
    '''
    oo = yyy.matplotlib
    p = yyy.html  
    yyy_str = mpld3.fig_to_html(p)  
    print('dfsdfsdf ',p)     
    '''
    series = series.tolist()
    print('im a he ', series)
    pp = []
    for i in series:
        for j in i:
            j = float(j)
            pp.append(j)

    series = np.array(pp)
    print('im a she ', series)

    #lime
    import lime
    from lime.lime_tabular import LimeTabularExplainer
    explainer = LimeTabularExplainer(X_data,
                                     mode='regression',
                                     feature_names=list(X_data.columns),
                                     random_state=42,
                                     discretize_continuous=False,
                                     kernel_width=0.2)

    exp = explainer.explain_instance(series, model.predict)

    print(exp.local_pred)

    fig = exp.as_pyplot_figure(label=list(X_data.columns))

    fig_2 = exp.as_html(labels=list(X_data.columns))
    #print('dddd ',fig_2)

    plt.tight_layout()
    #fig = plt.figure(figsize = (18,8))

    #    plt.tight_layout()
    #    #plt.boxplot(bank_data.transpose())
    #
    #    #Add titles to the chart and axes
    #    plt.hist(bank_data.transpose(), bins = 50)
    #    plt.title('Boxplot of Bank Stock Prices (5Y Lookback)')
    #    plt.xlabel('Bank')
    #    plt.ylabel('Stock Prices')
    #
    #mpld3.show(fig)
    #
    html_str = mpld3.fig_to_html(fig)
    Html_file = open("templates/lime.html", "w")
    Html_file.write(html_str)
    Html_file.close()
    #

    # plt.savefig('static/img/new34_plot.png')
    #plt.close()

    return render_template('local_result.html',
                           LIME=html_str,
                           SH=fig_2,
                           gh=html_str)
示例#40
0
class LimeTabular(ExplainerMixin):
    """ Exposes LIME tabular explainer from lime package, in interpret API form.
    If using this please cite the original authors as can be found here: https://github.com/marcotcr/lime/blob/master/citation.bib
    """

    available_explanations = ["local"]
    explainer_type = "blackbox"

    def __init__(self,
                 predict_fn,
                 data,
                 sampler=None,
                 feature_names=None,
                 feature_types=None,
                 explain_kwargs={},
                 n_jobs=1,
                 **kwargs):
        """ Initializes class.

        Args:
            predict_fn: Function of blackbox that takes input, and returns prediction.
            data: Data used to initialize LIME with.
            sampler: Currently unused. Due for deprecation.
            feature_names: List of feature names.
            feature_types: List of feature types.
            explain_kwargs: Kwargs that will be sent to lime's explain_instance.
            n_jobs: Number of jobs to run in parallel.
            **kwargs: Kwargs that will be sent to lime at initialization time.
        """
        from lime.lime_tabular import LimeTabularExplainer

        self.data, _, self.feature_names, self.feature_types = unify_data(
            data, None, feature_names, feature_types)
        self.predict_fn = unify_predict_fn(predict_fn, self.data)
        self.n_jobs = n_jobs

        if sampler is not None:  # pragma: no cover
            warnings.warn("Sampler interface not currently supported.")

        self.sampler = sampler
        self.explain_kwargs = explain_kwargs

        self.kwargs = kwargs
        final_kwargs = {"mode": "regression"}
        if self.feature_names:
            final_kwargs["feature_names"] = self.feature_names
        final_kwargs.update(self.kwargs)

        self.lime = LimeTabularExplainer(self.data, **final_kwargs)

    def explain_local(self, X, y=None, name=None):
        """ Generates local explanations for provided instances.

        Args:
            X: Numpy array for X to explain.
            y: Numpy vector for y to explain.
            name: User-defined explanation name.

        Returns:
            An explanation object, visualizing feature-value pairs
            for each instance as horizontal bar charts.
        """
        if name is None:
            name = gen_name_from_class(self)
        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        predictions = self.predict_fn(X)
        pred_fn = self.predict_fn

        data_dicts = []
        scores_list = []
        perf_list = []
        perf_dicts = gen_perf_dicts(predictions, y, False)
        for i, instance in enumerate(X):
            lime_explanation = self.lime.explain_instance(
                instance, pred_fn, **self.explain_kwargs)

            names = []
            scores = []
            values = []
            feature_idx_imp_pairs = lime_explanation.as_map()[1]
            for feat_idx, imp in feature_idx_imp_pairs:
                names.append(self.feature_names[feat_idx])
                scores.append(imp)
                values.append(instance[feat_idx])
            intercept = lime_explanation.intercept[1]

            perf_dict_obj = None if perf_dicts is None else perf_dicts[i]

            scores_list.append(scores)
            perf_list.append(perf_dict_obj)

            data_dict = {
                "type": "univariate",
                "names": names,
                "perf": perf_dict_obj,
                "scores": scores,
                "values": values,
                "extra": {
                    "names": ["Intercept"],
                    "scores": [intercept],
                    "values": [1]
                },
            }
            data_dicts.append(data_dict)

        internal_obj = {
            "overall":
            None,
            "specific":
            data_dicts,
            "mli": [{
                "explanation_type": "local_feature_importance",
                "value": {
                    "scores": scores_list,
                    "intercept": intercept,
                    "perf": perf_list,
                },
            }],
        }
        internal_obj["mli"].append({
            "explanation_type": "evaluation_dataset",
            "value": {
                "dataset_x": X,
                "dataset_y": y
            },
        })
        selector = gen_local_selector(data_dicts, is_classification=False)

        return FeatureValueExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )
示例#41
0
class Explainer:
    def __init__(self,
                 model,
                 df_train,
                 categorical_inputs,
                 categorical_imputer,
                 numeric_inputs,
                 numeric_imputer,
                 input_preproc,
                 class_names=None,
                 **kwargs):
        """
        Args:
            categorical_imputer: The imputer that is to be used for categorical columns.
                The imputer is not allowed to add new columns or change order of the
                existing ones.

            numeric_imputer: The imputer that is to be used for numeric columns.
                The imputer is not allowed to add new columns or change order of the
                existing ones.
        """
        self.model = model
        self.categorical_inputs = categorical_inputs
        self.categorical_imputer = categorical_imputer
        self.numeric_inputs = numeric_inputs
        self.numeric_imputer = numeric_imputer
        self.input_preproc = input_preproc
        class_names = [str(c) for c in class_names]

        self.interpret_preproc = make_column_transformer(
            (
                make_pipeline(
                    # wrap in a function transformer to prevent being refitted
                    FunctionTransformer(categorical_imputer.transform,
                                        validate=False),
                    OrdinalEncoder()),
                categorical_inputs),

            # wrap in a function transformer to prevent being refitted
            (FunctionTransformer(numeric_imputer.transform,
                                 validate=False), numeric_inputs))

        xx_train = self.interpret_preproc.fit_transform(
            df_train[self.categorical_inputs + self.numeric_inputs])

        if xx_train.shape[1] != len(categorical_inputs) + len(numeric_inputs):
            raise ValueError(
                "Imputers are not allowed to add new columns or to change their order."
            )

        self.ordenc = self.interpret_preproc.transformers_[0][1][1]

        try:
            cat_name_idx = {
                k: v
                for k, v in enumerate(self.ordenc.categories_)
            }

            self.categorical_names = {
                k: v
                for k, v in zip(categorical_inputs, self.ordenc.categories_)
            }
        except AttributeError:
            cat_name_idx = {}
            self.categorical_names = {}

        self.explainer = LimeTabularExplainer(
            xx_train,
            feature_names=categorical_inputs + numeric_inputs,
            class_names=class_names,
            categorical_features=range(len(categorical_inputs)),
            categorical_names=cat_name_idx,
            mode="classification"
            if is_classifier(self.model) else "regression",
            **kwargs)

        self.full_model = make_pipeline(FunctionTransformer(self._preproc_fn),
                                        self.model)

    def _preproc_fn(self, x):
        df_inst = pd.DataFrame(x,
                               columns=self.categorical_inputs +
                               self.numeric_inputs)

        if self.categorical_inputs:
            df_inst[self.categorical_inputs] = self.ordenc.inverse_transform(
                df_inst[self.categorical_inputs].values)

        x_preproc = self.input_preproc.transform(df_inst)

        return x_preproc

    def explain(self, df_inst):
        if len(df_inst.shape) == 1:
            df_inst = df_inst.to_frame().transpose()

        x_inst = self.interpret_preproc.transform(df_inst)[0]

        exp = self.explainer.explain_instance(
            x_inst, self.full_model.predict_proba if hasattr(
                self.full_model, 'predict_proba') else self.full_model.predict)

        # we replace the show_in_notebook with a version that
        # works OK in Google Colab; to active the compatible model
        # one needs to specify colab_mode=True when calling show_in_notebook
        exp._orig_show_in_notebook = exp.show_in_notebook
        exp.show_in_notebook = MethodType(show_in_notebook_patch, exp)

        return exp

    def pdp_plot(self, df_inputs, feature_name, **kwargs):
        xx_test = pd.DataFrame(self.interpret_preproc.transform(df_inputs),
                               columns=self.categorical_inputs +
                               self.numeric_inputs)

        pdp_goals = pdp.pdp_isolate(model=self.full_model,
                                    dataset=xx_test,
                                    model_features=self.categorical_inputs +
                                    self.numeric_inputs,
                                    feature=feature_name)

        pdp.pdp_plot(pdp_goals, feature_name, **kwargs)

        try:
            t = self.categorical_names[feature_name]
            plt.xticks(pdp_goals.feature_grids, t)
        except KeyError:
            pass

        return pdp_goals

    def permutation_importance(self, df_inputs, df_outputs, **kwargs):
        xx = self.interpret_preproc.transform(
            df_inputs[self.categorical_inputs + self.numeric_inputs])
        yy = df_outputs

        perm = eli5.sklearn.PermutationImportance(self.full_model,
                                                  **kwargs).fit(xx, yy)

        display(
            eli5.show_weights(perm,
                              feature_names=self.categorical_inputs +
                              self.numeric_inputs))

        return perm