def fit(self, X, y, lemna_component, predict_fn, labels_num): self.cluster_labels = self.cluster_method.fit_predict(X) self.num_features = X.shape[1] for i in range(self.cluster_num): inds = np.where(self.cluster_labels == i) explainer = LimeTabularExplainer(np.squeeze(X[inds, :]), discretize_continuous=False, sample_around_instance=True) simplified_models = explainer.explain_instance_with_lemna( self.cluster_method.cluster_centers_[i], predict_fn, lemna_component=lemna_component, num_samples=5000, labels=range(labels_num), num_features=X.shape[1], retrive_model=True) # coef_ is a 3-d matrix feature_num * lemna_component * labels_num # intercept is a 2-d matrix lemna_component * labels_num coef_ = np.zeros((X.shape[1], lemna_component, labels_num)) intercept_ = np.zeros((1, lemna_component, labels_num)) for idx in range(labels_num): coef_[:, :, idx] = simplified_models[idx].coef_ intercept_[0, :, idx] = simplified_models[idx].intercept_ pi_ = simplified_models[idx].pi_ self.models.append((coef_, intercept_, pi_))
def fit(self, X: Any, class_names: List[str] = None) -> None: if class_names is None: class_names = ['0', '1'] self._explainer = LimeTabularExplainer( training_data=X, feature_names=list(range(X.shape[1])), class_names=class_names, discretize_continuous=False, random_state=self._seed)
def test_lime_explainer_no_regressor(self): np.random.seed(1) iris = load_iris() train, test, labels_train, labels_test = ( sklearn.cross_validation.train_test_split(iris.data, iris.target, train_size=0.80)) rf = RandomForestClassifier(n_estimators=500) rf.fit(train, labels_train) i = np.random.randint(0, test.shape[0]) explainer = LimeTabularExplainer(train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True) exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEquals(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEquals(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature")
def calcul_interpretation(clf, client_id): test_features_filled = test_features.fillna(test_features.median()) lime1 = LimeTabularExplainer(test_features_filled, feature_names=test_features_filled.columns, discretize_continuous=False) explain_data = test_features_filled.iloc[test_corrs_removed.index[ test_corrs_removed['SK_ID_CURR'] == int(client_id)]].T.squeeze() exp = lime1.explain_instance(explain_data, clf.predict_proba, num_samples=1000) exp_list = exp.as_list() exp_keys = [] exp_values = [] exp_positives = [] for i in range(len(exp_list)): exp_keys.append(exp_list[i][0]) exp_values.append(exp_list[i][1]) df_data = pd.DataFrame(data=[exp_keys, exp_values]) df_data = df_data.T df_data.columns = ['exp_keys', 'exp_values'] df_data = df_data.iloc[np.abs(df_data['exp_values'].values).argsort()] df_data['color'] = df_data.exp_values.apply(lambda x: 'red' if x > 0 else 'green') return df_data
def lime_tabular_global(): targets = ['academic', 'fiction', 'magazine', 'newspaper'] data = pd.read_pickle('data_explain_tabular.pkl') clf = joblib.load('model_forest_tabular.pkl') feature_names = list(data) target = np.array(data['target']) data = data.drop(['target', 'year', 'ID'], axis=1).as_matrix() explainer = LimeTabularExplainer(data, feature_names=feature_names, class_names=targets) N = data.shape[0] academic, fiction, magazine, newspaper = ([],[],[],[]) academic_w, fiction_w, magazine_w, newspaper_w = ([],[],[],[]) for i in range(N): pred = clf.predict(data[i].reshape(1,-1))[0] if pred == target[i]: explanation = explainer.explain_instance(data[i], clf.predict_proba, num_features=10, top_labels=4) result = explanation.as_list(label=pred) if 0 == target[i]: academic.append((result, pred)) elif 1 == target[i]: fiction.append((result, pred)) elif 2 == target[i]: magazine.append((result, pred)) elif 3 == target[i]: newspaper.append((result, pred)) else: return 1 else: explanation = explainer.explain_instance(data[i], clf.predict_proba, num_features=10, top_labels=4) result = explanation.as_list(label=pred) if 0 == target[i]: academic_w.append((result, pred)) elif 1 == target[i]: fiction_w.append((result, pred)) elif 2 == target[i]: magazine_w.append((result, pred)) elif 3 == target[i]: newspaper_w.append((result, pred)) else: return 1 joblib.dump(academic, 'lime_academic.pkl') joblib.dump(fiction, 'lime_fiction.pkl') joblib.dump(magazine, 'lime_magazine.pkl') joblib.dump(newspaper, 'lime_newspaper.pkl') all_explanations = academic + fiction + magazine + newspaper joblib.dump(all_explanations, 'lime_all.pkl') joblib.dump(academic_w, 'lime_academic_wrong.pkl') joblib.dump(fiction_w, 'lime_fiction_wrong.pkl') joblib.dump(magazine_w, 'lime_magazine_wrong.pkl') joblib.dump(newspaper_w, 'lime_newspaper_wrong.pkl') all_explanations_w = academic_w + fiction_w + magazine_w + newspaper_w joblib.dump(all_explanations_w, 'lime_all_wrong.pkl')
def explain(): try: with open(CFG.TRAINING, 'rb') as f: training = pickle.load(f) my_json = request.get_json() encoded_dict = convert_json(my_json) dictionary = eval(encoded_dict) normalize_age_mons = age_mons_preprocessing.transform( [[dictionary['age_month']]])[0, 0] dictionary['age_month'] = normalize_age_mons pred = np.array([x[1] for x in dictionary.items()]) exp = LimeTabularExplainer(training.values, feature_names=training.columns, discretize_continuous=True) fig = exp.explain_instance(pred, model.predict_proba).as_pyplot_figure() fig.figsize = (30, 10) plt.tight_layout() plt.savefig('explain.png') return send_file('explain.png', mimetype='image/png', as_attachment=True) except ValueError: return 'Bad Request', 400
def __init__(self, bb_classifier, X, class_names, explanation_samples=5000): self.bb_classifier = bb_classifier self.EX, self.StdX = np.mean(X), np.array(np.std(X, axis=0, ddof=0)) self.class_names = class_names self.F = X.shape[1] # number of features self.explanation_samples = explanation_samples # SHAP Kernel self.SHAPEXPL = shap.KernelExplainer(self.bb_classifier.predict_proba, self.EX, nsamples=explanation_samples) # LIME Kernel self.LIMEEXPL = LimeTabularExplainer( X.astype('float'), feature_names=X.columns.tolist(), class_names=self.class_names, discretize_continuous=False, sample_around_instance=True, # categorical_features=categorical_features, # feature_selection='highest_weights', # sample_using_pca=False, # weight_classifier_labels=False, random_state=10) self.metrics = None self.lime_avg_jaccard_bin = self.lime_std_jaccard_bin = None self.shap_avg_jaccard_bin = self.shap_std_jaccard_bin = None
def lime(self, instance=None, html_file=False, num_features=2): """ :param instance: :param html_file: :param num_features: :return: """ explainer = LimeTabularExplainer(self.x_train.values, mode="classification", feature_names=self.x_train.columns, class_names=['false', 'true'], training_labels=self.y_train, discretize_continuous=True) if not instance: instance = np.random.randint(0, self.x_test.shape[0]) print('Case: ' + str(instance)) print('Label: ' + str(self.y_test.iloc[instance])) exp = explainer.explain_instance(self.x_test.values[instance], self.model.predict_proba, num_features=num_features) print("Lime explanation: ") exp.as_pyplot_figure(label=1).show() if html_file: exp.save_to_file( str(instance) + "_" + str(self.y_test.iloc[instance]) + "_explain.html")
def test_lime_explainer_entropy_discretizer(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) explainer = LimeTabularExplainer(self.train, feature_names=self.feature_names, class_names=self.target_names, training_labels=self.labels_train, discretize_continuous=True, discretizer='entropy') exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] print(keys) self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature")
def test_lime_explainer_good_regressor(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) explainer = LimeTabularExplainer(self.train, mode="classification", feature_names=self.feature_names, class_names=self.target_names, discretize_continuous=True) exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression()) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature")
def generate_neighborhood_data(self, sample, predict_fn, distance_metric='euclidean', n_samples=500, seed=1, **kwargs): '''Generate neighborhood data for a given point (currently using LIME) Args: train_data: Training data predict_fn was trained on sample: Observed sample predict_fn: Black box predictor to predict all points distance_metric: Distance metric used for weights n_samples: Number of samples to generate Returns: neighor_data (xs around sample), weights (weights of instances in xs), neighor_data_labels (ys around sample, corresponding to xs) ''' from lime.lime_tabular import LimeTabularExplainer e = LimeTabularExplainer( self.train_data, categorical_features=self.categorical_features, discretize_continuous=False) _, neighbor_data = e._LimeTabularExplainer__data_inverse( sample, n_samples) scaled_data = (neighbor_data - e.scaler.mean_) / e.scaler.scale_ return (*self._data(neighbor_data, scaled_data, distance_metric, predict_fn), sample)
def lime(): print('Loading dataset ...') X_train, Y_train, X_val, Y_val, X_test, Y_test = get_dataset( minibatch_size=32, sampling='None', numpy='True') net = load_fraudnet() lime_list = [] explainer = LimeTabularExplainer(X_train, training_labels=Y_train) def func_call(x): print(x.shape) input_ = torch.from_numpy(x).to(device=device).float() prob_1 = net(input_).view(-1, 1).cpu().data.numpy() prob_0 = 1 - prob_1 prob = np.concatenate([prob_0, prob_1], axis=1) return prob for i in range(X_test.shape[0]): exp = explainer.explain_instance(X_test[i, :], func_call, labels=(0, 1), num_features=50) lime_list.append(exp) lime_list = np.array(lime_list) lime_list = preprocess(lime_list) pickle.dump(lime_list, open('./saved_attributions/lime.pkl', 'wb'))
def __init__( self, predict_fn, data, sampler=None, feature_names=None, feature_types=None, explain_kwargs={}, n_jobs=1, **kwargs ): self.data, _, self.feature_names, self.feature_types = unify_data( data, None, feature_names, feature_types ) self.predict_fn = unify_predict_fn(predict_fn, self.data) self.n_jobs = n_jobs if sampler is not None: # pragma: no cover warnings.warn("Sampler interface not currently supported.") self.sampler = sampler self.explain_kwargs = explain_kwargs self.kwargs = kwargs final_kwargs = {"mode": "regression"} if self.feature_names: final_kwargs["feature_names"] = self.feature_names final_kwargs.update(self.kwargs) self.lime = LimeTabularExplainer(self.data, **final_kwargs)
def fit(self, X, y, predict_fn, labels_num): self.cluster_labels = self.cluster_method.fit_predict(X) #print(X.shape[1]) for i in range(self.cluster_num): inds = np.where(self.cluster_labels == i) explainer = LimeTabularExplainer(X[inds], discretize_continuous=False, sample_around_instance=True) #print(np.squeeze(X[inds, :])) #print (self.cluster_method.cluster_centers_[i]) #time1=time.clock() simplified_models = explainer.explain_instance( self.cluster_method.cluster_centers_[i], predict_fn, num_samples=10000, labels=range(labels_num), num_features=X.shape[1], retrive_model=True) #print(type(simplified_models)) coef_ = np.zeros((X.shape[1], labels_num)) intercept_ = np.zeros((1, labels_num)) #time2=time.clock() #time3 = time2-time1 #print("explain_instance") #print(time3) for idx in range(labels_num): coef_[:, idx] = simplified_models[idx].coef_ intercept_[0, idx] = simplified_models[idx].intercept_ self.models.append((coef_, intercept_))
def fit(self, X, y=None): self.explainer_ = LimeTabularExplainer( X, feature_names=self.feature_names, class_names=self.class_names, discretize_continuous=True) return self
def explain_with_lime(X_test, model, model_name, encoder, categorical_features_indices, categorical_encoding, class_names, feature_names, test_instance=10): """Explain a prediction from the test set with a trained model.""" columns = X_test.columns.tolist() predict_fn = lambda x: model.predict_proba(encoder.transform(pd.DataFrame(x, columns=columns)).astype(float)) explainer = LimeTabularExplainer(X_test.to_numpy(), mode="classification", feature_names=feature_names, class_names=class_names, categorical_features=categorical_features_indices, categorical_names=categorical_encoding, kernel_width=3) # might set seed? explanation = explainer.explain_instance(X_test.iloc[test_instance, :], predict_fn, num_features=5) # Show and save explanation # explanation.save_to_file(PATHS["03_data_outputs"] + "lime.html") explanation.as_pyplot_figure() plt.tight_layout() plt.savefig(PATHS["03_data_outputs"] + model_name + "_lime_plot.png") plt.close() # access the coefficients, the intercept and the R squared of the linear model print("Coefficients of linear model: ", explanation.local_exp) print("\n") print("Intercept: ", explanation.intercept) print("\n") print("R-squared: ", explanation.score)
def get_local_interpretation(ID_client, dataframe, modelname, features_importances, label): model = load_model(modelname) X = dataframe[dataframe['SK_ID_CURR'] == int(ID_client)] X = X.drop(['SK_ID_CURR', 'TARGET'], axis=1) dataframe = dataframe.drop(['SK_ID_CURR', 'TARGET'], axis=1) X_train = dataframe.sample(frac=0.1, random_state=42).values explainer = LimeTabularExplainer( training_data=X_train, mode='classification', feature_names=dataframe.columns, training_labels=dataframe.columns.tolist(), verbose=1, random_state=42) #st.write(np.array(X)) #st.write(type(np.array(X))) explanation = explainer.explain_instance( np.ravel(np.array(X)), predict_fn=model.predict_proba, labels=[0, 1], num_features=len(dataframe.columns)) #fig = explanation.as_pyplot_figure(label=label) #st.pyplot(fig) return explanation
def lime_interpreter(dataset_features, x_train, x_test, classifier, model_name, rng=True, instance=None): feature_names = ["f" + str(i) for i in range(dataset_features)] # explainer = LimeTabularExplainer(x_train, feature_names=feature_names, discretize_continuous=True) def wrapped_fn(x_test): p = classifier.predict_proba(x_test).toarray() p_norm = norm_probabilities(p) return p_norm if rng: idx = np.random.randint(0, x_test.shape[0]) else: idx = instance exp = explainer.explain_instance(x_test[idx], predict_fn=wrapped_fn) exp.save_to_file(model_name + '.html') print( "Iterpretation can be found as an HTML file in the currect directory, named :" ) print(model_name) print("")
def run(self, load_data=True, tune_parameter=True): if load_data: lines, values = self.data(0, self.num_samples) self.vectorize_text(lines, values) # If tune_parameter is false, we run with our experimented parameters if tune_parameter: self.tune_parameters() else: self.index = 0 self.param = { "alpha": 0.1, "learning_rate": "invscaling", "penalty": "l2" } reg = self.train() print(reg.densify()) y_pred = self.test(reg) y_test = np.load(self.Y_test, mmap_mode='r') print(y_pred.shape) self.print_stats(y_pred, y_test) # Show a Lime plot of the regression. The labelings will no be correct since we are using a regression model. X_train = np.load(self.X_train[self.index], mmap_mode='r') X_test = np.load(self.X_test[self.index], mmap_mode='r') explainer = LimeTabularExplainer(X_train, mode="regression") exp = explainer.explain_instance(X_test[self.text_index], reg.predict) exp.as_pyplot_figure()
def get_lime_scores(predictive_model, x_train, x_test): lime_scores = [] FEATS = len(x_train[0]) feat_names = ["X" + str(i) for i in range(len(x_train[0]))] explainer = LimeTabularExplainer(x_train, feature_names=feat_names) for w in range(x_test.shape[0]): exp = explainer.explain_instance(x_test[w], predictive_model.predict_proba, num_features=FEATS) rank_list = exp.as_list() curr_scores = [ np.where( np.array([ pd.Series(rank_list[v][0]).str.contains('X' + str(k))[0] * 1 for k in range(FEATS) ]) == 1)[0][0] for v in range(len(rank_list)) ] lime_score_ = np.zeros((1, x_train.shape[1])) lime_score_[0, np.array(curr_scores)] = np.array( [np.abs(rank_list[v][1]) for v in range(len(rank_list))]) lime_scores.append(lime_score_) lime_scores = np.array(lime_scores).reshape(-1, x_train.shape[1]) return lime_scores
def run(self, load_data=True, tune_parameter=True): if load_data: lines, values = self.data(0, self.num_samples) self.vectorize_text(lines, values) # If tune_parameter is false, we run with our experimented parameters if tune_parameter: self.tune_parameters() else: self.index = 1 self.param = {"alpha": 0.05, "learning_rate": "invscaling", "penalty": "l2"} reg = self.train() y_pred = self.test(reg) print(max(y_pred)) # Using log(y) so convert back to seconds with exp(y_pred) y_pred = np.expm1(y_pred) y_test = np.load(self.Y_test, mmap_mode='r') self.print_stats(y_pred, y_test) X_train = np.load(self.X_train[self.index], mmap_mode='r') X_test = np.load(self.X_test[self.index], mmap_mode='r') explainer = LimeTabularExplainer(X_train, mode="regression") exp = explainer.explain_instance(X_test[self.text_index], reg.predict) exp.as_pyplot_figure()
def create_model_explainer(self): self.explainer = LimeTabularExplainer( self.train, feature_names=self.feature_names, training_labels=self.labels_train, class_names=self.class_names, categorical_features=self.categorical_feature_indices, categorical_names=self.categorical_names, discretize_continuous=True)
def explain(): explainer = LimeTabularExplainer(train, class_names=class_names, feature_names=feature_names, categorical_features=categorical_features) return explainer.explain_instance(X.iloc[0], rf.predict_proba, num_features=4)
def _define_explainer(self): # define explainer self.explainer = LimeTabularExplainer( training_data=self.train_set, feature_names=self.input_cols, class_names=self.prediction_classes, categorical_features=self.categorical_features, categorical_names=self.cat_names, discretize_continuous=True )
def __init__(self, model, feature_names, classes, training_data): self.model = model self.feature_names = feature_names self.classes = classes self.training_data = training_data self.explainer = LimeTabularExplainer(training_data=training_data, mode='classification', feature_names=self.feature_names, class_names=self.classes)
def create_lime_explanation(explainer, new_observation, **kwargs): # utility function for predict_surrogate(type='lime') from lime.lime_tabular import LimeTabularExplainer explainer_dict, explanation_dict = unpack_kwargs_lime(explainer, new_observation, **kwargs) lime_tabular_explainer = LimeTabularExplainer(**explainer_dict) explanation = lime_tabular_explainer.explain_instance(**explanation_dict) explanation.plot = types.MethodType(plot_lime_custom, explanation) explanation.result = pd.DataFrame(explanation.as_list(), columns=['variable', 'effect']) return explanation
def interpret_data(X, y, func): explainer = LimeTabularExplainer(X, discretize_continuous=False, kernel_width=3) times, scores = [], [] for r_idx in range(100): start_time = time.time() explanation = explainer.explain_instance(X[r_idx, :], func) times.append(time.time() - start_time) scores.append(explanation.score) print('...') return times, scores
def __init__(self, random_forest_model, x_train, y_train): self.rf_model = random_forest_model self.x_train = x_train self.y_train = y_train self.columns = list(x_train.columns) self.explainer = LimeTabularExplainer(x_train.values, feature_names=self.columns) self.model = InMemoryModel(self.rf_model.predict_proba, examples=self.x_train) self.interpreter = Interpretation(training_data=self.x_train, feature_names=self.columns, training_labels=self.y_train)
def explain_tabular(self, trainset, labels, instance, num_features=5, kernel_width=3): """Explain categorical and numeric features for a prediction. It analyze the prediction by LIME, and returns a report of the most impactful tabular features contributing to certain labels. Args: trainset: a DataFrame representing the training features that LIME can use to decide value distributions. labels: a list of labels to explain. instance: the prediction instance. It needs to conform to model's input. Can be a csv line string, or a dict. num_features: maximum number of features to show. kernel_width: Passed to LIME LimeTabularExplainer directly. Returns: A LIME's lime.explanation.Explanation. """ from lime.lime_tabular import LimeTabularExplainer if isinstance(instance, six.string_types): instance = next( csv.DictReader([instance], fieldnames=self._headers)) categories = self._get_unique_categories(trainset) np_trainset = self._preprocess_data_for_tabular_explain( trainset, categories) predict_fn = self._make_tabular_predict_fn(labels, instance, categories) prediction_df = pd.DataFrame([instance]) prediction_instance = self._preprocess_data_for_tabular_explain( prediction_df, categories) explainer = LimeTabularExplainer( np_trainset, feature_names=(self._categorical_columns + self._numeric_columns), class_names=labels, categorical_features=range(len(categories)), categorical_names={i: v for i, v in enumerate(categories)}, kernel_width=kernel_width) exp = explainer.explain_instance(prediction_instance[0], predict_fn, num_features=num_features, labels=range(len(labels))) return exp
def _get_lime_coefficients( self, factuals: pd.DataFrame ) -> Tuple[np.ndarray, np.ndarray]: """ Actionable Recourse is only defined on linear models. To make it work for arbitrary non-linear networks we need to find the lime coefficients for every instance. Parameters ---------- factuals : pd.DataFrame Instances we want to get lime coefficients Returns ------- coeffs : np.ndArray intercepts : np.ndArray """ coeffs = np.zeros(factuals.shape) intercepts = [] lime_data = self._data.df[self._mlmodel.feature_input_order] lime_label = self._data.df[self._data.target] lime_exp = LimeTabularExplainer( training_data=lime_data.values, training_labels=lime_label, feature_names=self._mlmodel.feature_input_order, discretize_continuous=self._discretize_continuous, sample_around_instance=self._sample_around_instance, categorical_names=[ cat for cat in self._mlmodel.feature_input_order if cat not in self._data.continuous ] # self._data.encoded_normalized's categorical features contain feature name and value, separated by '_' # while self._data.categorical do not contain those additional values. ) for index, row in factuals.iterrows(): factual = row.values explanations = lime_exp.explain_instance( factual, self._mlmodel.predict_proba, num_features=len(self._mlmodel.feature_input_order), ) intercepts.append(explanations.intercept[1]) for tpl in explanations.local_exp[1]: coeffs[index][tpl[0]] = tpl[1] return coeffs, np.array(intercepts)
def test_lime_explainer_bad_regressor(self): iris = load_iris() train, test, labels_train, labels_test = sklearn.cross_validation.train_test_split(iris.data, iris.target, train_size=0.80) rf = RandomForestClassifier(n_estimators=500) rf.fit(train, labels_train) lasso = Lasso(alpha=1, fit_intercept=True) i = np.random.randint(0, test.shape[0]) with self.assertRaises(TypeError): explainer = LimeTabularExplainer(train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True) exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2, top_labels=1, model_regressor=lasso)
def test_lime_explainer_no_regressor(self): np.random.seed(1) iris = load_iris() train, test, labels_train, labels_test = sklearn.cross_validation.train_test_split(iris.data, iris.target, train_size=0.80) rf = RandomForestClassifier(n_estimators=500) rf.fit(train, labels_train) i = np.random.randint(0, test.shape[0]) explainer = LimeTabularExplainer(train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True) exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2) self.assertIsNotNone(exp)
def test_lime_explainer_bad_regressor(self): rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) lasso = Lasso(alpha=1, fit_intercept=True) i = np.random.randint(0, self.test.shape[0]) with self.assertRaises(TypeError): explainer = LimeTabularExplainer(self.train, mode="classification", feature_names=self.feature_names, class_names=self.target_names, discretize_continuous=True) exp = explainer.explain_instance(self.test[i], # noqa:F841 rf.predict_proba, num_features=2, top_labels=1, model_regressor=lasso)
def explain_tabular(self, trainset, labels, instance, num_features=5, kernel_width=3): """Explain categorical and numeric features for a prediction. It analyze the prediction by LIME, and returns a report of the most impactful tabular features contributing to certain labels. Args: trainset: a DataFrame representing the training features that LIME can use to decide value distributions. labels: a list of labels to explain. instance: the prediction instance. It needs to conform to model's input. Can be a csv line string, or a dict. num_features: maximum number of features to show. kernel_width: Passed to LIME LimeTabularExplainer directly. Returns: A LIME's lime.explanation.Explanation. """ from lime.lime_tabular import LimeTabularExplainer if isinstance(instance, six.string_types): instance = next(csv.DictReader([instance], fieldnames=self._headers)) categories = self._get_unique_categories(trainset) np_trainset = self._preprocess_data_for_tabular_explain(trainset, categories) predict_fn = self._make_tabular_predict_fn(labels, instance, categories) prediction_df = pd.DataFrame([instance]) prediction_instance = self._preprocess_data_for_tabular_explain(prediction_df, categories) explainer = LimeTabularExplainer( np_trainset, feature_names=(self._categorical_columns + self._numeric_columns), class_names=labels, categorical_features=range(len(categories)), categorical_names={i: v for i, v in enumerate(categories)}, kernel_width=kernel_width) exp = explainer.explain_instance( prediction_instance[0], predict_fn, num_features=num_features, labels=range(len(labels))) return exp
def test_lime_explainer_good_regressor_synthetic_data(self): X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10) rf = RandomForestClassifier(n_estimators=500) rf.fit(X, y) instance = np.random.randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] explainer = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True) exp = explainer.explain_instance(X[instance], rf.predict_proba) self.assertIsNotNone(exp) self.assertEqual(10, len(exp.as_list()))
from lime.lime_tabular import LimeTabularExplainer import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.datasets import load_boston boston = load_boston() x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, train_size=0.8) rf = RandomForestRegressor(n_estimators=1000) rf.fit(x_train, y_train) categorical_features = np.argwhere(np.array([len(set(boston.data[:,x])) for x in range(boston.data.shape[1])]) <= 10).flatten() explainer = LimeTabularExplainer(x_train, categorical_features=categorical_features, feature_names=boston.feature_names, class_names=['price'], verbose=True, mode='regression') exp = explainer.explain_instance(x_test[0], rf.predict, num_features=5) print(exp.as_list())
def test_lime_tabular_explainer_not_equal_random_state(self): X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=2, random_state=10) rf = RandomForestClassifier(n_estimators=500, random_state=10) rf.fit(X, y) instance = np.random.RandomState(10).randint(0, X.shape[0]) feature_names = ["feature" + str(i) for i in range(20)] # ---------------------------------------------------------------------- # -------------------------Quartile Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = QuartileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Decile Discretizer-------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = DecileDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map()) # ---------------------------------------------------------------------- # --------------------------Entropy Discretizer------------------------- # ---------------------------------------------------------------------- # ---------------------------------[1]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[2]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=10) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[3]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=10) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertTrue(exp_1.as_map() != exp_2.as_map()) # ---------------------------------[4]---------------------------------- discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_1 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_1 = explainer_1.explain_instance(X[instance], rf.predict_proba, num_samples=500) discretizer = EntropyDiscretizer(X, [], feature_names, y, random_state=20) explainer_2 = LimeTabularExplainer(X, feature_names=feature_names, discretize_continuous=True, discretizer=discretizer, random_state=20) exp_2 = explainer_2.explain_instance(X[instance], rf.predict_proba, num_samples=500) self.assertFalse(exp_1.as_map() != exp_2.as_map())
def test_lime_explainer_with_data_stats(self): np.random.seed(1) rf = RandomForestClassifier(n_estimators=500) rf.fit(self.train, self.labels_train) i = np.random.randint(0, self.test.shape[0]) # Generate stats using a quartile descritizer descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names, random_state=20) d_means = descritizer.means d_stds = descritizer.stds d_mins = descritizer.mins d_maxs = descritizer.maxs d_bins = descritizer.bins(self.train, self.target_names) # Compute feature values and frequencies of all columns cat_features = np.arange(self.train.shape[1]) discretized_training_data = descritizer.discretize(self.train) feature_values = {} feature_frequencies = {} for feature in cat_features: column = discretized_training_data[:, feature] feature_count = collections.Counter(column) values, frequencies = map(list, zip(*(feature_count.items()))) feature_values[feature] = values feature_frequencies[feature] = frequencies # Convert bins to list from array d_bins_revised = {} index = 0 for bin in d_bins: d_bins_revised[index] = bin.tolist() index = index+1 # Descritized stats data_stats = {} data_stats["means"] = d_means data_stats["stds"] = d_stds data_stats["maxs"] = d_maxs data_stats["mins"] = d_mins data_stats["bins"] = d_bins_revised data_stats["feature_values"] = feature_values data_stats["feature_frequencies"] = feature_frequencies data = np.zeros((2, len(self.feature_names))) explainer = LimeTabularExplainer( data, feature_names=self.feature_names, random_state=10, training_data_stats=data_stats, training_labels=self.target_names) exp = explainer.explain_instance(self.test[i], rf.predict_proba, num_features=2, model_regressor=LinearRegression()) self.assertIsNotNone(exp) keys = [x[0] for x in exp.as_list()] self.assertEqual(1, sum([1 if 'petal width' in x else 0 for x in keys]), "Petal Width is a major feature") self.assertEqual(1, sum([1 if 'petal length' in x else 0 for x in keys]), "Petal Length is a major feature")