def summarize(self): """ The summarize method allows to display the summary of local explainability. This method can be configured with modify_mask method to summarize the explainability to suit needs. If the user doesn't use modify_mask, the summarize method uses the mask_params parameters specified during the initialisation of the SmartPredictor. In classification case, The summarize method summarizes the explainability which corresponds to : - the predicted values specified by the user or automatically computed (with add_input method) - the right probabilities from predict_proba associated to the right predicted values - the right contributions ranked and filtered as specify with modify_mask method Returns ------- pandas.DataFrame - selected explanation of each row for classification case Examples -------- >>> summary_df = predictor.summarize() >>> summary_df pred proba feature_1 value_1 contribution_1 feature_2 value_2 contribution_2 0 0 0.756416 Sex 1.0 0.322308 Pclass 3.0 0.155069 1 3 0.628911 Sex 2.0 0.585475 Pclass 1.0 0.370504 2 0 0.543308 Sex 2.0 -0.486667 Pclass 3.0 0.255072 >>> predictor.modify_mask(max_contrib=1) >>> summary_df = predictor.summarize() >>> summary_df pred proba feature_1 value_1 contribution_1 0 0 0.756416 Sex 1.0 0.322308 1 3 0.628911 Sex 2.0 0.585475 2 0 0.543308 Sex 2.0 -0.486667 """ # data is needed : add_input() method must be called at least once if not hasattr(self, "data"): raise ValueError("You have to specify dataset x and y_pred arguments. Please use add_input() method.") self.summary = assign_contributions( rank_contributions( self.data["contributions"], self.data["x_postprocessed"] ) ) # Apply filter method with mask_params attributes parameters self.filter() # Summarize information self.data['summary'] = summarize(self.summary['contrib_sorted'], self.summary['var_dict'], self.summary['x_sorted'], self.mask, self.columns_dict, self.features_dict) # Matching with y_pred return pd.concat([self.data["ypred"], self.data['summary']], axis=1)
def test_rank_contributions_1(self): """ Unit test rank contributions 1 """ dataframe_s = pd.DataFrame( [[3.4, 1, -9, 4], [-45, 3, 43, -9]], columns=["Phi_" + str(i) for i in range(4)], index=['raw_1', 'raw_2'] ) dataframe_x = pd.DataFrame( [['Male', 'House', 'Married', 'PhD'], ['Female', 'Flat', 'Married', 'Master']], columns=["X" + str(i) for i in range(4)], index=['raw_1', 'raw_2'] ) expected_s_ord = pd.DataFrame( data=[[-9, 4, 3.4, 1], [-45, 43, -9, 3]], columns=['contribution_' + str(i) for i in range(4)], index=['raw_1', 'raw_2'] ) expected_x_ord = pd.DataFrame( data=[['Married', 'PhD', 'Male', 'House'], ['Female', 'Married', 'Master', 'Flat']], columns=['feature_' + str(i) for i in range(4)], index=['raw_1', 'raw_2'] ) expected_s_dict = pd.DataFrame( data=[[2, 3, 0, 1], [0, 2, 3, 1]], columns=['feature_' + str(i) for i in range(4)], index=['raw_1', 'raw_2'] ) s_ord, x_ord, s_dict = rank_contributions(dataframe_s, dataframe_x) assert np.array_equal(s_ord.values, expected_s_ord.values) assert np.array_equal(x_ord.values, expected_x_ord.values) assert np.array_equal(s_dict.values, expected_s_dict.values) assert list(s_ord.columns) == list(expected_s_ord.columns) assert list(x_ord.columns) == list(expected_x_ord.columns) assert list(s_dict.columns) == list(expected_s_dict.columns) assert pd.Index.equals(s_ord.index, expected_s_ord.index) assert pd.Index.equals(x_ord.index, expected_x_ord.index) assert pd.Index.equals(s_dict.index, expected_s_dict.index)
def test_rank_contributions_1(self): """ Unit test rank contributions 1 """ model = RandomForestClassifier(n_estimators=3) model.fit(self.x_train, self.y_train) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(self.x_test) slist = [ pd.DataFrame(data=tab, index=self.x_test.index, columns=self.x_test.columns) for tab in shap_values ] for i in range(3): s_ord, x_ord, s_dict = rank_contributions( slist[i], pd.DataFrame(data=self.x_test)) assert np.all(np.diff(np.abs(s_ord), axis=1) <= 0) == 1 assert np.array_equal( x_ord.values, np.take_along_axis(self.x_test.values, s_dict.values, axis=1))
def rank_contributions(self, contributions, x_pred): """ Rank contributions line by line and build a reference dictionary to the prediction set. Parameters ---------- contributions : pandas.DataFrame Local contributions to sort. x_pred : pandas.DataFrame Prediction set. Returns ------- pandas.DataFrame Local contributions sorted by decreasing absolute values. pandas.DataFrame Input features sorted by decreasing contributions absolute values. pandas.DataFrame Input features names sorted for each observation by decreasing contributions absolute values. """ return rank_contributions(contributions, x_pred)
def summarize(self, use_groups=None): """ The summarize method allows to display the summary of local explainability. This method can be configured with modify_mask method to summarize the explainability to suit needs. If the user doesn't use modify_mask, the summarize method uses the mask_params parameters specified during the initialisation of the SmartPredictor. In classification case, The summarize method summarizes the explainability which corresponds to : - the predicted values specified by the user or automatically computed (with add_input method) - the right probabilities from predict_proba associated to the right predicted values - the right contributions ranked and filtered as specify with modify_mask method Parameters ---------- use_groups : bool (optional) Whether or not to compute groups of features contributions. Returns ------- pandas.DataFrame - selected explanation of each row for classification case Examples -------- >>> summary_df = predictor.summarize() >>> summary_df pred proba feature_1 value_1 contribution_1 feature_2 value_2 contribution_2 0 0 0.756416 Sex 1.0 0.322308 Pclass 3.0 0.155069 1 3 0.628911 Sex 2.0 0.585475 Pclass 1.0 0.370504 2 0 0.543308 Sex 2.0 -0.486667 Pclass 3.0 0.255072 >>> predictor.modify_mask(max_contrib=1) >>> summary_df = predictor.summarize() >>> summary_df pred proba feature_1 value_1 contribution_1 0 0 0.756416 Sex 1.0 0.322308 1 3 0.628911 Sex 2.0 0.585475 2 0 0.543308 Sex 2.0 -0.486667 """ # data is needed : add_input() method must be called at least once use_groups = True if (use_groups is not False and self.features_groups is not None) else False if not hasattr(self, "data"): raise ValueError( "You have to specify dataset x and y_pred arguments. Please use add_input() method." ) if use_groups is True: data = self.data_groups else: data = self.data if self._drop_option is not None: columns_to_keep = [ x for x in self._drop_option["columns_dict_op"].values() if x in data["x_postprocessed"].columns ] if use_groups: columns_to_keep += list(self.features_groups.keys()) x_preprocessed = data["x_postprocessed"][columns_to_keep] else: x_preprocessed = data["x_postprocessed"] columns_dict = {i: col for i, col in enumerate(x_preprocessed.columns)} features_dict = { k: v for k, v in self.features_dict.items() if k in x_preprocessed.columns } self.summary = assign_contributions( rank_contributions(data["contributions"], x_preprocessed)) # Apply filter method with mask_params attributes parameters self.filter() # Summarize information data['summary'] = summarize(self.summary['contrib_sorted'], self.summary['var_dict'], self.summary['x_sorted'], self.mask, columns_dict, features_dict) # Matching with y_pred return pd.concat([data["ypred"], data['summary']], axis=1)