示例#1
0
    def summarize(self):
        """
        The summarize method allows to display the summary of local explainability.
        This method can be configured with modify_mask method to summarize the explainability to suit needs.

        If the user doesn't use modify_mask, the summarize method uses the mask_params parameters specified during
        the initialisation of the SmartPredictor.

        In classification case, The summarize method summarizes the explainability which corresponds to :
            - the predicted values specified by the user or automatically computed (with add_input method)
            - the right probabilities from predict_proba associated to the right predicted values
            - the right contributions ranked and filtered as specify with modify_mask method

        Returns
        -------
        pandas.DataFrame
            - selected explanation of each row for classification case

        Examples
        --------
        >>> summary_df = predictor.summarize()
        >>> summary_df
        	pred	proba	    feature_1	value_1	    contribution_1	feature_2	value_2	    contribution_2
        0	0	    0.756416	Sex	        1.0	        0.322308	    Pclass	    3.0	        0.155069
        1	3	    0.628911	Sex	        2.0	        0.585475	    Pclass	    1.0	        0.370504
        2	0	    0.543308	Sex	        2.0	        -0.486667	    Pclass	    3.0	        0.255072

        >>> predictor.modify_mask(max_contrib=1)
        >>> summary_df = predictor.summarize()
        >>> summary_df
        	pred	proba	    feature_1	value_1	    contribution_1
        0	0	    0.756416	Sex	        1.0	        0.322308
        1	3	    0.628911	Sex	        2.0	        0.585475
        2	0	    0.543308	Sex	        2.0	        -0.486667
        """
        # data is needed : add_input() method must be called at least once

        if not hasattr(self, "data"):
            raise ValueError("You have to specify dataset x and y_pred arguments. Please use add_input() method.")

        self.summary = assign_contributions(
            rank_contributions(
                self.data["contributions"],
                self.data["x_postprocessed"]
            )
        )
        # Apply filter method with mask_params attributes parameters
        self.filter()

        # Summarize information
        self.data['summary'] = summarize(self.summary['contrib_sorted'],
                                         self.summary['var_dict'],
                                         self.summary['x_sorted'],
                                         self.mask,
                                         self.columns_dict,
                                         self.features_dict)

        # Matching with y_pred
        return pd.concat([self.data["ypred"], self.data['summary']], axis=1)
    def test_rank_contributions_1(self):
        """
        Unit test rank contributions 1
        """
        dataframe_s = pd.DataFrame(
            [[3.4, 1, -9, 4],
             [-45, 3, 43, -9]],
            columns=["Phi_" + str(i) for i in range(4)],
            index=['raw_1', 'raw_2']
        )

        dataframe_x = pd.DataFrame(
            [['Male', 'House', 'Married', 'PhD'],
             ['Female', 'Flat', 'Married', 'Master']],
            columns=["X" + str(i) for i in range(4)],
            index=['raw_1', 'raw_2']
        )

        expected_s_ord = pd.DataFrame(
            data=[[-9, 4, 3.4, 1],
                  [-45, 43, -9, 3]],
            columns=['contribution_' + str(i) for i in range(4)],
            index=['raw_1', 'raw_2']
        )

        expected_x_ord = pd.DataFrame(
            data=[['Married', 'PhD', 'Male', 'House'],
                  ['Female', 'Married', 'Master', 'Flat']],
            columns=['feature_' + str(i) for i in range(4)],
            index=['raw_1', 'raw_2']
        )

        expected_s_dict = pd.DataFrame(
            data=[[2, 3, 0, 1],
                  [0, 2, 3, 1]],
            columns=['feature_' + str(i) for i in range(4)],
            index=['raw_1', 'raw_2']
        )

        s_ord, x_ord, s_dict = rank_contributions(dataframe_s, dataframe_x)

        assert np.array_equal(s_ord.values, expected_s_ord.values)
        assert np.array_equal(x_ord.values, expected_x_ord.values)
        assert np.array_equal(s_dict.values, expected_s_dict.values)

        assert list(s_ord.columns) == list(expected_s_ord.columns)
        assert list(x_ord.columns) == list(expected_x_ord.columns)
        assert list(s_dict.columns) == list(expected_s_dict.columns)

        assert pd.Index.equals(s_ord.index, expected_s_ord.index)
        assert pd.Index.equals(x_ord.index, expected_x_ord.index)
        assert pd.Index.equals(s_dict.index, expected_s_dict.index)
示例#3
0
    def test_rank_contributions_1(self):
        """
        Unit test rank contributions 1
        """
        model = RandomForestClassifier(n_estimators=3)
        model.fit(self.x_train, self.y_train)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(self.x_test)
        slist = [
            pd.DataFrame(data=tab,
                         index=self.x_test.index,
                         columns=self.x_test.columns) for tab in shap_values
        ]

        for i in range(3):
            s_ord, x_ord, s_dict = rank_contributions(
                slist[i], pd.DataFrame(data=self.x_test))
            assert np.all(np.diff(np.abs(s_ord), axis=1) <= 0) == 1
            assert np.array_equal(
                x_ord.values,
                np.take_along_axis(self.x_test.values, s_dict.values, axis=1))
示例#4
0
    def rank_contributions(self, contributions, x_pred):
        """
        Rank contributions line by line and build a reference dictionary to the prediction set.

        Parameters
        ----------
        contributions : pandas.DataFrame
            Local contributions to sort.
        x_pred : pandas.DataFrame
            Prediction set.

        Returns
        -------
        pandas.DataFrame
            Local contributions sorted by decreasing absolute values.
        pandas.DataFrame
            Input features sorted by decreasing contributions absolute values.
        pandas.DataFrame
            Input features names sorted for each observation
            by decreasing contributions absolute values.
        """
        return rank_contributions(contributions, x_pred)
示例#5
0
    def summarize(self, use_groups=None):
        """
        The summarize method allows to display the summary of local explainability.
        This method can be configured with modify_mask method to summarize the explainability to suit needs.

        If the user doesn't use modify_mask, the summarize method uses the mask_params parameters specified during
        the initialisation of the SmartPredictor.

        In classification case, The summarize method summarizes the explainability which corresponds to :
            - the predicted values specified by the user or automatically computed (with add_input method)
            - the right probabilities from predict_proba associated to the right predicted values
            - the right contributions ranked and filtered as specify with modify_mask method

        Parameters
        ----------
        use_groups : bool (optional)
            Whether or not to compute groups of features contributions.

        Returns
        -------
        pandas.DataFrame
            - selected explanation of each row for classification case

        Examples
        --------
        >>> summary_df = predictor.summarize()
        >>> summary_df
        	pred	proba	    feature_1	value_1	    contribution_1	feature_2	value_2	    contribution_2
        0	0	    0.756416	Sex	        1.0	        0.322308	    Pclass	    3.0	        0.155069
        1	3	    0.628911	Sex	        2.0	        0.585475	    Pclass	    1.0	        0.370504
        2	0	    0.543308	Sex	        2.0	        -0.486667	    Pclass	    3.0	        0.255072

        >>> predictor.modify_mask(max_contrib=1)
        >>> summary_df = predictor.summarize()
        >>> summary_df
        	pred	proba	    feature_1	value_1	    contribution_1
        0	0	    0.756416	Sex	        1.0	        0.322308
        1	3	    0.628911	Sex	        2.0	        0.585475
        2	0	    0.543308	Sex	        2.0	        -0.486667
        """
        # data is needed : add_input() method must be called at least once
        use_groups = True if (use_groups is not False
                              and self.features_groups is not None) else False

        if not hasattr(self, "data"):
            raise ValueError(
                "You have to specify dataset x and y_pred arguments. Please use add_input() method."
            )

        if use_groups is True:
            data = self.data_groups
        else:
            data = self.data

        if self._drop_option is not None:
            columns_to_keep = [
                x for x in self._drop_option["columns_dict_op"].values()
                if x in data["x_postprocessed"].columns
            ]
            if use_groups:
                columns_to_keep += list(self.features_groups.keys())
            x_preprocessed = data["x_postprocessed"][columns_to_keep]
        else:
            x_preprocessed = data["x_postprocessed"]

        columns_dict = {i: col for i, col in enumerate(x_preprocessed.columns)}
        features_dict = {
            k: v
            for k, v in self.features_dict.items()
            if k in x_preprocessed.columns
        }

        self.summary = assign_contributions(
            rank_contributions(data["contributions"], x_preprocessed))
        # Apply filter method with mask_params attributes parameters
        self.filter()

        # Summarize information
        data['summary'] = summarize(self.summary['contrib_sorted'],
                                    self.summary['var_dict'],
                                    self.summary['x_sorted'], self.mask,
                                    columns_dict, features_dict)

        # Matching with y_pred
        return pd.concat([data["ypred"], data['summary']], axis=1)