def compute(self, return_scores=False):
        """
        Computes the DataFrame, that presents the importance of each feature.

        Args:
            return_scores (bool, optional):
                Flag indicating whether the method should return the train and test score of the model, together with
                the model interpretation report. If true, the output of this method is a tuple of DataFrame, float,
                float.

        Returns:
            (pd.DataFrame or tuple(pd.DataFrame, float, float)):
                Dataframe with SHAP feature importance, or tuple containing the dataframe, train and test scores of the
                model.
        """
        self._check_if_fitted()

        # Compute SHAP importance
        self.importance_df_train = calculate_shap_importance(
            self.shap_values_train, self.column_names, output_columns_suffix="_train"
        )

        self.importance_df_test = calculate_shap_importance(
            self.shap_values_test, self.column_names, output_columns_suffix="_test"
        )

        # Concatenate the train and test, sort by test set importance and reorder the columns
        self.importance_df = pd.concat(
            [self.importance_df_train, self.importance_df_test], axis=1
        ).sort_values("mean_abs_shap_value_test", ascending=False)[
            [
                "mean_abs_shap_value_test",
                "mean_abs_shap_value_train",
                "mean_shap_value_test",
                "mean_shap_value_train",
            ]
        ]

        if return_scores:
            return self.importance_df, self.train_score, self.test_score
        else:
            return self.importance_df
Exemplo n.º 2
0
    def fit(self, X, y, column_names=None):
        """
        Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
             eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
             or [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
             object assigned as clf, the hyperparameter optimization is applied first. Then, the SHAP feature importance
             is calculated using Cross-Validation, and `step` lowest importance features are removed.

        Args:
            X (pd.DataFrame):
                Provided dataset.

            y (pd.Series):
                Binary labels for X.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.
        Returns:
            (ShapRFECV): Fitted object.
        """
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        self.X, self.column_names = preprocess_data(X,
                                                    X_name='X',
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name='y',
                                   index=self.X.index,
                                   verbose=self.verbose)
        self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

        remaining_features = current_features_set = self.column_names
        round_number = 0

        while len(current_features_set) > self.min_features_to_select:
            round_number += 1

            # Get current dataset info
            current_features_set = remaining_features
            current_X = self.X[current_features_set]

            # Set seed for results reproducibility
            if self.random_state is not None:
                np.random.seed(self.random_state)

            # Optimize parameters
            if self.search_clf:
                current_search_clf = clone(self.clf).fit(current_X, self.y)
                current_clf = current_search_clf.estimator.set_params(
                    **current_search_clf.best_params_)
            else:
                current_clf = clone(self.clf)

            # Perform CV to estimate feature importance with SHAP
            results_per_fold = Parallel(n_jobs=self.n_jobs)(
                delayed(self._get_feature_shap_values_per_fold)(
                    X=current_X,
                    y=self.y,
                    clf=current_clf,
                    train_index=train_index,
                    val_index=val_index,
                    scorer=self.scorer.scorer,
                    verbose=self.verbose)
                for train_index, val_index in self.cv.split(current_X, self.y))

            shap_values = np.vstack(
                [current_result[0] for current_result in results_per_fold])
            scores_train = [
                current_result[1] for current_result in results_per_fold
            ]
            scores_val = [
                current_result[2] for current_result in results_per_fold
            ]

            shap_importance_df = calculate_shap_importance(
                shap_values, remaining_features)

            # Get features to remove
            features_to_remove = self._get_current_features_to_remove(
                shap_importance_df)
            remaining_features = list(
                set(current_features_set) - set(features_to_remove))

            # Report results
            self._report_current_results(
                round_number=round_number,
                current_features_set=current_features_set,
                features_to_remove=features_to_remove,
                train_metric_mean=np.round(np.mean(scores_train), 3),
                train_metric_std=np.round(np.std(scores_train), 3),
                val_metric_mean=np.round(np.mean(scores_val), 3),
                val_metric_std=np.round(np.std(scores_val), 3))
            if self.verbose > 50:
                print(
                    f'Round: {round_number}, Current number of features: {len(current_features_set)}, '
                    f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
                    f'{self.report_df.loc[round_number]["val_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
                    f'Num of features left: {len(remaining_features)}. '
                    f'Removed features at the end of the round: {features_to_remove}'
                )
        self.fitted = True
        return self
Exemplo n.º 3
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            columns_to_keep=None,
            column_names=None,
            **shap_kwargs):
        """
        Fits the object with the provided data.

        The algorithm starts with the entire dataset, and then sequentially
             eliminates features. If sklearn compatible search CV is passed as clf e.g.
             [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html),
             [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
             or [BayesSearchCV](https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html),
             the hyperparameter optimization is applied at each step of the elimination.
             Then, the SHAP feature importance is calculated using Cross-Validation,
             and `step` lowest importance features are removed.

        Args:
            X (pd.DataFrame):
                Provided dataset.

            y (pd.Series):
                Binary labels for X.

            sample_weight (pd.Series, np.ndarray, list, optional):
                array-like of shape (n_samples,) - only use if the model you're using supports
                sample weighting (check the corresponding scikit-learn documentation).
                Array of weights that are assigned to individual samples.
                Note that they're only used for fitting of  the model, not during evaluation of metrics.
                If not provided, then each sample is given unit weight.

            columns_to_keep (list of str, optional):
                List of column names to keep. If given,
                these columns will not be eliminated by the feature elimination process.
                However, these feature will used for the calculation of the SHAP values.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.

            **shap_kwargs:
                keyword arguments passed to
                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
                `check_additivity=False` disables the additivity check inside SHAP.

        Returns:
            (ShapRFECV): Fitted object.
        """
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # If to columns_to_keep is not provided, then initialise it by an empty string.
        # If provided check if all the elements in columns_to_keep are of type string.
        if columns_to_keep is None:
            len_columns_to_keep = 0
        else:
            if all(isinstance(x, str) for x in columns_to_keep):
                len_columns_to_keep = len(columns_to_keep)
            else:
                raise (ValueError(
                    "The current values of columns_to_keep are not allowed.All the elements should be strings."
                ))

        # If the columns_to_keep parameter is provided, check if they match the column names in the X.
        if column_names is not None:
            if all(x in column_names for x in list(X.columns)):
                pass
            else:
                raise (ValueError(
                    "The column names in parameter columns_to_keep and column_names are not macthing."
                ))

        # Check that the total number of columns to select is less than total number of columns in the data.
        # only when both parameters are provided.
        if column_names is not None and columns_to_keep is not None:
            if (self.min_features_to_select + len_columns_to_keep) > len(
                    self.column_names):
                raise ValueError(
                    "Minimum features to select is greater than number of features."
                    "Lower the value for min_features_to_select or number of columns in columns_to_keep"
                )

        self.X, self.column_names = preprocess_data(X,
                                                    X_name="X",
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name="y",
                                   index=self.X.index,
                                   verbose=self.verbose)
        if sample_weight is not None:
            if self.verbose > 0:
                warnings.warn(
                    "sample_weight is passed only to the fit method of the model, not the evaluation metrics."
                )
            sample_weight = assure_pandas_series(sample_weight,
                                                 index=self.X.index)
        self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

        remaining_features = current_features_set = self.column_names
        round_number = 0

        # Stop when stopping criteria is met.
        stopping_criteria = np.max(
            [self.min_features_to_select, len_columns_to_keep])

        # Setting up the min_features_to_select parameter.
        if columns_to_keep is None:
            pass
        else:
            self.min_features_to_select = 0
            # This ensures that, if columns_to_keep is provided ,
            # the last features remaining are only the columns_to_keep.
            if self.verbose > 50:
                warnings.warn(
                    f"Minimum features to select : {stopping_criteria}")

        while len(current_features_set) > stopping_criteria:
            round_number += 1

            # Get current dataset info
            current_features_set = remaining_features
            if columns_to_keep is None:
                remaining_removeable_features = list(set(current_features_set))
            else:
                remaining_removeable_features = list(
                    set(current_features_set) | set(columns_to_keep))
            current_X = self.X[remaining_removeable_features]

            # Set seed for results reproducibility
            if self.random_state is not None:
                np.random.seed(self.random_state)

            # Optimize parameters
            if self.search_clf:
                current_search_clf = clone(self.clf).fit(current_X, self.y)
                current_clf = current_search_clf.estimator.set_params(
                    **current_search_clf.best_params_)
            else:
                current_clf = clone(self.clf)

            # Perform CV to estimate feature importance with SHAP
            results_per_fold = Parallel(n_jobs=self.n_jobs)(
                delayed(self._get_feature_shap_values_per_fold)(
                    X=current_X,
                    y=self.y,
                    clf=current_clf,
                    train_index=train_index,
                    val_index=val_index,
                    sample_weight=sample_weight,
                    **shap_kwargs,
                )
                for train_index, val_index in self.cv.split(current_X, self.y))

            shap_values = np.vstack(
                [current_result[0] for current_result in results_per_fold])
            scores_train = [
                current_result[1] for current_result in results_per_fold
            ]
            scores_val = [
                current_result[2] for current_result in results_per_fold
            ]

            # Calculate the shap features with remaining features and features to keep.

            shap_importance_df = calculate_shap_importance(
                shap_values, remaining_removeable_features)

            # Get features to remove
            features_to_remove = self._get_current_features_to_remove(
                shap_importance_df, columns_to_keep=columns_to_keep)
            remaining_features = list(
                set(current_features_set) - set(features_to_remove))

            # Report results
            self._report_current_results(
                round_number=round_number,
                current_features_set=current_features_set,
                features_to_remove=features_to_remove,
                train_metric_mean=np.round(np.mean(scores_train), 3),
                train_metric_std=np.round(np.std(scores_train), 3),
                val_metric_mean=np.round(np.mean(scores_val), 3),
                val_metric_std=np.round(np.std(scores_val), 3),
            )
            if self.verbose > 50:
                print(
                    f"Round: {round_number}, Current number of features: {len(current_features_set)}, "
                    f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
                    f'{self.report_df.loc[round_number]["val_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
                    f"Features left: {remaining_features}. "
                    f"Removed features at the end of the round: {features_to_remove}"
                )
        self.fitted = True
        return self
    def fit(self, X, y, columns_to_keep=None, column_names=None):
        """
        Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
             eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
             or [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
             object assigned as clf, the hyperparameter optimization is applied first. Then, the SHAP feature importance
             is calculated using Cross-Validation, and `step` lowest importance features are removed.

        Args:
            X (pd.DataFrame):
                Provided dataset.

            y (pd.Series):
                Binary labels for X.

            columns_to_keep(list of str,optional):
                List of column names to keep. If given, these columns will not be eliminated by the feature elimination process.
                However, these feature will used for the calculation of the SHAP values.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.
        Returns:
            (ShapRFECV): Fitted object.
        """
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # If to columns_to_keep is not provided, then initialise it by an empty string.
        # If provided check if all the elements in columns_to_keep are of type string.
        if columns_to_keep is None:
            len_columns_to_keep = 0
        else:
            if all(isinstance(x, str) for x in columns_to_keep):
                len_columns_to_keep = len(columns_to_keep)
            else:
                raise (ValueError(
                    'The current values of columns_to_keep are not allowed.All the elements should be strings.'
                ))

        # If the columns_to_keep parameter is provided, check if they match the column names in the X.
        if column_names is not None:
            if all(x in column_names for x in list(X.columns)):
                pass
            else:
                raise (ValueError(
                    'The column names in parameter columns_to_keep and column_names are not macthing.'
                ))

        #Check that the total number of columns to select is less than total number of columns in the data.
        #only when both parameters are provided.
        if column_names is not None and columns_to_keep is not None:
            if (self.min_features_to_select + len_columns_to_keep) > len(
                    self.column_names):
                raise ValueError(
                    'Minimum features to select is greater than number of features.'
                    'Lower the value for min_features_to_select or number of columns in columns_to_keep'
                )

        self.X, self.column_names = preprocess_data(X,
                                                    X_name='X',
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name='y',
                                   index=self.X.index,
                                   verbose=self.verbose)
        self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

        remaining_features = current_features_set = self.column_names
        round_number = 0

        #Stop when stopping criteria is met.
        stopping_criteria = np.max(
            [self.min_features_to_select, len_columns_to_keep])

        #Setting up the min_features_to_select parameter.
        if columns_to_keep is None:
            pass
        else:
            self.min_features_to_select = 0
            #This ensures that, if columns_to_keep is provided ,the last features remaining are only the columns_to_keep.
            if self.verbose > 50:
                warnings.warn(
                    f'Minimum features to select : {stopping_criteria}')

        while len(current_features_set) > stopping_criteria:
            round_number += 1

            # Get current dataset info
            current_features_set = remaining_features
            if columns_to_keep is None:
                remaining_removeable_features = list(set(current_features_set))
            else:
                remaining_removeable_features = list(
                    set(current_features_set) | set(columns_to_keep))
            current_X = self.X[remaining_removeable_features]

            # Set seed for results reproducibility
            if self.random_state is not None:
                np.random.seed(self.random_state)

            # Optimize parameters
            if self.search_clf:
                current_search_clf = clone(self.clf).fit(current_X, self.y)
                current_clf = current_search_clf.estimator.set_params(
                    **current_search_clf.best_params_)
            else:
                current_clf = clone(self.clf)

            # Perform CV to estimate feature importance with SHAP
            results_per_fold = Parallel(n_jobs=self.n_jobs)(
                delayed(self._get_feature_shap_values_per_fold)(
                    X=current_X,
                    y=self.y,
                    clf=current_clf,
                    train_index=train_index,
                    val_index=val_index,
                    scorer=self.scorer.scorer,
                    verbose=self.verbose)
                for train_index, val_index in self.cv.split(current_X, self.y))

            shap_values = np.vstack(
                [current_result[0] for current_result in results_per_fold])
            scores_train = [
                current_result[1] for current_result in results_per_fold
            ]
            scores_val = [
                current_result[2] for current_result in results_per_fold
            ]

            #Calculate the shap features with remaining features and features to keep.

            shap_importance_df = calculate_shap_importance(
                shap_values, remaining_removeable_features)

            # Get features to remove
            features_to_remove = self._get_current_features_to_remove(
                shap_importance_df, columns_to_keep=columns_to_keep)
            remaining_features = list(
                set(current_features_set) - set(features_to_remove))

            # Report results
            self._report_current_results(
                round_number=round_number,
                current_features_set=current_features_set,
                features_to_remove=features_to_remove,
                train_metric_mean=np.round(np.mean(scores_train), 3),
                train_metric_std=np.round(np.std(scores_train), 3),
                val_metric_mean=np.round(np.mean(scores_val), 3),
                val_metric_std=np.round(np.std(scores_val), 3))
            if self.verbose > 50:
                print(
                    f'Round: {round_number}, Current number of features: {len(current_features_set)}, '
                    f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
                    f'{self.report_df.loc[round_number]["val_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
                    f'Features left: {remaining_features}. '
                    f'Removed features at the end of the round: {features_to_remove}'
                )
        self.fitted = True
        return self