示例#1
0
    def fit(self, X, y):
        """Fit BorutaSHAP to data."""
        # Convert X and y to pandas.DataFrame and series
        from BorutaShap import BorutaShap

        self.__boruta_ = BorutaShap(
            model=self.model,
            importance_measure="shap",
            classification=self.classification,
            percentile=self.percentile,
            pvalue=self.pvalue,
        )

        if self.column_names is None:
            self.column_names = [str(i) for i in range(X.shape[1])]
        else:
            assert X.shape[1] == len(self.column_names), "X is not compatible \
            with column names provided."

        # BorutaSHAP is expensive so try to keep these to reasonable values.
        # If used in kfold CV the cost goes up very quickly.
        self.__boruta_.fit(
            X=pd.DataFrame(data=X, columns=self.column_names),
            y=pd.Series(data=y),
            n_trials=20,
            sample=False,
            train_or_test="test",  # Does internal 70:30 train/test
            normalize=True,
            verbose=False,
            random_state=0,
        )

        return self
 def boruta_shap(self, X, y):
     """
     Wrapper around BorutaShap package which is based on Boruta feature selection algorithm [1] with the addition
     of feature importance assessed using SHAP values [2]. Tree based algorithm are used to allow the use of
     TreeExplainer to compute SHAP values that scales linearly with the number of observations [3].
     https://github.com/Ekeany/Boruta-Shap
     [1] Kursa and Rudnicki, Feature Selection with the Boruta Package. Journal of Statistical Software, 2010
     [2] Lundberg and Lee, A Unified Approach to Interpreting Model Predictions. arXiv:1705.07874, 2017
     [3] Lundberg et al., Consistent Individualized Feature Attribution for Tree Ensembles. arXiv:1802.03888, 2019
     """
     init_params_dic = {
         'model':
         self.fit_params.get('model', None),
         'importance_measure':
         self.fit_params.get('importance_measure', 'shap'),
         'classification':
         self.fit_params.get('classification', self.classification),
         'percentile':
         self.fit_params.get('percentile', 100),
         'pvalue':
         self.fit_params.get('pvalue', 0.05)
     }
     fit_params = self.fit_params.copy()
     for init_params in init_params_dic:
         if init_params in fit_params:
             fit_params.pop(init_params)
     feature_selector = BorutaShap(**init_params_dic)
     X = pd.DataFrame(X, columns=[str(_) for _ in range(X.shape[1])])
     fit_params['random_state'] = fit_params.get('random_state',
                                                 self.random_state)
     fit_params['n_trials'] = fit_params.get('n_trials', self.n_bsamples)
     fit_params['verbose'] = fit_params.get('verbose', False)
     feature_selector.fit(X, y, **fit_params)
     if feature_selector.tentative:
         # Might get some undecided features after fit
         # Method which compares the median values of the max shadow feature and the undecided features
         feature_selector.TentativeRoughFix()
     self.accepted_features_index = [
         int(_) for _ in feature_selector.accepted
     ]
示例#3
0
def Test_Models(data_type, models):

    X, y = load_data(data_type=data_type)

    for key, value in models.items():

        print('Testing: ' + str(key))
        # no model selected default is Random Forest, if classification is False it is a Regression problem
        Feature_Selector = BorutaShap(model=value,
                                      importance_measure='shap',
                                      classification=True)

        Feature_Selector.fit(X=X,
                             y=y,
                             n_trials=5,
                             random_state=0,
                             train_or_test='train')

        # Returns Boxplot of features disaplay False or True to see the plots for automation False
        Feature_Selector.plot(X_size=12,
                              figsize=(12, 8),
                              y_scale='log',
                              which_features='all',
                              display=False)
示例#4
0
class PipeBorutaSHAP:
    """
    BorutaSHAP feature selector for pipelines.

    Create a BorutaSHAP instance that is compatible with
    scikit-learn's estimator API and can be used in sklearn and
    imblearn's pipelines.

    This is essentially a wrapper for
    [BorutaSHAP](https://github.com/Ekeany/Boruta-Shap). See
    documentation therein for additional details. This requires input as a
    Pandas DataFrame so an internal conversion will be performed.  Also,
    you must provide the names of the original columns (in order) at
    instantiation.

    BorutaSHAP works with tree-based models which do not require scaling or
    other preprocessing, therefore this stage can actually be put in the
    pipeline either before or after standard scaling (see example below).

    Notes
    -----
    BorutaSHAP is expensive; default parameters are set to be gentle but it can
    dramatically increase the cost of nested CV or grid searching.

    Leave `column_names` as None in pipelines which have feature engineers that
    can change the number of components. PipeBorutaSHAP will just label columns
    with integers to handle things consistently internally.

    Example
    -------
    >>> X, y = pd.read_csv(...), pd.read_csv(...)
    >>> pipeline = imblearn.pipeline.Pipeline(steps=[
    ...     ("smote", ScaledSMOTEENN(k_enn=5, kind_sel_enn='mode')),
    ...     ("scaler", StandardScaler()),
    ...     ("boruta", PipeBorutaSHAP(column_names=X.columns)),
    ...     ('tree', DecisionTreeClassifier(random_state=0))
    ...     ])
    >>> param_grid = [
    ...     {'smote__k_enn':[3, 5],
    ...     'smote__kind_sel_enn':['all', 'mode'],
    ...     'tree__max_depth':[3,5],
    ...     'boruta__pvalue':[0.05, 0.1]
    ...     }]
    >>> gs = GridSearchCV(estimator=pipeline,
    ...     param_grid=param_grid,
    ...     n_jobs=-1,
    ...     cv=StratifiedKFold(n_splits=2, random_state=1, shuffle=True)
    ...     )
    >>> gs.fit(X.values, y.values)
    >>> # OR, ...
    >>> NestedCV().grid_search(pipeline, param_grid, X.values, y.values)
    """
    def __init__(
        self,
        column_names=None,
        model=RF(
            n_estimators=100,
            criterion="entropy",
            random_state=0,
            class_weight="balanced",
        ),
        classification=True,
        percentile=100,
        pvalue=0.05,
    ):
        """Instantiate the class."""
        self.set_params(
            **{
                "column_names": column_names,
                "model": model,
                "classification": classification,
                "percentile": percentile,
                "pvalue": pvalue,
            })
        return

    def set_params(self, **parameters):
        """Set parameters; for consistency with sklearn's estimator API."""
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def get_params(self, deep=True):
        """Get parameters; for consistency with sklearn's estimator API."""
        return {
            "column_names": self.column_names,
            "model": self.model,
            "classification": self.classification,
            "percentile": self.percentile,
            "pvalue": self.pvalue,
        }

    def fit(self, X, y):
        """Fit BorutaSHAP to data."""
        # Convert X and y to pandas.DataFrame and series
        from BorutaShap import BorutaShap

        self.__boruta_ = BorutaShap(
            model=self.model,
            importance_measure="shap",
            classification=self.classification,
            percentile=self.percentile,
            pvalue=self.pvalue,
        )

        if self.column_names is None:
            self.column_names = [str(i) for i in range(X.shape[1])]
        else:
            assert X.shape[1] == len(self.column_names), "X is not compatible \
            with column names provided."

        # BorutaSHAP is expensive so try to keep these to reasonable values.
        # If used in kfold CV the cost goes up very quickly.
        self.__boruta_.fit(
            X=pd.DataFrame(data=X, columns=self.column_names),
            y=pd.Series(data=y),
            n_trials=20,
            sample=False,
            train_or_test="test",  # Does internal 70:30 train/test
            normalize=True,
            verbose=False,
            random_state=0,
        )

        return self

    def transform(self, X):
        """Select the columns that were deemed important."""
        # Could reorder X relative to original input?
        return pd.DataFrame(data=X,
                            columns=self.column_names)[self.accepted].values

    @property
    def accepted(self):
        """Get the columns that are important."""
        return self.__boruta_.accepted

    @property
    def rejected(self):
        """Get the columns that are not important."""
        return self.__boruta_.rejected
示例#5
0
    def random_boruta(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:

            dati = json.load(f)
            for data in dati:
                del data['value']

                brfmodel = BalancedRandomForestClassifier(**data)

                cv = StratifiedKFold(n_splits=5, shuffle=True)

                for train_index, test_index in cv.split(self.X, self.y):

                    X_train = self.X.iloc[lambda x: train_index]
                    X_test = self.X.iloc[lambda x: test_index]
                    y_train = np.take(self.y, train_index)
                    y_test = np.take(self.y, test_index)

                    median_imputer = SimpleImputer(missing_values=np.NaN,
                                                   strategy='median')
                    imputer = median_imputer.fit(X_train)
                    vX_train = imputer.transform(X_train)
                    imputertest = median_imputer.fit(X_test)
                    vX_test = imputertest.transform(X_test)

                    X_train = pd.DataFrame(vX_train,
                                           columns=X_train.columns,
                                           index=X_train.index)
                    X_test = pd.DataFrame(vX_test,
                                          columns=X_test.columns,
                                          index=X_test.index)
                    Feature_Selector = BorutaShap(model=brfmodel,
                                                  importance_measure='shap',
                                                  percentile=85,
                                                  pvalue=0.08,
                                                  classification=True)

                    Feature_Selector.fit(X_train,
                                         y_train,
                                         n_trials=200,
                                         random_state=0)
                    Feature_Selector.TentativeRoughFix()

                    Feature_Selector.plot(X_size=12,
                                          figsize=(12, 8),
                                          y_scale='log',
                                          which_features='all')

                    Xstrain = Feature_Selector.Subset()
                    selected = [x for x in Xstrain.columns]
                    print('features selected', selected)

                    v_test_X = median_imputer.fit_transform(self.X_test)
                    test_X = pd.DataFrame(v_test_X,
                                          columns=self.X_test.columns,
                                          index=self.X_test.index)

                    valx = self.X_validation
                    valy = self.y_validation
                    vvalX = imputer.transform(valx)
                    valx = pd.DataFrame(vvalX,
                                        columns=valx.columns,
                                        index=valx.index)

                    print('AUC')
                    brfmodel.fit(X_train, y_train)
                    roc = roc_auc_score(y_test,
                                        brfmodel.predict_proba(X_test)[:, 1])
                    print(roc)

                    print('AUC Validation')
                    roc_test = roc_auc_score(
                        self.y_validation,
                        brfmodel.predict_proba(valx)[:, 1])

                    print(roc_test)

                    print('AUC ridotte')
                    brfmodel.fit(Xstrain, y_train)
                    roc = roc_auc_score(
                        y_test,
                        brfmodel.predict_proba(X_test[selected])[:, 1])

                    print(roc)
                    roc_test = roc_auc_score(
                        self.y_validation,
                        brfmodel.predict_proba(valx[selected])[:, 1])

                    print(roc_test)
示例#6
0
def test_class_constructs():
    BorutaShap()
示例#7
0
X_filtered.head()

# + hidden=true
X_new = X_one_hot[['age', 'avg_glucose_level', 'heart_disease_1']]
X_new.head()

# + [markdown] heading_collapsed=true hidden=true
# ### BorutaShap

# + [markdown] hidden=true
# This initialization takes a maximum of 5 parameters including a tree based model of your choice example a **“Decision Tree” or “XGBoost” or "CatBoost" default is a “Random Forest”**. Which importance metric you would like to evaluate the features importance with either **Shapley values (default) or Gini importance**, A flag to specify if the problem is either classification or regression, a percentile parameter which will take a percentage of the max shadow feature thus making the selector less strict and finally a p-value or significance level which a after a feature will be either rejected or accepted.

# + hidden=true
model_xgb = XGBClassifier(objective='binary:logistic')
Feature_Selector_xgb = BorutaShap(model=model_xgb,
                                  importance_measure='shap',
                                  classification=True)
Feature_Selector_xgb.fit(X=X_one_hot, y=y, n_trials=25, random_state=0)

# + hidden=true
#Returns a subset of the original data with the selected features
X_subset_xgb = Feature_Selector_xgb.Subset()
print(X_subset_xgb.shape)
X_subset_xgb.head()

# + hidden=true
model_cat = CatBoostClassifier()
Feature_Selector_cat = BorutaShap(model=model_cat,
                                  importance_measure='shap',
                                  classification=True)
Feature_Selector_cat.fit(X=X_one_hot, y=y, n_trials=25, random_state=0)