示例#1
0
    def random_boruta(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:

            dati = json.load(f)
            for data in dati:
                del data['value']

                brfmodel = BalancedRandomForestClassifier(**data)

                cv = StratifiedKFold(n_splits=5, shuffle=True)

                for train_index, test_index in cv.split(self.X, self.y):

                    X_train = self.X.iloc[lambda x: train_index]
                    X_test = self.X.iloc[lambda x: test_index]
                    y_train = np.take(self.y, train_index)
                    y_test = np.take(self.y, test_index)

                    median_imputer = SimpleImputer(missing_values=np.NaN,
                                                   strategy='median')
                    imputer = median_imputer.fit(X_train)
                    vX_train = imputer.transform(X_train)
                    imputertest = median_imputer.fit(X_test)
                    vX_test = imputertest.transform(X_test)

                    X_train = pd.DataFrame(vX_train,
                                           columns=X_train.columns,
                                           index=X_train.index)
                    X_test = pd.DataFrame(vX_test,
                                          columns=X_test.columns,
                                          index=X_test.index)
                    Feature_Selector = BorutaShap(model=brfmodel,
                                                  importance_measure='shap',
                                                  percentile=85,
                                                  pvalue=0.08,
                                                  classification=True)

                    Feature_Selector.fit(X_train,
                                         y_train,
                                         n_trials=200,
                                         random_state=0)
                    Feature_Selector.TentativeRoughFix()

                    Feature_Selector.plot(X_size=12,
                                          figsize=(12, 8),
                                          y_scale='log',
                                          which_features='all')

                    Xstrain = Feature_Selector.Subset()
                    selected = [x for x in Xstrain.columns]
                    print('features selected', selected)

                    v_test_X = median_imputer.fit_transform(self.X_test)
                    test_X = pd.DataFrame(v_test_X,
                                          columns=self.X_test.columns,
                                          index=self.X_test.index)

                    valx = self.X_validation
                    valy = self.y_validation
                    vvalX = imputer.transform(valx)
                    valx = pd.DataFrame(vvalX,
                                        columns=valx.columns,
                                        index=valx.index)

                    print('AUC')
                    brfmodel.fit(X_train, y_train)
                    roc = roc_auc_score(y_test,
                                        brfmodel.predict_proba(X_test)[:, 1])
                    print(roc)

                    print('AUC Validation')
                    roc_test = roc_auc_score(
                        self.y_validation,
                        brfmodel.predict_proba(valx)[:, 1])

                    print(roc_test)

                    print('AUC ridotte')
                    brfmodel.fit(Xstrain, y_train)
                    roc = roc_auc_score(
                        y_test,
                        brfmodel.predict_proba(X_test[selected])[:, 1])

                    print(roc)
                    roc_test = roc_auc_score(
                        self.y_validation,
                        brfmodel.predict_proba(valx[selected])[:, 1])

                    print(roc_test)
示例#2
0
# + [markdown] heading_collapsed=true hidden=true
# ### BorutaShap

# + [markdown] hidden=true
# This initialization takes a maximum of 5 parameters including a tree based model of your choice example a **“Decision Tree” or “XGBoost” or "CatBoost" default is a “Random Forest”**. Which importance metric you would like to evaluate the features importance with either **Shapley values (default) or Gini importance**, A flag to specify if the problem is either classification or regression, a percentile parameter which will take a percentage of the max shadow feature thus making the selector less strict and finally a p-value or significance level which a after a feature will be either rejected or accepted.

# + hidden=true
model_xgb = XGBClassifier(objective='binary:logistic')
Feature_Selector_xgb = BorutaShap(model=model_xgb,
                                  importance_measure='shap',
                                  classification=True)
Feature_Selector_xgb.fit(X=X_one_hot, y=y, n_trials=25, random_state=0)

# + hidden=true
#Returns a subset of the original data with the selected features
X_subset_xgb = Feature_Selector_xgb.Subset()
print(X_subset_xgb.shape)
X_subset_xgb.head()

# + hidden=true
model_cat = CatBoostClassifier()
Feature_Selector_cat = BorutaShap(model=model_cat,
                                  importance_measure='shap',
                                  classification=True)
Feature_Selector_cat.fit(X=X_one_hot, y=y, n_trials=25, random_state=0)

# + hidden=true
#Returns a subset of the original data with the selected features
X_subset_cat = Feature_Selector_cat.Subset()
print(X_subset_cat.shape)
X_subset_cat.head()