def random_boruta(self): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] brfmodel = BalancedRandomForestClassifier(**data) cv = StratifiedKFold(n_splits=5, shuffle=True) for train_index, test_index in cv.split(self.X, self.y): X_train = self.X.iloc[lambda x: train_index] X_test = self.X.iloc[lambda x: test_index] y_train = np.take(self.y, train_index) y_test = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(X_train) vX_train = imputer.transform(X_train) imputertest = median_imputer.fit(X_test) vX_test = imputertest.transform(X_test) X_train = pd.DataFrame(vX_train, columns=X_train.columns, index=X_train.index) X_test = pd.DataFrame(vX_test, columns=X_test.columns, index=X_test.index) Feature_Selector = BorutaShap(model=brfmodel, importance_measure='shap', percentile=85, pvalue=0.08, classification=True) Feature_Selector.fit(X_train, y_train, n_trials=200, random_state=0) Feature_Selector.TentativeRoughFix() Feature_Selector.plot(X_size=12, figsize=(12, 8), y_scale='log', which_features='all') Xstrain = Feature_Selector.Subset() selected = [x for x in Xstrain.columns] print('features selected', selected) v_test_X = median_imputer.fit_transform(self.X_test) test_X = pd.DataFrame(v_test_X, columns=self.X_test.columns, index=self.X_test.index) valx = self.X_validation valy = self.y_validation vvalX = imputer.transform(valx) valx = pd.DataFrame(vvalX, columns=valx.columns, index=valx.index) print('AUC') brfmodel.fit(X_train, y_train) roc = roc_auc_score(y_test, brfmodel.predict_proba(X_test)[:, 1]) print(roc) print('AUC Validation') roc_test = roc_auc_score( self.y_validation, brfmodel.predict_proba(valx)[:, 1]) print(roc_test) print('AUC ridotte') brfmodel.fit(Xstrain, y_train) roc = roc_auc_score( y_test, brfmodel.predict_proba(X_test[selected])[:, 1]) print(roc) roc_test = roc_auc_score( self.y_validation, brfmodel.predict_proba(valx[selected])[:, 1]) print(roc_test)
# + [markdown] heading_collapsed=true hidden=true # ### BorutaShap # + [markdown] hidden=true # This initialization takes a maximum of 5 parameters including a tree based model of your choice example a **“Decision Tree” or “XGBoost” or "CatBoost" default is a “Random Forest”**. Which importance metric you would like to evaluate the features importance with either **Shapley values (default) or Gini importance**, A flag to specify if the problem is either classification or regression, a percentile parameter which will take a percentage of the max shadow feature thus making the selector less strict and finally a p-value or significance level which a after a feature will be either rejected or accepted. # + hidden=true model_xgb = XGBClassifier(objective='binary:logistic') Feature_Selector_xgb = BorutaShap(model=model_xgb, importance_measure='shap', classification=True) Feature_Selector_xgb.fit(X=X_one_hot, y=y, n_trials=25, random_state=0) # + hidden=true #Returns a subset of the original data with the selected features X_subset_xgb = Feature_Selector_xgb.Subset() print(X_subset_xgb.shape) X_subset_xgb.head() # + hidden=true model_cat = CatBoostClassifier() Feature_Selector_cat = BorutaShap(model=model_cat, importance_measure='shap', classification=True) Feature_Selector_cat.fit(X=X_one_hot, y=y, n_trials=25, random_state=0) # + hidden=true #Returns a subset of the original data with the selected features X_subset_cat = Feature_Selector_cat.Subset() print(X_subset_cat.shape) X_subset_cat.head()