def fit(self, X, y): """Fit BorutaSHAP to data.""" # Convert X and y to pandas.DataFrame and series from BorutaShap import BorutaShap self.__boruta_ = BorutaShap( model=self.model, importance_measure="shap", classification=self.classification, percentile=self.percentile, pvalue=self.pvalue, ) if self.column_names is None: self.column_names = [str(i) for i in range(X.shape[1])] else: assert X.shape[1] == len(self.column_names), "X is not compatible \ with column names provided." # BorutaSHAP is expensive so try to keep these to reasonable values. # If used in kfold CV the cost goes up very quickly. self.__boruta_.fit( X=pd.DataFrame(data=X, columns=self.column_names), y=pd.Series(data=y), n_trials=20, sample=False, train_or_test="test", # Does internal 70:30 train/test normalize=True, verbose=False, random_state=0, ) return self
def boruta_shap(self, X, y): """ Wrapper around BorutaShap package which is based on Boruta feature selection algorithm [1] with the addition of feature importance assessed using SHAP values [2]. Tree based algorithm are used to allow the use of TreeExplainer to compute SHAP values that scales linearly with the number of observations [3]. https://github.com/Ekeany/Boruta-Shap [1] Kursa and Rudnicki, Feature Selection with the Boruta Package. Journal of Statistical Software, 2010 [2] Lundberg and Lee, A Unified Approach to Interpreting Model Predictions. arXiv:1705.07874, 2017 [3] Lundberg et al., Consistent Individualized Feature Attribution for Tree Ensembles. arXiv:1802.03888, 2019 """ init_params_dic = { 'model': self.fit_params.get('model', None), 'importance_measure': self.fit_params.get('importance_measure', 'shap'), 'classification': self.fit_params.get('classification', self.classification), 'percentile': self.fit_params.get('percentile', 100), 'pvalue': self.fit_params.get('pvalue', 0.05) } fit_params = self.fit_params.copy() for init_params in init_params_dic: if init_params in fit_params: fit_params.pop(init_params) feature_selector = BorutaShap(**init_params_dic) X = pd.DataFrame(X, columns=[str(_) for _ in range(X.shape[1])]) fit_params['random_state'] = fit_params.get('random_state', self.random_state) fit_params['n_trials'] = fit_params.get('n_trials', self.n_bsamples) fit_params['verbose'] = fit_params.get('verbose', False) feature_selector.fit(X, y, **fit_params) if feature_selector.tentative: # Might get some undecided features after fit # Method which compares the median values of the max shadow feature and the undecided features feature_selector.TentativeRoughFix() self.accepted_features_index = [ int(_) for _ in feature_selector.accepted ]
def Test_Models(data_type, models): X, y = load_data(data_type=data_type) for key, value in models.items(): print('Testing: ' + str(key)) # no model selected default is Random Forest, if classification is False it is a Regression problem Feature_Selector = BorutaShap(model=value, importance_measure='shap', classification=True) Feature_Selector.fit(X=X, y=y, n_trials=5, random_state=0, train_or_test='train') # Returns Boxplot of features disaplay False or True to see the plots for automation False Feature_Selector.plot(X_size=12, figsize=(12, 8), y_scale='log', which_features='all', display=False)
class PipeBorutaSHAP: """ BorutaSHAP feature selector for pipelines. Create a BorutaSHAP instance that is compatible with scikit-learn's estimator API and can be used in sklearn and imblearn's pipelines. This is essentially a wrapper for [BorutaSHAP](https://github.com/Ekeany/Boruta-Shap). See documentation therein for additional details. This requires input as a Pandas DataFrame so an internal conversion will be performed. Also, you must provide the names of the original columns (in order) at instantiation. BorutaSHAP works with tree-based models which do not require scaling or other preprocessing, therefore this stage can actually be put in the pipeline either before or after standard scaling (see example below). Notes ----- BorutaSHAP is expensive; default parameters are set to be gentle but it can dramatically increase the cost of nested CV or grid searching. Leave `column_names` as None in pipelines which have feature engineers that can change the number of components. PipeBorutaSHAP will just label columns with integers to handle things consistently internally. Example ------- >>> X, y = pd.read_csv(...), pd.read_csv(...) >>> pipeline = imblearn.pipeline.Pipeline(steps=[ ... ("smote", ScaledSMOTEENN(k_enn=5, kind_sel_enn='mode')), ... ("scaler", StandardScaler()), ... ("boruta", PipeBorutaSHAP(column_names=X.columns)), ... ('tree', DecisionTreeClassifier(random_state=0)) ... ]) >>> param_grid = [ ... {'smote__k_enn':[3, 5], ... 'smote__kind_sel_enn':['all', 'mode'], ... 'tree__max_depth':[3,5], ... 'boruta__pvalue':[0.05, 0.1] ... }] >>> gs = GridSearchCV(estimator=pipeline, ... param_grid=param_grid, ... n_jobs=-1, ... cv=StratifiedKFold(n_splits=2, random_state=1, shuffle=True) ... ) >>> gs.fit(X.values, y.values) >>> # OR, ... >>> NestedCV().grid_search(pipeline, param_grid, X.values, y.values) """ def __init__( self, column_names=None, model=RF( n_estimators=100, criterion="entropy", random_state=0, class_weight="balanced", ), classification=True, percentile=100, pvalue=0.05, ): """Instantiate the class.""" self.set_params( **{ "column_names": column_names, "model": model, "classification": classification, "percentile": percentile, "pvalue": pvalue, }) return def set_params(self, **parameters): """Set parameters; for consistency with sklearn's estimator API.""" for parameter, value in parameters.items(): setattr(self, parameter, value) return self def get_params(self, deep=True): """Get parameters; for consistency with sklearn's estimator API.""" return { "column_names": self.column_names, "model": self.model, "classification": self.classification, "percentile": self.percentile, "pvalue": self.pvalue, } def fit(self, X, y): """Fit BorutaSHAP to data.""" # Convert X and y to pandas.DataFrame and series from BorutaShap import BorutaShap self.__boruta_ = BorutaShap( model=self.model, importance_measure="shap", classification=self.classification, percentile=self.percentile, pvalue=self.pvalue, ) if self.column_names is None: self.column_names = [str(i) for i in range(X.shape[1])] else: assert X.shape[1] == len(self.column_names), "X is not compatible \ with column names provided." # BorutaSHAP is expensive so try to keep these to reasonable values. # If used in kfold CV the cost goes up very quickly. self.__boruta_.fit( X=pd.DataFrame(data=X, columns=self.column_names), y=pd.Series(data=y), n_trials=20, sample=False, train_or_test="test", # Does internal 70:30 train/test normalize=True, verbose=False, random_state=0, ) return self def transform(self, X): """Select the columns that were deemed important.""" # Could reorder X relative to original input? return pd.DataFrame(data=X, columns=self.column_names)[self.accepted].values @property def accepted(self): """Get the columns that are important.""" return self.__boruta_.accepted @property def rejected(self): """Get the columns that are not important.""" return self.__boruta_.rejected
def random_boruta(self): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] brfmodel = BalancedRandomForestClassifier(**data) cv = StratifiedKFold(n_splits=5, shuffle=True) for train_index, test_index in cv.split(self.X, self.y): X_train = self.X.iloc[lambda x: train_index] X_test = self.X.iloc[lambda x: test_index] y_train = np.take(self.y, train_index) y_test = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(X_train) vX_train = imputer.transform(X_train) imputertest = median_imputer.fit(X_test) vX_test = imputertest.transform(X_test) X_train = pd.DataFrame(vX_train, columns=X_train.columns, index=X_train.index) X_test = pd.DataFrame(vX_test, columns=X_test.columns, index=X_test.index) Feature_Selector = BorutaShap(model=brfmodel, importance_measure='shap', percentile=85, pvalue=0.08, classification=True) Feature_Selector.fit(X_train, y_train, n_trials=200, random_state=0) Feature_Selector.TentativeRoughFix() Feature_Selector.plot(X_size=12, figsize=(12, 8), y_scale='log', which_features='all') Xstrain = Feature_Selector.Subset() selected = [x for x in Xstrain.columns] print('features selected', selected) v_test_X = median_imputer.fit_transform(self.X_test) test_X = pd.DataFrame(v_test_X, columns=self.X_test.columns, index=self.X_test.index) valx = self.X_validation valy = self.y_validation vvalX = imputer.transform(valx) valx = pd.DataFrame(vvalX, columns=valx.columns, index=valx.index) print('AUC') brfmodel.fit(X_train, y_train) roc = roc_auc_score(y_test, brfmodel.predict_proba(X_test)[:, 1]) print(roc) print('AUC Validation') roc_test = roc_auc_score( self.y_validation, brfmodel.predict_proba(valx)[:, 1]) print(roc_test) print('AUC ridotte') brfmodel.fit(Xstrain, y_train) roc = roc_auc_score( y_test, brfmodel.predict_proba(X_test[selected])[:, 1]) print(roc) roc_test = roc_auc_score( self.y_validation, brfmodel.predict_proba(valx[selected])[:, 1]) print(roc_test)
def test_class_constructs(): BorutaShap()
X_filtered.head() # + hidden=true X_new = X_one_hot[['age', 'avg_glucose_level', 'heart_disease_1']] X_new.head() # + [markdown] heading_collapsed=true hidden=true # ### BorutaShap # + [markdown] hidden=true # This initialization takes a maximum of 5 parameters including a tree based model of your choice example a **“Decision Tree” or “XGBoost” or "CatBoost" default is a “Random Forest”**. Which importance metric you would like to evaluate the features importance with either **Shapley values (default) or Gini importance**, A flag to specify if the problem is either classification or regression, a percentile parameter which will take a percentage of the max shadow feature thus making the selector less strict and finally a p-value or significance level which a after a feature will be either rejected or accepted. # + hidden=true model_xgb = XGBClassifier(objective='binary:logistic') Feature_Selector_xgb = BorutaShap(model=model_xgb, importance_measure='shap', classification=True) Feature_Selector_xgb.fit(X=X_one_hot, y=y, n_trials=25, random_state=0) # + hidden=true #Returns a subset of the original data with the selected features X_subset_xgb = Feature_Selector_xgb.Subset() print(X_subset_xgb.shape) X_subset_xgb.head() # + hidden=true model_cat = CatBoostClassifier() Feature_Selector_cat = BorutaShap(model=model_cat, importance_measure='shap', classification=True) Feature_Selector_cat.fit(X=X_one_hot, y=y, n_trials=25, random_state=0)