def select_features(X: pd.DataFrame, y: pd.Series, mode: str, n_estimators: int = 50, max_iter: int = 100, perc: int = 75, learning_rate: float = 0.01, verbosity: int = -1, seed: int = 1, max_depth: int = -1, random_state: int = 1, verbose: int = 2) -> List[str]: feat_estimator = LGBMFeatureEstimator( { "objective": "regression" if mode == "regression" else "binary", "metric": "rmse" if mode == "regression" else "auc", "learning_rate": learning_rate, "verbosity": verbosity, "seed": seed, "max_depth": max_depth, }, n_estimators) feat_selector = BorutaPy(feat_estimator, n_estimators=n_estimators, max_iter=max_iter, verbose=verbose, random_state=random_state, perc=perc) try: feat_selector.fit(X.values, y.values.ravel()) except TypeError: pass return X.columns[feat_selector.support_].tolist()
def test_if_boruta_extracts_relevant_features(self): np.random.seed(42) y = np.random.binomial(1, 0.5, 1000) X = np.zeros((1000, 10)) z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial( 1, 0.1, 1000) z[z == -1] = 0 z[z == 2] = 1 # 5 relevant features X[:, 0] = z X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal( 0, 0.1, 1000) X[:, 2] = y + np.random.normal(0, 1, 1000) X[:, 3] = y**2 + np.random.normal(0, 1, 1000) X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) # 5 irrelevant features X[:, 5] = np.random.normal(0, 1, 1000) X[:, 6] = np.random.poisson(1, 1000) X[:, 7] = np.random.binomial(1, 0.3, 1000) X[:, 8] = np.random.normal(0, 1, 1000) X[:, 9] = np.random.poisson(1, 1000) rfc = RandomForestClassifier() bt = BorutaPy(rfc) bt.fit(X, y) # make sure that only all the relevant features are returned self.assertListEqual(list(range(5)), list(np.where(bt.support_)[0]))
def test_if_boruta_extracts_relevant_features(self): np.random.seed(42) y = np.random.binomial(1, 0.5, 1000) X = np.zeros((1000, 10)) z = y - np.random.binomial(1, 0.1, 1000) + np.random.binomial(1, 0.1, 1000) z[z == -1] = 0 z[z == 2] = 1 # 5 relevant features X[:, 0] = z X[:, 1] = y * np.abs(np.random.normal(0, 1, 1000)) + np.random.normal(0, 0.1, 1000) X[:, 2] = y + np.random.normal(0, 1, 1000) X[:, 3] = y ** 2 + np.random.normal(0, 1, 1000) X[:, 4] = np.sqrt(y) + np.random.binomial(2, 0.1, 1000) # 5 irrelevant features X[:, 5] = np.random.normal(0, 1, 1000) X[:, 6] = np.random.poisson(1, 1000) X[:, 7] = np.random.binomial(1, 0.3, 1000) X[:, 8] = np.random.normal(0, 1, 1000) X[:, 9] = np.random.poisson(1, 1000) rfc = RandomForestClassifier() bt = BorutaPy(rfc) bt.fit(X, y) # make sure that only all the relevant features are returned self.assertItemsEqual(range(5), list(np.where(bt.support_)[0]))
def perform_boruta_fs(self): if self.ml_pipeline.config.fs_boruta_flg: xtrain = self.ml_pipeline.x_train xtest = self.ml_pipeline.x_test ytrain = self.ml_pipeline.y_train self.jlogger.info("Inside BorutaFS, Before Shape Train: {}".format( xtrain.shape)) self.jlogger.info("Inside BorutaFS, Before Shape Test: {}".format( xtest.shape)) # ytrain = ytrain.values.ravel() rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1) boruta_selector = BorutaPy(rfc, n_estimators='auto', random_state=50) boruta_selector.fit(xtrain.values, ytrain) xtrain_sel = boruta_selector.transform(xtrain.values) xtest_sel = boruta_selector.transform(xtest.values) sel_cols = xtrain.columns[boruta_selector.support_] # print("Inside BorutaFS, IN FeatureSelector get_feature_names ", sel_cols) train = pd.DataFrame(xtrain_sel, columns=sel_cols) test = pd.DataFrame(xtest_sel, columns=sel_cols) self.ml_pipeline.x_train = train self.ml_pipeline.x_test = test self.jlogger.info("Inside BorutaFS, After Shape Train: {}".format( train.shape)) self.jlogger.info("Inside BorutaFS, After Shape Test: {}".format( test.shape))
def run(self): print("Here : ") df_dummies = pd.read_csv( data_transformation().output()['output1'].path) X_all = pd.read_csv(data_transformation().output()['output2'].path) y_all = pd.read_csv(data_transformation().output()['output3'].path) X_boruta = X_all.values y_boruta = y_all.values y_boruta = np.insert(y_boruta, 7031, 'NO') # Define random forest classifier, with utilising all cores and sampling in proportion to y labels rfc = RandomForestClassifier(n_jobs=-1) # Define Boruta feature selection method feature_selector = BorutaPy(rfc, n_estimators='auto', random_state=1) # Find all relevant features feature_selector.fit(X_boruta, y_boruta) #Transposing dataframe for ranking df_features_rank = df_dummies.drop(['Churn'], axis=1).T # Check ranking of features df_features_rank['Boruta_Rank'] = feature_selector.ranking_ # Adding a variable 'Feature' in the dataframe df_features_rank['Feature'] = df_features_rank.index # Sort the dataframe as per Rank df_features_rank = df_features_rank.sort_values('Boruta_Rank') # Exctracting only top 2 ranked features df_top2_ranked_feature = df_features_rank.loc[ df_features_rank['Boruta_Rank'].isin([1, 2])] # Selecting important featutres selected_features = df_top2_ranked_feature.index X_selected = df_dummies[selected_features] y_selected = df_dummies["Churn"] print(self.output()) X_selected.to_csv(self.output()['output1'].path, index=False) y_selected.to_csv(self.output()['output2'].path, index=False)
def boruta_selector(df, y=None): Y = df[y] df = df.drop(y, axis=1) num_feat = df.select_dtypes(include=['int', 'float']).columns.tolist() cat_feat = df.select_dtypes(include=['object']).columns.tolist() pipe_num_tree = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]) pipe_cat_tree = Pipeline( steps=[('imputer', SimpleImputer( strategy='most_frequent')), ('cat_transformer', OrdinalEncoder())]) preprocessor_tree = ColumnTransformer( transformers=[('num_preprocessor', pipe_num_tree, num_feat), ('cat_preprocessor', pipe_cat_tree, cat_feat)]) RF = Pipeline( steps=[('preprocessor_rf', preprocessor_tree), ('model_rf', RandomForestClassifier(random_state=123, max_depth=5))]) X = preprocessor_tree.fit_transform(df) rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5) # Criando o boruta feat_selector = BorutaPy(rf, n_estimators='auto', random_state=123, max_iter=100) # 500 iterações até convergir feat_selector.fit(X, Y) # Terceiro filtro com as features selecionadas pelo boruta cols_drop_boruta = [not x for x in feat_selector.support_.tolist() ] # apenas invertendo o vetor de true/false cols_drop_boruta = df.loc[:, cols_drop_boruta].columns.tolist() return cols_drop_boruta
def fitBorutaRF( ): #Boruta documentation: https://pypi.python.org/pypi/Boruta/0.1.5 print('Feature selection from Boruta RandomForestClassifier: ') rf = RandomForestClassifier(n_jobs=-1, random_state=0, max_depth=5, class_weight='balanced') # define Boruta feature selection method feat_selector = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=0) # find all relevant features feat_selector.fit(X_train.values, y_train.values) # check selected features: feat_selector.support_ X_important_train = X_train.iloc[:, feat_selector.support_] X_important_test = X_test.iloc[:, feat_selector.support_] print("Boruta selected features for the model: ", list(X_important_train.columns)) # check ranking of features: feat_selector.ranking_ return X_important_train, X_important_test
def feature_engineering(X_all, y_all, df_dummies): # Change X and y to its values X_boruta = X_all.values y_boruta = y_all.values # Define random forest classifier, with utilising all cores and sampling in proportion to y labels rfc = RandomForestClassifier(n_jobs=-1) # Define Boruta feature selection method feature_selector = BorutaPy(rfc, n_estimators='auto', random_state=1) # Find all relevant features feature_selector.fit(X_boruta, y_boruta) #Transposing dataframe for ranking df_features_rank = df_dummies.drop(['Churn'], axis=1).T # Check ranking of features df_features_rank['Boruta_Rank'] = feature_selector.ranking_ # Adding a variable 'Feature' in the dataframe df_features_rank['Feature'] = df_features_rank.index # Sort the dataframe as per Rank df_features_rank = df_features_rank.sort_values('Boruta_Rank') # Exctracting only top 2 ranked features df_top2_ranked_feature = df_features_rank.loc[ df_features_rank['Boruta_Rank'].isin([1, 2])] # Selecting important featutres selected_features = df_top2_ranked_feature.index X_selected = df_dummies[selected_features] y_selected = df_dummies["Churn"] # Pickle the selected features for Form Uploads upload_featuredIndexFilePath = pickle_df_index(X_selected, 'featured_index_dict.pkl') return X_selected, y_selected, upload_featuredIndexFilePath
def select_features_by_boruta(X_train, X_test, y_train): model = RandomForestRegressor( n_estimators=50, max_depth=5, max_features='sqrt', n_jobs=-1, verbose=True, random_state=1 ) features_selector = BorutaPy( model, n_estimators='auto', perc=80, verbose=2, two_step=False, max_iter=100, random_state=1 ) features_selector.fit(X_train.values, y_train.values) X_train_selected = X_train.iloc[:, features_selector.support_] X_test_selected = X_test.iloc[:, features_selector.support_] feature_selected_cols = list(X_train_selected.columns) print('Selected features are: ', feature_selected_cols) return feature_selected_cols, X_train_selected, X_test_selected
def get_boruta(X, y): """ Returns the features selected by Boruta algorithm for the passed dataset :param X: Numpy array of features :param y: Numpy array of target feature """ from boruta import BorutaPy from sklearn.ensemble import RandomForestRegressor import numpy as np # Initialize Boruta forest = RandomForestRegressor(n_jobs=-1, max_depth=5) boruta = BorutaPy( estimator=forest, n_estimators='auto', max_iter=100 # number of trials to perform ) # fit Boruta (it accepts np.array, not pd.DataFrame) boruta.fit(np.array(X), np.array(y)) # print results green_area = X.columns[boruta.support_].to_list() blue_area = X.columns[boruta.support_weak_].to_list() print('features in the green area:', green_area) print('features in the blue area:', blue_area) print('features ranking :', boruta._rankings) return boruta
def by_boruta(data): import numpy as np from sklearn.ensemble import RandomForestClassifier from boruta import BorutaPy y = data.loc[:, 'type'].values y = y.astype(int) X = data.drop(columns=['type']) features = X.columns.to_list() X = X.values X = X.astype(int) rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced') feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2) feat_selector.fit(X, y) df = pd.DataFrame(data={ 'features': features, 'ranking': feat_selector.ranking_ }) #df.columns = [col.strip() for col in list(df.columns)] #print(df.columns.to_list()); df.sort_values(["ranking"], axis="rows", ascending=[False], inplace=True) #print(df.ranking) #print(feat_selector.ranking_) #print(df) top_features = df.features.to_list() return top_features
def Boruta_fs(x_train, y_train): """Perform feature selection using Boruta Arguments: x_train, y_train """ estimator = RandomForestClassifier(n_jobs=-1, random_state=0, class_weight='balanced') selector = BorutaPy(estimator, n_estimators='auto', verbose=2, random_state=0) #perc=100, max_iter=100, two_step=True selector.fit(x_train.values, y_train.values) feature_names = x_train.columns.values df_rank = pd.DataFrame({ 'Rank': selector.ranking_, 'Features': feature_names }) #finding ranked list confirmed_indices = np.where( selector.ranking_ == 1) #saving the confirmed features confirmed_names = x_train.columns.values[confirmed_indices] df_rank_confirmed = pd.DataFrame(confirmed_names) #print confirmed_names df_rank_confirmed.index += 1 return df_rank, df_rank_confirmed
def Feature_sort(Feat_scale, Label, threads=4): ##通过三种特征选择方法对特征进行排序 ranks = {} ## Univariate feature selection Selector = SelectKBest(f_classif, k='all') Selector.fit_transform(Feat_scale, Label) ranks["Univariate_f"] = np.argsort(Selector.pvalues_) ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling ##从第1900左右起,后续的特征排序得较为可疑。 rlogreg = RandomizedLogisticRegression(n_jobs=1, n_resampling=2000, selection_threshold=0, verbose=False, random_state=0) ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21. ##warnings.warn(msg, category=DeprecationWarning) rlogreg.fit(Feat_scale, Label) ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_)) ## boruta based on randomforest n_jobs=** rf = RandomForestClassifier(random_state=0, n_jobs=threads, max_features='auto') feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0) feat_selector.fit(Feat_scale, Label) ranks["Boruta_f"] = np.argsort(feat_selector.ranking_) return (ranks)
class DataTransformerBoruta: def __init__(self, corr_th, n_est=500, seed=123): self.boruta = True rfc = RandomForestClassifier(n_estimators=n_est, class_weight="balanced", n_jobs=6) self.feature_selector = BorutaPy(rfc, n_estimators="auto", verbose=0, random_state=seed, max_iter=100) self.corr_rem = CorrelationRemover(corr_th) def fit_transform(self, X, y): X_arr = np.array(X) y_arr = np.array(y).reshape(-1) self.feature_selector.fit(X_arr, y_arr) X_columns = X.columns selected_columns = X_columns[self.feature_selector.support_] X = X[selected_columns] X = self.corr_rem.fit_transform(X) return X def transform(self, X): X_columns = X.columns selected_columns = X_columns[self.feature_selector.support_] X = X[selected_columns] X = self.corr_rem.transform(X) return X def get_selected_num(self): return self.feature_selector.n_features_ - self.corr_rem.get_removed_num() def get_selected_vec(self, X): col_names = X.columns selected_columns = col_names[self.feature_selector.support_] cor_removed = self.corr_rem.get_removed_vec() selected_columns = set(selected_columns) - set(cor_removed) return (np.array(list(selected_columns)) + 1) # +1 is because we count coumnf from 1.
def main(): print("Begin Feature Selection Step...") print('-' * 60) print('Loading Data...') df = pd.read_csv("./data/my_midterm_train.csv") y = df['y'] X = df.drop(['y'], axis=1) # define random forest classifier, with utilising all cores and # sampling in proportion to y labels rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5) # define Boruta feature selection method feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2) print("Fitting Boruta...") # find all relevant features feat_selector.fit(X.values, y) print("Selected Features:") # check selected features print(feat_selector.support_) support = pd.DataFrame(feat_selector.support_) print("Selected Feature Rank:") # check ranking of features print(feat_selector.ranking_) ranking = pd.DataFrame(feat_selector.support_) # call transform() on X to filter it down to selected features print("Transforming X...") X_filtered = X.ix[:, feat_selector.support_] print("Writing Data...") support.to_csv("./work_dir/feature_support.csv", index=False) ranking.to_csv("./work_dir/feature_ranking.csv", index=False) combined_df = pd.concat([X_filtered, y], axis=1) combined_df.to_csv("./data/boruta_filtered_stacked_train.csv", index=False)
def select_features(X: pd.DataFrame, y: pd.Series, mode: str, n_estimators: int = 50, max_iter: int = 50, perc: int = 75) -> List[str]: feat_estimator = LGBMFeatureEstimator( { "objective": "regression" if mode == "regression" else "binary", "metric": "rmse" if mode == "regression" else "auc", "learning_rate": 0.01, "verbosity": -1, "seed": 1, "max_depth": 7, "min_data_in_leaf": 3, }, n_estimators) feat_selector = BorutaPy(feat_estimator, n_estimators=n_estimators, max_iter=max_iter, verbose=2, random_state=1, perc=perc) try: feat_selector.fit(X.values, y.values.ravel()) except: pass return X.columns[feat_selector.support_].tolist()
def boruta_select(X_df, Y, perc_list=[20], allowed_perc_good=.5, allowed_perc_med=.70, samples=[1], multiclass=False): """ Runs the Boruta selector :param X_df: The X Dataframe that the selector will run on :param Y: The y for the training of the selector :param perc_list: The percentages that boruta will be run on :param allowed_perc_good: How many times does one variable has to beat the random ones :param allowed_perc_med: How many times does one variable has to be tentative :param samples: nothing at this moment, possible expansion into sampling :param multiclass: If problem is multiclass or not :return: first dataframe is if the varible should be used, second is what variables were relevant at each percentage , third is what variables were tentative in each percentage """ use_list = [] y = Y.values.ravel() res_df_good = pd.DataFrame(index=X_df.columns) res_df_med = pd.DataFrame(index=X_df.columns) use_df = pd.DataFrame(index=X_df.columns) if multiclass: params_bor = {'num_leaves': 20, 'n_estimators': 100, 'boosting_type': 'rf', 'bagging_fraction': .8, 'bagging_freq': 1} else: params_bor = {'num_leaves': 20, 'n_estimators': 100, 'boosting_type': 'rf', 'bagging_fraction': .8, 'bagging_freq': 1} rf_bor = lgb.LGBMClassifier(**params_bor) for perc_ in perc_list: print('Starting on {}'.format(perc_)) feat_selector = BorutaPy(rf_bor, n_estimators=100, verbose=0, random_state=None, max_iter=10, perc=perc_) feat_selector.fit(X_df.values, y) if perc_ == perc_list[0]: times_good = (feat_selector.support_) * 1 times_kinda_good = (feat_selector.support_weak_) * 1 else: times_good += (feat_selector.support_) * 1 times_kinda_good += (feat_selector.support_weak_) * 1 res_df_good[str(perc_)] = (feat_selector.support_) * 1 res_df_med[str(perc_)] = (feat_selector.support_weak_) * 1 times_good_max = times_good.max() times_med_max = times_good.max() keep = (((times_good >= allowed_perc_good * times_good_max) | (times_kinda_good >= allowed_perc_med * times_med_max)) & ( times_good + times_kinda_good > 0)) # res_df_good[str(perc_)] = times_good # res_df_med[str(perc_)] = times_kinda_good use_df['use'] = keep # print(times_good_max, sum(keep)) return (use_df, res_df_good, res_df_med)
def _boruta(self): self._info(f"Feature importance {self.tag}: Boruta algorithm") model_factory = ModelFactoryRandomForest(self.config, self.datasets, self.model_type) model = model_factory.get() boruta = BorutaPy(model, n_estimators='auto', verbose=2) boruta.fit(self.x_train, self.y_train) return boruta
def do_boruta(model, X, y, max_iter=500, random_state=42): selector = BorutaPy(clone(model), n_estimators='auto', verbose=0, random_state=random_state, max_iter=max_iter) selector.fit(X.values, y.values) print('do_feat_boruta: Done') return X.columns.values[selector.support_]
def find_subsystems_of_interest(studyName, groupsList, geneCounts, level, percentage): """ Summary: uses Boruta machine learning method to roughly determine potential genes of interest. requires tab-separated matrix from MG-RAST analysis page Args: studyName (str): directory (study name) groupsList (list): list of group names level (str): subsystems level at which to run Boruta percentage (int): threshold for Boruta feature selection Returns: None, outputs files with tentative genes/gene families of interest """ numGeneCounts = geneCounts.select_dtypes(include=[np.number]) Y = numGeneCounts.transpose().index.str.split('_').str[0].values samplingDepth = numGeneCounts.sum().median() os.chdir(studyName) for i in range(len(numGeneCounts.columns)): subsampleList = [] if int(numGeneCounts[numGeneCounts.columns[i]].sum()) < samplingDepth: meanSubsample = numGeneCounts[numGeneCounts.columns[i]] else: for j in range(100): sample = subsample_counts( numGeneCounts[numGeneCounts.columns[i]].transpose().values, int(samplingDepth)) subsampleList.insert(j, sample) print("completed 100 subsamples for sample number " + str(i)) meanSubsample = pd.Series(subsampleList).mean() #recodification: setting all values less than 1.01 to zero meanSubsample[meanSubsample < 1.01] = 0 meanSubsample = 100 * meanSubsample / meanSubsample.sum() numGeneCounts[numGeneCounts.columns[i]] = meanSubsample numGeneCounts['level1'] = geneCounts['level1'] numGeneCounts['level2'] = geneCounts['level2'] numGeneCounts['level3'] = geneCounts['level3'] numGeneCounts['function'] = geneCounts['function'] countsLvl = numGeneCounts.groupby(level).sum() groupsDict = dict(enumerate(pd.Series(groupsList).unique())) dictGroups = {y: x for x, y in groupsDict.items()} rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=3) X = countsLvl.transpose().values feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, perc=int(percentage)) feat_selector.fit(X, Y) if len(countsLvl[feat_selector.support_]) > 0: countsLvl[feat_selector.support_].to_csv(str(level) + '_tentative.csv') countsLvl[feat_selector.support_weak_].to_csv( str(level) + '_tentative_weak.csv') os.chdir('..')
def boruta_fs(X, y, feat_names): rfc = RandomForestClassifier(n_estimators=10000, n_jobs=4, max_depth=1) boruta = BorutaPy(rfc, n_estimators='auto', verbose=2, max_iter=50) boruta.fit(X, y) results = sorted(zip(boruta.ranking_, feat_names), reverse=False) return [x[1] for x in results]
def get_boruta_features(est, X, y, mode): if mode == 'regression': rf = RandomForestRegressor(n_estimators=500, random_state=SEED) elif mode == 'classification': rf = RandomForestClassifier(n_estimators=500, random_state=SEED) boruta = BorutaPy(rf, n_estimators='auto') boruta.fit(X, y) X_features = X[:, boruta.support_] return X_features
def select_features(X, y, X_sub, feature_name, perc=10, max_depth=5, verbose=2): import sklearn.ensemble from boruta import BorutaPy rf = sklearn.ensemble.RandomForestRegressor(max_depth=max_depth) feat_selector = BorutaPy(rf, n_estimators='auto', perc=perc, verbose=verbose) feat_selector.fit(X.values, y) used_features = [feature_name[i] for i, x in enumerate(feat_selector.support_) if x] print(used_features) return feat_selector.transform(X.values), feat_selector.transform(X_sub.values)
def cal_boruta(df,target,n=50): y = df[target] X = df.drop([target], axis=1).values y = y.ravel() rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=10) feat_selector = BorutaPy(rf, n_estimators='auto', max_iter=n, verbose=2, random_state=1) feat_selector.fit(X, y) feature_df = pd.DataFrame(df.drop([target], axis=1).columns.tolist(), columns=['features']) feature_df['rank']=feat_selector.ranking_ feature_df = feature_df.sort_values('rank', ascending=True).reset_index(drop=True) return feature_df
def get_features_filter(X: pd.DataFrame, y: pd.DataFrame) -> BorutaPy: boruta_selector = BorutaPy( RandomForestClassifier(n_jobs=cpu_count(), class_weight='balanced', max_depth=5), n_estimators='auto', verbose=2, alpha=0.05, # p_value max_iter=10, # In practice one would run at least 100-200 times random_state=42) boruta_selector.fit(X.values, y.values.ravel()) return boruta_selector
def get_features_filter(X: pd.DataFrame, y: pd.DataFrame, name: str, cicli: int) -> BorutaPy: boruta_selector = BorutaPy(RandomForestClassifier(n_jobs=cpu_count(), class_weight='balanced', max_depth=5), n_estimators='auto', verbose=2, alpha=0.05, max_iter=cicli, random_state=42) boruta_selector.fit(X.values, y.values.ravel()) return boruta_selector
def fit(self, df, cfg): """ Performs Boruta feature selction Parameters: df (dataframe): dataframe. cfg (dict): configuration dictionary. Returns: selected_features: list of selected variable names. """ all_features = [x for x in df.columns if x not in cfg['drop_cols']+[cfg['ID_COL'], cfg['CTU_COL']]] X = df[all_features].values y = df[cfg['TE_TARGET_COL']].values.ravel() if (sum(y)/len(y)) < 0.1: class_ratio = (len(y) - sum(y))/sum(y) print ("Class Ratio:", class_ratio) class_weight = dict({1:class_ratio, 0:1.5}) max_depth = 8 n_estimators = 400 else: class_weight = None max_depth = 5 n_estimators = 200 param = { 'bootstrap':True, 'class_weight':class_weight, 'criterion':'gini', 'max_depth': max_depth, 'max_features':'auto', 'max_leaf_nodes':None, 'min_impurity_decrease' :0.0, 'min_impurity_split':None, 'min_samples_leaf':2, 'min_samples_split':10, 'min_weight_fraction_leaf':0.0, 'n_estimators':n_estimators, 'oob_score':False, 'random_state':121, 'verbose':0, 'warm_start':False } rf = RandomForestClassifier(**param) feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=cfg['seed'], max_iter = cfg['max_iter'], perc = cfg['z_score_percentile'], two_step = cfg['two_step']) feat_selector.fit(X, y) selected_features = [col for (col, id_bool) in zip(all_features, feat_selector.support_) if id_bool] return selected_features
def boruta(dataset: pd.DataFrame, labels: np.array, max_iter: int, p_value_threshold: float, random_state: int) \ -> pd.DataFrame: forest = RandomForestClassifier(n_jobs=cpu_count(), class_weight='balanced', max_depth=5) boruta_selector = BorutaPy(forest, n_estimators='auto', verbose=2, alpha=p_value_threshold, max_iter=max_iter, random_state=random_state) boruta_selector.fit(dataset.values, labels) return dataset[dataset.columns[np.where(boruta_selector.support_ == True)]]
def boruta_algorithm(dataset, target_name): ''' This function selects features in the dataset using an implementation of the boruta algorithm ''' print('USING BORUTA') rf = RandomForestClassifier(n_estimators=100, random_state=42) feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1) feat_selector.fit(dataset.drop([target_name], axis=1).values, dataset[target_name].values.ravel()) filtered = feat_selector.transform(dataset.drop([target_name], axis=1).values) generously_selected = feat_selector.support_weak_ feat_names = dataset.drop([target_name], axis=1).columns return [name for name, mask in zip(feat_names, generously_selected) if mask]
def boruta_tree(self, X_train_smote, y_train_res, X_test, y_test, n_features): """ - Apply Boruta two times to preselect about 400 features. - Decrease amount of features to n_features using a Random Forest Classifier. """ # Do Boruta once as it sometimes fails with so little data twice. for _ in range(1): from sklearn.metrics import f1_score # import again to avoid error... # Random Forests for Boruta. rf_boruta = RandomForestClassifier(n_jobs=-1, random_state=self.seed) # Perform Boruta. boruta = BorutaPy(rf_boruta, n_estimators='auto', verbose=0, alpha=0.005, max_iter=30, perc=100, random_state=self.seed) boruta.fit(X_train_smote.values, y_train_res) # Select features and fit Logistic Regression. cols = X_train_smote.columns[boruta.support_] X_train_smote = X_train_smote[cols] est_boruta = LogisticRegression(random_state=self.seed) est_boruta.fit(X_train_smote, y_train_res) scores = cross_val_score(est_boruta, X_train_smote, y_train_res, cv=5) # Print accuracy. print("Accuracy of Boruta: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) # Random Forest for extracting features. X_filtered = X_train_smote[cols] # Define selector. rf = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = self.seed) rf.fit(X_filtered, y_train_res) rf_pred = rf.predict(X_test[cols]) # Print accuracy. print("Accuracy of Boruta Tree: {:.3f}".format(accuracy_score(y_test, rf_pred))) # Retrieve features and importance. feature_names = X_filtered.columns rf_coeff = pd.DataFrame({"feature": feature_names,"coefficient": rf.feature_importances_}) rf_coeff_top = rf_coeff.sort_values(by="coefficient",ascending=False).head(n_features).set_index("feature") # Create dictionary with results. selected_features = rf_coeff_top.index.tolist() feature_importances = rf_coeff_top.coefficient.tolist() dictionary = {"Boruta Tree": [selected_features, feature_importances]} return dictionary
def getFeaturesRanking(X, y): rf = RandomForestRegressor(max_depth=5) #rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', max_depth=5) try: feat_selector = BorutaPy(rf, n_estimators='auto', verbose=3, max_iter=20) feat_selector.fit(X.values, y.values) #print("names:",feat_selector.support_) #print("ranking:", feat_selector.ranking_) return feat_selector.ranking_ except: return np.ones((len(names), ), dtype=np.int)