def fit(self, scenario: ASlibScenario, fold: int, num_instances: int): self._num_algorithms = len(scenario.algorithms) self._algorithm_cutoff_time = scenario.algorithm_cutoff_time # resample `amount_of_training_instances` instances and preprocess them accordingly features, performances = self._resample_instances( scenario.feature_data.values, scenario.performance_data.values, num_instances, random_state=fold) features, performances = self._preprocess_scenario( scenario, features, performances) base_model = Ridge(alpha=1.0, random_state=fold) scorer = make_scorer(mean_squared_error, greater_is_better=False) sfs_params = {'estimator': base_model, 'k_features': 'best', 'forward': True, 'scoring': scorer, 'cv': 2} for num in range(self._num_algorithms): feature_selector = SequentialFeatureSelector(**sfs_params) feature_selector = feature_selector.fit( features, performances[:, num]) self._features[num] = feature_selector.k_feature_idx_ features_tmp = PolynomialFeatures(2).fit_transform( features[:, self._features[num]]) feature_selector = SequentialFeatureSelector(**sfs_params) feature_selector = feature_selector.fit( features_tmp, performances[:, num]) self._quad_features[num] = feature_selector.k_feature_idx_ features_tmp = features_tmp[:, self._quad_features[num]] censored = performances[:, num] >= self._algorithm_cutoff_time self._models[num] = impute_censored( features_tmp, performances[:, num], censored, base_model, distr_func, self._algorithm_cutoff_time)
def _fit_linear_approximation(self, run_fffs): selected_features_x = list(self.x.columns) if run_fffs: feature_selector = SequentialFeatureSelector( LinearRegression(normalize=True), k_features=max(int(np.sqrt(self.x.shape[1])), self.zeds_df.shape[1]), forward=True, verbose=2, cv=5, n_jobs=-1, scoring='r2') features = feature_selector.fit(self.x, self.y) selected_columns = list(features.k_feature_names_) selected_columns.extend([ list(self.x.columns)[i] for i in list(self.zeds_df.columns.astype(int)) ]) selected_features_x = pd.DataFrame(self.x)[set(selected_columns)] m = self.get_best_linear_model(selected_features_x, self.y) m.fit(selected_features_x, self.y) else: m = self.get_best_linear_model(self.x, self.y) m.fit(self.x, self.y) return m, selected_features_x
def forward_feature_selection_decision_tree(X_train, y_train_binned): """ Selects features using Feedforward Feature Selection using a Decision Tree Classifier. -- RATIONALE I had aimed to write my own function to let the number of features to select be variable, however due to time constraints I did not implement such a version. For now I selected the number of features (7), based on visual inspection of the Forward Feature Selection plots. -- Parameters ----------- X_train: training split of feature variables with continuous values y_train_binned: training split of feature variables with 3 class values Returns ----------- """ clf = tree.DecisionTreeClassifier() # Build step forward feature selection sfs = SequentialFeatureSelector(clf, k_features=7, forward=True, floating=False, verbose=2, scoring='r2', cv=5) # Perform Sequential Feature Selection sfs = sfs.fit(X_train, y_train_binned) selected_feature_names = sfs.k_feature_names_ return selected_feature_names
def forward_feature_selection_linear_regression(X_train, y_train): """ Selects features using Feedforward Feature Selection using a Linear Regression. -- RATIONALE I had aimed to write my own function to let the number of features to select be variable, however due to time constraints I did not implement such a version. For now I selected the number of features (9), based on visual inspection of the Forward Feature Selection plots. -- Parameters ----------- Returns ----------- """ regr = LinearRegression() # Build step forward feature selection sfs = SequentialFeatureSelector(regr, k_features=9, forward=True, floating=False, verbose=2, scoring='r2', cv=5) # Perform Sequential Feedforward Selection sfs = sfs.fit(X_train, y_train) selected_feature_names = sfs.k_feature_names_ return selected_feature_names
def apply_SFS(classifiers,X_in,Y_in,sel_feat='best'): models_result = {} models_result['Forward']=[] models_result['Backward']=[] for forward in [True,False]: for model in classifiers: model_name = type(model).__name__ if model_name not in models_result: models_result[model_name] = {} models_result[model_name]['Forward Features'] = None models_result[model_name]['Forward Index'] = None models_result[model_name]['Backward Features'] = None models_result[model_name]['Backward Index'] = None sfs_obj = SequentialFeatureSelector(model,k_features=sel_feat, forward=forward) sfs = sfs_obj.fit(X_in,Y_in) if forward==True: models_result['Forward'].append(sfs.k_score_) models_result[model_name]['Forward Features'] = sfs.k_feature_names_ models_result[model_name]['Forward Index'] = sfs.k_feature_idx_ else: models_result['Backward'].append(sfs.k_score_) models_result[model_name]['Backward Features'] = sfs.k_feature_names_ models_result[model_name]['Backward Index'] = sfs.k_feature_idx_ return(models_result)
def feature_selection(train_X, valid_X, test_X, train_Y, i): c1 = SVC(C=0.01, kernel="linear") c2 = RandomForestClassifier(n_estimators=50, max_depth=10) c3 = KNeighborsClassifier(n_neighbors=150) c4 = SGDClassifier(loss="huber", penalty="l1") c5 = DecisionTreeClassifier(criterion="gini", min_samples_split=250) c6 = LinearDiscriminantAnalysis(solver="lsqr") c7 = naive_bayes.BernoulliNB() c8 = MLPClassifier(hidden_layer_sizes=(5, 3)) c9 = GradientBoostingClassifier(random_state=0, n_estimators=100, learning_rate=0.1) c10 = VotingClassifier(estimators=[('a', c1), ('b', c2), ( 'c', c3), ('d', c4), ('e', c5), ('f', c6), ('g', c7), ('h', c8), ('i', c9)]) features = {item for item in train_X.head(0)} fs = SequentialFeatureSelector(c10, k_features=i, forward=False, verbose=0, scoring='accuracy', cv=4) fs.fit(train_X, train_Y) selected_features = set(fs.k_feature_names_) print(fs.subsets_) features_to_drop = list(features - selected_features) return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \ test_X.drop(features_to_drop, axis=1)
def run_experiment(X, y, clf, protected_groups, unfairness_metric, unfairness_weight): metric = unfairness_metrics.UnfairnessMetric(protected_groups, unfairness_metric) unfairness_scorer = metrics.make_scorer(metric) unfairness_means = [] auc_means = [] selected_feature_props = np.zeros([ITERATIONS, X.shape[1]]) for i in tqdm(range(ITERATIONS), desc=' Training ' + clf.__class__.__name__): xval = model_selection.KFold(4, shuffle=True, random_state=i) # Make a metric combining accuracy and subtracting unfairness w.r.t. the protected groups metric = unfairness_metrics.CombinedMetric(ACCURACY_METRIC, protected_groups, unfairness_metric, unfairness_weight) combined_scorer = metrics.make_scorer(metric) sfs = SequentialFeatureSelector(clf, 'best', verbose=0, cv=xval, scoring=combined_scorer, n_jobs=2) pipe = pipeline.Pipeline([ ('standardize', preprocessing.StandardScaler()), ('feature_selection', sfs), ('model', clf), ]) result = model_selection.cross_validate(pipe, X, y, verbose=0, cv=xval, scoring={ 'unfairness': unfairness_scorer, 'auc': metrics.make_scorer(ACCURACY_METRIC), }, return_estimator=True) unfairness_means.append(result['test_unfairness'].mean()) auc_means.append(result['test_auc'].mean()) for estimator in result['estimator']: for feature_i in estimator.named_steps['feature_selection'].k_feature_idx_: selected_feature_props[i][feature_i] += 1 / len(result['estimator']) return unfairness_means, auc_means, selected_feature_props
def FeatureSelection(pipeline_name, data_dev_mode, tag, train_filepath, test_filepath): logger.info('FEATURE SELECTION...') if bool(config.params.clean_experiment_directory_before_training ) and os.path.isdir(config.params.experiment_dir): logger.info('Cleaning experiment directory...') shutil.rmtree(config.params.experiment_dir) data = _read_data(data_dev_mode, train_filepath, test_filepath) train_set = data['train'] y = train_set[config.TARGET_COL].values.reshape(-1, ) train_set = train_set.drop(columns=config.TARGET_COL) pipeline = PIPELINES[pipeline_name](so_config=config.SOLUTION_CONFIG, suffix=tag) sfs = SequentialFeatureSelector(estimator=pipeline, k_features=(10, len(train_set.columns)), forward=False, verbose=2, cv=5, scoring='roc_auc') sfs.fit(train_set.to_numpy(), y) fig = plot_sequential_feature_selection(sfs.get_metric_dict()) plt.ylim([0.6, 1]) plt.title('Sequential Feature Selection') plt.grid() plt.show()
def doSFS(runDict, save=True): for runName, subDict in runDict.items(): for forward in [True, False]: print(runName, forward) featureSelector = SequentialFeatureSelector(subDict['clf'], k_features=(1, 50), forward=forward, verbose=2, scoring="accuracy", cv=5, n_jobs=-1) if forward: subDict['Ffeatures'] = featureSelector.fit(x, y) subDict['FfilteredFeatures'] = x.columns[list( subDict['Ffeatures'].k_feature_idx_)] else: subDict['Bfeatures'] = featureSelector.fit(x, y) subDict['BfilteredFeatures'] = x.columns[list( subDict['Bfeatures'].k_feature_idx_)] if save: forwardsOrBackwards = 'Bfeatures' if forward: forwardsOrBackwards = 'Ffeatures' saveName = runName + '_' + forwardsOrBackwards pickling.save_dill(subDict[forwardsOrBackwards].subsets_, saveName) return runDict
def step_forward_selection_by_random_forest(features_to_select=27, df=df_train, to_print=True): if to_print: print( '\nStarting step forward feature selection test using RandomForest classifier.' ) df_features = drop_label_column(df) df_label = get_label_column(df) feature_selector = SequentialFeatureSelector(RandomForestClassifier( n_jobs=-1, n_estimators=100), k_features=features_to_select, forward=True, verbose=2, cv=4) features = feature_selector.fit(df_features, df_label) filtered_features = df_features.columns[list(features.k_feature_idx_)] if to_print: print('Found {} features to drop. Features are: \n{}'.format( len(filtered_features), filtered_features)) return filtered_features
def backward(X_train, Y_train): rf_sfs = RandomForestRegressor(n_estimators=100, max_depth=50, oob_score=False, n_jobs=-1) SFS_b = SequentialFeatureSelector(rf_sfs, forward=False, k_features=6, scoring='neg_mean_squared_error', n_jobs=-1) SFS_b = SFS_b.fit(X_train.values, Y_train.values) indxs = list(SFS_b.k_feature_names_) str_cols = X_train.columns features = set(zip(indxs, str_cols)) print(features)
def smoteenn_sffs_reduction_classify_full(): (X, Y), feature_names = read_dataset( screening='') # no screening results, only risk factors # dataset resampling for imbalanced data compensation smoteenn = SMOTEENN() Xres, Yres = smoteenn.fit_resample(X, Y) # resampled dataset print('Resampling') print('Original dataset size:', Counter(Y)) print('Resampled dataset size:', Counter(Yres)) # feature selection using sequential forward floating selection and tuned SVM scoring = [ 'accuracy', 'precision', 'recall', 'balanced_accuracy', 'average_precision', 'brier_score_loss', 'neg_log_loss' ] param_grid = {'C': np.logspace(-3, 3, 7), 'kernel': ['rbf']} grid = GridSearchCV(estimator=SVC(probability=True, gamma='scale'), param_grid=param_grid, n_jobs=-1, verbose=10, cv=5, scoring=scoring, refit='balanced_accuracy', iid=False, error_score=0) grid.fit(Xres, Yres) print(grid.best_params_) selector = SequentialFeatureSelector( forward=False, floating=True, k_features='best', verbose=2, n_jobs=-1, scoring='balanced_accuracy', cv=5, estimator=SVC(probability=True, gamma='scale', kernel=grid.best_params_['kernel'], C=grid.best_params_['C'])) selector.fit(Xres, Yres, custom_feature_names=feature_names) with open('smoteenn_sbfs.pkl', 'wb') as f: pickle.dump(selector, f, -1) df = pd.DataFrame(selector.subsets_) df.to_csv('smoteenn_sbfs.csv')
def classification(df, y): feature_selector = SequentialFeatureSelector( RandomForestClassifier(n_jobs=-1), k_features=len(df.keys()), forward=True, verbose=2, scoring='roc_auc', cv=4) features = feature_selector.fit(np.array(df), y) filtered_features = X.columns[list(features.k_feature_idx_)] return filtered_features
def selector(df): x_data, y_data = get_data(df) x_data_scaled = StandardScaler().fit_transform(x_data) selector = SequentialFeatureSelector(LogisticRegression(), scoring='neg_log_loss', verbose=2, k_features=3, forward=False, n_jobs=-1) selector.fit(x_data_scaled, y_data) # no output :( return
def test8(): # Example 3 - Majority voting with classifiers trained on different feature subsets from sklearn import datasets iris = datasets.load_iris() X, y = iris.data[:, :], iris.target from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import EnsembleVoteClassifier from sklearn.pipeline import Pipeline from mlxtend.feature_selection import SequentialFeatureSelector clf1 = LogisticRegression(random_state=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() # Creating a feature-selection-classifier pipeline sfs1 = SequentialFeatureSelector(clf1, k_features=4, forward=True, floating=False, scoring='accuracy', verbose=0, cv=0) clf1_pipe = Pipeline([('sfs', sfs1), ('logreg', clf1)]) eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3], voting='soft') params = { 'pipeline__sfs__k_features': [1, 2, 3], 'pipeline__logreg__C': [1.0, 100.0], 'randomforestclassifier__n_estimators': [20, 200] } grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5) grid.fit(iris.data, iris.target) cv_keys = ('mean_test_score', 'std_test_score', 'params') print("test8") for r, _ in enumerate(grid.cv_results_['mean_test_score']): print( "%0.3f +/- %0.2f %r" % (grid.cv_results_[cv_keys[0]][r], grid.cv_results_[cv_keys[1]][r] / 2.0, grid.cv_results_[cv_keys[2]][r]))
def plot_feed_forward_models(): """ Plots the performance for each iteration of the feedforward model. The number of features chosen are 15 and 20, since these showed the best result """ # create Linear Regression model regr = LinearRegression() sfs_model = SequentialFeatureSelector(regr, k_features=15, forward=True, floating=False, scoring='neg_mean_squared_error', cv=10) sfs_model = sfs_model.fit(X_train, y_train) plot_sfs(sfs_model.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection Linear Regression (w. StdErr)') plt.grid() plt.show() # Same for the Decision Tree, with some different settings clf = tree.DecisionTreeClassifier() sfs_model = SequentialFeatureSelector(clf, k_features=20, forward=True, floating=False, scoring='accuracy', cv=10) sfs_model = sfs_model.fit(X_train, y_train_binned) plot_sfs(sfs_model.get_metric_dict(), kind='std_err') plt.title('Sequential Forward Selection Decision Tree (w. StdErr)') plt.grid() plt.show()
def filter_with_sfs(train_X, valid_X, test_X, train_Y, i): features = {item for item in train_X.head(0)} fs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=30, random_state=0), k_features=i, forward=True, verbose=0, scoring='accuracy', cv=4) fs.fit(train_X, train_Y) selected_features = set(fs.k_feature_names_) features_to_drop = list(features - selected_features) return train_X.drop(features_to_drop, axis=1), valid_X.drop(features_to_drop, axis=1), \ test_X.drop(features_to_drop, axis=1)
def SFS_test(input, how_many_attrs, cv_scores): y = np.array(input[:, -1]) x = np.array(input[:, :-1]) sfs = SequentialFeatureSelector(KNeighborsClassifier(n_neighbors=5, metric="euclidean"), k_features=how_many_attrs, forward=True, floating=False, verbose=0, scoring='accuracy', n_jobs=-1, cv=4) sfs = sfs.fit(x, y) # print(sfs.k_feature_idx_) target = np.array(input[:, -1]).reshape(475, 1) return np.hstack((input[:, sfs.k_feature_idx_], target))
def select_features_wrapper(X, y, forward=True, k_features=20): # svc = SVC(gamma='auto') # linearSVC = LinearSVC(random_state=0, tol=1e-5, class_weight='balanced') random_forest_clssifier = RandomForestClassifier(max_depth=7, random_state=0) sgd = SGDClassifier(max_iter=1000, tol=1e-3) # knn = KNNeighborsClassifier(n_neighbors=3) sfs = SequentialFeatureSelector(sgd, k_features=k_features, forward=forward, floating=False, verbose=5, cv=0, n_jobs=-1) sfs.fit(X, y.values.ravel()) print(sfs.k_feature_names_) return sfs
def feature_selection(X, y, method=1, k_features=5, save_params=False, seed=127): logit = LogisticRegression(C=1, random_state=seed, solver='liblinear') if method == 1: rfe = RFE(logit, n_features_to_select=k_features, verbose=2) rfe.fit(X, y) if save_params: with open('rfe.pkl', 'wb') as file: pickle.dump(rfe, file, pickle.HIGHEST_PROTOCOL) return rfe elif method == 2: sfs = SequentialFeatureSelector(logit, cv=0, k_features=k_features, forward=False, scoring='roc_auc', verbose=2, n_jobs=-1) sfs.fit(X, y) if save_params: with open('sfs.pkl', 'wb') as file: pickle.dump(sfs, file, pickle.HIGHEST_PROTOCOL) return sfs
def start_data_pretreatment(train_path, test_path, path, flag, index): train = grouping(train_path) test = grouping(test_path) key_skills_processing(data_train=train, data_test=test, top=100, flag=flag) remove_columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] train = train.drop(train.columns[remove_columns], axis=1) test = test.drop(test.columns[remove_columns], axis=1) if index == 3: selector = VarianceThreshold(0.009) tmp1 = train.group train = train.drop(['group'], axis=1) tmp2 = test.group test = test.drop(['group'], axis=1) selector.fit(train) col = selector.get_support(True) train, test = update_data(train, test, col, train.columns, tmp1, tmp2) if index == 4: model_rfc = RandomForestClassifier(n_estimators=70) selector = SequentialFeatureSelector(model_rfc, k_features=50, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0, n_jobs=-1) tmp1 = train.group train = train.drop(['group'], axis=1) tmp2 = test.group test = test.drop(['group'], axis=1) selector = selector.fit(train, tmp1) col = selector.k_feature_idx_ train, test = update_data(train, test, col, train.columns, tmp1, tmp2) train.to_csv("train_" + str(index) + "_" + path, sep=';', encoding="utf-8-sig", index=False) test.to_csv("test_" + str(index) + "_" + path, sep=';', encoding="utf-8-sig", index=False) return train, test
def analyze_model(model: sk.base.BaseEstimator, x: pd.DataFrame, y: pd.DataFrame, n_jobs: int = 1) \ -> SequentialFeatureSelector: start_time = time.time() logger.info("Starting feature selection") sfs = SequentialFeatureSelector( estimator=model, k_features="parsimonious", cv=None, verbose=1, forward=True, n_jobs=n_jobs, # scoring is chosen as a default based on the type of model ) sfs.fit(x, y) end_time = time.time() logger.info("Feature selection done in %.3f seconds", end_time - start_time) return sfs
def hyper_parameter_tunning(self): # 调整特征组合 self.__lr = LogisticRegression() self.__ps = PredefinedSplit(self.__train_us_validation_index) self.__sfs = SequentialFeatureSelector( estimator=self.__lr, k_features=(1, 11), forward=True, floating=True, scoring="roc_auc", cv=self.__ps ) self.__sfs.fit(self.__train_us_validation_feature_woe, self.__train_us_validation_label) # 最终模型使用的 feature self.__feature_columns = self.__feature_columns[list(self.__sfs.k_feature_idx_)] # Numpy 使用不同的方式进行索引 , OOT 已经不包含通过 SFFS 删掉的特征了 self.__train_feature_woe = self.__train_feature_woe[:, self.__sfs.k_feature_idx_] self.__train_us_feature_woe = self.__train_us_feature_woe[:, self.__sfs.k_feature_idx_] self.__validation_feature_woe = self.__validation_feature_woe[:, self.__sfs.k_feature_idx_] self.__train_us_validation_feature_woe = self.__train_us_validation_feature_woe[:, self.__sfs.k_feature_idx_] # 特征组合一定的条件下调整 LR 超参数 C def __lr_cv(C): clf = LogisticRegression( C=C, random_state=7 ) val = cross_val_score( clf, self.__train_us_validation_feature_woe, self.__train_us_validation_label, scoring="roc_auc", cv=self.__ps ).mean() return val self.__param = {"C": (0.1, 100)} self.__lr_bo = BayesianOptimization(__lr_cv, self.__param, random_state=7) self.__lr_bo.maximize(** {"alpha": 1e-5})
def get_features(train_set, target, method=None, model="rf", n_features="auto", verbose=1): if model == "rf": model = RandomForestClassifier(n_jobs=-1, random_state=1) elif model == "gb": model = GradientBoostingClassifier(random_state=1) if method == None: selected_features = train_set.columns.values if method == "boruta": print("Fitting Boruta...") boruta = BorutaPy(model, n_estimators=n_features, verbose=verbose) boruta.fit(train_set.values, target.values) selected_features = train_set.columns[boruta.support_].values if method == "rfe": print("Fitting Recursive Feature Elimination...") rfe = RFECV(estimator=model, cv=4, scoring='accuracy', verbose=verbose) rfe = rfe.fit(train_set, target) selected_features = train_set.columns[rfe.support_].values if method == "sfs": print("Fitting Sequential Feature Selection...") if n_features == "auto": n_features = "best" sfs = SequentialFeatureSelector(model, k_features=n_features, verbose=verbose, n_jobs=-1, scoring='accuracy', cv=4) sfs.fit(train_set, target) selected_features = list(sfs.k_feature_names_) return selected_features
def main(): loadVariables("Questioned") key = pd.read_csv("../Text/key.csv") for questioned_iterator in q_transitions.keys(): data = [] data.append(np.mean(np.array(q_centroids[questioned_iterator]))) data.append(np.mean(np.array(q_transitions[questioned_iterator]))) data.append(np.mean(np.array(q_ratios[questioned_iterator]))) data.append(np.mean(np.array(q_black_Pixels[questioned_iterator]))) data.append(np.mean(np.array(q_normalized[questioned_iterator]))) data.append(np.mean(np.array(q_angles[questioned_iterator]))) data.append(np.mean(np.array( q_normalized_blacks[questioned_iterator]))) x.append(data) for temp, file in enumerate(q_ratios.keys()): number = str(re.findall(r'\d+', file)).replace('[', '').replace( ']', '').replace("'", '').lstrip('0') if key['Decision'].values[int(number) - 1] == 'F': y.append(0) if key['Decision'].values[int(number) - 1] == 'D': y.append(1) if key['Decision'].values[int(number) - 1] == 'G': y.append(2) knnClassifier = KNeighborsClassifier(n_neighbors=4) sfs = SequentialFeatureSelector(knnClassifier, k_features=7, forward=True, floating=True, verbose=2, scoring='accuracy', cv=0, n_jobs=-1) sfs = sfs.fit(np.array(x), np.array(y), custom_feature_names=tuple(titles)) print() pprint(sfs.subsets_)
def feature_selection(X, y, model): correlated_features = set() correlation_matrix = X.corr() for i in range(len(correlation_matrix.columns)): for j in range(i): if abs(correlation_matrix.iloc[i, j]) > 0.8: colname = correlation_matrix.columns[i] correlated_features.add(colname) X.drop(labels=correlated_features, axis=1, inplace=True) feature_selector = SequentialFeatureSelector(model, k_features=11, forward=False, verbose=2, scoring='balanced_accuracy', cv=5) #feature_selector = ExhaustiveFeatureSelector(model, min_features=5, max_features=10, scoring='balanced_accuracy', print_progress=True, cv=3) features = feature_selector.fit(X, y) filtered_features = X.columns[list(features.k_feature_idx_)] return filtered_features
def forward_floating(data, scoring=None, model=None, k=3, cv=10): """A wrapper of mlxtend Sequential Forward Floating Selection algorithm. """ X_train, X_test, y_train, y_test = data # Z-scores. X_train_std, X_test_std = utils.train_test_z_scores(X_train, X_test) # NOTE: Nested calls not supported by multiprocessing => joblib converts # into sequential code (thus, default n_jobs=1). #n_jobs = cpu_count() - 1 if cpu_count() > 1 else cpu_count() n_jobs = 1 selector = SequentialFeatureSelector( model, k_features=k, forward=True, floating=True, scoring='roc_auc', cv=cv, n_jobs=n_jobs ) selector.fit(X_train_std, y_train) support = _check_support(selector.k_feature_idx_, X_train_std) return _check_feature_subset(X_train_std, X_test_std, support)
# Removing Constant features constant_filter = VarianceThreshold() constant_filter.fit(X_train) constant_columns = [col for col in X_train.columns if col not in X_train.columns[constant_filter.get_support()]] X_train.drop(labels=constant_columns, axis=1, inplace=True) X_test.drop(labels=constant_columns, axis=1, inplace=True) from sklearn.neighbors import KNeighborsClassifier start = time.time() classifier_ = DecisionTreeClassifier(random_state = 100) knn = KNeighborsClassifier(n_neighbors=2) feature_selector = SequentialFeatureSelector(classifier_, k_features=15, forward=True, scoring='accuracy', cv=0) feature_selector = feature_selector.fit(X_train,y_train) end = time.time() print("Execution time: %0.4f seconds"%(float(end)- float(start))) selected_features= X_train.columns[list(feature_selector.k_feature_idx_)] X_train = X_train[selected_features] X_test = X_test[selected_features] start = time.time() clf3 = DecisionTreeClassifier(random_state = 100) clf3.fit(X_train, y_train) end = time.time()
def forward(X_train, Y_train): rf_sfs = RandomForestRegressor(n_estimators=100, max_depth=50, oob_score=False, n_jobs=-1) SFS = SequentialFeatureSelector(rf_sfs, k_features=6, scoring='neg_mean_squared_error', n_jobs=-1) SFS = SFS.fit(X_train, Y_train) print(SFS.k_feature_names_)
X = transformer.fit_transform(X).toarray() from sklearn.metrics import roc_auc_score from mlxtend.feature_selection import SequentialFeatureSelector from sklearn.ensemble import RandomForestClassifier ####### #we want 10 feature sfs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=10, n_jobs=-1), k_features=10, forward=True, floating=False, verbose=2, scoring='accuracy', cv=3) X = sfs.fit_transform(X, y) """ sfs=SequentialFeatureSelector(DecisionTreeClassifier(),k_features=10, forward=True,floating=False,verbose=2,scoring='accuracy',cv=3) sfs=sfs.fit(X,y) X=sfs.fit_transform(X,y) """ #####################################################################