def SelectBestFeatures(self): """ Approximates best features for the model using L1 regularization """ logreg = LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=42).fit( self.features, self.target) model = SelectFromModel(logreg, prefit=True) self.features_new = model.transform(self.features) selected_features = pd.DataFrame(model.inverse_transform( self.features_new), index=self.features.index, columns=self.features.columns) self.top_selected_columns = selected_features.columns[ selected_features.var() != 0] self.top_selected_columns = self.train[self.top_selected_columns] print('Best selected columns :\n{}'.format( self.top_selected_columns.columns)) return self.top_selected_columns
def feature_selection_l1(Xtrain, ytrain, c=0.07): """ Return selected features using logistic regression with an L1 penalty """ logistic = LogisticRegression(C=c, penalty="l1", random_state=7).fit(Xtrain, ytrain) model = SelectFromModel(logistic, prefit=True) Xtrain_new = model.transform(Xtrain) selected_features = pd.DataFrame(model.inverse_transform(Xtrain_new), index=Xtrain.index, columns=Xtrain.columns) selected_columns = selected_features.columns[selected_features.var() != 0] dropped_columns = selected_features.columns[selected_features.var() == 0] return selected_columns, dropped_columns
def select_from_model(): iris = load_iris() x = iris.data y = iris.target estimator = LinearSVC(penalty="l1", dual=False) selector = SelectFromModel(estimator=estimator, threshold="mean") selector.fit(x, y) selector.transform(x) print(selector.threshold_) print(selector.get_support(indices=True)) print(selector.get_support(indices=False)) print(selector.inverse_transform(selector.transform(x))) pass
def select_features_l1(X, y): """ Return selected features using logistic regression with an L1 penalty """ logistic = LogisticRegression(C=0.1, penalty="l1", random_state=7).fit(X, y) model = SelectFromModel(logistic, prefit=True) X_new = model.transform(X) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index, columns=X.columns) # Dropped columns have values of all 0s, keep other columns selected = selected_features.columns[selected_features.var() != 0] return selected
def select_features_l1(X, y): logistic_model = LogisticRegression(C=0.1, penalty="l1", random_state=7, solver='liblinear').fit(X, y) model = SelectFromModel(logistic_model, prefit=True) X_new = model.transform(X) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index, columns=X.columns) # Dropped columns have values of all 0s, keep other columns cols_to_keep = selected_features.columns[selected_features.var() != 0] return cols_to_keep
def feature_selectionfrommodel(data, y, num_feature): xx = data.sort_values('pid').values xx_label = y.sort_values('pid')[sep].values # select = SelectKBest(f_classif, k=num_feature).fit(xx,xx_label) # select = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=10000), threshold= "median", max_features=num_feature).fit(xx,xx_label) select = SelectFromModel(RandomForestClassifier(n_estimators=20000, random_state=0, n_jobs=-1), threshold="median", max_features=num_feature).fit(xx, xx_label) reduced_xx = select.transform(xx) new_data = select.inverse_transform(reduced_xx) new_data = pd.DataFrame(new_data, index=data.sort_values('pid').index, columns=data.sort_values('pid').columns) # idx = select.get_support() # print(idx) # new_data = np.delete(new_data,idx,1) return new_data
def l1_regularization_selection(X_train, y_train, features, reg_parameter, rand_state): # Esta funcion utiliza la regularizacion L1 para seleccionar las mejores features # Nota: esta funcion no trae las mejore k features, sino las que quedan seleccionadas como mas relevantes # luego de aplicarles la regularizacion from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel logistic = LogisticRegression(C=reg_parameter, penalty='l1', random_state=rand_state).fit( X_train, y_train) model = SelectFromModel(logistic, prefit=True) X_new = model.transform(X_train) selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X_train.index, columns=X_train.columns) cols_to_keep = selected_features.columns[selected_features.var() != 0] return cols_to_keep
def lesson_4(): print_("Lesson 4: Feature Selection", 0, 1) ks = pd.read_csv(ks_projects_file_path, parse_dates=['deadline', 'launched']) # Drop live projects ks = ks.query('state != "live"') # Add outcome column, "successful" == 1, others are 0 ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int)) # Timestamp features ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year) # Label encoding cat_features = ['category', 'currency', 'country'] encoder = LabelEncoder() encoded = ks[cat_features].apply(encoder.fit_transform) data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome'] baseline_data = ks[data_cols].join(encoded) cat_features = ['category', 'currency', 'country'] interactions = pd.DataFrame(index=ks.index) for col1, col2 in itertools.combinations(cat_features, 2): new_col_name = '_'.join([col1, col2]) # Convert to strings and combine new_values = ks[col1].map(str) + "_" + ks[col2].map(str) label_enc = LabelEncoder() interactions[new_col_name] = label_enc.fit_transform(new_values) baseline_data = baseline_data.join(interactions) launched = pd.Series(ks.index, index=ks.launched, name="count_7_days").sort_index() count_7_days = launched.rolling('7d').count() - 1 count_7_days.index = launched.values count_7_days = count_7_days.reindex(ks.index) baseline_data = baseline_data.join(count_7_days) def time_since_last_project(series): # Return the time in hours return series.diff().dt.total_seconds() / 3600. df = ks[['category', 'launched']].sort_values('launched') timedeltas = df.groupby('category').transform(time_since_last_project) timedeltas = timedeltas.fillna(timedeltas.max()) baseline_data = baseline_data.join( timedeltas.rename({'launched': 'time_since_last_project'}, axis=1)) def get_data_splits(dataframe, valid_fraction=0.1): valid_fraction = 0.1 valid_size = int(len(dataframe) * valid_fraction) train = dataframe[:-valid_size * 2] # valid size == test size, last two sections of the data valid = dataframe[-valid_size * 2:-valid_size] test = dataframe[-valid_size:] return train, valid, test def train_model(train, valid): feature_cols = train.columns.drop('outcome') dtrain = lgb.Dataset(train[feature_cols], label=train['outcome']) dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome']) param = { 'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7 } print("Training model!") bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False) valid_pred = bst.predict(valid[feature_cols]) valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred) print(f"Validation AUC score: {valid_score:.4f}") return bst # ---------------------------- # Univariate Feature Selection # ---------------------------- feature_cols = baseline_data.columns.drop('outcome') # Keep 5 features selector = SelectKBest(f_classif, k=5) # NOTE: we should select features using only a training set, not the whole # dataset we are doing here (which will be fixed next) X_new = selector.fit_transform(baseline_data[feature_cols], baseline_data['outcome']) print_("X_new (after selecting 5 best features)", 0) print_(X_new) # Fix: select features using only a training set feature_cols = baseline_data.columns.drop('outcome') train, valid, _ = get_data_splits(baseline_data) # Keep 5 features selector = SelectKBest(f_classif, k=5) X_new = selector.fit_transform(train[feature_cols], train['outcome']) print_("X_new FIXED [Using Train Only]", 0) print_(X_new) # Get back the features we've kept, zero out all other features selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) print_( "First 5 rows from the train set including the 5 best features only (others set at 0)", 0) print_(selected_features.head()) # Dropped columns have values of all 0s, so var is 0, drop them selected_columns = selected_features.columns[selected_features.var() != 0] # Get the valid dataset with the selected features. print_("Valid dataset with the selected features only", 0) print_(valid[selected_columns].head()) # ----------------- # L1 regularization # ----------------- train, valid, _ = get_data_splits(baseline_data) X, y = train[train.columns.drop("outcome")], train['outcome'] # Set the regularization parameter C=1 logistic = LogisticRegression(C=1, penalty="l1", solver='liblinear', random_state=7).fit(X, y) model = SelectFromModel(logistic, prefit=True) X_new = model.transform(X) print_("X_new with L1 regularization", 0) print_(X_new) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index, columns=X.columns) # Dropped columns have values of all 0s, keep other columns selected_columns = selected_features.columns[selected_features.var() != 0] print_("Rejected columns: {}".format( selected_features.columns.difference(selected_columns).to_list())) # Get the valid dataset with the selected features. print_("Valid dataset with the selected features using L1 regularization", 0) print_(valid[selected_columns].head())
# (4) apply the feature selector to the training dataset # (5) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros # (6) find selected columns by choosing features with nonzero variance feature_cols = data.columns.drop('outcome') train, valid, test = get_data_splits(data) selector = SelectKBest(f_classif, k=6) X_new = selector.fit_transform(train[feature_cols], train['outcome']) selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) selected_columns = selected_features.columns[selected_features.var()!=0] # L1 regularization # feature selection using L1 regularization should use training data only # (1) split the data into training, validation and testing # (2) drop the target column # (3) fit a logistic regressio model to the training dataset (the smaller the parameter C the more penalty) # (4) select the nonzero coefficients using .SelectFromModel method # (5) select features based on the nonzero coefficients # (6) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros # (7) find selected columns by choosing features with nonzero variance train, valid, test = get_data_splits(data) X, y = train[train.columns.drop("outcome")], train['outcome'] logistic = LogisticRegression(C=0.00001, penalty="l1", random_state=7).fit(X, y) model = SelectFromModel(logistic, prefit=True) X_new = model.transform(X) selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index, columns=X.columns) selected_columns = selected_features.columns[selected_features.var()!=0]
index_col=0) y_train = pd.read_csv('data/y_train.csv', index_col=0) test_df = pd.read_csv('data/test_df_with_division_non_unique_words.csv', index_col=0) # Set the regularization parameter C=1 logistic = LogisticRegression(C=1, penalty="l1", random_state=7).fit(X_train, y_train) model = SelectFromModel(logistic, prefit=True) print("Model trained") X_new = model.transform(X_train) test_new = model.transform(test_df) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features_train = pd.DataFrame(model.inverse_transform(X_new), index=X_train.index, columns=X_train.columns) selected_features_test = pd.DataFrame(model.inverse_transform(test_new), index=test_df.index, columns=X_train.columns) print("Features selected") # Dropped columns have values of all 0s, keep other columns selected_columns = selected_features_train.columns[ selected_features_train.var() != 0] selected_features_train = selected_features_train[selected_columns] selected_features_test = selected_features_test[selected_columns] selected_features_train.to_csv('data/selected_features_train.csv')
#train = train.loc[:,(train != -1).any(axis=0)] label = train.WnvPresent train = train.drop('WnvPresent', axis=1) sfm = SelectFromModel(LinearSVC(penalty='l1', loss='squared_hinge', dual=False)) data = sfm.fit_transform(train, label) data = preprocessing.scale(data) #data = preprocessing.scale(train) transformer = FunctionTransformer(np.log1p, validate=True) transformer.transform(data) data = preprocessing.normalize(data, norm='l2') feature_cols = train.columns databackup = data data = pd.DataFrame(sfm.inverse_transform(data), index=train.index, columns=feature_cols) selCols = data.columns[data.var() != 0] data = data[selCols] TrainX, TestX, TrainY, TestY = train_test_split(data, label, test_size=0.2, random_state=1) ######################################################################################################################## def plotCurves(model): results = model.evals_result() epochs = len(results['validation_0']['auc'])
def run_grid_pipeline(self, features, labels, standardization_colms, parameters, estimator, feature_selection_threshold_type): # Preprocessing for numerical data numerical_transformer = StandardScaler() # Preprocessing for categorical data categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, standardization_colms), # ('cat', categorical_transformer, self.onehot_colms) # ], n_jobs = self.n_jobs) ], n_jobs=self.n_jobs, remainder='passthrough') feature_selection_clf = RandomForestClassifier( random_state=self.random_state, n_jobs=self.n_jobs) feature_selection_model = SelectFromModel( feature_selection_clf, threshold=feature_selection_threshold_type) grid = GridSearchCV(estimator=estimator, param_grid=parameters, cv=5, scoring='accuracy', refit=True, n_jobs=-1) pipeline = Pipeline(steps=[( 'preprocessor', preprocessor), ('feature_selection', feature_selection_model), ('grid_search', grid)]) pipeline.fit(features, labels) def print_results(results): print('BEST PARAMS: {}\n'.format(results.best_params_)) means = results.cv_results_['mean_test_score'] stds = results.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, results.cv_results_['params']): print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params)) print_results(pipeline['grid_search']) # print(features.columns) feature_selection_model = pipeline['feature_selection'] selected_features = feature_selection_model.transform(features) selected_features = pd.DataFrame( feature_selection_model.inverse_transform(selected_features), index=features.index, columns=features.columns) self.selected_columns = selected_features.columns[ selected_features.var() != 0] print( '\nColumns selected for {0} threshold'.format( feature_selection_threshold_type), self.selected_columns) # print('\nBest estimator:\n') # print(pipeline['grid_search'].best_estimator_) # print(pipeline['grid_search'].best_score_) # print(pipeline['grid_search'].best_params_) # print(pipeline['grid_search'].scorer_) return pipeline
def feature_selection(self): onehot_features = self.original_features onehot_labels = self.original_labels onehot_encoder = OneHotEncoder(handle_unknown='error', sparse=False) onehot_encoder.fit(onehot_features[self.onehot_colms]) onehot_transformed_colms = onehot_encoder.get_feature_names( self.onehot_colms) onehot_transformed_features = onehot_encoder.transform( onehot_features[self.onehot_colms]) onehot_features = onehot_features.join(pd.DataFrame( onehot_transformed_features, index=onehot_features.index, columns=onehot_transformed_colms), how='inner') # print(onehot_features.info()) # print(onehot_transformed_colms) onehot_features = onehot_features.drop(columns=self.onehot_colms) # print(onehot_features.info()) # print(self.original_features.loc[0:5,'Region']) # print(onehot_features.loc[0:5, ['Region_1', 'Region_2', 'Region_3', 'Region_4', 'Region_5', 'Region_6', 'Region_7', 'Region_8', 'Region_9'] ] ) sss = StratifiedShuffleSplit(n_splits=1, train_size=self.train_ratio, random_state=self.random_state) for train_indx, test_indx in sss.split(onehot_features, onehot_labels): # print(len(train_indx)/len(features), len(test_indx)/len(features)) # print('% Survived:', labels[test_indx].mean()) # Using RandomForestClassifier gives non-linear decision boundary clf = RandomForestClassifier(random_state=self.random_state, n_jobs=self.n_jobs) # Using LogisticRegression (default L1) gives linear decision boundary # clf = LogisticRegression() clf.fit(onehot_features.iloc[train_indx], onehot_labels.iloc[train_indx]) # Using mean threshold in SelectFromModel feature_selection_model = SelectFromModel(clf, prefit=True, threshold='mean') selected_features = feature_selection_model.transform( onehot_features.iloc[train_indx]) selected_features = pd.DataFrame( feature_selection_model.inverse_transform(selected_features), index=onehot_features.iloc[train_indx].index, columns=onehot_features.iloc[train_indx].columns) self.selected_columns_mean = selected_features.columns[ selected_features.var() != 0] print('Mean threshold:', self.selected_columns_mean) # Using Median threshold for SelectFromModel feature_selection_model = SelectFromModel(clf, prefit=True, threshold='median') selected_features = feature_selection_model.transform( onehot_features.iloc[train_indx]) selected_features = pd.DataFrame( feature_selection_model.inverse_transform(selected_features), index=onehot_features.iloc[train_indx].index, columns=onehot_features.iloc[train_indx].columns) self.selected_columns_median = selected_features.columns[ selected_features.var() != 0] print('Median threshold', self.selected_columns_median)
test_size=0.2, random_state=0) N, d = np.shape(x_train) #METHOD 0: no feature selection; all features are used in the ML model print('No feature selection:') train_eval_tree(x_train, y_train, x_devel, y_devel) #METHOD 1: LASSO/L1 regularization lsvc = LinearSVC(C=1.0, penalty='l1', dual=False, max_iter=1000).fit(x_train, y_train) svc_mod = SelectFromModel(lsvc, prefit=True) x_train_new = svc_mod.transform(x_train) #get the selected/most important features and extract from validation set selected_feats = pd.DataFrame(svc_mod.inverse_transform(x_train_new), index=x_train.index, columns=x_train.columns) selected_cols = selected_feats.columns[selected_feats.var() != 0] x_devel_new = x_devel[selected_cols] #now train and test a decision tree using these selected features print('L1 regularization:') train_eval_tree(x_train_new, y_train, x_devel_new, y_devel) #METHOD 2: SelectKBest using the f_classif score select_feats = SelectKBest(f_classif, k=10) x_train_new = select_feats.fit_transform(x_train, y_train) selected_feats = pd.DataFrame(select_feats.inverse_transform(x_train_new), index=x_train.index, columns=x_train.columns)
# Get back the features we've kept, zero out all other features selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) # Dropped columns have values of all 0s, so var is 0, drop them selected_columns = selected_features.columns[selected_features.var() != 0] ================================================================================= L1 regularization Univariate methods consider only one feature at a time when making a selection decision. Instead, we can make our selection using all of the features by including them in a linear model with L1 regularization. This type of regularization (sometimes called Lasso) penalizes the absolute magnitude of the coefficients, as compared to L2 (Ridge) regression which penalizes the square of the coefficients. As the strength of regularization is increased, features which are less important for predicting the target are set to 0. This allows us to perform feature selection by adjusting the regularization parameter. We choose the parameter by finding the best performance on a hold-out set, or decide ahead of time how many features to keep. from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel train, valid, _ = get_data_splits(baseline_data) X, y = train[train.columns.drop("outcome")], train['outcome'] # Set the regularization parameter C=1 logistic = LogisticRegression(C=1, penalty="l1", random_state=7).fit(X, y) model = SelectFromModel(logistic, prefit=True) X_new = model.transform(X) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index,columns=X.columns) # Dropped columns have values of all 0s, keep other columns selected_columns = selected_features.columns[selected_features.var() != 0]