def fval(df, y, alpha, k): """Feature Selection based on F-Value :param df: dataframe :param y: label :param alpha: hyper-parameter [alpha] :param k: number of select features :return: dataframe of feature selected """ x_bin = MinMaxScaler().fit_transform(scale(df)) select_chi2 = SelectFpr(chi2, alpha=alpha).fit(x_bin, y) select_f_classif = SelectFpr(f_classif, alpha=alpha).fit(df, y) chi2_selected = select_chi2.get_support() f_classif_selected = select_f_classif.get_support() chi2_selected_features = [ f for i, f in enumerate(df.columns) if chi2_selected[i] ] logging.info('Chi2 selected {} features {}.'.format( chi2_selected.sum(), chi2_selected_features)) f_classif_selected_features = [ f for i, f in enumerate(df.columns) if f_classif_selected[i] ] logging.info('F_classif selected {} features {}.'.format( f_classif_selected.sum(), f_classif_selected_features)) selected = chi2_selected & f_classif_selected logging.info('Chi2 & F_classif selected {} features'.format( selected.sum())) features = [f for f, s in zip(df.columns, selected) if s] logging.info(features) return df[features]
def fit(self, X, y, sample_weight=None): if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] else: self.support = np.ones(X.shape[1]).astype(bool) # fit the model super().fit(X, y, [len(X)], sample_weight=sample_weight) # get the mean of z for each level of y self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ z = super().predict(X).astype(float) self.z_means = np.array( [z[y == cl].mean() for cl in self.label_encoder.classes_]) return self
class f_regressionFPRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR') self.id = 29 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
class UnivariateSelectChiFPRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR') self.id = 27 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
def test_select_fpr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFpr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fpr", param=0.0001).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fpr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fpr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def select_with_fpr(train, test): train_data = train.drop('ID', axis=1) test_data = test.drop('ID', axis=1) train_y = train_data['TARGET'] train_X = train_data.drop('TARGET', 1) fpr = SelectFpr(alpha = 0.001) features = fpr.fit_transform(train_X, train_y) print('Fpr выбрал {} признаков.'.format(features.shape[1])) col_numbers = fpr.get_support() columns = np.delete(train_data.columns.values, train_data.shape[1] - 1, axis=0) features = [] i = 0 for i in range(len(columns)): if col_numbers[i] == True: features.append(columns[i]) new_train = train[['ID'] + features + ['TARGET']] new_train.to_csv('train_after_fpr.csv') new_test = test[['ID'] + features] new_test.to_csv('test_after_fpr.csv')
def selectionFwe(X, y, paramlist): k = paramlist['number _of_features'] fwe = SelectFpr(chi2, k=k) Xnew = fwe.fit_transform(X, y) indexarr = fwe.get_support(indices=True) scores_arr = fwe.scores_ return [Xnew, indexarr, scores_arr]
def fit(self, X, y, sample_weight=None): self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ y = self.label_encoder.transform(y) if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] if self.bounds is not None: self.bounds = [ self.bounds[ii] for ii in range(len(self.bounds)) if self.support[ii] ] else: self.support = np.ones(X.shape[1]).astype(bool) def func(w, X, y, alpha, sw): out, grad = _logistic_loss_and_grad(w, X, y, 0, sw) out_penalty = alpha * np.sum(np.abs(w[:-1])) grad_penalty = np.r_[alpha * np.sign(w[:-1]), 0] return out + out_penalty, grad + grad_penalty y2 = np.array(y) y2[y2 == 0] = -1 w0 = np.r_[np.random.randn(X.shape[1]) / 10, 0.] if self.bounds is None: method = 'BFGS' else: method = 'L-BFGS-B' if sample_weight is None: if self.class_weight is not None: sample_weight = get_sample_weights( y, class_weight=self.class_weight) else: sample_weight = np.ones(len(X)) sample_weight /= (np.mean(sample_weight) * len(X)) self.opt_res = minimize(func, w0, method=method, jac=True, args=(X, y2, 1. / self.C, sample_weight), bounds=self.bounds + [(None, None)], options={ "gtol": self.tol, "maxiter": self.max_iter }) self.coef_ = np.zeros(len(self.support)) self.coef_[self.support] = self.opt_res.x[:-1] self.coef_ = self.coef_.reshape(1, -1) self.intercept_ = self.opt_res.x[-1].reshape(1, ) return self
def SelectFpr_selector(data, target, sf): selector = SelectFpr(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def fit(self, X, y, sample_weight=None): self.fitted_ = False if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) Xold = np.array(X) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] self.allow_missing_ids = self.allow_missing_ids[self.support] else: self.support = np.ones(X.shape[1]).astype(bool) if sample_weight is None: if self.class_weight is not None: sample_weight = get_sample_weights( y, class_weight=self.class_weight) else: sample_weight = np.ones(len(X)) sample_weight /= (np.mean(sample_weight) * len(X)) # generate pairs X2, y2, sw2 = self._generate_pairs(X, y, sample_weight) sw2 = sw2 / sw2.mean() if self.verbose: print('Generated %d pairs from %d samples' % (len(X2), len(X))) # fit the model if self.estimator.bounds is not None: self.estimator.bounds = [ self.estimator.bounds[ii] for ii in range(len(self.estimator.bounds)) if self.support[ii] ] self.estimator.fit(X2, y2, sample_weight=sw2) # get the mean of z for each level of y self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ z = self.predict_z(Xold) self.z_means = np.array( [z[y == cl].mean() for cl in self.label_encoder.classes_]) self.coef_ = np.zeros(len(self.support)) self.coef_[self.support] = self.estimator.coef_.flatten() self.coef_ = self.coef_.reshape(1, -1) self.intercept_ = self.estimator.intercept_ self.fitted_ = True return self
def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = GenericUnivariateSelect(f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5,), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)
def test_select_fpr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fpr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fpr", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert (support[:5] == 1).all() assert np.sum(support[5:] == 1) < 3
def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect( f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)
def test_select_fpr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fpr heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fpr', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert(support[:5] == 1).all() assert(np.sum(support[5:] == 1) < 3)
print "SelectPercentile -- chi2" print X_fitted_4.scores_ print X_fitted_4.pvalues_ print X_fitted_4.get_support() X_transformed_4 = X_fitted_4.transform(X) print X_transformed_4.shape #SelectFpr --- chi2 from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import chi2 X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X, y) print "SelectFpr --- chi2" print X_fitted_5.scores_ print X_fitted_5.pvalues_ print X_fitted_5.get_support() X_transformed_5 = X_fitted_5.transform(X) print X_transformed_5.shape #SelectFpr --- f_classif from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import f_classif X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31).fit(X, y) print "SelectFpr --- f_classif" print X_fitted_6.scores_ print X_fitted_6.pvalues_ print X_fitted_6.get_support() X_transformed_6 = X_fitted_6.transform(X) print X_transformed_6.shape
from sklearn.preprocessing import Imputer imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X) X = imputer.transform(X) #feature scaling from sklearn.preprocessing import MinMaxScaler mms = MinMaxScaler() X_norm = mms.fit_transform(X) # Univariate feature selection using false positive rate from sklearn.feature_selection import SelectFpr, f_classif X_fpr = SelectFpr(f_classif, alpha=0.05).fit(X, y) # Get indices of selected features X_fpr.get_support(indices=True) # select features using false positive rate method X_fpr = SelectFpr(f_classif, alpha=0.05).fit_transform(X, y) print(X_fpr.shape) # Splitting the dataset into Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_fpr, y, test_size=0.2, random_state=0) # fitting logistic regression to Training Set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0)
X_fitted_4 = SelectPercentile(chi2, percentile=50).fit(X,y) print "SelectPercentile -- chi2" print X_fitted_4.scores_ print X_fitted_4.pvalues_ print X_fitted_4.get_support() X_transformed_4 = X_fitted_4.transform(X) print X_transformed_4.shape #SelectFpr --- chi2 from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import chi2 X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X,y) print "SelectFpr --- chi2" print X_fitted_5.scores_ print X_fitted_5.pvalues_ print X_fitted_5.get_support() X_transformed_5 = X_fitted_5.transform(X) print X_transformed_5.shape #SelectFpr --- f_classif from sklearn.feature_selection import SelectFpr from sklearn.feature_selection import f_classif X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31 ).fit(X,y) print "SelectFpr --- f_classif" print X_fitted_6.scores_ print X_fitted_6.pvalues_ print X_fitted_6.get_support() X_transformed_6 = X_fitted_6.transform(X) print X_transformed_6.shape # SelectFdr 和 SelectFwe 的用法和上面类似,只是选择特征时候的依据不同,真正决定得分不同的是
# import data of all Count and Position features. Training and test sets altogether dfCountfeatures = pd.read_csv('data/CountingAndPositionFeatures_TrainAndTestData.csv') dfTrainRaw = pd.read_csv('data/train.csv') # get only training data TrainQueryIDs = dfTrainRaw["id"] relevance = dfTrainRaw["relevance"] dfCountfeatures_TrainSet = dfCountfeatures[dfCountfeatures["id"].isin(TrainQueryIDs)] #select these features which have non-zero variance selector = VarianceThreshold() selector.fit_transform(dfCountfeatures_TrainSet).shape # only one feature with zero variance - shape (74067L, 262L) # select feature based on p-values from univariate regression with target feature (relevance) selector2= SelectFpr(f_regression, alpha = 0.01) selector2.fit(dfCountfeatures_TrainSet.drop("id", axis = 1), relevance) selector2.get_support(indices=True).size # left 226 features out of 262 with p-value <=1% # get titles of features which were selected selectedCountfeatures = dfCountfeatures.columns[selector2.get_support(indices=True)] # check correlation amongst features corrReduced = dfCountfeatures_TrainSet[selectedCountfeatures].corr() corrReduced.iloc[:,:] = np.tril(corrReduced.values, k=-1) corrReduced =corrReduced.stack() # get pairs of features which are highly correlated corrReduced[corrReduced.abs()>0.8].size # 578 pairs correlated more than 80% out of 25.425 len(set(corrReduced[corrReduced.abs()>0.8].index.labels[0])) # 172 features to be removed due to high correlation with other features # get feature titles which will be used in training the model after removing highly correlated features indices = set(corrReduced[corrReduced.abs()>0.8].index.labels[0]) selectedCountfeatures2 = [i for j, i in enumerate(selectedCountfeatures.tolist()) if j not in indices] selectedCountfeatures2.append("id")
print("Loading tfidf model...") model_tfidf = TfidfModel.load(FLAGS.tfidfFile) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus] comments_vecs = np.vstack( [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf]) chi2_features = None if doTrain: # Find most descrimitive words for any of the labels print("Finding discrimitive features...") labels = np.array(data['any']) model_fpr = SelectFpr(chi2, alpha=0.025) model_fpr.fit(comments_vecs, labels) chi2_features = model_fpr.get_support(indices=True) np.save(FLAGS.chi2File, chi2_features) else: print("Loading discrimitive features data...") chi2_features = np.load(FLAGS.chi2File) print("Calculating tfidf weighted word2vec vectors...") chi2_tfidf_vecs = comments_vecs[:, chi2_features] fpr_embeddings = None if doTrain: print('Fitting FastText embedding model...') ft_model = FastText(sentences=docs, size=300, workers=8) fpr_embeddings = [ ft_model.wv[t] for t in [comments_dictionary[i] for i in chi2_features]
def feature_select(x, y): vt = SelectFpr(f_regression, alpha=0.05) samples_selected = vt.fit_transform(x, y) get_index_selected = vt.get_support(indices=True) return samples_selected, get_index_selected
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np df = pd.read_csv('Train_CV_Data.csv') X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4']) Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32) print(np.sum(Y_train == 1)) kBest = SelectKBest(chi2, k=12) kBest.fit(X_train, Y_train) mask1 = kBest.get_support(indices=True) fpr = SelectFpr(chi2, alpha=0.0001) fpr.fit(X_train, Y_train) mask2 = fpr.get_support(indices=True) rf = RandomForestClassifier(n_estimators=50) rfe = RFE(rf, n_features_to_select=12, step=1) rfe.fit(X_train, Y_train) mask3 = rfe.get_support(indices=True) print('K-Best Feat :', mask1) print('False Positive based :', mask2) print('RFE based :', mask3)