def fval(df, y, alpha, k): """Feature Selection based on F-Value :param df: dataframe :param y: label :param alpha: hyper-parameter [alpha] :param k: number of select features :return: dataframe of feature selected """ x_bin = MinMaxScaler().fit_transform(scale(df)) select_chi2 = SelectFpr(chi2, alpha=alpha).fit(x_bin, y) select_f_classif = SelectFpr(f_classif, alpha=alpha).fit(df, y) chi2_selected = select_chi2.get_support() f_classif_selected = select_f_classif.get_support() chi2_selected_features = [ f for i, f in enumerate(df.columns) if chi2_selected[i] ] logging.info('Chi2 selected {} features {}.'.format( chi2_selected.sum(), chi2_selected_features)) f_classif_selected_features = [ f for i, f in enumerate(df.columns) if f_classif_selected[i] ] logging.info('F_classif selected {} features {}.'.format( f_classif_selected.sum(), f_classif_selected_features)) selected = chi2_selected & f_classif_selected logging.info('Chi2 & F_classif selected {} features'.format( selected.sum())) features = [f for f, s in zip(df.columns, selected) if s] logging.info(features) return df[features]
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fpr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fpr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def selectionFwe(X, y, paramlist): k = paramlist['number _of_features'] fwe = SelectFpr(chi2, k=k) Xnew = fwe.fit_transform(X, y) indexarr = fwe.get_support(indices=True) scores_arr = fwe.scores_ return [Xnew, indexarr, scores_arr]
def fit(self, X, y, sample_weight=None): if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] else: self.support = np.ones(X.shape[1]).astype(bool) # fit the model super().fit(X, y, [len(X)], sample_weight=sample_weight) # get the mean of z for each level of y self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ z = super().predict(X).astype(float) self.z_means = np.array( [z[y == cl].mean() for cl in self.label_encoder.classes_]) return self
def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert selector is not new_selector assert selector.get_params() == new_selector.get_params() selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert selector is not new_selector
def build_model(clf="log_reg", train_reader=sick_train_reader, feature_vectorizer=DictVectorizer(sparse=True), features=None, feature_selector=SelectFpr(chi2, alpha=0.05), file_name=None, load_vec=None, compression=None): ''' Builds the model of choice. ''' global _models clf_pipe = None ''' Putting RFE in the pipeline feature_selector = RFE( LogisticRegression(solver='lbfgs'), n_features_to_select = 5000, step = 0.05) ''' if compression: clf_pipe = Pipeline([('dict_vector', feature_vectorizer), ('feature_selector', feature_selector), ('compression', _models[compression]), ('clf', _models[clf])]) else: clf_pipe = Pipeline([('dict_vector', feature_vectorizer), ('feature_selector', feature_selector), ('clf', _models[clf])]) feat_vec, labels = obtain_vectors(file_name, load_vec, train_reader, features) return clf_pipe, feat_vec, labels
def get_ensemble_model(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() n_jobs = -1 return Pipeline([ ('feature_extraction', get_features(w2v)), # false positive rate test for feature selection ('feature_selection', SelectFpr(f_classif)), #('normalize', Normalizer(norm='l2')), ( 'proba', ProbExtractor([ RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, n_jobs=n_jobs), # ExtraTreesClassifier(n_estimators=300, max_depth=10, # min_samples_split=10, # n_jobs=n_jobs), XGBClassifier(n_estimators=300, max_depth=10, n_jobs=8), LogisticRegression(C=0.1, solver='lbfgs', penalty='l2', n_jobs=n_jobs), BernoulliNB(alpha=5.0) ])), ('polynomial', PolynomialFeatures(degree=2)), ('logistic_regression', GridSearchCV(LogisticRegression(penalty='l2', random_state=42), param_grid=params)) ])
def fit(self, X, y, sample_weight=None): self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ y = self.label_encoder.transform(y) if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] if self.bounds is not None: self.bounds = [ self.bounds[ii] for ii in range(len(self.bounds)) if self.support[ii] ] else: self.support = np.ones(X.shape[1]).astype(bool) def func(w, X, y, alpha, sw): out, grad = _logistic_loss_and_grad(w, X, y, 0, sw) out_penalty = alpha * np.sum(np.abs(w[:-1])) grad_penalty = np.r_[alpha * np.sign(w[:-1]), 0] return out + out_penalty, grad + grad_penalty y2 = np.array(y) y2[y2 == 0] = -1 w0 = np.r_[np.random.randn(X.shape[1]) / 10, 0.] if self.bounds is None: method = 'BFGS' else: method = 'L-BFGS-B' if sample_weight is None: if self.class_weight is not None: sample_weight = get_sample_weights( y, class_weight=self.class_weight) else: sample_weight = np.ones(len(X)) sample_weight /= (np.mean(sample_weight) * len(X)) self.opt_res = minimize(func, w0, method=method, jac=True, args=(X, y2, 1. / self.C, sample_weight), bounds=self.bounds + [(None, None)], options={ "gtol": self.tol, "maxiter": self.max_iter }) self.coef_ = np.zeros(len(self.support)) self.coef_[self.support] = self.opt_res.x[:-1] self.coef_ = self.coef_.reshape(1, -1) self.intercept_ = self.opt_res.x[-1].reshape(1, ) return self
def SelectFpr_selector(data, target, sf): selector = SelectFpr(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def get_feature_extractor(w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() return Pipeline([("feature_extraction", get_features(w2v)), ('feature_selection', SelectFpr(f_classif)) ])
def test_verbose_output_for_select_select_fpr(): expected_output = ("The p-value of column 'B' (1.0000) is above the " + "specified alpha of 0.5000") model = SelectFpr(chi2, alpha=0.5) output = _capture_verbose_output_for_model(model, use_supervised_df=True) assert output == expected_output
def selectFpr(args): """Uses scikit-learn's SelectFpr, select the pvalues below alpha based on a FPR test. Parameters ---------- score_func : callable Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). alpha : float, optional The highest uncorrected p-value for features to keep. """ if (args[2] == "chi2"): selector = SelectFpr(chi2, alpha=float(args[1])) elif (args[2] == "f_classif"): selector = SelectFpr(f_classif, alpha=float(args[1])) return selector
def feature_SelectFpr(x_data, y_data): # print(x_data) # print(y_data) bestfeatures = SelectFpr(f_classif, alpha=0.01) fit = bestfeatures.fit(x_data, y_data) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(x_data.columns) featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] # naming the dataframe columns top_20_features = featureScores.nlargest(20, 'Score') return top_20_features
def feature_method_selection(data, label, fsname): """ select features by option 'fsname' :param data: :param label: :param fsname: :return: new_data, selected data :return: selected_features_inx, the index of selected feature, starts with 0 """ if fsname == 'variance_threshold': #变化不大就舍弃,离散值 model = VarianceThreshold() #th=1 return model.fit_transform(data) elif fsname == 'select_kbest': model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类 elif fsname == 'rfe':#递归消除,耗时很长 svc = SVC(kernel='linear', C=1) model = RFE(estimator=svc, n_features_to_select=10, step=1) elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值 svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1), scoring='accuracy') elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type model = RandomizedLogisticRegression() elif fsname == 'linear_svc': model = LinearSVC() #没有importance elif fsname == 'tree': model = ExtraTreesClassifier() elif fsname == 'fclassif': model = SelectFpr() #默认是f_classif,值越大,特征越有用 elif fsname == 'pearsonr': #label必须是数值 label = turn_label_2num(label)#结果是两个sample的相关性 res = pearsonr(data,label) elif fsname == 'RandForReg': #label必须是数值 label = turn_label_2num(label) model = RandomForestRegressor() else: logging.error('ERROR: feature selection option is wrong') model.fit(data, label) new_data = model.transform(data) # selected importanted data return new_data
def fit(self, X, y, sample_weight=None): self.fitted_ = False if self.allow_missing_ids is None: self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool) Xold = np.array(X) if self.univariate_feature_selection: # univariate feature selection feature_selector = SelectFpr(alpha=0.05).fit( X[:, ~self.allow_missing_ids], y) self.support = np.ones(X.shape[1]).astype(bool) self.support[~self. allow_missing_ids] = feature_selector.get_support() X = X[:, self.support] self.allow_missing_ids = self.allow_missing_ids[self.support] else: self.support = np.ones(X.shape[1]).astype(bool) if sample_weight is None: if self.class_weight is not None: sample_weight = get_sample_weights( y, class_weight=self.class_weight) else: sample_weight = np.ones(len(X)) sample_weight /= (np.mean(sample_weight) * len(X)) # generate pairs X2, y2, sw2 = self._generate_pairs(X, y, sample_weight) sw2 = sw2 / sw2.mean() if self.verbose: print('Generated %d pairs from %d samples' % (len(X2), len(X))) # fit the model if self.estimator.bounds is not None: self.estimator.bounds = [ self.estimator.bounds[ii] for ii in range(len(self.estimator.bounds)) if self.support[ii] ] self.estimator.fit(X2, y2, sample_weight=sw2) # get the mean of z for each level of y self.label_encoder = LabelEncoder().fit(y) self.classes_ = self.label_encoder.classes_ z = self.predict_z(Xold) self.z_means = np.array( [z[y == cl].mean() for cl in self.label_encoder.classes_]) self.coef_ = np.zeros(len(self.support)) self.coef_[self.support] = self.estimator.coef_.flatten() self.coef_ = self.coef_.reshape(1, -1) self.intercept_ = self.estimator.intercept_ self.fitted_ = True return self
def test_clone_2(): # Tests that clone doesn't copy everything. # We first create an estimator, give it an own attribute, and # make a copy of its original state. Then we check that the copy doesn't # have the specific attribute we manually added to the initial estimator. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) selector.own_attribute = "test" new_selector = clone(selector) assert not hasattr(new_selector, "own_attribute")
def train_DT( feats=None, labels=[], feature_selector=SelectFpr( chi2, alpha=0.05), # Use None to stop feature selection cv=5): # Number of folds used in cross-validation # Map the count dictionaries to a sparse feature matrix: vectorizer = DictVectorizer(sparse=False) feats = vectorizer.fit_transform(feats) ##### FEATURE SELECTION feat_matrix = feats feature_selector = RFE(estimator=MultinomialNB(), n_features_to_select=None, step=1, verbose=0) feat_matrix = feature_selector.fit_transform(feats, labels) ##### HYPER-PARAMETER SEARCH # Define the basic model to use for parameter search: searchmod = DecisionTreeClassifier() # Parameters to grid-search over: parameters = { 'splitter': ['best', 'random'], 'max_features': ['sqrt', 0.25, 'log2'], 'min_samples_split': [2, 5, 10] } # Cross-validation grid search to find the best hyper-parameters: clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1) clf.fit(feat_matrix, labels) params = clf.best_params_ # Establish the model we want using the parameters obtained from the search: mod = DecisionTreeClassifier(splitter=params['splitter'], max_features=params['max_features'], min_samples_split=params['min_samples_split']) ##### ASSESSMENT scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro") print 'Best model', mod print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1]) print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2) # TRAIN OUR MODEL: mod.fit(feat_matrix, labels) # Return the trained model along with the objects we need to # featurize test data in a way that aligns with our training # matrix: return (mod, vectorizer, feature_selector)
def correlation(df, y, threshold, alpha, corr_k_pass, mode): """Feature selection based on correlation between features :param df: dataframe :param y: label :param threshold: select feature threshold :param alpha: hyper-parameter [alpha] :param corr_k_pass: correlation threshold :param mode: feature selection based on static method :return: dataframe of feature selected """ df_out = df.corr() col_pass = [] del_col = [] if mode == "chi2": filter_slect = chi2 elif mode == "f": filter_slect = f_classif else: raise Exception("No mode: " % mode) if alpha: x_bin = MinMaxScaler().fit_transform(scale(df)) fpval = SelectFpr(filter_slect, alpha=alpha).fit(x_bin, y).scores_ df_sort_fval = pd.DataFrame({ "col": list(df.columns), "fval": list(fpval) }) df_sort_fval = df_sort_fval.sort_values(by=['fval'], ascending=False) ranking_col = list(df_sort_fval['col']) else: ranking_col = list(df.columns) for i, col in enumerate(ranking_col): if col not in del_col: col_pass.append(col) del_col = list( set(del_col + (list(df_out[col][(df_out[col] > threshold) | (df_out[col] < -threshold)].index)))) else: del_col = list( set(del_col + (list(df_out[col][(df_out[col] > threshold) | (df_out[col] < -threshold)].index)))) del df_out logging.info("Del col : %d" % len(del_col)) logging.info("Passed col : %d" % len(col_pass)) if corr_k_pass: if len(col_pass) > corr_k_pass: col_pass = col_pass[:corr_k_pass] return df[col_pass]
def select_fpr(args): #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html from sklearn.feature_selection import f_classif, chi2 if args['alpha'] is None: args['alpha'] = 0.05 if args['score_function'] == 'chi2': args['score_function'] = chi2 elif args['score_function'] == 'f_classif': args['score_function'] = f_classif return SelectFpr(score_func=args['score_function'], alpha=args['alpha'])
def test_clone(): """Tests that clone creates a correct deep copy. We create an estimator, make a copy of its original state (which, in this case, is the current state of the setimator), and check that the obtained copy is a correct deep copy. """ from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert_true(selector is not new_selector) assert_equal(selector._get_params(), new_selector._get_params())
def get_basic_model(model, w2v=None): if not w2v: glove = Glove.load() w2v = glove.get_dict() n_jobs = -1 return Pipeline([ ('feature_extraction', get_features(w2v)), # false positive rate test for feature selection ('feature_selection', SelectFpr(f_classif)), #('normalize', StandardScaler(with_mean=False)), #('normalize', MaxAbsScaler()), ("model", model) ])
def get_fsmethod (fsmethod, n_feats, n_subjs, n_jobs=1): if fsmethod == 'stats': return 'stats', None #Feature selection procedures #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html fsmethods = { 'rfe' : RFE(estimator=SVC(kernel="linear"), step=0.05, n_features_to_select=2), #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html 'rfecv' : RFECV(estimator=SVC(kernel="linear"), step=0.05, loss_func=zero_one), #cv=3, default; cv=StratifiedKFold(n_subjs, 3) #Univariate Feature selection: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html 'univariate': SelectPercentile(f_classif, percentile=5), #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html 'fpr' : SelectFpr (f_classif, alpha=0.05), #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFdr.html 'fdr' : SelectFdr (f_classif, alpha=0.05), #http://scikit-learn.org/stable/modules/feature_selection.html 'extratrees': ExtraTreesClassifier(n_estimators=50, max_features='auto', compute_importances=True, n_jobs=n_jobs, random_state=0), 'pca' : PCA(n_components='mle'), 'rpca' : RandomizedPCA(random_state=0), 'lda' : LDA(), } #feature selection parameter values for grid search max_feats = ['auto'] if n_feats < 10: feats_to_sel = range(2, n_feats, 2) n_comps = range(1, n_feats, 2) else: feats_to_sel = range(2, 20, 4) n_comps = range(1, 30, 4) max_feats.extend(feats_to_sel) n_comps_pca = list(n_comps) n_comps_pca.extend(['mle']) fsgrid = { 'rfe' : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)], n_features_to_select = feats_to_sel), 'rfecv' : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)]), 'univariate': dict(percentile = [1, 3, 5, 10]), 'fpr' : dict(alpha = [1, 3, 5, 10]), 'fdr' : dict(alpha = [1, 3, 5, 10]), 'extratrees': dict(n_estimators = [1, 3, 5, 10, 30, 50], max_features = max_feats), 'pca' : dict(n_components = n_comps_pca, whiten = [True, False]), 'rpca' : dict(n_components = n_comps, iterated_power = [3, 4, 5], whiten = [True, False]), 'lda' : dict(n_components = n_comps) } return fsmethods[fsmethod], fsgrid[fsmethod]
def test_select_fpr_int(self): model = SelectFpr() X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.int64) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "select fpr", [("input", Int64TensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr")
def test_select_fpr_int(self): model = SelectFpr() X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]]) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, 'select fpr', [('input', Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) # if export: print("Exporting decision tree image...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def select_features(data, features, target, feature_selector='SelectKBest', k=10, alpha=0.05, score_func='f_classif'): X = data[features] y = data[target] if score_func == 'f_classif': score_func = f_classif elif score_func == 'f_regression': score_func = f_regression elif score_func == 'chi2': score_func = chi2 elif score_func == 'mutual_info_classif': score_func = mutual_info_classif elif score_func == 'mutual_info_regression': score_func = mutual_info_regression else: raise Exception('Undefined score_func') if feature_selector == 'SelectKBest': feature_selector = SelectKBest(score_func=score_func, k=k) elif feature_selector == 'SelectFpr': feature_selector = SelectFpr(score_func=score_func, alpha=alpha) elif feature_selector == 'SelectFdr': feature_selector = SelectFdr(score_func=score_func, alpha=alpha) else: raise Exception('Undefined score_func') feature_selector.fit_transform(X, y) feature_index = [ zero_based_index for zero_based_index in list(feature_selector.get_support( indices=True)) ] best_features = [] for i in feature_index: best_features.append(features[i]) print('Best features selected are: ' + str(best_features)) return best_features
def feature_Univarselection(data, y, Alpha): xx = data.sort_values('pid').values xx_label = y.sort_values('pid')[sep].values select = SelectFpr(f_classif, alpha=Alpha).fit(xx, xx_label) # select = SelectFdr(f_classif, alpha=Alpha).fit(xx,xx_label) # select = SelectFwe(f_classif, alpha=Alpha).fit(xx,xx_label) # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label) # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y) reduced_xx = select.transform(xx) new_data = select.inverse_transform(reduced_xx) new_data = pd.DataFrame(new_data, index=data.sort_values('pid').index, columns=data.sort_values('pid').columns) # idx = select.get_support() # print(idx) # new_data = np.delete(new_data,idx,1) return new_data
def get_best_estimator(x_train, y_train, x_test, y_priors=None): pipeline = Pipeline([('selection', SelectFpr(SELECTOR)), ('scaler', StandardScaler()), ('svm', svm.SVC())]) sample_weight = None if y_priors is not None: sample_weight = [1.0 for i in xrange(len(y_train))] y_train.extend(y_priors) x_train = np.vstack((x_train, x_test)) sample_weight.extend([PRIOR_WEIGHT for i in xrange(len(y_priors))]) clf = GridSearchCV(pipeline, params, fit_params={'svm__sample_weight': sample_weight}) else: clf = GridSearchCV(pipeline, params) clf.fit(x_train, y_train) clf = clf.best_estimator_ logging.debug(clf) return clf
def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect( f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)