def recursive_feature_selection(info_humans, info_bots, params, scale=False): X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale) print "first feature selection by variance test" skb = VarianceThreshold(threshold=(.8 * (1 - .8))) X_new = skb.fit_transform(X) features_1 = features[skb.get_support()] print "second feature selection by ch2 test" skb = SelectKBest(chi2, k=200) # skb = SelectFpr(chi2, alpha=0.005) X_new = skb.fit_transform(X_new, y) features_2 = features_1[skb.get_support()] # skb = PCA(n_components=250) # X_new = skb.fit_transform(X_new, y) print "third feature selection by recursive featue elimination (RFECV)" clf = LogisticRegression(penalty=params['penalty'], C=params['C']) # clf = SVC(kernel="linear") rfecv = RFECV(estimator=clf, step=1, cv=cross_validation.StratifiedKFold(y, 5), scoring='roc_auc', verbose=1) rfecv.fit(X_new, y) print("Optimal number of features : %d" % rfecv.n_features_) return skb, rfecv
def main(): train_df = munge_data('./data/train.csv', False) train_df = train_df.drop('PassengerId', axis=1) target_df = train_df['Survived'] train_df = train_df.drop('Survived', axis=1) train_df = train_df.sort(axis=1) test_df = munge_data('./data/test.csv') test_ids = test_df.PassengerId.values test_df = test_df.drop('PassengerId', axis=1) test_df = test_df.sort(axis=1) train_data = train_df.values target_data = target_df.values test_data = test_df.values clf = svm.SVC(kernel='linear') selector = RFECV(clf, step=1, cv=5, scoring='accuracy') train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split( train_data, target_data, test_size=0.2) selector = selector.fit(train_data, target_data) print(selector.score(cx_data, cx_target_data)) cx_predictions = selector.predict(cx_data) print(classification_report(cx_target_data, cx_predictions)) predictions = selector.predict(test_data) with open('output.csv', 'w') as o: o.write('PassengerId,Survived\n') for passenger, prediction in zip(test_ids, predictions): o.write('{},{}\n'.format(passenger, prediction))
def test_model(model, xtrain, ytrain, feature_list, prefix): """ use train_test_split to create validation train/test samples """ xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain, test_size=0.4) if DO_RFECV: model.fit(xtrain, ytrain) if hasattr(model, 'coef_'): model = RFECV(estimator=model, verbose=0, step=1, scoring=score_fn, cv=3) model.fit(xTrain, yTrain) print 'score', model.score(xTest, yTest) ypred = model.predict(xTest) ### don't allow model to predict negative number of orders if any(ypred < 0): print ypred[ypred < 0] ypred[ypred < 0] = 0 print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest)) # debug_output(model, feature_list) debug_plots(model, yTest, ypred, prefix) return
def plotRFECV (X,y,stepSize=0.05,scoring='f1'): ''' Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation. http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py ''' from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. # svc = SVC(kernel="linear") svc = SVC(kernel="linear",class_weight='auto', cache_size=1400) # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2), scoring=scoring) rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv
def recursiveFeatureSelectorCV(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFECV(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data,transformed_test_data
class RFECVSelection(SelectionModel): name = "RFECV" def __init__(self, *args): SelectionModel.__init__(self, *args) self.selector = RFECV(self.estimator, step=1, cv=5, scoring='mean_squared_error') self.selector.fit(self.x_array, self.y_array) self.support_ = self.selector.support_ def print_rankings(self): print("Rankings for: ", RFECVSelection.name) for (i, rank) in zip(self.columns, self.selector.ranking_): print("{0}: {1}".format(data.column_names[i], rank)) # number of features vs. cv scores def plot_num_of_feat_vs_cv_score(self): plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation scores (mse)") plt.plot(range(1, len(self.selector.grid_scores_) + 1), self.selector.grid_scores_) plt.show() def plot_rankings(self): plt.figure() plt.title("Ranking of features in RFECV") plt.bar(range(self.x_array.shape[1]), self.selector.ranking_, align="center", color="r") plt.xticks(range(self.x_array.shape[1]), [data.column_names[i] for i in self.columns]) plt.show()
def find_best_features(df_train, y_train): rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16) # vals_pearson = df_train.corr('pearson').values vals_pearson = joblib.load("vals_pearson.pkl") # vals_kendall = df_train.corr('kendall').values # vals_spearman = df_train.corr('spearman').values vals_spearman = joblib.load("vals_spearman.pkl") vals = (vals_pearson + vals_spearman) / 2 dumped_cols = [] res_cols = [True] * vals.shape[0] for i in range(vals.shape[0]): if i not in dumped_cols: for j in range(vals.shape[1]): if i != j: if abs(vals[i, j]) > 0.90: dumped_cols.append(j) res_cols[j] = False # df_train2 = df_train[df_train.columns[res_cols]] rfecv = RFECV(rfr, step=10, cv=5, scoring=rmse_scorer, verbose=2) # Float step gives error on the end # rfecv.fit(df_train2, y_train) rfecv = joblib.load("rfecv.pkl") return (res_cols, rfecv.get_support())
def benchmark_features_selection(clf,name): print('_' * 80) print("Training: ") print(clf) t0 = time() rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy') rfecv.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) print(name+"Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() t0 = time() pred = rfecv.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("Saving data to database:") save_results_data(cursor, name, testing_identifiant_produit_list, pred) print() clf_descr = str(clf).split('(')[0] return clf_descr,train_time,test_time
def select_features(clf, x_train, y_train, columns, num_folds, step=19, random_state=0): """ automatic tuning of the number of features selected with cross-validation. :param clf: estimator :param x_train: :param y_train: :return: the fitted rfecv object """ print '================= select_features ================' # Create the RFE object and compute a cross-validated score. cvObj = KFold(len(y_train), n_folds=num_folds, shuffle=True, random_state=random_state) # The "accuracy" scoring is proportional to the number of correct classifications rfecv = RFECV(estimator=clf, step=step, cv=cvObj, scoring=scorer, verbose=2) rfecv.fit(x_train, y_train) print '------------ Results: ----------------' print '>>>> Optimal number of features : %d' % rfecv.n_features_ print '>>>> grid scores:' pprint(rfecv.grid_scores_) print '>>>> ranking of columns:' pprint(np.array(columns)[rfecv.ranking_-1]) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10), scoring='accuracy') rfecv.fit(train_x_nor, train_y) print("Optimal number of features : %d" % rfecv.n_features_)
def run_rfecv(X, y, clf_class, **kwargs): clf = clf_class(**kwargs) rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) plot_rfcev(rfecv) print "Optimal number of features : {0} for model: {1}".format(rfecv.n_features_, clf_class) return rfecv
def get_top_features(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \ classifier = RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER, n_folds = 3, step = 0.05, scoring = 'f1'): ''' Using sklearn.feature_selection.RFECV model in order to find the top features of given windows with features, given in a CSV format. @param windows_data_frame (pandas.DataFrame): A data frame of the windows' CSV. @param drop_only_almost_positives (boolean, default False): Same as in train_window_classifier. @param drop_duplicates (boolean, default True): Whether to drop duplicating windows in the dataset, based on their neighbourhood property, prior to RFECV. @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler): A preprocessing transformer to use for the data before applying RFECV. If None, will not perform any preprocessing transformation. @param classifier (sklearn classifier, default a special version of random forest suitable for RFECV): The classifier to use as the estimator of RFECV. @param n_folds (int, default 2): The n_folds to use in the kfold cross-validation as part of the RFECV process. @param step (default 0.05): See sklearn.feature_selection.RFECV @param scoring (default 'f1'): See sklearn.feature_selection.RFECV @return: A list of the top features, each represented as a string. ''' features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer) kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED) rfecv = RFECV(estimator = classifier, cv = kfold, step = step, scoring = scoring) rfecv.fit(X, y) return util.apply_mask(features, rfecv.support_)
def featureSelection(train_x, train_y): # Create the RFE object and compute a cross-validated score. svc = LinearSVC(C=1, class_weight='balanced') # The "accuracy" scoring is proportional to the number of correct # classifications lasso = RandomizedLasso() lasso.fit(train_x, train_y) rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy') rfecv.fit(train_x, train_y) print("Optimal number of features : %d" % rfecv.n_features_) rankings = rfecv.ranking_ lasso_ranks = lasso.get_support() lassoFeats = [] recursiveFeats = [] shouldUseFeats = [] for i in range(len(rankings)): if lasso_ranks[i]: lassoFeats.append(feats[i]) if rankings[i] == 1: recursiveFeats.append(feats[i]) if lasso_ranks[i]: shouldUseFeats.append(feats[i]) keyboard() print 'Should use ' + ', '.join(shouldUseFeats) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def plot_RFE(X,y): from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn.datasets import make_classification from sklearn.metrics import zero_one_loss import pylab as pl import matplotlib.pylab as pl # Create the RFE object and compute a cross-validated score. # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True) svc=LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto',multi_class='ovr') # SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15) ## rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc') rfecv = RFECV(estimator=svc, step=0.2,cv=StratifiedKFold(y, 2), scoring='f1') X_RFE = rfecv.fit_transform(X, y) print("Optimal number of features in X_RFE : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (nb of misclassifications)") pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) pl.show() print ('RFE Opt.shapes features CV score:') CV_multi_stats(X_RFE,y,svc) return (X_RFE,rfecv)
def feature_selection_RFE(fn ,ax=None, sel="all", goal="Referee", verbosity=0, nf=7): X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1) if verbosity > 1: print ("names:", ",".join(names)) # Create the RFE object and compute a cross-validated score. #estimator = svm.SVC(kernel="linear",C=1.0) estimator = get_clf('svm') scoring = 'f1' cv = cross_validation.StratifiedKFold(y, 2) # The "accuracy" scoring is proportional to the number of correct # classifications if True: rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring) else: from kgml.rfecv import RFECVp f_estimator = get_clf('svm') rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring) with warnings.catch_warnings(): warnings.simplefilter("ignore") rfecv.fit(X, y) # Plot number of features VS. cross-validation scores ax.set_xlabel("Number of features selected") ax.set_ylabel("Cross validation score ({})".format(scoring)) ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) #print("Optimal number of features : %d" % rfecv.n_features_) best = names[rfecv.ranking_==1] #print "The best features:", ', '.join(best) return best
def recursiveFeatureElimination(): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures) x, y = np.array(x), np.array(y) # Create the RFE object and compute a cross-validated score. svr = SVR(kernel="linear") rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(x, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.subplot(numRows, numCols, fignum) plt.title(POI['NAME']) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of misclassifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) fignum += 1 plt.show()
def plot_rfe(X,label): y=X[label] X=X.drop(['churn','appetency','upselling',label],axis='columns') from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Build a classification task using 3 informative features # X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, # n_redundant=2, n_repeated=0, n_classes=8, # n_clusters_per_class=1, random_state=0) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-val5idation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def RFE_featureSelection(X_train,Y_train): ## Sampling RSObj=randomSampling.randomSampling() (X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Select classifier and parameters logistic = linear_model.LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, solver='liblinear', tol=0.01, verbose=0, warm_start=False) ## Initialiaze RFE rfecv = RFECV(estimator=logistic, step=1, cv=5, scoring='recall') ## Fit data rfecv.fit(X_train, Y_train) ## Selected Features print("Optimal number of features : %d" % rfecv.n_features_) ## Plot importance plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() #print('\n Selectd Columns : {0}').format(list(rfecv.support_)) print('\n Selectd Columns : {0}').format(X_train.columns[list(rfecv.support_)]) selected_columns = X_train.columns[list(rfecv.support_)] return selected_columns
def main(): print("Loading paths") paths = json.loads(open("SETTINGS.json").read()) print("Getting features for deleted papers from the disk files") features_conf = [feature for feature in csv.reader(open(paths["trainpos_features"]))] features_deleted = [feature for feature in csv.reader(open(paths["trainneg_features"]))] features = np.array([map(float, x[2:]) for x in features_deleted + features_conf]) target = np.array([0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]) '''classifier = RandomForestClassifier(n_estimators=360, verbose=2, n_jobs=4, min_samples_split=10, random_state=1) classifier = SVR(kernel="linear") ''' classifier = LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=1.0,\ multi_class='ovr',fit_intercept=True, intercept_scaling=1,\ class_weight=None, verbose=0, random_state=None) print("Start feature selection") selector = RFECV(classifier, step=1, cv=5) print features.shape selector = selector.fit(features, target) print("Ouput feature selection results") print selector.support_ print selector.ranking_ writer = csv.writer(open(paths["selection_result"], "w")) writer.writerow(selector.support_.tolist()) writer.writerow(selector.ranking_.tolist())
def select_features(estimator, X_train, y_train): log.info("Selecting best features:") estimator = SVR(kernel="rbf") selector = RFECV(estimator, step=1, cv=5) selector = selector.fit(X_train, y_train) selector.support_ selector.ranking_ return selector.ranking_
def optimalFeatures(train,target): sk = StratifiedKFold(target,n_folds=3) est = SVC(kernel='linear') rfecv = RFECV(est,cv=sk) rfecv.fit(train,target) print("Optimal number of features : %d" % rfecv.n_features_) return rfecv
def select_features(data, class_attribute): X = [[float(v) for v in row[3:]] for row in data[1:]] X = preprocessing.MinMaxScaler().fit_transform(X) y = [0 if row[class_attribute] == 'low' else 1 for row in data[1:]] print("Loaded %d sessions" % len(y)) svc = LinearSVC(class_weight='auto') rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 10), scoring='f1') print("RFE in progress") rfecv = rfecv.fit(X, y) return rfecv
def feature_selection(data_matrix, target): from sklearn.feature_selection import RFECV from sklearn.linear_model import SGDClassifier estimator = SGDClassifier(average=True, shuffle=True, penalty='elasticnet') # perform feature rescaling with elastic penalty data_matrix = estimator.fit_transform(data_matrix, target) # perform recursive feature elimination selector = RFECV(estimator, step=0.1, cv=10) data_matrix = selector.fit_transform(data_matrix, target) return data_matrix
def selectBestFeaturesRFECV(samples, classifications, featureNames, classifierClass): fs = RFECV(classifierClass.getEstimator()) if (not sprs.issparse(samples)): samples = sprs.csr_matrix(samples) samples = fs.fit_transform(samples.toarray(), classifications) sup = fs.get_support() featureNames = [featureNames[i] for (i,s) in enumerate(sup) if s] return [samples,featureNames]
def main(): xtrain=np.load('data/x_train.npy') ytrainreg=np.load('data/loss.npy') xtrain=xtrain[ytrainreg>0] ytrainreg=ytrainreg[ytrainreg>0] reg1=linear_model.SGDRegressor(loss='epsilon_insensitive',random_state=0,n_iter=5) selector1=RFECV(estimator=reg1,scoring='mean_squared_error',verbose=10) selector1.fit(xtrain,np.log(ytrainreg)) #training on the log of the loss print "sel1, optimal number of features:", selector1.n_features_ np.save('features/reg_sel_sgd_eps.npy', selector1.support_)
def cls_create(xs, ys): def score_fn(expected, actual): r,p,f1 = rpf1(expected, actual) return 1.0 - f1 clf = LDA() selector = RFECV(clf, step=50, cv=3, loss_func=score_fn) selector = selector.fit(xs, ys) return selector
def selectFeatures (clf, X, Y): # Create the RFE object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y, 5), scoring='accuracy') rfecv.fit(X, Y) lst = rfecv.get_support() indices = find(lst, True) return X[:, indices], indices
def main(): #read in data, parse into training and target sets data = csv_io.read_data("./hotness_features_classes.csv") target = np.array( [x[0] for x in data] ) train = np.array( [x[1:] for x in data] ) train_scaled = preprocessing.scale(train) clf = SVC(kernel='linear') selector = RFECV(clf, step=1, cv=10) selector = selector.fit(train_scaled, target) print selector.support_
def recursive_fs_cv(X, y, clf): # create the RFE model and select 3 attributes rfe = RFECV(clf, step=1, cv=5) start = time.time() rfe = rfe.fit(X, y) # summarize the selection of the attributes end = time.time() print ("Training Time: " + str((end - start)) + "s") return rfe
def featureSelection(X,y): class RandomForestClassifierWithCoef(RandomForestClassifier): def fit(self, *args, **kwargs): super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs) self.coef_ = self.feature_importances_ randfor = RandomForestClassifierWithCoef(n_estimators=35) rfecv = RFECV(estimator=randfor, step=1, cv=5, scoring='accuracy',verbose=2) rfecv.fit(X,y) return X.columns[rfecv.get_support()]
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1) rfecv.fit(X, y) # non-regression test for missing worst feature: assert len(rfecv.grid_scores_) == X.shape[1] assert len(rfecv.ranking_) == X.shape[1] X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # In the event of cross validation score ties, the expected behavior of # RFECV is to return the FEWEST features that maximize the CV score. # Because test_scorer always returns 1.0 in this example, RFECV should # reduce the dimensionality to a single feature (i.e. n_features_ = 1) assert rfecv.n_features_ == 1 # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2) rfecv.fit(X, y) assert len(rfecv.grid_scores_) == 6 assert len(rfecv.ranking_) == X.shape[1] X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)
df_step5 = df_gini df_step5.info() # Get class, memory, and column info: names, data types, obs. df_step5.head() # Print first 5 observations ### Step 6: Recursive Feature Elimination ### Collect features from RF and PC df_pc_gini = pd.merge(df_pc, df_gini, on = "Features", how = "inner") # Join by column while keeping only items that exist in both, select outer or left for other options pc_gini_features = df_pc_gini["Features"].tolist() # Save features from data frame df_rfecv = df_step3[pc_gini_features] # Add selected features to df ### Setup RFE model X = df_rfecv # Save features columns as predictor data frame Y = df_step3["outcome"] # Use outcome data frame RFE = LinearRegression() # Use regression coefficient as estimator selector = RFECV(estimator = RFE, min_features_to_select = 5) # define selection parameters, in this case all features are selected. See Readme for more ifo ### Fit RFE model selected = selector.fit(X, Y) # This will take time ### Collect features from RFE model ar_rfe = selected.support_ # Save Boolean values as numpy array l_rfe = list(zip(X, ar_rfe)) # Create list of variables alongside RFE value df_rfe = pd.DataFrame(l_rfe, columns = ["Features", "RFE"]) # Create data frame of importances with variables and gini column names df_rfe = df_rfe[df_rfe.RFE == True] # Select Variables that were True df_rfe = df_rfe.reset_index() # Reset Index df_rfe = df_rfe.filter(["Features"]) # Keep only selected columns ### Rename and Verify df_step6 = df_rfe df_step6.info() # Get class, memory, and column info: names, data types, obs.
def default_pipeline(self, name, n_pca=10, n_best=10, lda_shrink=10, svm_C=10, svm_gamma=10, fdr_alpha=[0.05], fpr_alpha=[0.05]): """Use a default combination of parameters for building a pipeline Args: name: string The string for building a default pipeline (see examples below) Kargs: n_pca: integer, optional, (def: 10) The number of components to search n_best: integer, optional, (def: 10) Number of best features to consider using a statistical method lda_shrink: integer, optional, (def: 10) Fit optimisation parameter for the lda svm_C/svm_gamma: integer, optional, (def: 10/10) Parameters to optimize for the svm fdr/fpr_alpha: list, optional, (def: [0.05]) List of float for selecting features using a fdr or fpr Examples: >>> # Basic classifiers : >>> name = 'lda' # or name = 'svm_linear' for a linear SVM >>> # Combine a classifier with a feature selection method : >>> name = 'lda_fdr_fpr_kbest_pca' >>> # The method above will use an LDA for the features evaluation >>> # and will combine a FDR, FPR, k-Best and pca feature seelction. >>> # Now we can combine with classifier optimisation : >>> name = 'lda_optimized_pca' # will try to optimize an LDA with a pca >>> name = 'svm_kernel_C_gamma_kbest' # optimize a SVM by trying >>> # diffrent kernels (linear/RBF), and optimize C and gamma parameters >>> # combine with a k-Best features selection. """ # ---------------------------------------------------------------- # DEFINED COMBINORS # ---------------------------------------------------------------- pca = PCA() selection = SelectKBest() scaler = StandardScaler() fdr = SelectFdr() fpr = SelectFpr() # ---------------------------------------------------------------- # RANGE DEFINITION # --------------------------------------------------------- pca_range = np.arange(1, n_pca + 1) kbest_range = np.arange(1, n_best + 1) C_range = np.logspace(-5, 15, svm_C, base=2.) #np.logspace(-2, 2, svm_C) gamma_range = np.logspace(-15, 3, svm_gamma, base=2.) #np.logspace(-9, 2, svm_gamma) # Check range : if not kbest_range.size: kbest_range = [1] if not pca_range.size: pca_range = [1] if not C_range.size: C_range = [1.] if not gamma_range.size: gamma_range = ['auto'] # ---------------------------------------------------------------- # DEFINED PIPELINE ELEMENTS # ---------------------------------------------------------------- pipeline = [] grid = {} combine = [] # ---------------------------------------------------------------- # BUILD CLASSIFIER # ---------------------------------------------------------------- # -> SCALE : if name.lower().find('scale') != -1: pipeline.append(("scaler", scaler)) # -> LDA : if name.lower().find('lda') != -1: # Default : if name.lower().find('optimized') == -1: clf = LinearDiscriminantAnalysis( priors=np.array([1 / self._nclass] * self._nclass)) # Optimized : elif name.lower().find('optimized') != -1: clf = LinearDiscriminantAnalysis(priors=np.array( [1 / self._nclass] * self._nclass), solver='lsqr') grid['clf__shrinkage'] = np.linspace(0., 1., lda_shrink) # -> SVM : elif name.lower().find('svm') != -1: # Linear/RBF standard kernel : if name.lower().find('linear') != -1: kwargs = {'kernel': 'linear'} elif name.lower().find('rbf') != -1: kwargs = {'kernel': 'rbf'} else: kwargs = {} # Optimized : if name.lower().find('optimized') != -1: # Kernel optimization : if name.lower().find('kernel') != -1: grid['clf__kernel'] = ('linear', 'rbf') # C optimization : if name.lower().find('_c_') != -1: grid['clf__C'] = C_range # Gamma optimization : if name.lower().find('gamma') != -1: grid['clf__gamma'] = gamma_range clf = SVC(**kwargs) # ---------------------------------------------------------------- # BUILD COMBINE # ---------------------------------------------------------------- # -> FDR : if name.lower().find('fdr') != -1: combine.append(("fdr", fdr)) grid['features__fdr__alpha'] = fdr_alpha # -> FPR : if name.lower().find('fpr') != -1: combine.append(("fpr", fpr)) grid['features__fpr__alpha'] = fpr_alpha # -> PCA : if name.lower().find('pca') != -1: combine.append(("pca", pca)) grid['features__pca__n_components'] = pca_range # -> kBest : if name.lower().find('kbest') != -1: combine.append(("kBest", selection)) grid['features__kBest__k'] = kbest_range # -> RFECV : if name.lower().find('rfecv') != -1: rfecv = RFECV(clf) combine.append(("RFECV", rfecv)) # if combine is empty, select all features : if not len(combine): combine.append(("kBest", SelectKBest(k='all'))) self.combine = FeatureUnion(combine) # ---------------------------------------------------------------- # SAVE PIPELINE # ---------------------------------------------------------------- # Build ordered pipeline : if len(combine): pipeline.append(("features", self.combine)) pipeline.append(("clf", clf)) # Save pipeline : self.pipeline = Pipeline(pipeline) self.grid = grid self._pipename = name
def rfecv_features(X, y, rfecv_params): """ Feature ranking with recursive feature elimination and cross-validated selection of the best number of features. Determines the minimum number of features that are needed to maxmize the model's performance. Parameters ---------- X : pandas dataframe A data set where each row is an observation and each column a feature. y: numpy array A numpy array containing the targets rfecv_params: dict, A dictionary containing the set of parameters use to initialize RFECV sklearn class. Examples -------- # Initialize estimator estimator = RandomForestClassifier() # Define RFECV parameters rfecv_params = {'estimator': estimator, 'cv': 2, 'step': 1, 'scoring': 'accuracy', 'verbose': 50} # Get rfecv feature labels labels = rfecv_features(X = X, y = y, rfecv_params = rfecv_params) Returns ----- labels: list A list with the labels identifying the subset of features needed to maximize the model's performance. feature_selector: fitted RFECV object References ---------- Find more details about Boruta here: https://github.com/scikit-learn-contrib/boruta_py """ # Initialize RFECV object feature_selector = RFECV(**rfecv_params) # Fit RFECV feature_selector.fit(X, y) # Get selected features feature_labels = X.columns # Get selected features labels = feature_labels[feature_selector.support_].tolist() return labels, feature_selector
# meanwhile, we split the traindata as well to loop so that largely reduced overfitting. from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.feature_selection import RFECV from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score, r2_score from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances # divide the training dataset into splits folds, using len(splits)-1 for training, and 1 for validation. # repeat such dividing for 'repeats' times. The overall outputs fit model is splits*repeats rskf = RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats, random_state=random_seed) # rskf=StratifiedShuffleSplit(n_splits=splits, test_size=test_size, # random_state=random_seed) # select the important features for a specific estimator.create a feature selector feature_selector = RFECV(gridsearch.best_estimator_, step=steps, cv=cv, scoring=myScorer) # start loop for the model fitting for every single split. total loop number: splits*repeats counter = 0 predictions = pd.DataFrame() print( "counter,mse, mae, auc,r2, grid_search.best_score_, feature_selector.n_features_, message " ) for train_index, val_index in rskf.split(X_train, y_train): # train a model for every single split. #select train_index rows of data for training X, X_val = X_train[train_index], X_train[val_index] y, y_val = y_train[train_index], y_train[val_index] # # fit the RFE model and automatically tune the number of selected.
def predict_features(self, df_features, df_target, idx=0, **kwargs): estimator = SVR(kernel='linear') selector = RFECV(estimator, step=1) selector = selector.fit(df_features.as_matrix(), df_target.as_matrix()[:, 0]) return selector.grid_scores_
# ### **Recursive feature elimination with cross validation and random forest classification** # Now, we will find how many atributtes do we need for best accuracy # In[ ]: X = train_df.drop(['Survived'], axis=1) y = train_df.Survived X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # In[ ]: rf = RandomForestClassifier() rfecv = RFECV(estimator=rf, step=1, cv=5, scoring='accuracy') #5-fold cross-validation rfecv = rfecv.fit(X_train, y_train) print('Optimal number of features :', rfecv.n_features_) print('Best features :', X_train.columns[rfecv.support_]) # In[ ]: X = train_df.drop(['Survived', 'Embarked'], axis=1) # ## **Model selection** # Ok, we have many models and algorithms. First, we gonna try with all features. # In[ ]:
# Diccionario que mapea la RFE Accuracy con un índice dict_1 = {} # Diccionario que mapea un índice con el objeto RFECV dict_2 = {} time_prebucle = time.time() # Itero sobre los posibles valores de C for i, c in enumerate(C): time_temp1 = time.time() clf_temp = SVC(C=c, kernel=kernel, class_weight=class_weight, random_state=random_state) rfecv_temp = RFECV(clf_temp, cv=skf, scoring=scoring) rfecv_temp.fit(X, y) dict_1[rfecv_temp.grid_scores_[rfecv_temp.n_features_]] = i dict_2[i] = rfecv_temp time_temp2 = time.time() print(f'Time iteration {i}: {time_temp2-time_temp1}') time_bucle = time.time() print(f'Time loop: {time_bucle-time_prebucle}') maximo = max(dict_1) indice_maximo = dict_1[maximo] rfecv = dict_2[indice_maximo] best_c = rfecv.estimator_.get_params()['C'] print(f'Best C: {best_c}')
def test_RFECV(): from sklearn.datasets import load_boston from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_iris from sklearn.feature_selection import RFECV # Regression X, y = load_boston(return_X_y=True) bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='reg:squarederror', random_state=0, verbosity=0) rfecv = RFECV( estimator=bst, step=1, cv=3, scoring='neg_mean_squared_error') rfecv.fit(X, y) # Binary classification X, y = load_breast_cancer(return_X_y=True) bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='binary:logistic', random_state=0, verbosity=0) rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc') rfecv.fit(X, y) # Multi-class classification X, y = load_iris(return_X_y=True) bst = xgb.XGBClassifier(base_score=0.4, booster='gblinear', learning_rate=0.1, n_estimators=10, n_jobs=1, objective='multi:softprob', random_state=0, reg_alpha=0.001, reg_lambda=0.01, scale_pos_weight=0.5, verbosity=0) rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss') rfecv.fit(X, y)
labels_train = [] labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) # Cross-validation for SVC if clf_choice == 'svc': from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Recursive feature elimination with cross-validation svc_clf = RFECV(estimator=clf, step=1, cv=StratifiedKFold(labels_train, n_folds=10), scoring='accuracy') svc_clf = svc_clf.fit(features_train, labels_train) print("Optimal number of features: %d" % svc_clf.n_features_) print(svc_clf.support_) print(svc_clf.ranking_) # Cross-validation for DecisionTreeClassifier if clf_choice == 'dtc': from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV param_grid = { 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2', None], 'max_depth': [4, 5, 6, 7, 8, None], 'min_samples_split': [2, 3, 4, 5],
# Make a fake patient by randomly selecting a value from each feature fake_patient = X.apply(np.random.choice, axis=0) fake_prediction = best_classifier.predict(np.array([fake_patient.to_numpy()])) # - # ### How to account for data collection cost for each feature? # #### Try recursive feature elmination with CV # RFECV attempts to select the best combination of features by fitting models (with 5-fold CV) recursively eliminating a feature at a time. # + rfecv = RFECV(estimator=LogisticRegression( solver='liblinear', C=best_classifier.get_params()['C'], penalty=best_classifier.get_params()['penalty'], random_state=48), step=1, cv=5, scoring='f1') rfecv.fit(X, y) print('Recommended to select the following {} features:'.format( rfecv.n_features_)) print(X.columns[rfecv.support_]) best_feature_subset_classifier = LogisticRegression( solver='liblinear', C=best_classifier.get_params()['C'], penalty=best_classifier.get_params()['penalty'], random_state=48) best_feature_subset_classifier.fit(X_train, y_train)
# -*- coding: utf-8 -*- import scipy.io import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import RFECV from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt mat = scipy.io.loadmat("./input/arcene.mat") y_train = np.ravel(mat["y_train"]) y_test = np.ravel(mat["y_test"]) X_train = mat["X_train"] X_test = mat["X_test"] est = LogisticRegression(solver="lbfgs") rfe = RFECV(estimator=est, step=50, verbose=1) rfe = rfe.fit(X_train, y_train) rfe.support_ plt.plot(range(0, 10001, 50), rfe.grid_scores_) score = accuracy_score(y_test, rfe.predict(X_test)) print('Test accuracy', score)
print("Shape X matrix: ", x.shape) print("prop: ", y.value_counts() / y.shape[0]) #validation set data2 = pd.read_csv("data\\val_imp.csv", header=0) y_v = data2.iloc[:, -1] x_v = data2.iloc[:, :-1] print("Shape X_v matrix: ", x_v.shape) ############################# #Feature selection ############################# #setting up feature selection algorithm k_fold = KFold(n_splits=10) est = LogisticRegression() selector = RFECV(est, cv=k_fold) selector.fit(x, y) #keeping selected variables and printing names for control x_b = x.loc[:, selector.get_support()] xv_b = x_v.loc[:, selector.get_support()] print("Optimal number of features : %d" % selector.n_features_) print("Support", x.loc[:, selector.get_support()].columns) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) plt.savefig("plots\\featknn.pdf", bbox_inches='tight') plt.close() ##############################
import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import RFECV import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score arcene = loadmat('arcene.mat') X_train = arcene['X_train'] X_test = arcene['X_test'] y_train = arcene['y_train'] y_test = arcene['y_test'] y_train = np.ravel(y_train) y_test = np.ravel(y_test) model = LogisticRegression() selector = RFECV(model, step=50, verbose=1) selector.fit(X_train, y_train) count = 0 for x in selector.support_: if x: count += 1 plt.plot(range(0, 10001, 50), selector.grid_scores_) y_pred = selector.predict(X_test) score = accuracy_score(y_test, y_pred) print(score)
kepler_X_trans = trans.fit_transform(train_X, train_y) columns_retained_RFE = train_X.iloc[:, :].columns[ trans.get_support()].values print('Cols to keep:', columns_retained_RFE) clf = linear_model.LinearRegression().fit( train_X[columns_retained_RFE], train_y) stats.summary(clf, oos_X[columns_retained_RFE], oos_y, columns_retained_RFE) print('Train R:', clf.score(train_X[columns_retained_RFE], train_y)) print('OOS R:', clf.score(oos_X[columns_retained_RFE], oos_y)) print('############ RFECV ############') clf = linear_model.LinearRegression() trans = RFECV(clf) kepler_X_trans = trans.fit_transform(train_X, train_y) columns_retained_RFECV = train_X.iloc[:, :].columns[ trans.get_support()].values print('Cols to keep:', columns_retained_RFECV) clf = linear_model.LinearRegression().fit( train_X[columns_retained_RFECV], train_y) stats.summary(clf, oos_X[columns_retained_RFECV], oos_y, columns_retained_RFECV) print('Train R:', clf.score(train_X[columns_retained_RFECV], train_y)) print('OOS R:', clf.score(oos_X[columns_retained_RFECV], oos_y)) #clf = linear_model.BayesianRidge() #LinearRegression() #clf.fit(train_X[columns_retained_RFECV], train_y) #print('train R:', clf.score(train_X[columns_retained_RFECV], train_y))
features = data[list(data.columns)[:-1]] features = features.to_numpy() #select the labels labels = data[list(data.columns)[-1]] labels = labels.to_numpy() #60-40 train-test split numTrain = int(0.6*len(features)) trainData = features[:numTrain] trainLbl = labels[:numTrain] testData = features[numTrain:] testLbl = labels[numTrain:] clf = RandomForestClassifier(n_estimators=30, max_depth=20, n_jobs=-1, random_state=42) #cv = None -> defaults to 5-fold cross validation rfecv = RFECV(estimator=clf, step=1, cv=None, scoring='f1_weighted') #5-fold on the whole dataset rfecv.fit(features, labels) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (F-score)") fig_feat = plt.gcf() plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() plt.draw() fig_feat.savefig('%s.png'%(graph_file), bbox_inches='tight') numImportantFeatures = rfecv.n_features_ print("Optimal number of features : %d" % numImportantFeatures)
def perform(emotion, train_tweets, y_train, task_name, estimator_dict): #Select the scoring metric, depending upon the task name scoring = Dictionaries.scoring.get(task_name) # Perform the preprocessing and feature engineering tasks preprocess_train_df = Preprocessor.perform(train_tweets, emotion, 'train', task_name) X_train = Feature_Transformer.perform(preprocess_train_df, emotion, 'train', task_name) #Iterate through all the estimators for estimator_name, estimator in estimator_dict.items(): #pipeline for original data pipeline = make_pipeline( MinMaxScaler(feature_range=(0, 1), copy=True), RFECV(estimator, step=1, cv=5, scoring=scoring, n_jobs=-1)) scores = cross_validate(pipeline, X_train, y_train, scoring=scoring, cv=5, return_train_score=False) print(scores) pipeline.fit(X_train, y_train) print(pipeline.steps) #Get number of features selected, the features selected and its ranking selected_features = pipeline.steps[1][1].n_features_ feature_mask = pipeline.steps[1][1].support_ feature_rank = pipeline.steps[1][1].ranking_ # Classification task if (task_name == 'c'): #Get F1 scores cv_feature_scores = pipeline.steps[1][1].grid_scores_ # f1 Writer.write_class_feat_rank_anal_results_in_file( emotion, 'original', estimator_name, selected_features, feature_mask, feature_rank, cv_feature_scores) # Pipeline with resamplers - SMOTE, TomekLinks, SMOTETomek for resampler_name, resampler in Dictionaries.resampler_dict.items( ): #pipeline for resampling pipeline = make_pipeline_imb( MinMaxScaler(feature_range=(0, 1), copy=True), resampler, RFECV(estimator, step=1, cv=5, scoring=scoring, n_jobs=-1)) # Fit the pipeline with data pipeline.fit(X_train, y_train) print(pipeline.steps) selected_features = pipeline.steps[2][1].n_features_ feature_mask = pipeline.steps[2][1].support_ feature_rank = pipeline.steps[2][1].ranking_ cv_feature_scores = pipeline.steps[2][1].grid_scores_ # f1 Writer.write_class_feat_rank_anal_results_in_file( emotion, resampler_name, estimator_name, selected_features, feature_mask, feature_rank, cv_feature_scores) gc.collect() # Regression task if (task_name == 'r'): #Get rmse scores cv_feature_scores = np.sqrt(-pipeline.steps[1][1].grid_scores_ ) # sqrt(-neg_mean_squared_error) Writer.write_reg_feat_rank_anal_results_in_file( emotion, estimator_name, selected_features, feature_mask, feature_rank, cv_feature_scores) gc.collect()
#%% fimp.sort_values().head(10).index #%% from sklearn.feature_selection import RFE rfe = RFE(estimator=logreg, n_features_to_select=1, step=1) rfe.fit(X, y) ranking = rfe.ranking_ #%% fimp = pd.Series(ranking, index = viz.features_) fimp.sort_values(ascending=True).head(20).index #%% from sklearn.feature_selection import RFECV rfecv = RFECV(estimator=logreg, step=1, cv=2, n_jobs=-1, scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() #%% ranking = rfecv.ranking_ fimp = pd.Series(ranking, index = features) fimp.sort_values(ascending=True).head(rfecv.n_features_).index
from sklearn.feature_selection import RFECV from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import StandardScaler df_train = pd.read_csv("train.csv") feats = df_train.drop("revenue", axis=1) X = feats.values #features y = df_train["revenue"].values #target # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') t = StandardScaler() X = t.fit_transform(X) y = t.fit_transform(y) count = 0 for elem in y: print(elem) count += 1 if count > 10: break count = 0 for elem in X:
param_names = sorted(param_grid) combinations = list(itertools.product(*(param_grid[name] for name in param_names))) print("Grid contains {} hyper-parameter combinations...".format(len(combinations))) result_recorder = [] featr_support = [] rfetr_num = [] counter = 1 for i in combinations: print("CV for {} set hyperparameters...".format(counter)) tmp_hyper = dict(zip(param_names, i)) rfr_hyper = RandomForestRegressor(**tmp_hyper,random_state = random_state) rfe_cv = RFECV(estimator=rfr_hyper, step=1, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=False) rfe_cv.fit(X_train,y_train) result_recorder.append(rfe_cv.grid_scores_) # record mean of CV featr_support.append(rfe_cv.support_) # record selected features of CV rfetr_num.append(rfe_cv.n_features_) counter +=1 result_recorder = np.array(result_recorder) * -1 all_rmse = np.min(result_recorder,axis=0) # record smallest RMSE rfecv_res = pd.DataFrame(np.column_stack([np.arange(1,len(all_rmse)+1),all_rmse]), columns = ["Number of features", "Mean RMSE"]) rfecv_res.to_csv("rfecv_results/rfecv_term_{}.csv".format(term_type)) # for plotting purpose
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc for piece in range(7, 8): feature = brk_ext(piece) train_data = feature.loc[feature['train_test'] == 1] train_data.loc[(train_data['activities'] == 0) | (train_data['activities'] == 1), 'activities'] = 0 train_data.loc[(train_data['activities'] != 0) & (train_data['activities'] != 1), 'activities'] = 1 col = [] for i in range(0, 6 * piece): col.append("max" + str(i + 1)) col.append("mean" + str(i + 1)) col.append("std" + str(i + 1)) train_target = train_data.iloc[:, -1] train_data = train_data[col] train_data.to_csv("./f.csv", index=False, header=True) train_target.to_csv("./ff.csv", index=False, header=True) model = LogisticRegression(max_iter=20) clf = RFECV(model, step=1, cv=5, n_jobs=-1) clf = clf.fit(train_data, train_target) row, p = train_data.iloc[:, clf.get_support()].shape accuracy = clf.score(train_data, train_target) f1 = f1_score(train_target, clf.predict(train_data)) print("piece: ", piece, " best p :", p, "accuracy: ", accuracy, " F1-score :", f1)
classifier.fit(train_feature_data, train_class_data) # predict using model and test data test_predicted_data = classifier.predict(test_feature_data) # calculatre metrics print('score={}'.format( accuracy_score(test_class_data, test_predicted_data))) print(confusion_matrix(test_class_data, test_predicted_data)) print(classification_report(test_class_data, test_predicted_data)) # using RFE (Recursive Feature Estimation) print('Logistic Regression classifier after using RFECV') classifier = LogisticRegression(max_iter=10000, solver='lbfgs') rfecv = RFECV(estimator=classifier, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv_data = rfecv.fit_transform(feature_data_processed, class_data) # Plot number of features VS. cross-validation scores plot.figure() plot.xlabel("Number of features selected") plot.ylabel("Cross validation score (nb of correct classifications)") plot.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plot.show() selected_columns = rfecv.get_support(indices=True) columns_rfecv = [ feature_data_processed.columns[selected] for selected in selected_columns
def RecursiveFeatureSelectionCrossValidated(model): rfe = RFECV(model, cv=5, step=1) return rfe
MI_selector = SelectKBest(mutual_info_classif, k=5) X_train_MI = MI_selector.fit_transform(X_train, y_train) print("MI scores: {}, MI p-values: {}".format(MI_selector.scores_, MI_selector.pvalues_)) table1_output += "MI scores: {}, MI p-values: {}".format( MI_selector.scores_, MI_selector.pvalues_) #estimator for recursive feature elimination # estimator = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', class_weight= None, max_features = None, random_state = 42,n_jobs=-1) estimator = DecisionTreeClassifier(random_state=42) #inner cv for recursive feature elimination inner_cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42) RFE_selector = RFECV(estimator, step=1, cv=inner_cv, n_jobs=-1) X_train_RFE = RFE_selector.fit_transform(X_train, y_train) print("RFE rankings: {}, RFE grid-scores: {}".format( RFE_selector.ranking_, RFE_selector.grid_scores_)) table1_output += "RFE rankings: {}, RFE grid-scores: {}".format( RFE_selector.ranking_, RFE_selector.grid_scores_) ############# TEST RESULTS FOR FEATURE SELECTION METHODS ############# fold_info = { model_name: { 'FULL': {score_name: 0 for score_name, score in scoring}, 'ANOVA': {score_name: 0 for score_name, score in scoring}, 'MI': {score_name: 0 for score_name, score in scoring},
print(X.shape) clf = LinearSVC(penalty='l2',dual=False) scores = cross_validation.cross_val_score(clf, X, data[1], cv=10) print(scores) print("L2 SVM trained on the features selected by the L1 SVM. \n Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #L2 SVM that use the class RFECV which automatically selects the number of features clf = LinearSVC(penalty='l2',dual=False) # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(data[1], 2),scoring='accuracy') rfecv.fit(data[0], data[1]) #scores = cross_validation.cross_val_score(rfecv, data[0], data[1], cv=10) print("Optimal number of features : %d" % rfecv.n_features_) scores = rfecv.grid_scores_ print(scores) print("L2 SVM that use the class RFECV which automatically selects the number of features. \n Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ''' # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() '''
multi_class='ovr')), ['<NAME0>', '<NAME2>']), (SelectFromModel( PermutationImportance( LogisticRegression(solver='liblinear', random_state=42), cv=5, random_state=42, refit=False, ), threshold=0.1, ), ['<NAME2>', '<NAME3>']), (RFE(LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr'), n_features_to_select=2), ['<NAME1>', '<NAME3>']), (RFECV(LogisticRegression( solver='liblinear', random_state=42, multi_class='ovr'), cv=3), ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']), ] + _additional_test_cases) def test_transform_feature_names_iris(transformer, expected, iris_train): X, y, _, _ = iris_train transformer.fit(X, y) # Test in_names being provided res = transform_feature_names(transformer, ['<NAME0>', '<NAME1>', '<NAME2>', '<NAME3>']) assert res == expected # Test in_names being None expected_default_names = [ re.sub('<NAME([0-9]+)>', r'x\1', name) for name in expected ] assert transform_feature_names(transformer, None) == expected_default_names
def fit(self, X, Y): params = self.get_params() model = sk_SVR(**params) self.rfe = RFECV(model) self.rfe.fit(X, Y)
del data_test['fmri_select'] ###################################################################################################################### import pandas as pd from pandas import Series from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFECV class RandomForestClassifierWithCoef(RandomForestClassifier): def fit(self, *args, **kwargs): super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs) self.coef_ = self.feature_importances_ #fitting on train rf = RandomForestClassifierWithCoef(n_estimators=1000, min_samples_leaf=5, n_jobs=-1) rfecv = RFECV(estimator=rf, step=1, cv=3, scoring='accuracy', verbose=30) selector=rfecv.fit(data_train._get_numeric_data(), labels_train) # collect only the important features df_important_features_train = data_train._get_numeric_data()[data_train._get_numeric_data().columns[rfecv.get_support()]] df_important_features_test = data_test._get_numeric_data()[data_test._get_numeric_data().columns[rfecv.get_support()]] # selector.get_support() # accuracy in test from sklearn.metrics import accuracy_score accuracy_score(labels_test,rfecv.predict(data_test._get_numeric_data())
number of features selected with cross-validation. """ print __doc__ from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn.datasets import make_classification from sklearn.metrics import zero_one # Build a classification task using 3 informative features X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2), loss_func=zero_one) rfecv.fit(X, y) print "Optimal number of features : %d" % rfecv.n_features_ # Plot number of features VS. cross-validation scores import pylab as pl pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (nb of misclassifications)") pl.plot(xrange(1, len(rfecv.cv_scores_) + 1), rfecv.cv_scores_) pl.show()
p = precision_score(labels_test, labels_pred, average='micro') r = recall_score(labels_test, labels_pred, average='micro') if p > 0.3 and r > 0.3: return f1_score(labels_test, labels_pred, average='macro') return 0 #Recursive Feature Selection import matplotlib.pyplot as plt from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV clf = DecisionTreeClassifier(max_depth=5) rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(labels, 50), scoring='precision') rfecv.fit(features, labels) print("Optimal number of features : %d" % rfecv.n_features_) print rfecv.support_ features = features[:, rfecv.support_] # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() # DecisionTreeClassifier tuning t0 = time() parameters = { 'max_depth': [1, 2, 3, 4, 5, 6, 8, 9, 10],