def plotRFECV (X,y,stepSize=0.05,scoring='f1'): ''' Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation. http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py ''' from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. # svc = SVC(kernel="linear") svc = SVC(kernel="linear",class_weight='auto', cache_size=1400) # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2), scoring=scoring) rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv
def feature_selection_RFE(fn ,ax=None, sel="all", goal="Referee", verbosity=0, nf=7): X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1) if verbosity > 1: print ("names:", ",".join(names)) # Create the RFE object and compute a cross-validated score. #estimator = svm.SVC(kernel="linear",C=1.0) estimator = get_clf('svm') scoring = 'f1' cv = cross_validation.StratifiedKFold(y, 2) # The "accuracy" scoring is proportional to the number of correct # classifications if True: rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring) else: from kgml.rfecv import RFECVp f_estimator = get_clf('svm') rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring) with warnings.catch_warnings(): warnings.simplefilter("ignore") rfecv.fit(X, y) # Plot number of features VS. cross-validation scores ax.set_xlabel("Number of features selected") ax.set_ylabel("Cross validation score ({})".format(scoring)) ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) #print("Optimal number of features : %d" % rfecv.n_features_) best = names[rfecv.ranking_==1] #print "The best features:", ', '.join(best) return best
class RFECVSelection(SelectionModel): name = "RFECV" def __init__(self, *args): SelectionModel.__init__(self, *args) self.selector = RFECV(self.estimator, step=1, cv=5, scoring='mean_squared_error') self.selector.fit(self.x_array, self.y_array) self.support_ = self.selector.support_ def print_rankings(self): print("Rankings for: ", RFECVSelection.name) for (i, rank) in zip(self.columns, self.selector.ranking_): print("{0}: {1}".format(data.column_names[i], rank)) # number of features vs. cv scores def plot_num_of_feat_vs_cv_score(self): plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation scores (mse)") plt.plot(range(1, len(self.selector.grid_scores_) + 1), self.selector.grid_scores_) plt.show() def plot_rankings(self): plt.figure() plt.title("Ranking of features in RFECV") plt.bar(range(self.x_array.shape[1]), self.selector.ranking_, align="center", color="r") plt.xticks(range(self.x_array.shape[1]), [data.column_names[i] for i in self.columns]) plt.show()
def RFE_featureSelection(X_train,Y_train): ## Sampling RSObj=randomSampling.randomSampling() (X_train,Y_train)=RSObj.getRandomSample(X_train,Y_train) X_train.reset_index(drop=True,inplace=True) Y_train.reset_index(drop=True,inplace=True) ## Select classifier and parameters logistic = linear_model.LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, solver='liblinear', tol=0.01, verbose=0, warm_start=False) ## Initialiaze RFE rfecv = RFECV(estimator=logistic, step=1, cv=5, scoring='recall') ## Fit data rfecv.fit(X_train, Y_train) ## Selected Features print("Optimal number of features : %d" % rfecv.n_features_) ## Plot importance plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() #print('\n Selectd Columns : {0}').format(list(rfecv.support_)) print('\n Selectd Columns : {0}').format(X_train.columns[list(rfecv.support_)]) selected_columns = X_train.columns[list(rfecv.support_)] return selected_columns
def benchmark_features_selection(clf,name): print('_' * 80) print("Training: ") print(clf) t0 = time() rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy') rfecv.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) print(name+"Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() t0 = time() pred = rfecv.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("Saving data to database:") save_results_data(cursor, name, testing_identifiant_produit_list, pred) print() clf_descr = str(clf).split('(')[0] return clf_descr,train_time,test_time
def test_model(model, xtrain, ytrain, feature_list, prefix): """ use train_test_split to create validation train/test samples """ xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain, test_size=0.4) if DO_RFECV: model.fit(xtrain, ytrain) if hasattr(model, 'coef_'): model = RFECV(estimator=model, verbose=0, step=1, scoring=score_fn, cv=3) model.fit(xTrain, yTrain) print 'score', model.score(xTest, yTest) ypred = model.predict(xTest) ### don't allow model to predict negative number of orders if any(ypred < 0): print ypred[ypred < 0] ypred[ypred < 0] = 0 print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest)) # debug_output(model, feature_list) debug_plots(model, yTest, ypred, prefix) return
def recursive_feature_selection(info_humans, info_bots, params, scale=False): X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale) print "first feature selection by variance test" skb = VarianceThreshold(threshold=(.8 * (1 - .8))) X_new = skb.fit_transform(X) features_1 = features[skb.get_support()] print "second feature selection by ch2 test" skb = SelectKBest(chi2, k=200) # skb = SelectFpr(chi2, alpha=0.005) X_new = skb.fit_transform(X_new, y) features_2 = features_1[skb.get_support()] # skb = PCA(n_components=250) # X_new = skb.fit_transform(X_new, y) print "third feature selection by recursive featue elimination (RFECV)" clf = LogisticRegression(penalty=params['penalty'], C=params['C']) # clf = SVC(kernel="linear") rfecv = RFECV(estimator=clf, step=1, cv=cross_validation.StratifiedKFold(y, 5), scoring='roc_auc', verbose=1) rfecv.fit(X_new, y) print("Optimal number of features : %d" % rfecv.n_features_) return skb, rfecv
def select_features(clf, x_train, y_train, columns, num_folds, step=19, random_state=0): """ automatic tuning of the number of features selected with cross-validation. :param clf: estimator :param x_train: :param y_train: :return: the fitted rfecv object """ print '================= select_features ================' # Create the RFE object and compute a cross-validated score. cvObj = KFold(len(y_train), n_folds=num_folds, shuffle=True, random_state=random_state) # The "accuracy" scoring is proportional to the number of correct classifications rfecv = RFECV(estimator=clf, step=step, cv=cvObj, scoring=scorer, verbose=2) rfecv.fit(x_train, y_train) print '------------ Results: ----------------' print '>>>> Optimal number of features : %d' % rfecv.n_features_ print '>>>> grid scores:' pprint(rfecv.grid_scores_) print '>>>> ranking of columns:' pprint(np.array(columns)[rfecv.ranking_-1]) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv
def get_top_features(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \ classifier = RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER, n_folds = 3, step = 0.05, scoring = 'f1'): ''' Using sklearn.feature_selection.RFECV model in order to find the top features of given windows with features, given in a CSV format. @param windows_data_frame (pandas.DataFrame): A data frame of the windows' CSV. @param drop_only_almost_positives (boolean, default False): Same as in train_window_classifier. @param drop_duplicates (boolean, default True): Whether to drop duplicating windows in the dataset, based on their neighbourhood property, prior to RFECV. @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler): A preprocessing transformer to use for the data before applying RFECV. If None, will not perform any preprocessing transformation. @param classifier (sklearn classifier, default a special version of random forest suitable for RFECV): The classifier to use as the estimator of RFECV. @param n_folds (int, default 2): The n_folds to use in the kfold cross-validation as part of the RFECV process. @param step (default 0.05): See sklearn.feature_selection.RFECV @param scoring (default 'f1'): See sklearn.feature_selection.RFECV @return: A list of the top features, each represented as a string. ''' features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer) kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED) rfecv = RFECV(estimator = classifier, cv = kfold, step = step, scoring = scoring) rfecv.fit(X, y) return util.apply_mask(features, rfecv.support_)
def recursiveFeatureElimination(): with DB() as db: POIs = getPointsOfInterest() numRows, numCols = int(math.sqrt(len(POIs))), int(math.sqrt(len(POIs))) + 1 # for hour in xrange(24): plt.figure() plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5) fignum = 1 for POI in POIs: x, y = loadData(db, POI['LAT'], POI['LONG'], generateAllFeatures) x, y = np.array(x), np.array(y) # Create the RFE object and compute a cross-validated score. svr = SVR(kernel="linear") rfecv = RFECV(estimator=svr, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(x, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.subplot(numRows, numCols, fignum) plt.title(POI['NAME']) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of misclassifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) fignum += 1 plt.show()
def main(): args = getOptions() print args print "train file read" train_x, train_y = readfile_noid(args.train,'train') train_x_new, id = extractID(train_x) del id print "test file read" test_x, test_y = readfile_noid(args.test,'test') test_x_new, id = extractID(test_x) #remove feature with no distinction and less important print "remove feature with no distinction and less important" sel = VarianceThreshold() train_x_uniq = sel.fit_transform(train_x_new) test_x_uniq = sel.transform(test_x_new) #normalization print "normalization" train_x_nor, mean, std = normalize(train_x_uniq) test_x_nor, mean, std = normalize(test_x_uniq, mean, std) #feature selection print "feature selection" # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(train_y, 10), scoring='accuracy') rfecv.fit(train_x_nor, train_y) print("Optimal number of features : %d" % rfecv.n_features_)
def featureSelection(train_x, train_y): # Create the RFE object and compute a cross-validated score. svc = LinearSVC(C=1, class_weight='balanced') # The "accuracy" scoring is proportional to the number of correct # classifications lasso = RandomizedLasso() lasso.fit(train_x, train_y) rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy') rfecv.fit(train_x, train_y) print("Optimal number of features : %d" % rfecv.n_features_) rankings = rfecv.ranking_ lasso_ranks = lasso.get_support() lassoFeats = [] recursiveFeats = [] shouldUseFeats = [] for i in range(len(rankings)): if lasso_ranks[i]: lassoFeats.append(feats[i]) if rankings[i] == 1: recursiveFeats.append(feats[i]) if lasso_ranks[i]: shouldUseFeats.append(feats[i]) keyboard() print 'Should use ' + ', '.join(shouldUseFeats) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def run_rfecv(X, y, clf_class, **kwargs): clf = clf_class(**kwargs) rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) plot_rfcev(rfecv) print "Optimal number of features : {0} for model: {1}".format(rfecv.n_features_, clf_class) return rfecv
def plot_rfe(X,label): y=X[label] X=X.drop(['churn','appetency','upselling',label],axis='columns') from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Build a classification task using 3 informative features # X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, # n_redundant=2, n_repeated=0, n_classes=8, # n_clusters_per_class=1, random_state=0) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-val5idation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def optimalFeatures(train,target): sk = StratifiedKFold(target,n_folds=3) est = SVC(kernel='linear') rfecv = RFECV(est,cv=sk) rfecv.fit(train,target) print("Optimal number of features : %d" % rfecv.n_features_) return rfecv
def featureSelection(X,y): class RandomForestClassifierWithCoef(RandomForestClassifier): def fit(self, *args, **kwargs): super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs) self.coef_ = self.feature_importances_ randfor = RandomForestClassifierWithCoef(n_estimators=35) rfecv = RFECV(estimator=randfor, step=1, cv=5, scoring='accuracy',verbose=2) rfecv.fit(X,y) return X.columns[rfecv.get_support()]
def selectFeatures (clf, X, Y): # Create the RFE object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y, 5), scoring='accuracy') rfecv.fit(X, Y) lst = rfecv.get_support() indices = find(lst, True) return X[:, indices], indices
def main(): xtrain=np.load('data/x_train.npy') ytrainreg=np.load('data/loss.npy') xtrain=xtrain[ytrainreg>0] ytrainreg=ytrainreg[ytrainreg>0] reg1=linear_model.SGDRegressor(loss='epsilon_insensitive',random_state=0,n_iter=5) selector1=RFECV(estimator=reg1,scoring='mean_squared_error',verbose=10) selector1.fit(xtrain,np.log(ytrainreg)) #training on the log of the loss print "sel1, optimal number of features:", selector1.n_features_ np.save('features/reg_sel_sgd_eps.npy', selector1.support_)
def feature_selection_with_scikit(): """ 1-VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples. 2-Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator """ p=0.8 selector = VarianceThreshold(threshold=(p * (1 - p))) c=selector.fit_transform(X) print "Number of the attribute before: ",X.shape[1] print "number of the attribute after:",c.shape[1] # selecting k best attribute instead of chi2, f_classif can also be used skb=SelectKBest(chi2, k=10) X_new=skb.fit_transform(X, y) attr=np.where(skb._get_support_mask(),attributeNames,'-1') print "Best attribute choosen with SelectKBest: " i=1 for att in attr: if att!='-1': print i, ": ",att i+=1 #using ExtraTreesClassifier print "Using feature importance..." etc=ExtraTreesClassifier() etc.fit(X,y).transform(X) print etc.feature_importances_ print etc.max_features print etc.max_depth print "Recursive feature selection : " from sklearn.svm import SVC import sklearn.linear_model as lm from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. estim=lm.LinearRegression() # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=estim, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def feature_selection_RFE_draft(fn ,ax=None, sel="all", goal="Linebreak", isclass=True, verbosity=0, nf=7): X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1) if verbosity > 1: print "names:", ",".join(names) # Create the RFE object and compute a cross-validated score. if isclass: #estimator = svm.SVC(kernel="linear",C=1.0) estimator = get_clf('svm') scoring = 'f1' cv = cross_validation.StratifiedKFold(y, 2) else: if False: from sklearn.ensemble import RandomForestRegressor if not hasattr(RandomForestRegressor,'coef_'): RandomForestRegressor.coef_ = property(lambda self:self.feature_importances_) estimator = RandomForestRegressor(n_estimators=100, max_depth=2, min_samples_leaf=2) else: estimator = linear_model.RidgeCV() scoring = 'mean_squared_error' cv = 3 # The "accuracy" scoring is proportional to the number of correct # classifications if True: rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring) else: from kgml.rfecv import RFECVp f_estimator = get_clf('svm') rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring) with warnings.catch_warnings(): warnings.simplefilter("ignore") rfecv.fit(X, y) # Plot number of features VS. cross-validation scores ax.set_xlabel("Number of features selected") ax.set_ylabel("Cross validation score ({})".format(scoring)) ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) #print("Optimal number of features : %d" % rfecv.n_features_) best = names[rfecv.ranking_==1] rfe = RFE(estimator, n_features_to_select=1) rfe.fit(X,y) ranks = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)) # reorder best using ranks best_set = set(best) best = [name for (i,name) in ranks if name in best_set] #print "The best features:", ', '.join(best) assert len(best) == len(best_set) return best, ranks
def select_optimal_features(feature_matrix, y, classifier): # print("type of cv is: " + str(cv)) ################################## preparing feature matirx with optimal features ############################ # reduced_data = PCA(n_components=25).fit_transform(feature_matrix) # print("shape of reduced data before rfecv is: " +str(reduced_data.shape)) # Create the RFE object and compute a cross-validated score. # classifier = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications cv = StratifiedKFold(y, 5) # print("type of cv is: " + str(cv)) rfecv = RFECV(estimator=classifier, step=1, cv=cv, scoring="accuracy") print ("going to select optimal features") rfecv.fit(feature_matrix, y) print ("done selecting optimal features") print ("Optimal number of features : %d" % rfecv.n_features_) ## ranking_ : array of shape [n_features] # The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature. # Selected (i.e., estimated best) features are assigned rank 1. # print("shape of reduced data after rfecv is: " +str(reduced_data.shape)) # print("ranking list is: " + str(rfecv.ranking_)) # print(type(rfecv.ranking_)) ranked_features = rfecv.ranking_.tolist() index = [] for i in range(0, len(ranked_features)): if ranked_features[i] is 1: index.append(i) print ("index is" + str(index)) i = 0 selected_features = np.zeros(shape=(len(feature_matrix), len(index)), dtype=np.float64) # initialze with zeros for val in index: selected_features[:, i] = feature_matrix[:, val] i = i + 1 plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # print((selected_features.shape)) # path_to_file="/home/ubuntu/Documents/Data_challenge/dc_3/dc_3_try2/" # file_name="selected_features" # with open(path_to_file+file_name,"w") as internal_filename: # pickle.dump(selected_features,internal_filename) return selected_features, index
def recursiveFeatSelection(): X_train, y_train = load_svmlight_file(svmPath + "/" + trainFile) X_test, y_test = load_svmlight_file(svmPath + "/" + testFile, n_features=X_train.shape[1]) clf = svm.SVC(kernel='linear', C=1024.0) rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2), scoring='f1') rfecv.fit(X_train, y_train) print("Optimal number of features : %d" % rfecv.n_features_)
def SelectFeatures(featuresStructuresArray, labels): estimator = LogisticRegression('l2', False) featureNames = featuresStructuresArray.dtype.names featureData = castStructuredArrayToRegular(featuresStructuresArray) featuresSelector = RFECV(estimator, cv=8) featuresSelector.fit(featureData , labels) selectedIndices = featuresSelector.get_support() selectedFeatures = np.array(featureNames)[selectedIndices] return selectedFeatures
def decision_tree(): print "---bc---" clf = tree.DecisionTreeClassifier(criterion="gini") rfecv = RFECV(clf, cv=10) _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth) _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_gini" + str(depth)) clf = tree.DecisionTreeClassifier(criterion="entropy") _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth) _decision_tree(clf, bc_data_train, bc_data_test, bc_target_train, bc_target_test, "bc_entropy" + str(depth)) rfecv.fit(bc_data_train, bc_target_train) print rfecv.support_ print rfecv.ranking_ print rfecv.score(bc_data_test, bc_target_test) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() print "---v---" clf = tree.DecisionTreeClassifier(criterion="gini") rfecv = RFECV(clf, cv=10) _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth) _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_gini" + str(depth)) clf = tree.DecisionTreeClassifier(criterion="entropy") _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy") for depth in DEPTHS: clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=depth) _decision_tree(clf, v_data_train, v_data_test, v_target_train, v_target_test, "v_entropy" + str(depth)) rfecv.fit(v_data_train, v_target_train) print rfecv.support_ print rfecv.ranking_ print rfecv.score(v_data_test, v_target_test) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def lr_with_fs(): """ Submission: lr_with_fs_0703_01.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV import pylab as pl X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) pkl_path = Path.of_cache('lr_with_fs.RFECV.pkl') rfe = IO.fetch_cache(pkl_path) if rfe is None: rfe = RFECV(estimator=LogisticRegression(class_weight='auto'), cv=StratifiedKFold(y, 5), scoring='roc_auc') rfe.fit(X_scaled, y) IO.cache(rfe, pkl_path) print("Optimal number of features : %d" % rfe.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (AUC)") pl.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_) pl.savefig('lr_with_fs.refcv') X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print('CV scores: %s' % clf.scores_) print('Ein: %f' % Util.auc_score(clf, X_new, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0703_01')
def get_best(self): """Finds the optimal number of features :return: optimal number of features and ranking """ svc = SVC(kernel="linear") rfecv = RFECV( estimator=svc, step=1, cv=StratifiedKFold(self.y_train, 2), scoring="log_loss" ) rfecv.fit(self.x_train, self.y_train) return rfecv.n_features_, rfecv.ranking_
def feature_selection_rfecv(x, y): # Create the RFE object and compute a cross-validated score. dtc = DecisionTreeClassifier() # The "accuracy" scoring is proportional to the number of correct classifications rfecv = RFECV(estimator=dtc, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(x, y) print 'Optimal number of features: %d' % rfecv.n_features_ # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel('Number of features selected') plt.ylabel('Cross validation score (nb of correct classifications)') plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def adjust_optimal_features_using_recursive_feature_elimination(class_name, training_set): class_names = map(lambda x: x["classes"][class_name], training_set) numerical_characteristics_training_set = map(lambda x: x, map(lambda x: select_numerical_characteristics(x), training_set)) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(class_names, 2), scoring='accuracy') # ToDo: check if class has more than one representant always rfecv.fit(numerical_characteristics_training_set, class_names) optimal_features_indexes = [i for i, x in enumerate(rfecv.ranking_) if x == 1] print("Optimal number of features : %d" % rfecv.n_features_) return map(lambda i: numerical_characteristics[i], optimal_features_indexes) # X_new.shape
def run_feature_elimination(outdir, bdts, x, y, setup): logging.info("starting feature selection") for n, bdt in enumerate(bdts): rfecv = RFECV(estimator=bdt, step=1, cv=CV, scoring='roc_auc') # new in 18.1: , n_jobs=NJOBS) rfecv.fit(x, y) plot_feature_elimination(outdir, rfecv, n) out = u'Feature selection\n=================\n\n' out += u'optimal feature count: {}\n\nranking\n-------\n'.format(rfecv.n_features_) for i, v in enumerate(setup["variables"]): out += u'{:30}: {:>5}\n'.format(v, rfecv.ranking_[i]) with codecs.open(os.path.join(outdir, "bdt-{}".format(n), "log-feature-elimination.txt"), "w", encoding="utf8") as fd: fd.write(out)
def rfe_cross_validate(X, y): # Create the RFE object and compute a cross-validated score. model = LogisticRegression() # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) # plot it plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
"metric": 'auc', "verbosity": -1, 'reg_alpha': 0.2899927210061127, 'reg_lambda': 0.4485237330340494, 'random_state': 53 } # In[9]: clf = lgb.LGBMClassifier(**params) #(n_splits=6, shuffle=False) 'accuracy', 'binary_logloss', 'precision', 'recall' rfe = RFECV(estimator=clf, step=10, cv=5, scoring='roc_auc', verbose=2) # In[10]: rfe.fit(train_x, train_y) # In[11]: for col in train_x.columns[rfe.ranking_ == 1]: print(col) # In[14]: most_influential = pd.DataFrame( [col for col in train_x.columns[rfe.ranking_ == 1]], columns=['features']) most_influential.to_csv('Import_feature.csv') # In[8]: most_influential = pd.read_csv('Inputs/Import_feature.csv')
score accuracy=accuracy_score(y_test, classes) t2=pd.DataFrame(classes) pd.DataFrame(y_test2).describe() pd.DataFrame(classes).describe() #Mélange 2 modeles import matplotlib.pyplot as plt from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. svc = LogisticRegression(C=0.6) # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc,scoring='log_loss') rfecv.fit(train,targets_tr) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def go(self, all_data, cols, colsP): train = all_data.loc[(all_data.SalePrice > 0), cols].reset_index(drop=True, inplace=False) y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index( drop=True, inplace=False) test = all_data.loc[(all_data.SalePrice == 0), cols].reset_index(drop=True, inplace=False) # Main script here scale = RobustScaler() df = pd.DataFrame(scale.fit_transform(train[cols]), columns=cols) #select features based on P values ln_model = sm.OLS(y_train, df) result = ln_model.fit() print(result.summary2()) pv_cols = cols.values SL = 0.051 pv_cols, LR = self.backwardElimination(df, y_train, SL, pv_cols) pred = LR.predict(df[pv_cols]) y_pred = pred.apply(lambda x: 1 if x > 0.5 else 0) print('Fvalue: {:.6f}'.format(LR.fvalue)) print('MSE total on the train data: {:.4f}'.format(LR.mse_total)) ls = Lasso(alpha=0.0005, max_iter=161, selection='cyclic', tol=0.002, random_state=101) rfecv = RFECV(estimator=ls, n_jobs=-1, step=1, scoring='neg_mean_squared_error', cv=5) rfecv.fit(df, y_train) select_features_rfecv = rfecv.get_support() RFEcv = cols[select_features_rfecv] print('{:d} Features Select by RFEcv:\n{:}'.format( rfecv.n_features_, RFEcv.values)) score = r2_score ls = Lasso(alpha=0.0005, max_iter=161, selection='cyclic', tol=0.002, random_state=101) sbs = SequentialFeatureSelection(ls, k_features=1, scoring=score) sbs.fit(df, y_train) print('Best Score: {:2.2%}\n'.format(max(sbs.scores_))) print('Best score with:{0:2d}.\n'.\ format(len(list(df.columns[sbs.subsets_[np.argmax(sbs.scores_)]])))) SBS = list(df.columns[list(sbs.subsets_[max( np.arange(0, len(sbs.scores_))[(sbs.scores_ == max(sbs.scores_))])])]) print('\nBest score with {0:2d} features:\n{1:}'.format(len(SBS), SBS)) skb = SelectKBest(score_func=f_regression, k=80) skb.fit(df, y_train) select_features_kbest = skb.get_support() kbest_FR = cols[select_features_kbest] scores = skb.scores_[select_features_kbest] skb = SelectKBest(score_func=mutual_info_regression, k=80) skb.fit(df, y_train) select_features_kbest = skb.get_support() kbest_MIR = cols[select_features_kbest] scores = skb.scores_[select_features_kbest] X_train, X_test, y, y_test = train_test_split(df, y_train, test_size=0.30, random_state=101) # fit model on all training data #importance_type='gain' model = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, max_delta_step=0, random_state=101, min_child_weight=1, missing=None, n_jobs=4, scale_pos_weight=1, seed=None, silent=True, subsample=1) model.fit(X_train, y) # Using each unique importance as a threshold thresholds = np.sort(np.unique(model.feature_importances_)) best = 1e36 colsbest = 31 my_model = model threshold = 0 for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # train model selection_model = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, max_delta_step=0, random_state=101, min_child_weight=1, missing=None, n_jobs=4, scale_pos_weight=1, seed=None, silent=True, subsample=1) selection_model.fit(select_X_train, y) # eval model select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] r2 = r2_score(y_test, predictions) mse = mean_squared_error(y_test, predictions) print( "Thresh={:1.3f}, n={:d}, R2: {:2.2%} with MSE: {:.4f}".format( thresh, select_X_train.shape[1], r2, mse)) if (best >= mse): best = mse colsbest = select_X_train.shape[1] my_model = selection_model threshold = thresh feature_importances = [ (score, feature) for score, feature in zip(model.feature_importances_, cols) ] XGBest = pd.DataFrame(sorted( sorted(feature_importances, reverse=True)[:colsbest]), columns=['Score', 'Feature']) XGBestCols = XGBest.iloc[:, 1].tolist() bcols = set(pv_cols).union(set(RFEcv)).union(set(kbest_FR)).union( set(kbest_MIR)).union(set(XGBestCols)).union(set(SBS)) intersection = set(SBS).intersection(set(kbest_MIR)).intersection( set(RFEcv)).intersection(set(pv_cols)).intersection( set(kbest_FR)).intersection(set(XGBestCols)) print(intersection, '\n') print('_' * 75, '\nUnion All Features Selected:') print('Total number of features selected:', len(bcols)) print('\n{0:2d} features removed if use the union of selections: {1:}'. format(len(cols.difference(bcols)), cols.difference(bcols))) totalCols = list(bcols.union(set(colsP))) #self.trainingData = self.trainingData.loc[list(totalCols)].reset_index(drop=True, inplace=False) #self.testingData = self.testingData.loc[list(totalCols)].reset_index(drop=True, inplace=False) #self.combinedData = [self.trainingData, self.testingData] return DataObject(self.trainingData, self.testingData, self.combinedData), totalCols, RFEcv, XGBestCols
# Build a classification task using 3 informative features X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv.fit(X, y) # 输出结果 print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores # 最优特征的序号 support = np.argwhere(rfecv.support_ == True) + 1 print("Optimal index of features : {}".format(support)) plt.figure() plt.subplot(1, 2, 1) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
if full_report: dic_rfecv_alg = [] for clf in [dtc, abc, rfc]: t000 = time() #### rfecv object rfecv = RFECV(estimator=clf, step=1, cv=StratifiedShuffleSplit(fullset_labels, 10, random_state=15), scoring='f1') #### fit on fullset of features rfecv_clf = rfecv.fit(fullset_features, fullset_labels) rfecv_best = rfecv_clf.estimator_ rfecv_features_list = [] for (x, y) in zip(full_features_list[1:], rfecv.support_): if y: rfecv_features_list.append(x) #### populate list of rfecv times dic_rfecv_alg.append({ "algorithm": clf, "run_time": round(time() - t000, 2), "rfecv_features_list": rfecv_features_list }) #### Plot number of features VS. cross-validation scores
step = 10 if getpass.getuser() == 'stone': train_df = train_df[:2000] step = 0.1 print('数据量:', train_df.shape) # ---------------- 特征选择 ------------------ xgb_estimator = XGBClassifier( objective='binary:logistic', nthread=-1, seed=36) print('XGB交叉验证迭代筛选特征...') selector = RFECV(xgb_estimator, step=step, cv=5, scoring='roc_auc') selector.fit(train_df[predictors], train_df[target]) # 筛选特征 print('筛选最优特征...') selected_features = selector.transform(all_features_df[predictors]) support = pd.Series(selector.support_, index=predictors) # -------------保存筛选特征后的数据-------------- print("存储选择结果中:") print('位置:../../data/data_4/selection_result_xgb.csv') support.to_csv('../../data/data_4/selection_result_xgb.csv', encoding='gbk') # ----------------输出运行时间 -------------------- time_spend = time.time() - time_begin print('\n运行时间:%d 秒,约%d分钟\n' % (time_spend, time_spend // 60))
def main(): # start time startTime = time.time() # get data dataset = '/home/markg/Documents/TCD/ML/ML1819--task-107--team-11/cleanData.csv' data = pd.read_csv(dataset, encoding='latin-1') data.drop(columns=['Unnamed: 0'], inplace=True) # data.drop(columns = ['tweet_count', 'month', 'fav_number', # 'month', 'totalLettersName', 'link_hue', 'link_vue', 'link_sat', 'sidebar_sat', # 'sidebar_vue'], inplace=True) # create independent & dependent variables X = data.drop('gender_catg', axis=1) Y = data['gender_catg'] # split into 90% training, 10% testing X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10) # train model (could change kernel here) svm = SVC(C=1, gamma=0.3, kernel='linear') svm.fit(X_train, y_train) # make predictions and print metrics y_pred = svm.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) # # recursive feature selection without cross validation # rfe = RFE(svm, 4) # fit = rfe.fit(X_train, y_train) # print('Num Features:',fit.n_features_to_select) # print("Selected Features:",fit.support_) # print ("Feature ranking:", fit.ranking_) # data.info() # # recursive feature selection using cross validation rfecv = RFECV(estimator=svm, step=1, cv=StratifiedKFold(5), scoring='accuracy') rfecv.fit(X_train, y_train) print("Optimal number of features : %d" % rfecv.n_features_) print("Feature ranking: ", rfecv.ranking_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.title("SVM") plt.ylabel("Accuracy 5-Fold Cross validation Score") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # plt.savefig('featuresSelectionSVM.png') plt.show() # # plot bar chart of feature ranking # features = list(X) # ranking = rfecv.ranking_ # plt.bar(features, ranking, align='center', alpha=0.5) # # plt.savefig('featureRankingSVM.png') # plt.show() # cross validation to choose c and gamma # C_s, gamma_s = np.meshgrid(np.logspace(-2, 0.3, 5), np.logspace(-2, 0.3, 5)) # scores = list() # i=0; j=0 # for C, gamma in zip(C_s.ravel(),gamma_s.ravel()): # svm.C = C # svm.gamma = gamma # this_scores = cross_val_score(svm, X, Y, cv=3) # scores.append(np.mean(this_scores)) # scores=np.array(scores) # scores=scores.reshape(C_s.shape) # fig2, ax2 = plt.subplots(figsize=(12,8)) # c=ax2.contourf(C_s,gamma_s,scores) # ax2.set_xlabel('C') # ax2.set_ylabel('gamma') # fig2.colorbar(c) # fig2.savefig('crossvalParameterSelection2.png') # end time endTIme = time.time() totalTime = endTIme - startTime print("Time taken:", totalTime)
def roc_auc_with_multi_labels(estimator, X, y, n_classes=3, weight='micro'): ''' ROC 原始定义是适用于二分类,如何可以参考 https://www.jianshu.com/p/00ef5b63dfc8 ''' y_pred = estimator.predict_proba(X) y_pred_ = np.array([_[:, 1] for _ in y_pred]).T y_test = np.array([list(map(int, _.split(','))) for _ in y]) fpr = dict() tpr = dict() roc_auc = dict() for i in range(y_test.shape[1]): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area(方法二) fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred_.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) if weight == 'micro': return roc_auc["micro"] elif weight == 'macro': # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return roc_auc["macro"] else: raise ValueError('"weight" is not in ("macro", "micro") def plot_roc_cv(Scores): Scores_arr = np.array(Scores) n_fea = Scores_arr.shape[1] fig, ax = plt.subplots() ax.fill_between(range(1, Scores_arr.shape[1]+1), Scores_arr.max(axis=0), Scores_arr.min(axis=0), color='skyblue') ax.plot(range(1, n_fea+1), Scores_arr.max(axis=0), '-*', lw=2, color='orange', label='max') ax.plot(range(1, n_fea+1), Scores_arr.min(axis=0), '-*', lw=2, color='blue', label='min') ax.plot(range(1, n_fea+1), Scores_arr.mean(axis=0), '--', lw=1, color='green', label='mean') ax.plot(range(1, n_fea+1), Scores_arr.mean(axis=0) + Scores_arr.std(axis=0), '--', lw=1.5,color='darksalmon', label='mean + std') ax.plot(range(1, n_fea+1), Scores_arr.mean(axis=0) - Scores_arr.std(axis=0), '--', lw=1.5,color='darkviolet', label='mean - std') ax.plot(range(1, n_fea+1), [max(Scores_arr.max(axis=0))]* n_fea, '--', lw=1,color='lawngreen') ax.plot(range(1, n_fea+1), [min(Scores_arr.min(axis=0))]* n_fea, '--', lw=1,color='lawngreen') plt.legend(loc='upper right', fontsize=7) plt.ylim([0, 1]) plt.xlim([1, 9]) plt.xlabel('feature num') plt.ylabel('AUC') yticks = [_/10 for _ in range(0, 11, 2)] yticks.append(round(max(Scores_arr.max(axis=0)), 3)) yticks.append(round(min(Scores_arr.min(axis=0)), 3)) yticks = sorted(yticks) plt.yticks(yticks, yticks) plt.title('Feature importance selected by RFECV \n with RF(100 tree)') plt.show() if __name__ == '__main__': df = pd.read_csv('feature_label.csv', sep='\t') X = df[df.columns[:-2]] y = df['Transfer'].values N = 100 Scores = [] Best_fea_num = [] Ranks = [] t0 = time.time() for i in range(1, N+1): y1 = label_binarize(y, classes=['Normal', 'I_III', 'IV']) X1, y1 = shuffle(X, y1, random_state=i) y1 = list(map(lambda x:'{},{},{}'.format(*x), y1)) model = RandomForestClassifierRefcv(n_estimators=100) rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(6), n_jobs=-1, scoring=roc_auc_with_multi_labels_binary) rfecv.fit(X1, y1) Best_fea_num.append(rfecv.n_features_) Scores.append(rfecv.grid_scores_) Ranks.append(rfecv.ranking_) if i % 10 == 0: t1 = time.time() print('finish {} round test in {} sec'.format(i, t1-t0)) t0 = t1 plot_roc_cv(Scores)
#%% select_feature = SelectKBest(chi2, k=3).fit(x_train, y_train) x_train_chi = select_feature.transform(x_train) x_test_chi = select_feature.transform(x_test) # %% lr_chi_model = clf_lr.fit(x_train_chi, y_train) # %% rfe = RFE(estimator=clf_lr, step=1) rfe = rfe.fit(x_train, y_train) x_train_rfe = rfe.transform(x_train) x_test_rfe = rfe.transform(x_test) lr_rfe_model = clf_lr.fit(x_train_rfe, y_train) # %% rfecv = RFECV(estimator=clf_lr, step=1, cv=5, scoring='accuracy') rfecv = rfecv.fit(x_train, y_train) #%% plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validated Accuracy") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, '-o') plt.title("Recursive Feature Elimination with Cross Validation") plt.show() plt.savefig("D:\\Autumn 2019\\Research\\Feature_Extracting\\StimOn") # %%
def get_new_mask_2(X, y): clf = LinearRegression() rfecv = RFECV(clf, step=1, cv=2) selector = rfecv.fit(X, y) return selector.support_
def get_new_mask(X, y, model=LinearRegression()): clf = model rfecv = RFECV(clf, step=1, cv=3) selector = rfecv.fit(X, y) return selector.support_
plt.figure() sns.distplot(train['wind_speed']) plt.figure() sns.distplot(train['wind_direction']) plt.figure() sns.distplot(train['precipitation']) plt.figure() sns.distplot(train['temp']) # Feature selection features = [ 'wind_speed', 'wind_direction', 'temp', 'precipitation', 'hour', 'month' ] max_power = train['Power'].max() train['Power'] = train['Power'].divide(max_power) train, test = train_test_split(train, train_size=0.85, shuffle=False) scaler = StandardScaler() X_train = scaler.fit_transform(train[features].values) y_train = train['Power'].divide(train['Power'].max()).values model = lgb.LGBMRegressor() rfe = RFECV(estimator=model, cv=10, step=1) rfe = rfe.fit(X_train, y_train) print rfe.support_ model = xgb.XGBRegressor() rfe = RFECV(estimator=model, cv=10, step=1) rfe = rfe.fit(X_train, y_train) print rfe.support_
'NumCharacter' ]]) test_X.to_csv('test_cleaned.csv') train_X.to_csv('train_X_cleaned.csv') train_Y.to_csv('train_Y_cleaned.csv') from sklearn.feature_selection import RFECV from sklearn.linear_model import LinearRegression X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=0) model = LinearRegression() rfecv = RFECV(model, step=1, scoring='neg_mean_squared_error') rfecv.fit(X_train, y_train.values.ravel()) # Recursive feature elimination # Number of best features rfecv.n_features_ # The number of best features is 56, which means based on recursvie cross validation result , we should use every feature. rfecv.support_ rfecv.ranking_ #Reduced X_test and X_train to the selected features rfecv.transform(X_train) #Use the current model to predict X_test value ypred1 = rfecv.predict(X_test) mean_squared_error(y_test.values, ypred1) # 0.07404738031954539 #Maybe try another feature selection method
forest = RandomForestRegressor(n_estimators = 1000, max_depth = 10) #Use default values except for number of trees. For a further explanation see readme included in repository. forest.fit(X, Y) # Fit Forest model, This will take time rf = forest.feature_importances_ # Output importances of features l_rf = list(zip(X, rf)) # Create list of variables alongside importance scores df_rf = pd.DataFrame(l_rf, columns = ["Features", "Gini"]) # Create data frame of importances with variables and gini column names df_rf = df_rf[(df_rf["Gini"] > df_rf["Gini"].mean())] # Subset by Gini values higher than mean df_rf = df_rf.sort_values(by = ["Gini"], ascending = False) # Sort Columns by Value print(df_rf) ### Recursive Feature Elimination df_pca_rf = pd.merge(df_pca, df_rf, on = "Features", how = "inner") # Join by column while keeping only items that exist in both, select outer or left for other options pca_rf = df_pca_rf["Features"].tolist() # Save features from data frame X = df_prep[pca_rf] # Save features columns as predictor data frame Y = df_prep["quant"] # Selected quantitative outcome from original data frame recursive = RFECV(estimator = LinearRegression(), min_features_to_select = 5) # define selection parameters, in this case all features are selected. See Readme for more ifo recursive.fit(X, Y) # This will take time rfe = recursive.support_ # Save Boolean values as numpy array l_rfe = list(zip(X, rfe)) # Create list of variables alongside RFE value df_rfe = pd.DataFrame(l_rfe, columns = ["Features", "RFE"]) # Create data frame of importances with variables and gini column names df_rfe = df_rfe[df_rfe.RFE == True] # Select Variables that were True print(df_rfe) ### Multiple Regression pca_rf_rfe = df_rfe["Features"].tolist() # Save chosen featres as list X = df_prep.filter(pca_rf_rfe) # Keep only selected columns from rfe Y = df_prep["quant"] # Add outcome variable regression = LinearRegression() # Linear Regression in scikit learn regression.fit(X, Y) # Fit model coef = regression.coef_ # Coefficient models as scipy array l_reg = list(zip(X, coef)) # Create list of variables alongside coefficient df_reg = pd.DataFrame(l_reg, columns = ["Features", "Coefficients"]) # Create data frame of importances with variables and gini column names
''' 1: Logistic Regression_v1''' from sklearn.feature_selection import RFE, RFECV from sklearn.linear_model import LogisticRegression mod_lr = LogisticRegression() mod_lr.fit(X, y) y_pred = mod_lr.predict(X) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix from sklearn import metrics confusion_matrix(y, y_pred) metrics.accuracy_score(y, y_pred) rfe = RFECV(mod_lr, min_features_to_select=10, step=1, cv=5) #rfe = RFE(mod_lr, 10, step = 1) fit = rfe.fit(X, y) print("Num Attribute: %d" % fit.n_features_) print("Selected Attribute: %s" % fit.support_) print("Feature Ranking: %s" % fit.ranking_) feature_imp = pd.DataFrame({ 'Attribute': df.iloc[:, 2:].columns.tolist(), 'Select': fit.support_, 'Rank': fit.ranking_ }).sort_values(by='Rank', ascending=True) pickup = feature_imp[feature_imp['Rank'] == 1]['Attribute'] #pickup = df_us.iloc[:, np.r_[3:15, 39:57, 94:187]].columns[fit.support_] X = df.loc[:, pickup].values
from sklearn.datasets import make_classification import matplotlib.pyplot as plt # Build a classification task using 3 informative features X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct classifications. rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y, 2), scoring='accuracy') rfecv.fit(X, y) print "Optimal number of features : %d" % rfecv.n_features_ # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def main(): # 写入数据 print('*' * 20, "程序开始-读取数据", '*' * 20) X_Train = commonFunc.parseFile('../UCI HAR Dataset/train/X_train.txt') Y_Train = commonFunc.parseFile( '../UCI HAR Dataset/train/y_train.txt').flatten() X_Test = commonFunc.parseFile('../UCI HAR Dataset/test/X_test.txt') Y_Test = commonFunc.parseFile( '../UCI HAR Dataset/test/y_test.txt').flatten() activityLabels = [ 'WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING' ] print("数据读取完成~\n") # 参数设置 print('*' * 20, "设置参数表", '*' * 20) print("LDA:") SOLVER = 'svd' print("solver:{0}".format(SOLVER)) print("RFE:") STEPSET = 5 MINFEATURETOSET = 300 CROSSVALIDATION = 20 CPUCHANNEL = 6 print( "estimate:LDA \t step:{0} \t min_feature_to_select:{1} \t CrossValidation:{2} \t CPUChannel:{3} \n" .format(STEPSET, MINFEATURETOSET, CROSSVALIDATION, CPUCHANNEL)) # 特征选择 print('*' * 20, "读取特征文件", '*' * 20) maskSaveName = "LDA-features-mask.out" if (os.path.exists(maskSaveName)): print("存在特征文件,开始读取...") maskInteger = np.loadtxt(maskSaveName) mask = (maskInteger == 1) print("读取完成,准备显示...") print("特征选择数量: {0}".format(sum(mask == 1))) else: print("特征文件不存在~") print("开始特征选择...") start = perf_counter() estimator = LDA(solver=SOLVER) selector = RFECV(estimator, step=STEPSET, min_features_to_select=MINFEATURETOSET, cv=CROSSVALIDATION, n_jobs=CPUCHANNEL) selector = selector.fit(X_Train, Y_Train) mask = selector.get_support() print("特征选择完成!") print("用时 {0:.2f}mins".format((perf_counter() - start) / 60)) print("特征选择数量: {0}".format(sum(mask == 1))) np.savetxt(maskSaveName, mask, fmt='%d') # 画图 plt.figure(figsize=(14, 14)) plt.subplot(2, 2, (1, 2)) plt.imshow(mask.reshape(1, -1), cmap='tab20c_r') plt.title("Feature Selected: {0}".format(sum(mask == 1)), fontsize=14, y=2.5) plt.ylim([-5, 5]) plt.xlabel("Feature Index(Deeper Color means Selected)", fontsize=10) # plt.show() print('\n') # 选择特征抽取 print('*' * 20, "特征选择后的数据结果", '*' * 20) X_Train_selected = X_Train[:, mask] X_Test_selected = X_Test[:, mask] clf_selected = LDA(solver=SOLVER) clf_selected.fit(X_Train_selected, Y_Train) Y_predict_selected = clf_selected.predict(X_Test_selected) prec_selected, rec_selected, f_score_selected = commonFunc.checkAccuracy( Y_Test, Y_predict_selected) print("训练结果:") print("准确率:{0}\n召回率:{1}\nF1度量:{2}".format(prec_selected, rec_selected, f_score_selected)) # 混淆矩阵 plt.subplot(2, 2, 3) cm = commonFunc.createConfusionMatrix(Y_predict_selected, Y_Test) plot_confusion_matrix(cm, activityLabels, normalize=False, title='Selected_F Confusion matrix') print('\n') # 原始数据的训练结果 print('*' * 20, "特征选择前的数据结果", '*' * 20) clf = LDA(solver=SOLVER) clf.fit(X_Train, Y_Train) Y_predict = clf.predict(X_Test) prec, rec, f_score = commonFunc.checkAccuracy(Y_Test, Y_predict) print("训练结果:") print("准确率:{0}\n召回率:{1}\nF1度量:{2}".format(prec, rec, f_score)) # 混淆矩阵 plt.subplot(2, 2, 4) cm = commonFunc.createConfusionMatrix(Y_predict, Y_Test) plot_confusion_matrix(cm, activityLabels, normalize=False, title='All_F Confusion matrix') # plt.tight_layout() plt.show()
# Diccionario que mapea la RFE Accuracy con un índice dict_1 = {} # Diccionario que mapea un índice con el objeto RFECV dict_2 = {} time_prebucle = time.time() # Itero sobre los posibles valores de C for i, c in enumerate(C): time_temp1 = time.time() clf_temp = SVC(C=c, kernel=kernel, class_weight=class_weight, random_state=random_state) rfecv_temp = RFECV(clf_temp, cv=skf, scoring=scoring) rfecv_temp.fit(X, y) dict_1[rfecv_temp.grid_scores_[rfecv_temp.n_features_]] = i dict_2[i] = rfecv_temp time_temp2 = time.time() print(f'Time iteration {i}: {time_temp2-time_temp1}') time_bucle = time.time() print(f'Time loop: {time_bucle-time_prebucle}') maximo = max(dict_1) indice_maximo = dict_1[maximo] rfecv = dict_2[indice_maximo] best_c = rfecv.estimator_.get_params()['C'] print(f'Best C: {best_c}') # Imprimimos el número de características resultante
import warnings warnings.filterwarnings('ignore') from sklearn.svm import SVR from sklearn.model_selection import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn.feature_selection import RFE from sklearn import preprocessing intersect_fnc_l = pd.read_csv("/data/mialab/users/bray14/rfecv_fnc/intersect_fnc_ica_2_filtered.csv") X = intersect_fnc_l.iloc[:,5:105] target = intersect_fnc_l['age_at_cnb'] mm_scaler = preprocessing.MinMaxScaler() X_train_minmax = mm_scaler.fit_transform(X) Svr_linear = SVR(kernel='linear') rfecv = RFECV(estimator=Svr_linear, step=1, cv=StratifiedKFold(10), scoring='neg_root_mean_squared_error') rfecv.fit(X_train_minmax, target) print('Optimal number of features: {}'.format(rfecv.n_features_)) coef_rmse_rfecv= rfecv.estimator_.coef_ rfecv_rmse_featureCoeff = pd.DataFrame() rfecv_rmse_featureCoeff['attr'] = X.columns[rfecv.support_] rfecv_rmse_featureCoeff['coefficient'] = coef_rmse_rfecv.transpose(1,0) rfecv_rmse_featureCoeff['rank']= rfecv.ranking_[rfecv.support_] rfecv_rmse_featureCoeff = rfecv_rmse_featureCoeff.sort_values(by='coefficient', ascending=False) rfecv_rmse_featureCoeff.to_csv('rfecv_rmse_featureCoeff.csv',encoding='utf-8',index=False,na_rep='NA') plt.figure(figsize=(16, 9)) plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20) plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
ledroit_Evaluator = FeaturesEvaluator.FeaturesEvaluator(reg_ledroit, X_train_ledroit, y_train_ledroit, X_test_ledroit, y_test_ledroit) params = {'n_estimators': 128, 'max_depth': 7, 'min_samples_split': 5, 'learning_rate': 0.01, 'loss': 'ls'} reg_ledroit = ensemble.GradientBoostingRegressor(**params) selector = RFECV(reg_ledroit, step=1, cv=5, verbose =3 , min_features_to_select=30) selector = selector.fit(X_train_ledroit, y_train_ledroit) selector.support_ selector.grid_scores_ X_to_remove = X_train_ledroit.keys()[np.logical_not(selector.support_)] reg_ledroit.fit(X_train_ledroit, y_train_ledroit) predict_ledroit = reg_ledroit.predict(X_test_ledroit) test = pd.DataFrame(selector.transform(X_train_ledroit))
def Find(df, quant, path="", title=""): import pandas as pd # Widely used data manipulation library with R/Excel like tables named 'data frames' import numpy as np # Widely used matrix library for numerical processes from sklearn.decomposition import PCA # Principal compnents analysis from sklearn from sklearn.ensemble import RandomForestRegressor # Random Forest classification component from sklearn.feature_selection import RFECV # Recursive Feature elimination with cross validation from sklearn.linear_model import LinearRegression # Used for machine learning with quantitative outcome pop = df.pop(quant) # Remove quantitative outcome df = df.dropna( axis=1, thresh=0.75 * len(df) ) # Drop features less than 75% non-NA count for all columns df = pd.DataFrame(SimpleImputer(strategy="median").fit_transform(df), columns=df.columns) # Impute missing data df = pd.DataFrame( StandardScaler().fit_transform(df.values), columns=df.columns ) # Standard scale values by converting the normalized features into a tabular format with the help of DataFrame. df = df.dropna( ) # Drop all rows with NA values (should be none, this is just to confirm) degree = len( df.columns ) - 1 # Save number of features -1 to get degrees of freedom pca = PCA( n_components=degree ) # Pass the number of components to make PCA model based on degrees of freedom pca.fit(df) # Fit initial PCA model df_comp = pd.DataFrame( pca.explained_variance_) # Print explained variance of components df_comp = df_comp[( df_comp[0] > 1)] # Save eigenvalues above 1 to identify components components = len( df_comp.index ) - 1 # Save count of components for Variable reduction pca = PCA(n_components=components ) # you will pass the number of components to make PCA model pca.fit_transform( df ) # finally call fit_transform on the aggregate data to create PCA results object df_pc = pd.DataFrame( pca.components_, columns=df.columns ) # Export eigenvectors to data frame with column names from original data df_pc[ "Variance"] = pca.explained_variance_ratio_ # Save eigenvalues as their own column df_pc = df_pc[df_pc["Variance"] > df_pc["Variance"].mean( )] # Susbet by eigenvalues with above average exlained variance ratio df_pc = df_pc.abs() # Get absolute value of eigenvalues df_pc = df_pc.drop(columns=["Variance"]) # Drop outcomes and targets df_pca = pd.DataFrame( df_pc.max(), columns=["MaxEV"]) # select maximum eigenvector for each feature df_pca = df_pca[ df_pca.MaxEV > df_pca.MaxEV.mean()] # Susbet by above average max eigenvalues df_pca = df_pca.reset_index( ) # Add a new index of ascending values, existing index consisting of feature labels becomes column named "index" df_pca = df_pca.rename(columns={"index": "Features" }) # Rename former index as features print(df_pca) df.insert(0, "quant", pop) # Reattach qunatitative outcome to front of data frame X = df.drop(columns=["quant"]) # Drop outcomes and targets Y = df["quant"] # Isolate Outcome variable forest = RandomForestRegressor( n_estimators=1000, max_depth=10 ) #Use default values except for number of trees. For a further explanation see readme included in repository. forest.fit(X, Y) # Fit Forest model, This will take time rf = forest.feature_importances_ # Output importances of features l_rf = list(zip( X, rf)) # Create list of variables alongside importance scores df_rf = pd.DataFrame( l_rf, columns=["Features", "Gini"] ) # Create data frame of importances with variables and gini column names df_rf = df_rf[(df_rf["Gini"] > df_rf["Gini"].mean() )] # Subset by Gini values higher than mean print(df_rf) df_pca_rf = pd.merge( df_pca, df_rf, on="Features", how="inner" ) # Join by column while keeping only items that exist in both, select outer or left for other options pca_rf = df_pca_rf["Features"].tolist( ) # Save features from data frame X = df[pca_rf] # Save features columns as predictor data frame Y = df[ "quant"] # Selected quantitative outcome from original data frame recursive = RFECV( estimator=LinearRegression(), min_features_to_select=5 ) # define selection parameters, in this case all features are selected. See Readme for more ifo recursive.fit(X, Y) # This will take time rfe = recursive.support_ # Save Boolean values as numpy array l_rfe = list(zip(X, rfe)) # Create list of variables alongside RFE value df_rfe = pd.DataFrame( l_rfe, columns=["Features", "RFE"] ) # Create data frame of importances with variables and gini column names df_rfe = df_rfe[df_rfe.RFE == True] # Select Variables that were True print(df_rfe) pca_rf_rfe = df_rfe["Features"].tolist() # Save chosen featres as list X = df.filter(pca_rf_rfe) # Keep only selected columns from rfe Y = df["quant"] # Add outcome variable regression = LinearRegression() # Linear Regression in scikit learn regression.fit(X, Y) # Fit model coef = regression.coef_ # Coefficient models as scipy array l_reg = list(zip( X, coef)) # Create list of variables alongside coefficient df_reg = pd.DataFrame( l_reg, columns=["Features", "Coefficients"] ) # Create data frame of importances with variables and gini column names print(df_reg) df_final = pd.merge( df_pca_rf, df_reg, on="Features", how="inner" ) # Join by column while keeping only items that exist in both, select outer or left for other options final = df_final["Features"].tolist() # Save chosen featres as list print(df_final) # Show in terminal df_final.to_csv(path + title + "_fp_v1.4_quant.csv") # Export df as csv
# Fit the best algorithm to the data. clf.fit(X_train_cross, Y_train_cross) predictions = clf.predict(X_test_cross) print(accuracy_score(Y_test_cross, predictions)) plot_variable_importance(X_train_cross, Y_train_cross) print(clf.score(X_train_cross, Y_train_cross), clf.score(X_test_cross, Y_test_cross)) rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y_train_cross, 2), scoring='accuracy') rfecv.fit(X_train_cross, Y_train_cross) # KFold def run_kfold(clf): kf = KFold(891, n_folds=10) outcomes = [] fold = 0 for train_index, test_index in kf: fold += 1 X_train_cross, X_test_cross = X_train.values[ train_index], X_train.values[test_index] Y_train_cross, Y_test_cross = Y_train.values[ train_index], Y_train.values[test_index] clf.fit(X_train_cross, Y_train_cross) predictions = clf.predict(X_test_cross)
import numpy import pickle numpy.set_printoptions(suppress=True) JOBS = 10 SEED = 0 types = {f'V{i}': 'float32' for i in range(1, 29)} types['Amount'] = 'float32' X = pandas.read_csv('./data/features.csv', header=0, dtype=types) y = pandas.read_csv('./data/target.csv', header=0, dtype={'Class': 'int32'}) rf = RandomForestClassifier(random_state=SEED) xgb = XGBClassifier(random_state=SEED) selector = RFECV( estimator=xgb, step=1, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED), n_jobs=JOBS, verbose=10, scoring='precision', min_features_to_select=1, ) filename = './artifacts/rfe_precision_xgb.pkl' rfe = selector.fit(X.to_numpy(), y.to_numpy().reshape(-1, )) pickle.dump(obj=rfe, file=open(filename, 'wb'))
plt.show() #-------------------------------------------------------- """ Feature Engineering Logistic Regression — Feature Selection """ #-------------------------------------------------------- from sklearn.feature_selection import RFECV logreg_model = LogisticRegression() rfecv = RFECV(estimator=logreg_model, step=1, cv=strat_k_fold, scoring='accuracy') rfecv.fit(X, y) plt.figure() plt.title('Logistic Regression CV score vs No of Features') plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() feature_importance = list(zip(feature_names, rfecv.support_)) new_features = [] for key, value in enumerate(feature_importance): if (value[1]) == True: new_features.append(value[0]) print(new_features)
from sklearn.model_selection import KFold ### PARSING df0=pd.read_csv('RAs_and_rescue.csv').drop(columns=['genotype']).set_index('names') ### SCALING RELATIVE ABUNDANCES scaler = StandardScaler() df = pd.DataFrame(scaler.fit_transform(df0.drop(columns='group'))) df.index=df0.index df.columns=df0.columns[:-1] df=df.merge(df0[['group']], left_index=True, right_index=True) ### SVM-RFE clf = svm.SVC(kernel='linear') rfe = RFECV(clf, step=1, cv=KFold(n_splits=df.shape[0]),min_features_to_select=1,n_jobs=40) rfe.fit(df.drop(columns=['group']),df['group']) # RETURNING RESULTS print('\nscore= ', sum(rfe.grid_scores_)/len(rfe.grid_scores_)) print('\n') cols=rfe.get_support(indices=True) cols =[df.columns[:-1][i] for i in cols] coeffs=pd.DataFrame() coeffs['svm_coeff']=list(rfe.estimator_.coef_[0]) coeffs['names']=cols print(coeffs.set_index('names').sort_values(by='svm_coeff')) print(len(coeffs),' features kept') coeffs.to_csv('svm_output.csv')
#on very high-dimensional datasets. data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV lr = LogisticRegression() rfecv = RFECV(estimator=lr, step=1, cv=StratifiedKFold(labels, 50), scoring='precision') rfecv.fit(features, labels) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() # Payment ratio of poi = salary/total_payment, gives the ratio of above values. #popping out outliers: data_dict.pop('TOTAL', 0) data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0) # Engineered features , this could give some more insights on usage of available data:
X, y = dataset.data, dataset.target features = dataset.feature_names #============================================================================== # CV MSE before feature selection #============================================================================== est = LinearRegression() score = -1.0 * cross_val_score( est, X, y, cv=5, scoring="neg_mean_squared_error") print("CV MSE before feature selection: {:.2f}".format(np.mean(score))) #============================================================================== # CV MSE after feature selection: RFE #============================================================================== rfe = RFECV(est, cv=5, scoring="neg_mean_squared_error") rfe.fit(X, y) score = -1.0 * cross_val_score( est, X[:, rfe.support_], y, cv=5, scoring="neg_mean_squared_error") print("CV MSE after RFE feature selection: {:.2f}".format(np.mean(score))) #============================================================================== # CV MSE after feature selection: Feature Importance #============================================================================== rf = RandomForestRegressor(n_estimators=500, random_state=SEED) rf.fit(X, y) support = rf.feature_importances_ > 0.01 score = -1.0 * cross_val_score( est, X[:, support], y, cv=5, scoring="neg_mean_squared_error") print("CV MSE after Feature Importance feature selection: {:.2f}".format( np.mean(score)))
# Load libraries import warnings from sklearn.datasets import make_regression from sklearn.feature_selection import RFECV from sklearn import datasets, linear_model # Suppress an annoying but harmless warning warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") # Generate features matrix, target vector, and the true coefficients features, target = make_regression(n_samples=10000, n_features=100, n_informative=2, random_state=1) # Create a linear regression ols = linear_model.LinearRegression() # Recursively eliminate features rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error") rfecv.fit(features, target) rfecv.transform(features)
def feature_selection_classifier(features, labels, estimator=None): if estimator is None: estimator = SVC(kernel='linear', C=0.1, gamma=1) classifier = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(10), scoring='accuracy', n_jobs=-1) classifier.fit(features, labels) return classifier
plt.title('ROC') plt.show() score lg_auc #%% from utility_functions import generate_result_csv res = {} #svm = fit_by_label(la, fea, 10, params) res[la] = predict_by_label(la, fea, svm) generate_result_csv(res, 'result_d5.csv') #%% clf = SVC(kernel='linear') selector = RFECV(clf, 2) selector.fit(X_train, y_train) #%% X_train_red = X_train[:, selector.support_] X_test_red = X_test[:, selector.support_] clf = SVC(kernel='rbf', gamma=gs.best_params_['gamma'], C=gs.best_params_['C']) clf.fit(X_train_red, y_train) clf.score(X_test_red, y_test) #%% #la1 = 'Dog_4' #fea1 = ['band_power'] #from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #from sklearn.linear_model import LogisticRegressionCV #X_train, X_test, y_train, y_test = generate_fit_data(la1, fea1, 1, cv=True) #