def df5(): from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split import time from sklearn.linear_model import LogisticRegression select = RFE(RandomForestClassifier(n_estimators=100,random_state=42), n_features_to_select=40) cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len((cancer.data)), 50)) X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5) start_time = time.time() select.fit(X_train,y_train) print("Estimated execution time: {} seconds".format((time.time()-start_time))) X_train_rfe = select.transform(X_train) X_test_rfe = select.transform(X_test) score = LogisticRegression().fit(X_train_rfe,y_train).score(X_test_rfe,y_test) print("Score: {:.3f}".format(score)) mask = select.get_support() plt.matshow(mask.reshape(1,-1),cmap='gray_r') plt.xlabel("Sample index") plt.show()
def feature_selection(self) -> None: """ Features selection """ ####################### # FEATURE SELECTION # selector = SelectPercentile(score_func=mutual_info_classif, percentile=100) # selector = RFE(estimator=LogisticRegression(max_iter=1500), n_features_to_select=15) selector = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=15) self.log.info( f"[FEATURE SELECTION] Feature selection using {type(selector).__qualname__}" ) selector.fit(self.training.X, self.training.y) self.training.X = selector.transform(self.training.X) self.log.debug( f"[FEATURE SELECTION] Feature index after {type(selector).__qualname__}: {selector.get_support(indices=True)}" ) self.test.X = selector.transform(self.test.X) self.log.debug( f"[FEATURE SELECTION] Train shape after feature selection: {self.training.X.shape} | {self.training.y.shape}" ) self.log.debug( f"[FEATURE SELECTION] Test shape after feature selection: {self.test.X.shape} | {self.test.y.shape}" )
def Feature_Selection_Recursive(k, function, model, xtrain, xtest, ytrain, ytest): selector = RFE(function, k) selector = selector.fit(xtrain, ytrain) xtrain = selector.transform(xtrain) xtest = selector.transform(xtest) clf = model clf.fit(xtrain, ytrain) log_detial = { 'Model': '', 'Select-Method': '', 'Select-Function': '', 'Feature-Count': 0, 'Train-S': 0, 'Test-S': 0, 'R2': 0, 'RMSE': 0, 'AE': 0 } log_detial['Model'] = str(clf.__class__).split('.')[-1].replace("'>", '') log_detial['Select-Method'] = 'Recursive' log_detial['Select-Function'] = str( function.__class__).split('.')[-1].replace("'>", '') log_detial['Feature-Count'] = k log_detial['Train-S'] = clf.score(xtrain, ytrain) log_detial['Test-S'] = clf.score(xtest, ytest) log_detial['R2'] = r2_score(ytest, clf.predict(xtest)) log_detial['RMSE'] = sqrt(mean_squared_error(ytest, clf.predict(xtest))) log_detial['AE'] = mean_absolute_error(ytest, clf.predict(xtest)) return (log_detial)
def determine_num_feat_for_selection(X_train, Y_train, max_num_features, alpha): nums = numpy.arange(5, max_num_features + 1, 5) scores = [] kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=RAND) r = linear_model.Ridge(alpha=alpha) for n in nums: fold_mses = [] for tr_index, val_index in kf.split(X_train, Y_train): X_tr = X_train[tr_index] X_val = X_train[val_index] Y_tr = Y_train[tr_index] Y_val = Y_train[val_index] selection = RFE(r, n, step=1).fit(X_tr, Y_tr) X_tr = selection.transform(X_tr) X_val = selection.transform(X_val) r.fit(X_tr, Y_tr) Y_pred = r.predict(X_val) mse = metrics.mean_squared_error(Y_val, Y_pred) fold_mses = numpy.append(fold_mses, mse) scores = numpy.append(scores, fold_mses.mean()) min_score_index = numpy.argmin(scores) print('Selecting', nums[min_score_index], 'features') return nums[min_score_index]
def feature_selection(): print("Start training...") svc = svm.LinearSVC(C=0.01, penalty='l1', dual=False) svc.fit(train_vec, train_label.ravel()) tree = ExtraTreesClassifier() tree.fit(train_vec, train_label.ravel()) print("Training Accuracy:%.4f" % svc.score(train_vec, train_label)) print("Testing Accuracy:%.4f" % svc.score(test_vec, test_label)) model = SelectFromModel(svc, prefit=True) rfe = RFE(svc, n_features_to_select=20, ) rfe.fit(train_vec, train_label.ravel()) X_train = rfe.transform(train_vec) X_test = rfe.transform(test_vec) print(X_train.shape) clf = svm.SVC(C=0.9, kernel='rbf', gamma=80, decision_function_shape='ovo', ) clf.fit(X_train, train_label.ravel()) print("After feature selection...") print("Training Accuracy:%.4f" % clf.score(X_train, train_label)) print("Testing Accuracy:%.4f" % clf.score(X_test, test_label)) return X_train, X_test
def in46(): from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier select=RFE(RandomForestClassifier(n_estimators=100,random_state=42),n_features_to_select=40) from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) # print(cancer.data.shape) (596,30) x_w_noise = np.hstack([cancer.data, noise]) x_train, x_test, y_train, y_test = train_test_split(x_w_noise, cancer.target, random_state=0, test_size=0.5) select.fit(x_train,y_train) mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel('sample index') plt.show() x_train_rfe=select.transform(x_train) x_test_rfe=select.transform(x_test) from sklearn.linear_model import LogisticRegression print(LogisticRegression().fit(x_train, y_train).score(x_test, y_test)) print(LogisticRegression().fit(x_train_rfe, y_train).score(x_test_rfe, y_test))
def lsvm_rfe(c, n_feat, trainX, trainy, testX): svc = SVC(kernel="linear", C=c) rfe = RFE(estimator=svc, n_features_to_select=n_feat, step=1) rfe.fit(trainX, trainy) train_X = rfe.transform(trainX) test_X = rfe.transform(testX) return train_X, test_X
def Feature_Optimization_RF(X_train, y_train, X_test, y_test): results = pd.DataFrame( columns=['Number of Features', 'Accuracy Score', 'Micro F1 Score', 'Macro F1 Score', 'Weighted F1 Score', 'Micro Precision Score', 'Macro Precision Score', 'Weighted Precision Score', 'Micro Recall Score', 'Macro Recall Score', 'Weighted Recall Score']) for index in np.arange(len(X_train.columns)): sel = RFE(RandomForestClassifier(random_state=42, n_jobs=-1), n_features_to_select=index + 1) sel.fit(X_train, y_train) x_train_rfe = sel.transform(X_train) x_test_rfe = sel.transform(X_test) model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1) model.fit(x_train_rfe, y_train) results.loc[index] = [index + 1, round(accuracy_score(y_test, model.predict(x_test_rfe)), 4), round(f1_score(y_test, model.predict(x_test_rfe), average='micro'), 4), round(f1_score(y_test, model.predict(x_test_rfe), average='macro'), 4), round(f1_score(y_test, model.predict(x_test_rfe), average='weighted'), 4), round(precision_score(y_test, model.predict(x_test_rfe), average='micro'), 4), round(precision_score(y_test, model.predict(x_test_rfe), average='macro'), 4), round(precision_score(y_test, model.predict(x_test_rfe), average='weighted'), 4), round(recall_score(y_test, model.predict(x_test_rfe), average='micro'), 4), round(recall_score(y_test, model.predict(x_test_rfe), average='macro'), 4), round(recall_score(y_test, model.predict(x_test_rfe), average='weighted'), 4)] return results
def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] X_sparse = sparse.csr_matrix(X) y = iris.target # dense model clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert len(rfe.ranking_) == X.shape[1] # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) assert X_r.shape == iris.data.shape assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert rfe.score(X, y) == clf.score(iris.data, iris.target) assert_array_almost_equal(X_r, X_r_sparse.toarray())
class LinearRegSoccerGame: # create classifier for home and away # x - features list per game # y - 0 1 2 == home draw away def __init__(self, x, y): clf = LinearRegression() self.rfe = RFE(estimator=clf, n_features_to_select=102) self.rfe.fit(x, y) best_features = self.rfe.transform(x) self.clf = LinearRegression() # self.clf = ElasticNet(normalize=params['normalize'], fit_intercept=params['fit_intercept'], alpha =params['alpha'], # selection=params['selection'], l1_ratio=params['l1_ratio'], random_state=42).fit(x,y) self.X = best_features self.Y = y # return the probabilities per game in test group # x - test group def take_the_more_prob(self, x): bet = [round(k) for k in self.clf.predict(self.rfe.transform(x))] return def calculate_prob_for_test_group(self, x): home = [] draw = [] away = [] for k in self.clf.predict(self.rfe.transform(x)): if k == 0.0: home.append(1) draw.append(0) away.append(0) elif k == 1.0: home.append(0) draw.append(1) away.append(0) elif k == 2.0: home.append(0) draw.append(0) away.append(1) else: h = 1 / k d = 1 / abs(1 - k) a = 1 / abs(2 - k) norm_ = h + d + a home.append(h / norm_) draw.append(d / norm_) away.append(a / norm_) return home, draw, away def predict_proba(self, x): return [self.calculate_prob_for_test_group(x)] def fit(self, x, y): return self # return in how much games the result with more probability is equal to the real result # 1 return parameter: number of games # 2 return parameter: number of the probability right # 3 return parameter: the two above """def take_the_more_prob(self, x, cost_per_game, real_results):
def RFE_Feature(x_train,y_train,x_test,y_test): from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier # define model rfc = RandomForestClassifier(n_estimators=100) rfe = RFE(estimator=rfc, n_features_to_select=3) # fit the model rfe.fit(x_train, y_train) #transform the data x_train, y_train = rfe.transform(x_train, y_train) x_test, y_test = rfe.transform(x_test, y_test) return x_train,y_train,x_test,y_test
def subtest(model, XL, YL, XT, YT, feature_names): nfeatures = XL.shape[1] rfe = RFE(model, nfeatures-1) print "BEFORE" model.fit(XL, YL) print_performance(YT, model.predict(XT)) print "AFTER" rfe.fit(XL, YL) print_performance(YT, rfe.predict(XT)) print "REMOVED FEATURE %s" % (feature_names[np.where(rfe.support_==False)[0][0]]) print "" return rfe.transform(XL), rfe.transform(XT), feature_names[rfe.support_]
def rfe(X, y, x_test): selector = RFE(Lasso(), n_features_to_select=30, step=1) selector = selector.fit(X, y) X_cols = X.columns x_test_cols = x_test.columns selector.transform(X) selector.transform(x_test) return pd.DataFrame(X, columns=X_cols), pd.DataFrame(x_test, columns=x_test_cols) # def feature_union(X,x_test=None,verbose=False): # return
def RFE_method(X_train, X_test, y_train, y_test): #def RFE_method(X_train, y_train): # define model rfc = RandomForestClassifier(n_estimators=100) rfe = RFE(estimator=rfc, n_features_to_select=2) # fit the model rfe.fit(X_train, y_train) # transform the data #X_train, y_train = rfe.transform(X_train, y_train) #X_test, y_test = rfe.transform(X_test, y_test) X_train = rfe.transform(X_train) X_test = rfe.transform(X_test) #return X_train, y_train, X_test, y_test return X_train, X_test
def select_features_RFE_lasso(X, y, columns, iteration): clf = LassoCV(max_iter=iteration).fit(X, y) importance = np.abs(clf.coef_) print("importance") print(importance) idx_third = importance.argsort()[-4] threshold = importance[idx_third] + 0.00000001 idx_features = (-importance).argsort()[:8] name_features = np.array(columns)[idx_features] print('Selected features RFE from LassoCV : {}'.format(name_features)) sfm = RFE(clf) sfm.fit(X, y) X_transform = sfm.transform(X) n_features = sfm.transform(X).shape[1] return name_features
def feature_selection(func=DecisionTreeClassifier): x, y = load_data() clf = func() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='accuracy') rfe = RFE(estimator=clf, step=1, n_features_to_select=5) rfe = rfe.fit(x_train, y_train) print('Chosen best 5 feature by rfe:', x_train.columns[rfe.support_]) feat_name = list(x_train.columns[rfe.support_]) x_train = rfe.transform(x_train) x_test = rfe.transform(x_test) data_set = np.hstack((x_train, np.array([[i] for i in y_train]))) test_set = np.hstack((x_test, np.array([[i] for i in y_test]))) return data_set.tolist(), test_set.tolist(), feat_name
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFE(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data,transformed_test_data
def run_once(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model lg_regression = linear_model.LogisticRegression(solver='lbfgs') rfe = RFE(lg_regression, best_nof_feature) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) lg_regression.fit(rfe_train_x, train_y) labels = df_label.unique() # predict probs test_y_predict_probs = lg_regression.predict_proba(rfe_test_x) test_y_predict_prob = test_y_predict_probs[:, 1] prob_df = pd.DataFrame(test_y_predict_prob) prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0) get_accuracy("logistic regression predict_probs", test_y, prob_df['predict'], labels) # print features cols = list(df_ohe.columns) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index save_print("Top " + str(best_nof_feature) + " features are: ") save_print(selected_features_rfe) # dump model joblib.dump(lg_regression, root_folder + "lg_regression.pkl") save_print("lg_regression Model dumped!") joblib.dump(selected_features_rfe, root_folder + "lg_regression_cols.pkl") save_print("lg_regression models columns dumped!")
def rfe(self, n): rfe = RFE(self.clf, n) logger.info("Fitting RFE to data...") fit = rfe.fit(self.X, self.y) logger.info(f"RFE support: {fit.support_}") logger.info(f"RFE ranking: {fit.ranking_}") self.X = rfe.transform(self.X)
def run_rfe(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model nof_list = np.arange(1, (max_feature_try_numbers + 1)) class_1_precision_list = [] class_1_recall_list = [] for n in range(len(nof_list)): save_print("********Current nof features are: " + str(nof_list[n])) dc_tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=20, random_state=99) rfe = RFE(dc_tree, nof_list[n]) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) dc_tree.fit(rfe_train_x, train_y) labels = df_label.unique() # predict test_y_predict = dc_tree.predict(rfe_test_x) class_1_precision, class_1_recall = get_accuracy( "decision tree", test_y, test_y_predict, labels) class_1_precision_list.append(class_1_precision) class_1_recall_list.append(class_1_recall) plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list, 'decision tree')
def Recursive_Feature_Elimination(self, X_train, X_test, y_train, y_test, x, y, file_name = 'model.sav'): nof_list = np.arange(1, len(x.columns)) high_score=0 nof=0 score_list =[] for n in range(len(nof_list)): model = LinearRegression() rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if(score>high_score): high_score = score nof = nof_list[n] print("Optimum number of features: %d with score: %f" % (nof, high_score)) cols = list(x.columns) model = LinearRegression() rfe = RFE(model, nof) X_rfe = rfe.fit_transform(x,y) model.fit(X_rfe,y) temp = pd.Series(rfe.support_,index = cols) selected_features_rfe = temp[temp==True].index pickle.dump(model, open(file_name, 'wb')) with open('parameters_selection.txt', 'w') as f: for item in selected_features_rfe: f.write("%s\n" % item) return selected_features_rfe
class RFE_RandomForestRegPrim(primitive): def __init__(self, random_state=0): super(RFE_RandomForestRegPrim, self).__init__(name='RFE_RandomForestReg') self.id = 44 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Feature ranking with recursive feature elimination with Random-Forest regressor. Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached." self.hyperparams_run = {'default': True} self.selector = RFE(RandomForestRegressor()) self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def select_useful_features(X_scaled, y, k_features=k_features): auc_scores = np.array([]) accuracy_scores_RFE = [] f1_scores_RFE = [] for i in k_features: model = LogisticRegression(random_state=0) selector = RFE(model, n_features_to_select=i, step=True, verbose=True) selector.fit(X_scaled, y) X_selected = selector.transform(X) auc_score = compute_score(model, X_selected, y) auc_scores = np.append(auc_scores, auc_score) accuracy_score = compute_score_accuracy(model, X_selected, y) accuracy_scores_RFE.append(accuracy_score) f1_score = compute_score_f1(model, X_selected, y) f1_scores_RFE.append(f1_score) scores_df = pd.DataFrame(auc_scores, index=k_features, columns=['scores']) max_value = scores_df[max(scores_df.values)==scores_df.values].index.values[0] plt.plot(string_array, auc_scores, linewidth=1, color='black', marker='o', markersize=7, label='AUC') plt.plot(string_array, accuracy_scores_RFE, linewidth=1, marker='o', markersize=7, label='accuracy', color='red') plt.plot(string_array, f1_scores_RFE, linewidth=1, marker='o', markersize=7, label='f1') plt.axvline(x=10, color='red', linewidth=1, linestyle='--', label='chosen with {} features'.format(11)) plt.ylabel('scoring', fontsize=20) plt.xlabel('number of features selected', fontsize=20) plt.title('Recursive feature elimination', fontsize=30) plt.xticks(fontsize=14) plt.yticks(fontsize=14) plt.grid(linewidth=0.5) plt.legend(fontsize=14)
def run_once(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model dc_tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=20, random_state=99) rfe = RFE(dc_tree, best_nof_feature) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) dc_tree.fit(rfe_train_x, train_y) labels = df_label.unique() # predict test_y_predict = dc_tree.predict(rfe_test_x) get_accuracy("decision tree", test_y, test_y_predict, labels) # print features cols = list(df_ohe.columns) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index save_print("Top " + str(best_nof_feature) + " features are: ") save_print(selected_features_rfe) # dump model joblib.dump(dc_tree, root_folder + "dc_tree.pkl") save_print("dc_tree Model dumped!") joblib.dump(selected_features_rfe, root_folder + "dc_tree_cols.pkl") save_print("dc_tree models columns dumped!")
def run_rfe(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model nof_list = np.arange(1, (max_feature_try_numbers + 1)) class_1_precision_list = [] class_1_recall_list = [] for n in range(len(nof_list)): save_print("********Current nof features are: " + str(nof_list[n])) lg_regression = linear_model.LogisticRegression(solver='lbfgs') rfe = RFE(lg_regression, nof_list[n]) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) lg_regression.fit(rfe_train_x, train_y) labels = df_label.unique() # predict probs test_y_predict_probs = lg_regression.predict_proba(rfe_test_x) test_y_predict_prob = test_y_predict_probs[:, 1] prob_df = pd.DataFrame(test_y_predict_prob) prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0) class_1_precision, class_1_recall = get_accuracy( "logistic regression predict_probs", test_y, prob_df['predict'], labels) class_1_precision_list.append(class_1_precision) class_1_recall_list.append(class_1_recall) plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list, 'logistic regression')
def RFE(): from sklearn.feature_selection import RFE model = LinearRegression(X, y) #Initializing RFE model rfe = RFE(model, 5) #Transforming data using RFE X_rfe = rfe.fit_transform(X, y) #Fitting the data to model model.fit(X_rfe, y) print(rfe.support_) print(rfe.ranking_) #no of features nof_list = np.arange(1, 13) high_score = 0 #Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100) model = LinearRegression() rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score nof = nof_list[n] print("Optimum number of features: %d" % nof) print("Score with %d features: %f" % (nof, high_score))
def RFE_nof(df, target, normalize): y = df[target] X = df.drop(target, 1) nof_list = np.arange(1, len(X.columns)) high_score = 0 #Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=normalize) rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score nof = nof_list[n] return nof
def RFE(X, y, num_features=4, classification_tasks=True, model=None): """ Implements feature selection using Recursive Feature Elimination :return: """ from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression, LinearRegression # column values col_names = X.columns if not model: if classification_tasks: model = LogisticRegression() else: model = LinearRegression() rfe = RFE(model, n_features_to_select=num_features) fit = rfe.fit(X, y) features = rfe.transform(X) feature_scores = pd.DataFrame(fit.ranking_, index=col_names, columns=["scores"]) # best features has a #1 score value feature_scores = feature_scores.sort_values(by="scores", ascending=True) print("============== Feature scores - RFE ===========") print("Feature Ranking:\n {}".format(feature_scores)) print("Selected Features: {}".format(features)) return features, feature_scores
def fitpositiveLassoCVRFE(self): cv = TimeSeriesSplit(n_splits=3) #3 is the default lassomodel = LassoCV(n_alphas=2, alphas=np.linspace(0.01, 0.1, num=2), fit_intercept=False, precompute=True, \ max_iter=2000, cv=cv,\ positive=True, random_state=9999, selection='random') #alpha=0.01 #alphas=np.linspace(0.01, 0.1, num=10), rfe = RFE(lassomodel, globe.SecuritiesPerBasket) fit = rfe.fit(self.xdata, self.ydata) print(fit) #print("Num Features: " + str(fit.n_features_)) #print("Selected Features: " + str(fit.support_)) #print("Feature Ranking: " + str(fit.ranking_)) i = 0 indices = list() for included in fit.support_: if included: indices.append(i) i += 1 print('Indeces selected by RFE:') print(indices) lmodel = rfe.estimator_ X = rfe.transform(self.xdata) port = subportfolio.subportfolio(lmodel, X, indices, self.ydata, 'positiveLassoCVRFE', 'RFE', self.plotmodelresults, \ self.plt, self.catdata) port.evaluatemodelaccuracy(self.figurenr, self.listofsubportfolios)
def optimal_number_of_features(X_train, y_train, X_test, y_test): ''' optimal_number_of_features(X_train, y_train, X_test, y_test) RETURNS: number_of_features discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature elimination and linear regression (to test the performance with each number of features). We will use the output of this function (the number of features) as input to the next function optimal_features, which will then run recursive feature elimination to find the n best features Shamelessly stolen from David Espinola ''' number_of_attributes = X_train.shape[1] number_of_features_list = np.arange(1, number_of_attributes) high_score = 0 #Variable to store the optimum features number_of_features = 0 score_list = [] for n in range(len(number_of_features_list)): model = LinearRegression() rfe = RFE(model, number_of_features_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score number_of_features = number_of_features_list[n] return number_of_features
def optimal_number_of_features(X, y): '''discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature elimination and linear regression (to test the performance with each number of features). We will use the output of this function (the number of features) as input to the next function optimal_features, which will then run recursive feature elimination to find the n best features ''' number_of_attributes = X_train.shape[1] number_of_features_list = np.arange( 1, number_of_attributes) # len(features_range) # set "high score" to be the lowest possible score high_score = 0 # variables to store the feature list and number of features number_of_features = 0 score_list = [] for n in range(len(number_of_features_list)): model = LinearRegression() rfe = RFE(model, number_of_features_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score number_of_features = number_of_features_list[n] return number_of_features
class BackwardStepwise(object): def __init__(self, n, estimator, step=100): assert type( n ) is int and n > 0, "Invalid parameter type or value %s (number of features)" % n assert type( step ) is int and step > 0, "Invalid parameter type or value %s (step)" % n self.__estimator = estimator self.__n = n self.__step = step self.__model = RFE(self.__estimator, n_features_to_select=self.__n, step=self.__step) def score_features(self, X, Y): self.__model.fit(X, Y) return self.__model.ranking_ def select_features(self, X): return self.__model.transform(X) def __str__(self): return ''' Backward stepwise feature selection: Top features selected: %s Step size: %s Estimator: %s ''' % (self.__n, self.__step, self.__estimator)
class LogReg: """ Initialization sets the objects model, vectorizer, labels, and corpus variables. Initialization also performs the initial training for the model and vectorizer using the given reviews. """ def __init__( self, reviews, vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 1, ngram_range = (1, 2)), model = LogisticRegression() ): self.model = model self.vectorizer = vectorizer self.selector = RFE(self.model, step = 100, verbose = 100) corpus = [] labels = [] for review in reviews: corpus += [review[1]["text"]] labels += [review[0]] #setting variables for the object self.corpus = corpus self.labels = labels self.reviews = reviews X = self.vectorizer.fit_transform(self.corpus) self.feature_names = self.vectorizer.get_feature_names() y = self.labels for string in self.feature_names: print(string.encode("ascii", 'ignore')) #Training the model X_new = self.selector.fit_transform(X, self.labels) self.model.fit(X_new, self.labels) def classify_all(self, all_test_data): test_corpus = [] y = [] for review in all_test_data: test_corpus += [review[1]['text']] y += [review[0]] #Used transform instead of fit_transform #for test data so number of features will match X = self.vectorizer.transform(test_corpus) X_new = self.selector.transform(X) results = self.model.predict(X_new) categories = ["spring", "summer", "fall", "winter"] for i, category in enumerate(categories): top10 = np.argsort(self.model.coef_[i])[-20:] for j in top10: print("%s: %s" % (category, "".join(self.feature_names[j]))) return results
def vocabulary(self, all_test_data): test_corpus = [] y = [] for review in all_test_data: test_corpus += [review[1]['text']] y += [review[0]] X = self.vectorizer.transform(test_corpus) results = self.model.predict(X) selector = RFE(self.model, 100, 1) sel_result = selector.fit(X, y) print(selector.transform(X))
def featureSelector(train_path,predict_path): X, y = load_svmlight_file("data/comment_test") Xt, yt = load_svmlight_file("data/test_predict") estimator = SVR(kernel="linear") selector = RFE(estimator, step=1) newX = selector.fit(X, y) newtX=selector.transform(Xt) print newtX """ sel = VarianceThreshold(threshold=(.8 * (1 - .8))) newX=sel.fit_transform(X) print newX """ #featureSelector("","")
import numpy as np from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV, RFE from sklearn.datasets import make_classification from sklearn.metrics import zero_one from scipy import sparse # Build a classification task using 3 informative features X, y = make_classification(n_samples=1000, n_features=5000, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0) X_sparse = sparse.csr_matrix(X) print X.shape, "x", y.shape # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.20) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) print X_r_sparse.shape
''' #selector = SelectKBest(f_classif, k=18) #print(selector.get_support) S = SVC(kernel='linear') # create the RFE model for the svm classifier # and select attributes rfe = RFE(S,30) rfe = rfe.fit(first, y) # print summaries for the selection of attributes print(rfe.support_) print(rfe.ranking_) print(rfe.n_features_) yoo = rfe.transform(first) #yoo = ["A3","B3","B4","B6","C2","C6","C7","C8","C11","C16c","C17","D1","D2","D3","D5","D6","D7","D8","D9","D10","D11","E4","E3","E5"] #print(rfe.score(first,y)) ''' rfecv = RFECV(estimator=S, step=1, cv=StratifiedKFold(2)) rfecv.fit(yoo, y) print(rfecv.grid_scores_) ''' ''' clf = DecisionTreeClassifier() clf.fit(first, y) sfm = SelectFromModel(clf,threshold=0.022)
def featureReductionTest(useFeature,trueSet,falseSet,dim=10,state=-1): if(state==-1): state = np.random.randint(10000) # load data and split X_true = [] for dn in trueSet: fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb") X_true.append(pickle.load(fin)) fin.close() X_true = np.vstack(X_true) # print(X_true.shape) X_false = [] for dn in falseSet: fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb") X_false.append(pickle.load(fin)) fin.close() X_false = np.vstack(X_false) # print(X_false.shape) test_size = 0.3 X_true_train,X_true_test = train_test_split(X_true ,test_size=test_size,random_state=state) X_false_train, X_false_test = train_test_split(X_false ,train_size=len(X_true_train),test_size=len(X_true_test),random_state=state+1) X = np.vstack([X_true_train,X_false_train]) X_ = np.vstack([X_true_test,X_false_test]) Y = [1]*len(X_true_train)+[0]*len(X_false_train) Y_ = [1]*len(X_true_test)+[0]*len(X_false_test) X,Y = shuffle(X,Y) X_,Y_ = shuffle(X_,Y_) featNames = ml_feature_name.getFeatureName(useFeature) clf = LinearSVC(C=0.1) rfe = RFE(estimator =clf, n_features_to_select=dim,step=10) rfe.fit(X,Y) Xs = rfe.transform(X) Xs_ = rfe.transform(X_) clf.fit(Xs,Y) Yp = clf.predict(Xs) Yp_ = clf.predict(Xs_) supIndex = rfe.transform(list(range(len(X[0]))))[0] feats = [[abs(clf.coef_[0][i]),clf.coef_[0][i],v,featNames[v]] for i,v in enumerate(supIndex)] feats.sort() print("\n".join(list(map(str,feats))[::-1])) print(classification_report(Y,Yp)) print(classification_report(Y_,Yp_)) featNames = ml_feature_name.getFeatureName(useFeature) arr = Xs.T[0] reg = list(zip(arr,Y)) reg.sort() # plt.plot(list(range(len(reg))),reg) # plt.ylim(0,2) # plt.show() if(useFeature=="rp"): fin = open("./feature/id_rhythm_barkband.txt","r") bark = [int(v) for v in fin.readline().split(",")] fin.close() barkName = [] for ind in range(len(bark)): if(ind==0): barkName.append("0-"+str(bark[ind])+"Hz") else: barkName.append(str(bark[ind-1])+"-"+str(bark[ind])+"Hz") flucName = ["{0:.0f}bpm".format(((v)+1)*0.17*60) for v in range(60)] barkName.reverse() mat = np.zeros((60,24)) for i,ind in enumerate(supIndex): val = clf.coef_[0][i] mat[ind//24,ind%24]=val mat =np.fliplr(mat) plt.yticks(range(24),barkName) plt.xticks(range(60),flucName,rotation="vertical") plt.imshow(mat.T,cmap="Greys_r") plt.savefig("./learn/feature/rp_rank"+str(dim)+".png") plt.show() return f1_score(Y,Yp),f1_score(Y_,Yp_)
print('Variance score Train: %.2f' % selector2.score(X,y)); print('Variance score Test: %.2f' % selector2.score(Xtest,ytest)); print('Coeff of Test: ', selector2.ranking_); print('No of Features selected by RFE = %.2f' %sum(selector2.support_)); plotfit(selector2,X,y, title = 'Training fit'); plotfit(selector2,Xtest,ytest,c='blue', title = 'Test fit'); # Forecast, # Since lagged variables are selected for our model, forecasting is done iteratively # by using the predicted values at time t as lagged variables for time t+1,t+2... # In practice however, this is not required as the true price will be known before prediction. yfcast = []; for i in list(range(len(Xfcast))): Xfcast_trim = selector2.transform(Xfcast); x = Xfcast_trim[i,:]; x = x.reshape(1,-1); ypred = predictor.predict(x); yfcast.append(ypred); k = 9;j=1; if i < len(Xfcast)-1: for l in list(range(0,lag)): #print ('l = ', l , 'i+j = ', i+j, ' k+1 = ', k+1); if i+j <= 49: Xfcast[i+j,k] = ypred; k = k+10; j = j+1; #np.savetxt('../Xfcast-loaded.csv', Xfcast, fmt="%f", delimiter = ','); # Write predictions
def svc_1(): """ Submission: svc_1_0620_01.csv E_val: 0.866856950449 E_in: 0.855948 E_out: 0.8546898189645258 """ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import LinearSVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFE from sklearn.grid_search import RandomizedSearchCV from sklearn.calibration import CalibratedClassifierCV from sklearn.linear_model import LogisticRegression from scipy.stats import expon logger.debug('svc_1') X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = RFE(estimator=LogisticRegression(class_weight='auto'), step=1, n_features_to_select=21) rfe.fit(X_scaled, y) util.dump(rfe, util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) logger.debug('Features selected.') new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) svc = LinearSVC(dual=False, class_weight='auto') rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=StratifiedKFold(y, 5), param_distributions={'C': expon()}) rs.fit(X_new, y) logger.debug('Got best SVC.') logger.debug('Grid scores: %s', rs.grid_scores_) logger.debug('Best score (E_val): %s', rs.best_score_) logger.debug('Best params: %s', rs.best_params_) svc = rs.best_estimator_ util.dump(svc, util.cache_path('new_data.SVC')) isotonic = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5), method='isotonic') isotonic.fit(X_new, y) util.dump(isotonic, util.cache_path('new_data.CalibratedClassifierCV.isotonic')) logger.debug('Got best isotonic CalibratedClassifier.') logger.debug('E_in (isotonic): %f', auc_score(isotonic, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('svc', isotonic)]), 'svc_1_0620_01')
pre= machine_1[['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W',]] clf= ExtraTreesClassifier().fit(pre,machine_1['X']) clf.feature_importances_ model= SelectFromModel(clf, prefit=True) new=model.transform(pre) new.shape() # Recursive Feature Elimination from sklearn.feature_selection import RFE model = ExtraTreesClassifier() rfe = RFE(model) rfe = rfe.fit(pre,machine_1['X']) # summarize the selection of the attributes machine_model= rfe.transform(pre) print(rfe.support_) print(rfe.ranking_) #after comparing output of two models, its been concluded that Recursive Feature Elimination gives better results machine_2= pd.read_csv("G:\\Datasets\\7z assignment\\Train\\machine2.csv") machine_2.isnull().sum() predictors_2= machine_2[['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W',]] clf2= ExtraTreesClassifier().fit(predictors_2,machine_2['X']) clf2.feature_importances_ model_2= SelectFromModel(clf2, prefit=True) new_model2=model_2.transform(predictors_2) df_machine1= pd.DataFrame(new) df_machine1.loc[:,'X']= pd.Series(machine_1['X'],index=df_machine1.index)
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args): W = [] features = [] if selection_method != '2step_kbest': n_features = min(n_features, len(feature_list)) if estimator_method == 'svm' and selection_method == 'rfe': estimator_args['kernel'] = 'linear' estimator = ESTIMATORS[estimator_method](**estimator_args) if selection_method == 'cluster': agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average') clusters = agglom.fit_predict(X).tolist() sample = [clusters.index(i) for i in range(n_features)] X = X[:,sample] Z = Z[:,sample] selection_method = None if selection_method is None: for i, y in enumerate(Y): estimator.fit(X, y) w = estimator.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'rfe': selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args) for i, y in enumerate(Y): selector = selector.fit(X, y) features.append(feature_list[selector.support_]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'myrfe': selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args) for i, y in enumerate(Y): selector.fit(X, y) features.append(feature_list[selector.support]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'kbest': selector = SelectKBest(f_regression, k=n_features, **selection_args) for i, y in enumerate(Y): X2 = selector.fit_transform(X, y) Z2 = selector.transform(Z) features.append(feature_list[selector.get_support()]) estimator.fit(X2, y) w = estimator.predict(Z2) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', print return W, features
from sklearn import cross_validation from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import cross_val_predict from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report from sklearn.linear_model import LogisticRegression # from sklearn.svm import SVR model = LogisticRegression() selector = RFE(model, 12, step=1) selector.fit(X,y) # summarize the selection of the features print X.columns[selector.get_support()] #get the only selected features from X X_new=selector.transform(X) X_new = pd.DataFrame(X_new,columns = [X.columns[selector.get_support()]]) # 5-folder cross validation y_pred=cross_val_predict(model,X_new,y, cv=5) #print precision_score(y,y_pred,average=None) #print recall_score(y,y_pred,average=None) #print f1_score(y,y_pred,average=None) #print accuracy_score(y,y_pred) #print classification_report(y,y_pred) ####################################################### # def full_precision (estimator, X_test, y_test): # y_pred = estimator.predict(X_test) # return precision_score(y_test,y_pred, average=None)