def runFeatures_KNN(i, numberOfFeature, X_train, Y_train, X_test, Y_test): print("run feature method") print("Number of features Selected KNN : ", numberOfFeature) KNN = KNeighborsClassifier(n_neighbors=8, p=3) sfs1 = sfs(KNN, k_features=numberOfFeature, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0, n_jobs=-1) sfs1.fit(X_train, Y_train) cols = sfs1.k_feature_idx_ print('The indices of best features KNN are: ', cols, '\n') str1 = ','.join(str(e) for e in cols) X_train = sfs1.transform(X_train) X_test = sfs1.transform(X_test) filename = 'saved_models/SFS_KNN.pkl' pickle.dump(sfs1, open(filename, 'wb')) train_acc = KNN_evaluation_procedure(KNN, X_train, Y_train, X_test, Y_test) X_valid, Y_valid = SFS_validate.read_features(i) sfs1 = pickle.load(open(filename, 'rb')) X_valid = sfs1.transform(X_valid) val_acc = SFS_validate.validation_procedure_KNN(X_valid, Y_valid) return str1, train_acc, val_acc
def do_feature_selection(model, trainAndValidation, trainAndValidation_y, minFeatures, maxFeatures, mainFeatures, fixed_features, focal_class): validation_indices = trainAndValidation[trainAndValidation.set_annotation == 'validation'].index validSet = PredefinedHoldoutSplit(validation_indices) trainAndValidation_y = np.where(trainAndValidation_y == focal_class, 1, 0) X, y = trainAndValidation[mainFeatures], trainAndValidation_y results = {} for totFeatures in range(minFeatures, maxFeatures): # Build step forward feature selection curSFS = sfs(model, k_features=totFeatures, forward=True, verbose=0, # If 0, no output, # if 1 number of features in current set, # if 2 detailed logging including timestamp and cv scores at step. scoring=auc_scorer, # 'roc_auc', cv=validSet, n_jobs=1, fixed_features=fixed_features) curSFS = curSFS.fit(X, y) feat_cols = list(curSFS.k_feature_idx_) sel = [] for i in feat_cols: sel.append(mainFeatures[i]) key = ",".join(sorted(sel)) if key in results: print("error") results[key] = curSFS.k_score_ return results
def forward_feature_selection(x_data, y_data, n_select): print("Applying forward feature selection to numerical data") print( f"cat variables before forward feature selection {x_data.select_dtypes(include='object').shape}" ) print( f"numeric variables before forward feature selection {x_data.select_dtypes(include='number').shape}" ) num_cols = x_data.select_dtypes(include='number').columns temp = x_data[num_cols] sfsf = sfs(RandomForestRegressor(n_jobs=5), k_features=n_select, forward=True, floating=False, verbose=2, cv=3, scoring='r2') sfsf.fit(temp, y_data) idx = sfsf.k_feature_idx_ idx = list(idx) cols_to_keep = num_cols[idx] cols_to_drop = [x for x in num_cols if x not in cols_to_keep] x_data.drop(labels=cols_to_drop, axis=1, inplace=True) print( f"cat variables after forward feature selection {x_data.select_dtypes(include='object').columns}" ) print( f"numeric variables after forward feature selection {x_data.select_dtypes(include='number').columns}" ) return x_data
def runFeatures_SVM(i, numberOfFeature, X_train, Y_train, X_test, Y_test): print("Number of features Selected SVM : ", numberOfFeature) SVC1 = SVC(kernel='linear', probability=True, random_state=0) sfs1 = sfs(SVC1, k_features=numberOfFeature, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0, n_jobs=-1) sfs1.fit(X_train, Y_train) cols = sfs1.k_feature_idx_ print('The indices of best features SVM are: ', cols, '\n') str1 = ','.join(str(e) for e in cols) X_train = sfs1.transform(X_train) X_test = sfs1.transform(X_test) filename = 'saved_models/SFS_SVM.pkl' pickle.dump(sfs1, open(filename, 'wb')) train_acc = SVM_evaluation_procedure(SVC1, X_train, Y_train, X_test, Y_test) X_valid, Y_valid = SFS_validate.read_features(i) sfs1 = pickle.load(open(filename, 'rb')) X_valid = sfs1.transform(X_valid) val_acc = SFS_validate.validation_procedure_SVM(X_valid, Y_valid) return str1, train_acc, val_acc
def runFeatures_LR(i, numberOfFeature, X_train, Y_train, X_test, Y_test): print("Number of features Selected LR: ", numberOfFeature) LR = LogisticRegression(penalty='l1', tol=0.1, random_state=12) sfs1 = sfs(LR, k_features=numberOfFeature, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0, n_jobs=-1) sfs1.fit(X_train, Y_train) cols = sfs1.k_feature_idx_ print('The indices of best features LR are: ', cols, '\n') str1 = ','.join(str(e) for e in cols) X_train = sfs1.transform(X_train) X_test = sfs1.transform(X_test) filename = 'saved_models/SFS_LR.pkl' pickle.dump(sfs1, open(filename, 'wb')) train_acc = LR_evaluation_procedure(LR, X_train, Y_train, X_test, Y_test) X_valid, Y_valid = SFS_validate.read_features(i) sfs1 = pickle.load(open(filename, 'rb')) X_valid = sfs1.transform(X_valid) val_acc = SFS_validate.validation_procedure_LR(X_valid, Y_valid) return str1, train_acc, val_acc
def runFeatures_RF(i, numberOfFeature, X_train, Y_train, X_test, Y_test): print("Number of features Selected RF : ", numberOfFeature) RF = RandomForestClassifier(n_estimators=100, random_state=1, max_features='log2') sfs1 = sfs(RF, k_features=numberOfFeature, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0, n_jobs=-1) sfs1.fit(X_train, Y_train) cols = sfs1.k_feature_idx_ print('The indices of best features RF are: ', cols, '\n') str1 = ','.join(str(e) for e in cols) X_train = sfs1.transform(X_train) X_test = sfs1.transform(X_test) filename = 'saved_models/SFS_RF.pkl' pickle.dump(sfs1, open(filename, 'wb')) train_acc = RF_evaluation_procedure(RF, X_train, Y_train, X_test, Y_test) X_valid, Y_valid = SFS_validate.read_features(i) sfs1 = pickle.load(open(filename, 'rb')) X_valid = sfs1.transform(X_valid) val_acc = SFS_validate.validation_procedure_RF(X_valid, Y_valid) print("val acc runFeatures_RF", val_acc) return str1, train_acc, val_acc
def do_sfs(x_tr, y_tr): sfs_kern = sfs(svm.SVC(kernel='rbf'), k_features=n_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=5) sfs_kern.fit(x_tr, y_tr) return sfs_kern
def feature_selection(self,X,y): lda=LinearDiscriminantAnalysis(solver='lsqr') initial_list=[] included=list(initial_list) X=self.pretreat(X) sfs1 = sfs(lda,k_features=self.max_steps,forward=self.forw,floating=self.flot, verbose=0,scoring=self.score,cv=self.cvl) sfs1 = sfs1.fit(X, y) a=list(sfs1.k_feature_names_) return a
def selectFeatures(algorithm, X_train, y_train, numberOfFeatures, isForward): selector = sfs(algorithm, k_features=(1, numberOfFeatures), forward=isForward, floating=False, verbose=0, scoring='accuracy', cv=None, n_jobs=-1) selector.fit(X_train.values, y_train) return list(selector.k_feature_idx_)
def getBestFeaturesForQDA(trainingData): x = trainingData.iloc[:, 0:11] y = trainingData.iloc[:, 11] bestFeatures = sfs( da.QuadraticDiscriminantAnalysis(), k_features="best", forward=False, floating=False, verbose=False, scoring='r2', ).fit(x, y) return bestFeatures.k_feature_names_, bestFeatures.k_feature_idx_
def getBestFeaturesForHigherOrderTerms(trainingData, num_features): x = trainingData.loc[:, trainingData.columns != 'label'] y = trainingData.loc[:, 'label'] bestFeatures = sfs( da.QuadraticDiscriminantAnalysis(), k_features=num_features, forward=True, floating=False, verbose=2, scoring='r2', ).fit(x, y) return bestFeatures.k_feature_names_
def getBestFeaturesForHigherOrderTerms(clf, trainingData, num_features, scoringString='r2'): x = trainingData.loc[:, trainingData.columns != 'label'] y = trainingData.loc[:, 'label'] bestFeatures = sfs(clf, k_features=num_features, forward=True, floating=False, verbose=2, scoring=scoringString, n_jobs=5).fit(x, y) return bestFeatures.k_feature_names_
def forward_step_feature_selection(x_train_1, y_train_1): # Build RF classifier to use in feature selection clf = RandomForestRegressor(n_estimators=100, n_jobs=-1) # Build step forward feature selection sfs1 = sfs(clf, k_features=10, forward=True, floating=False, verbose=2, scoring='accuracy', cv=5) # Perform SFFS sfs1 = sfs1.fit(x_train_1, y_train_1)
def fwrd_selection(scaled_X, Y): # Build RF classifier to use in feature selection clf = LogisticRegression() sfs1 = sfs(clf, k_features='best', forward=True, floating=False, verbose=0, scoring='accuracy', cv=5) sfs1 = sfs1.fit(scaled_X, Y) feat_cols = list(sfs1.k_feature_idx_) fs_vars = [scaled_X.columns[i] for i in feat_cols] return fs_vars
def feature_selection(self, X, y): mlr = LinearRegression() initial_list = [] included = list(initial_list) X = self.pretreat(X) sfs1 = sfs(mlr, k_features=self.max_steps, forward=self.forw, floating=self.flot, verbose=0, scoring=self.score, cv=self.cvl) sfs1 = sfs1.fit(X, y) a = list(sfs1.k_feature_names_) return a
def wrapper_forward_selection(X, y, top_feat, model): model_forward = sfs(model, k_features=top_feat, forward=True, floating=False, verbose=0, cv=5, n_jobs=-1, scoring='accuracy') model_forward.fit(X, y) res = list( map(lambda e: e['feature_names'], model_forward.subsets_.values()) ) # [[len0],[len1],[len2],...,[lenN-1]] res.sort(key=len) return res
def stepFeatureSelect(X, y, regressor, num_features=10, direction=False): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) X_train = pd.DataFrame(X_train, columns=list(X)) X_test = pd.DataFrame(X_test, columns=list(X)) stepF = sfs(regressor, k_features=num_features, forward=direction, floating=False, verbose=2, scoring='r2', cv=3, n_jobs=-1).fit(X_train, y_train) return FeatureSelector(stepF, X)
def select_features(model, X, y, n=10): """Input the number of features you want to have""" candidate = [] # Build step forward feature selection sfs1 = sfs(model, k_features=n, forward=True, floating=False, verbose=2, scoring='accuracy', cv=5) # Perform SFFS sfs1 = sfs1.fit(X, y) # The index list of the important features feat_cols = list(sfs1.k_feature_idx_) for idx in feat_cols: candidate.append(X.columns[idx]) return candidate
def WrapperAlgo(x_train, y_train): clsf = RandomForestClassifier(n_estimators=100, n_jobs=-1) # Build step forward feature selection sfs1 = sfs( clsf, k_features=18, forward=True, # The floating algorithms have an additional exclusion or inclusion step to remove features once they # were included (or excluded), so that a larger number of feature subset combinations can be sampled floating=False, verbose=2, scoring='accuracy', cv=5) # Perform SFFS sfs1 = sfs1.fit(x_train, y_train) # Which features? feat_cols = list(sfs1.k_feature_idx_) return feat_cols
def selectFeatures30(X, Y): """ Select 30 features using step forward selection""" # Build RF classifier to use in feature selection clf = RandomForestClassifier(n_estimators=100, n_jobs=-1) # Build step forward feature selection sfs1 = sfs(clf, k_features=30, forward=True, floating=False, verbose=2, scoring='accuracy', cv=5) # Perform SFS sfs1 = sfs1.fit(X, Y) feat_cols = list(sfs1.k_feature_idx_) print(feat_cols) return sfs1
def run_sffs(X_train, X_test, y_train, y_test, clf, normalize, k_features, cv): if normalize == 'yes': X_train, X_test = normalize_features(X_train, X_test) print('Starting SFFS Dimensionality Reduction ..') start = time.time() sfs1 = sfs(clf, k_features=k_features, forward=True, floating=True, verbose=2, scoring='accuracy', cv=cv, n_jobs=-1) sfs1 = sfs1.fit(X_train, y_train) feat_cols = list(sfs1.k_feature_idx_) end = time.time() print('\nSFFS done in', end - start, 'seconds\n') print('Reduced dimension : ', len(feat_cols)) return X_train[:, feat_cols], X_test[:, feat_cols]
def forward_selection(cls, df, features_count=1): if df.name == 'train': qwk_scorer = make_scorer(cls.quadratic_weighted_kappa, greater_is_better=True) model = RandomForestClassifier(n_estimators=100, n_jobs=-1) X = df.drop('AdoptionSpeed', axis=1) y = df['AdoptionSpeed'] X_train, X_test,\ y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) y_train = y_train.ravel() y_test = y_test.ravel() sfs1 = sfs(model, k_features=3, forward=True, floating=False, verbose=2, scoring=qwk_scorer, cv=5) sfs1 = sfs1.fit(X_train, y_train) best_cols = list(sfs1.k_feature_idx_) return best_cols
# %% # Train/test split X_train, X_test, y_train, y_test = train_test_split(df.values[:, :-1], df.values[:, -1], test_size=0.30, random_state=42, shuffle=False) y_train = y_train.astype('int') y_test = y_test.astype('int') # %% sfs1 = sfs(clf, k_features='best', scoring='accuracy', verbose=2, forward=True, cv=tscv(n_splits=5)) #Perform SFFS sfs1 = sfs1.fit(X_train, y_train) # %% print("Best accuracy from sfs:", sfs1.k_score_) print("Indices selected by sfs:", sfs1.k_feature_idx_) print("List of selected indices:", df.columns[[x for x in (list(sfs1.k_feature_idx_))]]) # %% #sfs lr acc clf.fit(X_train[:, list(sfs1.k_feature_idx_)], y_train)
reg__model.summary() reg__model predicted_values=(reg__model.predict(x_test)) from sklearn.metrics import mean_squared_error np.sqrt(mean_squared_error(y_test, predicted_values)) np.exp(predicted_values) import pandas as pd dataset2.sort_values('income', ascending = False) from mlxtend.feature_selection import SequentialFeatureSelector as sfs from sklearn.linear_model import LinearRegression model = sfs(LinearRegression(),2,forward=False,n_jobs=-1,floating=True,verbose=3,scoring='r2').fit(np.array(x_train),y_train) model.k_feature_idx_ from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs model1 = efs(LinearRegression(),2,forward=False,n_jobs=-1,floating=True,verbose=3,scoring='r2').fit(np.array(x_train),y_train) from mlxtend.feature_selection import ExhaustiveFeatureSelector as efs efs(LinearRegression(),1, 3,n_jobs=-1,scoring='r2', print_progress= True, clone_estimator=True).fit((x_train),y_train)
"""**Building model with the best features and checking the R2 score for the same**""" mask = selector.support_ print(f"Best features according to RFE {X_m.columns[mask].values}") X_m1 = X_m.iloc[:,mask] # We could have used train test split or cross validation strategies # for scoring the model but in order to compare with the stats model # we will use the whole data model1 = LinearRegression().fit(X_m1,y_m) print(f"R2 Score: {model1.score(X_m1,y_m)}") """### Forward Selection""" model = LinearRegression(fit_intercept=False) sfs1 = sfs(model,k_features=20,forward=True,scoring='r2',cv=5) sfs1.fit(X_m,y_m) fig = plot_sfs(sfs1.get_metric_dict()) plt.title('Forward Selection') plt.grid() plt.show() print(sfs1.k_features, sfs1.k_feature_names_,sep="\n") index = list(sfs1.k_feature_idx_) X_m1 = X_m.iloc[:,index] model1 = LinearRegression().fit(X_m1,y_m) print(f"R2 Score: {model1.score(X_m1,y_m)}") """## Regularization 1. Lasso
# select a Series from the DataFrame y = MFB_Data['2'] DT = MFB_Data.drop(['2'], axis=1) X = DT[:] # print the first 5 values y.shape # In[21]: # Build step forward feature selection from mlxtend.feature_selection import SequentialFeatureSelector as sfs sfs1 = sfs(clf, k_features=56, forward=True, floating=False, verbose=2, scoring='accuracy', cv=5) # In[99]: # Which features? feat_cols = list(sfs1.k_feature_idx_) print(feat_cols) # In[126]: # check the type and shape of y print(type(y)) print(y.shape)
y = my_data[0:20000, 0].astype(str) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) clf = RandomForestClassifier(n_estimators=10) #clf = SVC(kernel='linear') #try multiple scoring parameters, like 'accuracy', 'neg_mean_squared_error', None sfs1 = sfs(clf, k_features=10, forward=True, floating=True, verbose=2, scoring='accuracy', cv=3, n_jobs=-1) sfs1 = sfs1.fit(X_train, y_train) feat_cols = list(sfs1.k_feature_idx_) print(feat_cols) # Build full model with selected features clf.fit(X_train[:, feat_cols], y_train) train_accuracy = clf.score(X_train[:, feat_cols], y_train) test_accuracy = clf.score(X_test[:, feat_cols], y_test) y_train_pred = clf.predict(X_train[:, feat_cols]) y_test_pred = clf.predict(X_test[:, feat_cols])
return X_train, X_test X_train, X_test = standardize(X_train, X_test) X_train """<b>Inference :</b> The above table is fetched upon implementing the Standard Scaling on Train dataset to bring all variables to the Standardized format. ## Feature selection """ linreg = LinearRegression() linreg_forward = sfs(estimator=linreg, k_features=100, forward=True, verbose=2, scoring='r2') sfs_forward = linreg_forward.fit(X_train, y_train) """<b>Inference :</b> building a forward feature selection It is evident that from for features 31 to 51, the score is constant at 0.86 and decreases form the level 52 to 0.84, This indicates with 51 sigbificant features, the Model Efficiency can be increased. Hence we are running the Model with 51 features using the standard Linear Regression technique as done below. """ linreg = LinearRegression() linreg_forward = sfs(estimator=linreg,
def train(): bankdata = pd.read_csv('trainingbin_.csv') X = bankdata.drop('class_label', axis=1) y = bankdata['class_label'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) from sklearn.ensemble import RandomForestClassifier from mlxtend.feature_selection import SequentialFeatureSelector as sfs from sklearn.svm import SVC scaler = QuantileTransformer(output_distribution='uniform') X_train = scaler.fit_transform(X_train) #y_train= scaler.fit_transform(y_train) X_test = scaler.fit_transform(X_test) #y_test= scaler.fit_transform(y_test) #from sklearn.ensemble import RandomForestClassifier clf = svm.SVC(kernel='linear', C=8192) #clf = SVC(kernel='linear') #clf = RandomForestClassifier(n_estimators=100) sfs1 = sfs(clf, k_features=10, forward=True, floating=False, verbose=2, scoring='accuracy') sfs1 = sfs1.fit_transform(X_train, y_train) X_train_rfe = sfs1.fit_transform(X_train) X_test_rfe = sfs1.fit_transform(X_test) #clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=11) clf.fit(X_train_rfe, y_train) y_train_pred = clf.predict(X_train_rfe) from sklearn.metrics import accuracy_score as acc print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred)) y_test_pred = clf.predict(X_test_rfe) print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred)) #svclassifier = SVC(kernel='rbf', gamma='auto', degree=3) #y_pred = test.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix from sklearn import metrics from sklearn.metrics import accuracy_score as acc print(confusion_matrix(y_test, y_test_pred)) cnf_matrix = confusion_matrix(y_test, y_test_pred) #print(classification_report(y_test,y_pred)) FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) TP = np.diag(cnf_matrix) TN = cnf_matrix.sum() - (FP + FN + TP) FP = FP.astype(float) FN = FN.astype(float) TP = TP.astype(float) TN = TN.astype(float) # Sensitivity, hit rate, recall, or true positive rate TPR = TP / (TP + FN) # Specificity or true negative rate TNR = TN / (TN + FP) # Precision or positive predictive value PPV = TP / (TP + FP) # Negative predictive value NPV = TN / (TN + FN) # Fall out or false positive rate FPR = FP / (FP + TN) # False negative rate FNR = FN / (TP + FN) # False discovery rate FDR = FP / (TP + FP) # Overall accuracy ACC = (TP + TN) / (TP + FP + FN + TN) print("FNR:", sum(FNR) / 55) print("FPR:", sum(FPR) / 55) print("ACC:", 100 * (sum(ACC) / 55))
import pandas as pd import numpy as np from sklearn import linear_model import numpy as np import plotly.graph_objs as go from mlxtend.feature_selection import SequentialFeatureSelector as sfs from sklearn.preprocessing import scale df = pd.read_csv('clean_data.csv', index_col=[0]) X = df.drop('lrfs', axis=1) y = df['lrfs'] model = linear_model.LinearRegression() sfs1 = sfs(model, k_features=(1, 16), forward=True, floating=True, verbose=2, scoring='r2') sfs1 = sfs1.fit(X, y) print("") for i in sfs1.k_feature_idx_: print(X.columns[i])