def feature_select(data: pd.DataFrame, test_data: pd.DataFrame): run_model(data, ExtraTreesClassifier()) clf = ExtraTreesClassifier() from sklearn.feature_selection import SelectFromModel x = data.drop(columns=['match']) y = data['match'] clf.fit(x, y) model = SelectFromModel(clf, prefit=True) x_new = model.transform(x) data = pd.concat([pd.DataFrame(x_new), data[['match']]], axis=1) print('--------- feature params -----------', model.get_params()) test_data_x = test_data.drop(columns=['match']) test_data_y = np.array(test_data['match']) test_data_x_new = model.transform(test_data_x) test_data_y = pd.DataFrame({'match': test_data_y}) print('--------- test data y after feature ---------', test_data_y) test_data = pd.concat([pd.DataFrame(test_data_x_new), test_data_y], axis=1) return data, test_data
from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFECV m_RFERFC = RFECV(RandomForestClassifier(n_estimators=100), scoring='accuracy') m_RFERFC.fit(X, Y) # returns model X_RFERFC = m_RFERFC.predict(X) m_RFERFC.score(X, Y) from sklearn.linear_model import LassoCV from sklearn.feature_selection import SelectFromModel m_lasso = SelectFromModel(LassoCV()) m_lasso.fit(X, Y) m_lasso.transform(X).shape X_lasso = m_lasso.transform(X) m_lasso.get_params() mask = m_lasso.get_support() print(mask) plt.matshow(mask.reshape(1, -1), cmap='gray_r') X.columns[mask] #Using CV helps reduce selection bias due to the observations in the training set #X_test_selected = modelfit.transform(X_test) #predmodel = logisticRegression() #predmodel.fit(X_train,Y_train) #print('The score on all features: {:.3f}'.format(predmodel.score(X_test,Y_test))) #score = predmodel.fit(X_train_selected, y_train).score(X_test_selected,y_test) #print('The score on all features: {:.3f}'.format(score)) from sklearn.ensemble import RandomForestClassifier fs_SFM_RFC = SelectFromModel(RandomForestClassifier(n_estimators=100))
X_selected_main = [] X_val_selected_main = [] selector_params = [] # To store dicts from `get_params()` method of each selector below. print("End of step 4, time taken: ", timer() - start, '\n') # fit and select apt. features for each estimator then refit on them. # GBC start = timer() print("Fitting GradientBoost Classifier:") gbc_clf.fit(X, y) print("Score using all features:Training ", gbc_clf.score(X, y)) print("Score using all features:Validation ", gbc_clf.score(X_val, y_val)) selector = SelectFromModel(gbc_clf, prefit=True) selector_params.append(selector.get_params()) X_selected = selector.transform(X) X_selected_main.append(X_selected) X_val_selected = selector.transform(X_val) X_val_selected_main.append(X_val_selected) print("Shaped reduced from {} to {}, difference is {}".format(X.shape[1], X_selected.shape[1], X.shape[1] - X_selected.shape[1])) print("Refitting using selected features.") gbc_clf.fit(X_selected, y) print("Score using selected features:Training ", gbc_clf.score(X_selected, y)) print("Score using selected features:Validation ", gbc_clf.score(X_val_selected, y_val)) # ABC print("Fitting AdaBoost Classifier: ") abc_clf.fit(X, y)
for j in range(i+1,len(cols)): if np.array_equal(v,train[cols[j]].values): remove.append(cols[j]) train.drop(remove, axis=1, inplace=True) test.drop(remove, axis=1, inplace=True) # split data into train and test test_id = test.ID test = test.drop(["ID"],axis=1) X = train.drop(["TARGET","ID"],axis=1) y = train.TARGET.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1729) print(X_train.shape, X_test.shape, test.shape) ## # Feature selection clf = ExtraTreesClassifier(random_state=1729) selector = clf.fit(X_train, y_train) # clf.feature_importances_ fs = SelectFromModel(selector, prefit=True) X_train = fs.transform(X_train) X_test = fs.transform(X_test) test = fs.transform(test) print(X_train.shape, X_test.shape, test.shape) print (fs.get_params(deep=True))
train.drop(remove, axis=1, inplace=True) test.drop(remove, axis=1, inplace=True) # split data into train and test test_id = test.ID test = test.drop(["ID"], axis=1) X = train.drop(["TARGET", "ID"], axis=1) y = train.TARGET.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1729) print(X_train.shape, X_test.shape, test.shape) ## # Feature selection clf = ExtraTreesClassifier(random_state=1729) selector = clf.fit(X_train, y_train) # clf.feature_importances_ fs = SelectFromModel(selector, prefit=True) X_train = fs.transform(X_train) X_test = fs.transform(X_test) test = fs.transform(test) print(X_train.shape, X_test.shape, test.shape) print(fs.get_params(deep=True))