plt.show() # Here we see all but 2 of the original features were selected X_test_li = select.transform(X_test) score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_li, y_test) print 'Test set score: {:.3f}'.format(score) # This time the score is .951, which is a better boost in performance # than SelectPercentile select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40) select.fit(X_train, y_train) mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel('Feature index') plt.yticks(()) plt.show() # Plotting which features were selected by the Recursive Feature Elimination (RFE) # this method of feature selection is more computationally expensive than # the others since it has to iteratively train a model X_train_rfe = select.transform(X_train) X_test_rfe = select.transform(X_test) score = LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test) print 'Test score: {:.3f}'.format(score) # Scores .951 print 'Test score: {:.3f}'.format(select.score(X_test, y_test)) # Also scores .951, after thie feature selection the linear model # performs just as well as the random forest
# The feature selection got better compared to the univariate and model based selection, but one feature was still missed. # Running the above code takes significantly longer than the model based selection, because a random forest model is trained 40 times, # once for each feature that is dropped. X_train_rfe = select.transform(X_train) X_test_rfe = select.transform(X_test) score = LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test) print("\nTest score(the random forest model): {:.3f}".format(score)) # Result: # Test score(the random forest model): 0.951 # We can also use the model used inside the RFE to make predictions. This uses only the feature set that was selected: print("\nTest score(the model used inside the RFE): {:.3f}".format( select.score(X_test, y_test))) # Result: # Test score(the model used inside the RFE): 0.951 ## 4.6 Utilizing Expert Knowledge # Feature engineering is often an important place to use expert knowledge for a particular application. print( "\n----------- Utilizing Expert Knowledge - citibike dataset -----------") citibike = mglearn.datasets.load_citibike() print("\nCiti Bike data - head():\n{}".format(citibike.head())) # Result: # Citi Bike data: # starttime # 2015-08-01 00:00:00 3 # 2015-08-01 03:00:00 0
X_test_l1 = select.transform(X_test) score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test) print("Test score:{:.3f}".format(score)) # %% from sklearn.feature_selection import RFE select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40) select.fit(X_train, y_train) #visualise the selected features: mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel("Sample index") plt.yticks(()) # %% print("Test scoreL {:.3f}".format(select.score(X_test, y_test))) # %% citibike = mglearn.datasets.load_citibike() # %% print("Citi Bike data:\n{}".format(citibike.head())) # %% plt.figure(figsize=(10, 3)) xticks = pd.date_range(start=citibike.index.min(), end=citibike.index.max(), freq="D") plt.xticks(xticks, xticks.strftime("%a %m-%d"), rotation=90, ha="left") plt.plot(citibike, linewidth=1) plt.xlabel("Date") plt.ylabel("Rentals") # %%
from sklearn.feature_selection import RFE select = RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=40) select.fit(X_train, y_train) mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap="gray") #흰색 선택 O, 검은색 선택 x plt.xlabel("feature num") X_train_rfe = select.transform(X_train) X_test_rfe = select.transform(X_test) score = LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test) print("test score : {:.3f}".format(score)) print("test score : {:.3f}".format(select.score(X_test, y_test))) #예측 ##전문 지식 활용 #자전거 대여소 데이터 import mglearn import pandas as pd citibike = mglearn.datasets.load_citibike() print("data : \n{}".format(citibike.head())) #대여횟수 스계열 그래프 plt.figure(figsize=(10, 3)) xticks = pd.date_range(start=citibike.index.min(), end=citibike.index.max(), freq="D") week = ["mon", "tues", "wen", "thur", "fri", "sat", "sun"]