plt.show()
# Here we see all but 2 of the original features were selected

X_test_li = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_li, y_test)
print 'Test set score: {:.3f}'.format(score)
# This time the score is .951, which is a better boost in performance
# than SelectPercentile

select = RFE(RandomForestClassifier(n_estimators=100, random_state=42),
             n_features_to_select=40)
select.fit(X_train, y_train)
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel('Feature index')
plt.yticks(())
plt.show()
# Plotting which features were selected by the Recursive Feature Elimination (RFE)
# this method of feature selection is more computationally expensive than
# the others since it has to iteratively train a model

X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)
score = LogisticRegression().fit(X_train_rfe,
                                 y_train).score(X_test_rfe, y_test)
print 'Test score: {:.3f}'.format(score)
# Scores .951
print 'Test score: {:.3f}'.format(select.score(X_test, y_test))
# Also scores .951, after thie feature selection the linear model
# performs just as well as the random forest
# The feature selection got better compared to the univariate and model based selection, but one feature was still missed.
# Running the above code takes significantly longer than the model based selection, because a random forest model is trained 40 times,
# once for each feature that is dropped.
X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)

score = LogisticRegression().fit(X_train_rfe,
                                 y_train).score(X_test_rfe, y_test)
print("\nTest score(the random forest model): {:.3f}".format(score))
# Result:
# Test score(the random forest model): 0.951

# We can also use the model used inside the RFE to make predictions. This uses only the feature set that was selected:
print("\nTest score(the model used inside the RFE): {:.3f}".format(
    select.score(X_test, y_test)))
# Result:
# Test score(the model used inside the RFE): 0.951

## 4.6 Utilizing Expert Knowledge
# Feature engineering is often an important place to use expert knowledge for a particular application.
print(
    "\n----------- Utilizing Expert Knowledge - citibike dataset -----------")

citibike = mglearn.datasets.load_citibike()
print("\nCiti Bike data - head():\n{}".format(citibike.head()))
# Result:
# Citi Bike data:
# starttime
# 2015-08-01 00:00:00     3
# 2015-08-01 03:00:00     0
X_test_l1 = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
print("Test score:{:.3f}".format(score))
# %%
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42),
             n_features_to_select=40)
select.fit(X_train, y_train)
#visualise the selected features:
mask = select.get_support()

plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())
# %%
print("Test scoreL {:.3f}".format(select.score(X_test, y_test)))
# %%
citibike = mglearn.datasets.load_citibike()
# %%
print("Citi Bike data:\n{}".format(citibike.head()))
# %%
plt.figure(figsize=(10, 3))
xticks = pd.date_range(start=citibike.index.min(),
                       end=citibike.index.max(),
                       freq="D")
plt.xticks(xticks, xticks.strftime("%a %m-%d"), rotation=90, ha="left")
plt.plot(citibike, linewidth=1)
plt.xlabel("Date")
plt.ylabel("Rentals")

# %%
Пример #4
0
from sklearn.feature_selection import RFE

select = RFE(RandomForestClassifier(n_estimators=100, random_state=42),
             n_features_to_select=40)

select.fit(X_train, y_train)
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap="gray")  #흰색 선택 O, 검은색 선택 x
plt.xlabel("feature num")

X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)
score = LogisticRegression().fit(X_train_rfe,
                                 y_train).score(X_test_rfe, y_test)
print("test score : {:.3f}".format(score))
print("test score : {:.3f}".format(select.score(X_test, y_test)))  #예측

##전문 지식 활용
#자전거 대여소 데이터
import mglearn
import pandas as pd

citibike = mglearn.datasets.load_citibike()
print("data : \n{}".format(citibike.head()))

#대여횟수 스계열 그래프
plt.figure(figsize=(10, 3))
xticks = pd.date_range(start=citibike.index.min(),
                       end=citibike.index.max(),
                       freq="D")
week = ["mon", "tues", "wen", "thur", "fri", "sat", "sun"]