def main(): r_wine_path = os.path.join("data", "winequality-red.csv") w_wine_path = os.path.join("data", "winequality-white.csv") df_wine_red = pd.read_csv(r_wine_path, sep=';') df_wine_white = pd.read_csv(w_wine_path, sep=';') X_r, y_r = df_wine_red.iloc[:, :11], df_wine_red.iloc[:, 11] X_w, y_w = df_wine_white.iloc[:, :11], df_wine_white.iloc[:, 11] stdsc_r = StandardScaler() stdsc_w = StandardScaler() X_r_train_std = stdsc_r.fit_transform(X_r) X_w_train_std = stdsc_w.fit_transform(X_w) knn_r = KNeighborsClassifier(n_neighbors=5) knn_w = KNeighborsClassifier(n_neighbors=5) sbs_r = SBS(knn_r, k_features=1) sbs_w = SBS(knn_w, k_features=1) sbs_r.fit(X_r_train_std, y_r) sbs_w.fit(X_w_train_std, y_w) plot_accuracy(sbs_r.subsets_, sbs_r.scores_) plot_accuracy(sbs_w.subsets_, sbs_w.scores_) feat_labels = df_wine_white.columns[:11] RandomForest.f_importance(feat_labels, X_r, y_r) RandomForest.f_importance(feat_labels, X_w, y_w)
def main(): # prepare sample data and target variable wine_data = WineData() X = wine_data.X y = wine_data.y # split sample data into training data and test data and standardize them X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) sc = StandardScaler().fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) # fit estimators estimators = [ LogisticRegression(C=100.0, random_state=1, solver='liblinear', multi_class='ovr'), SVC(C=1.0, kernel='linear', random_state=1).fit(X_train_std, y_train), KNeighborsClassifier(n_neighbors=5) ] sbs_estimators = [SBS(estimator=estimator, k_features=1).fit(X_train_std, y_train) for estimator in estimators] # plot score at each steps labels = ['logistic regression', 'SVM', 'KNN'] for sbs, label in zip(sbs_estimators, labels): k_features = list(len(subset) for subset in sbs.subsets_) plt.plot(k_features, sbs.scores_, marker='o', label=label) plt.xlabel('number of feateres') plt.ylabel('accuracy') plt.grid() plt.legend() plt.show() # show results of SBS print('[score summary]') for sbs, estimator, label in zip(sbs_estimators, estimators, labels): print('estimator:', label) # search minimal subsets of features which achieves the best score indices = sbs.subsets_[0] for i in reversed(range(X.shape[1])): if sbs.scores_[i] == 1.0: indices = sbs.subsets_[i] break print('minimal subsets:', [wine_data.features[i] for i in indices]) # compare score with all the features and one with minimal subsets estimator_all = estimator.fit(X_train_std, y_train) score_all = estimator_all.score(X_test_std, y_test) estimator_min = estimator.fit(X_train_std[:, indices], y_train) score_min = estimator_min.score(X_test_std[:, indices], y_test) print('score (all features) :', score_all) print('score (minimal features):', score_min)
########################################### Split the datset ########################################### X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) ############################## Feature scaling(using standardization) ################################## stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.fit_transform(X_test) ############################## Implement Sequental Backward Selection using KNN ######################### knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, k_features=1) sbs.fit(X_train_std, y_train) ############################ plot the classification accuracy of the KNN classifier ##################### k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.show() # plt.savefig('sbm_classification_using_knn.png', dpi=300) ############################ get the features that yield best performance ################################ k5 = list(sbs.subsets_[8]) print(df_wine.columns[1:][k5])
header=None) df_wine.columns = [ 'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoids phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline' ] X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.transform(X_test) knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, 1) sbs.fit(X_train_std, y_train) k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.show()
'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] ########################################### Split the datset ########################################### X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) ############################## Feature scaling(using standardization) ################################## stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.fit_transform(X_test) ############################## Implement Sequental Backward Selection using KNN ######################### knn = KNeighborsClassifier(n_neighbors=2) sbs = SBS(knn, k_features=1) sbs.fit(X_train_std, y_train) ############################ plot the classification accuracy of the KNN classifier ##################### k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.show() # plt.savefig('sbm_classification_using_knn.png', dpi=300) ############################ get the features that yield best performance ################################ k5 = list(sbs.subsets_[8])
# z標準化 sc = StandardScaler() sc.fit(X) X_std = sc.transform(X) n_of_trials = 30 # 試行回数 score_train_all = np.zeros(n_of_features) #部分集合毎の学習データに対するスコア格納用 score_test_all = np.zeros(n_of_features) #部分集合毎のテストデータに対するスコア格納用 #========================================================== # 本プログラムは交差検証ではなく,異なる乱数状態で複数回試行した平均を取っている for k in range(0, n_of_trials): X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = k) lr = LinearRegression() sbs = SBS(lr, k_features=1, scoring=r2_score) sbs.fit(X_train, y_train) selected_features = list(sbs.subsets_[n_of_features - n_of_selected_features]) print("Trial {:2d}; Best {} features: {}".format(k+1, n_of_selected_features, df.feature_names[selected_features])) score_train = np.array([]) score_test = np.array([]) #====================================================== # 課題:SBSアルゴリズムで得られた各部分集合に対して,線形回帰モデルを適合させて # 学習データ,テストデータに対する決定係数を算出し,score_train,score_testに格納する. # ヒント:特徴の部分集合はsbs.subsets_に格納されている. [YOUR CODE HERE] #======================================================