""" Unsupervised dimensionality reduction via scikit-learn's implementation of Principal Component Analysis(PCA) on the wine dataset to reduce 13 dimensions of data to the top 2 and a plot of decision regions of the data """ from wine_comon_funcs import wine_initializer, plot_decision_regions from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA import matplotlib.pyplot as plt x_train_std, y_train, x_test_std, y_test, _ = wine_initializer() pca = PCA(n_components=2) # n_components=None keeps all principal components, # which can be accessed via the explained_variance_ratio_ attribute. No dimensionality reduction is done. x_train_pca, x_test_pca = pca.fit_transform(x_train_std), pca.transform( x_test_std) # values in scikit-learn's PCA are flipped for PC2, plotted data has a 180 degree rotation/flip # Does not affect decision region prediction capability lr = LogisticRegression(multi_class='auto', solver='liblinear').fit(x_train_pca, y_train) plot_decision_regions(x_train_pca, y_train, classifier=lr) plt.xlabel('PC1') plt.ylabel('PC2') plt.legend(loc='lower left') plt.show() """Repeated with test data: very small misclassification error""" plot_decision_regions(x_test_pca, y_test, classifier=lr) plt.xlabel('PC1')
""" Using a random forest of a 1000 trees to rank the 13 features of the wine dataset """ from sklearn.ensemble import RandomForestClassifier from wine_comon_funcs import wine_initializer import numpy as np import matplotlib.pyplot as plt x_train, y_train, x_test, y_test, columns = wine_initializer('val') feat_labels = columns[1:] forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1).fit(x_train, y_train) # n_jobs = -1 means using all processors importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(x_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[f], importances[indices[f]])) # print("{0}) {1}{2:<30.5f}".format(f + 1, feat_labels[f], importances[indices[f]])) plt.title("Feature Importances") plt.bar(range(x_train.shape[1]), importances[indices], color='lightblue', align='center') plt.xticks(range(x_train.shape[1]), feat_labels, rotation=90) plt.xlim([-1, x_train.shape[1]])
""" L1 regularization to get sparse solutions to get more well-defined decision regions """ import matplotlib.pyplot as plt import numpy as np from wine_comon_funcs import wine_initializer from sklearn.linear_model import LogisticRegression x_train_std, y_train, x_test_std, y_test, columns = wine_initializer() lr = LogisticRegression(penalty='l1', C=0.1, multi_class='auto', solver='liblinear') # penalty = L 1 lr.fit(x_train_std, y_train) """ print('Training accuracy: ',lr.score(x_train_std, y_train)) print('Test accuracy: ',lr.score(x_test_std, y_test)) print(lr.intercept_) print(lr.coef_) """ fig = plt.figure() ax = plt.subplot(111) colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'pink', 'lightgreen', 'lightblue', 'gray', 'indigo', 'orange'] weights, params = [], [] for c in np.arange(-4, 6): lr = LogisticRegression(penalty='l1', C=10.0**c, random_state=0, multi_class='auto', solver='liblinear') lr.fit(x_train_std, y_train) weights.append(lr.coef_[1])
""" Bar chart of the linear discriminants of the wine dataset to find the principal component manually """ from wine_comon_funcs import wine_initializer, wine_matrix_init import numpy as np import matplotlib.pyplot as plt mean_vectors, sc_w, sc_b = wine_matrix_init() x_train_std, y_train, x_test_std, y_test, x, y = wine_initializer('sc_mat') # print(sc_b) eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(sc_w).dot(sc_b)) eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i]) for i in range(len(eigen_vals))] eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True) """ print('Eigenvalues in decreasing order:') for eigen_val in eigen_pairs: print(eigen_val[0]) """ total = sum(eigen_vals.real) dis_cr = [(i / total) for i in sorted(eigen_vals.real, reverse=True)] # discriminant cum_dis_cr = np.cumsum(dis_cr) # cumulative discriminant plt.bar(range(1, 14), dis_cr, alpha=0.5, align='center',