"""
Unsupervised dimensionality reduction via scikit-learn's  implementation of
Principal Component Analysis(PCA) on the wine dataset to reduce 13 dimensions of data to the top 2
and a plot of decision regions of the data
"""

from wine_comon_funcs import wine_initializer, plot_decision_regions
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

x_train_std, y_train, x_test_std, y_test, _ = wine_initializer()
pca = PCA(n_components=2)
# n_components=None keeps all principal components,
# which can be accessed via the explained_variance_ratio_ attribute. No dimensionality reduction is done.

x_train_pca, x_test_pca = pca.fit_transform(x_train_std), pca.transform(
    x_test_std)
# values in scikit-learn's PCA are flipped for PC2, plotted data has a 180 degree rotation/flip
# Does not affect decision region prediction capability

lr = LogisticRegression(multi_class='auto',
                        solver='liblinear').fit(x_train_pca, y_train)
plot_decision_regions(x_train_pca, y_train, classifier=lr)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend(loc='lower left')
plt.show()
"""Repeated with test data: very small misclassification error"""
plot_decision_regions(x_test_pca, y_test, classifier=lr)
plt.xlabel('PC1')
"""
Using a random forest of a 1000 trees to rank the 13 features of the wine dataset
"""

from sklearn.ensemble import RandomForestClassifier
from wine_comon_funcs import wine_initializer
import numpy as np
import matplotlib.pyplot as plt

x_train, y_train, x_test, y_test, columns = wine_initializer('val')
feat_labels = columns[1:]

forest = RandomForestClassifier(n_estimators=1000, random_state=0,
                                n_jobs=-1).fit(x_train, y_train)
# n_jobs = -1 means using all processors

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" %
          (f + 1, 30, feat_labels[f], importances[indices[f]]))
    # print("{0}) {1}{2:<30.5f}".format(f + 1, feat_labels[f], importances[indices[f]]))

plt.title("Feature Importances")
plt.bar(range(x_train.shape[1]),
        importances[indices],
        color='lightblue',
        align='center')
plt.xticks(range(x_train.shape[1]), feat_labels, rotation=90)
plt.xlim([-1, x_train.shape[1]])
예제 #3
0
"""
L1 regularization to get sparse solutions to get more well-defined decision regions
"""

import matplotlib.pyplot as plt
import numpy as np
from wine_comon_funcs import wine_initializer
from sklearn.linear_model import LogisticRegression

x_train_std, y_train, x_test_std, y_test, columns = wine_initializer()

lr = LogisticRegression(penalty='l1', C=0.1, multi_class='auto', solver='liblinear')        # penalty = L 1
lr.fit(x_train_std, y_train)

"""
print('Training accuracy: ',lr.score(x_train_std, y_train))
print('Test accuracy: ',lr.score(x_test_std, y_test))
print(lr.intercept_)
print(lr.coef_)
"""

fig = plt.figure()
ax = plt.subplot(111)
colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black',
          'pink', 'lightgreen', 'lightblue', 'gray', 'indigo', 'orange']

weights, params = [], []
for c in np.arange(-4, 6):
    lr = LogisticRegression(penalty='l1', C=10.0**c, random_state=0, multi_class='auto', solver='liblinear')
    lr.fit(x_train_std, y_train)
    weights.append(lr.coef_[1])
"""
Bar chart of the linear discriminants of the wine dataset to find the principal component manually
"""

from wine_comon_funcs import wine_initializer, wine_matrix_init
import numpy as np
import matplotlib.pyplot as plt

mean_vectors, sc_w, sc_b = wine_matrix_init()
x_train_std, y_train, x_test_std, y_test, x, y = wine_initializer('sc_mat')
# print(sc_b)

eigen_vals, eigen_vecs = np.linalg.eig(np.linalg.inv(sc_w).dot(sc_b))
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
               for i in range(len(eigen_vals))]
eigen_pairs = sorted(eigen_pairs, key=lambda k: k[0], reverse=True)
"""
print('Eigenvalues in decreasing order:')
for eigen_val in eigen_pairs:
    print(eigen_val[0])
"""

total = sum(eigen_vals.real)
dis_cr = [(i / total)
          for i in sorted(eigen_vals.real, reverse=True)]  # discriminant
cum_dis_cr = np.cumsum(dis_cr)  # cumulative discriminant

plt.bar(range(1, 14),
        dis_cr,
        alpha=0.5,
        align='center',