from sklearn.decomposition import PCA from sklearn.datasets import load_breast_cancer from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter cancer = load_breast_cancer() (X_cancer, y_cancer) = load_breast_cancer(return_X_y=True) # Before applying PCA, each feature should be centered (zero mean) and with unit variance X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer) pca = PCA(n_components=2).fit(X_normalized) X_pca = pca.transform(X_normalized) print(X_cancer.shape, X_pca.shape) plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign']) plt.xlabel('First principal component') plt.ylabel('Second principal component') plt.title('Breast Cancer Dataset PCA (n_components = 2)') # Plotting the magnitude of each feature value for the first two principal components fig = plt.figure(figsize=(8, 4)) plt.imshow(pca.components_, interpolation='none', cmap='plasma') feature_names = list(cancer.feature_names) plt.gca().set_xticks(np.arange(-.5, len(feature_names))) plt.gca().set_yticks(np.arange(0.5, 2)) plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12) plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12) plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0,
from sklearn.datasets import make_blobs from sklearn.cluster import KMeans from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter X, y = make_blobs(random_state=10) kmeans = KMeans(n_clusters=3) kmeans.fit(X) plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3'])
from sklearn.datasets import make_blobs from sklearn.cluster import DBSCAN from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter X, y = make_blobs(n_samples=25, random_state=9) dbscan = DBSCAN(eps=2, min_samples=2) cls = dbscan.fit_predict(X) print("Cluster membership values:\n{}".format(cls)) plot_labelled_scatter(X, cls + 1, ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])
from sklearn.manifold import TSNE from sklearn.preprocessing import StandardScaler import pandas as pd import matplotlib.pyplot as plt from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter tsne = TSNE(random_state=0) fruits = pd.read_table('../resources/fruit_data_with_colors.txt') feature_names_fruits = ['height', 'width', 'mass', 'color_score'] target_fruit_names = ['apple', 'mandarin', 'orange', 'lemon'] X_fruits = fruits[feature_names_fruits] y_fruits = fruits['fruit_label'] # each feature should be centered (zero mean) and with unit variance X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits) X_tsne = tsne.fit_transform(X_fruits_normalized) plot_labelled_scatter(X_tsne, y_fruits, ['apple', 'mandarin', 'orange', 'lemon']) plt.xlabel('First t-SNE feature') plt.ylabel('Second t-SNE feature') plt.title('Fruits dataset t-SNE')
from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter X, y = make_blobs(random_state=10) cls = AgglomerativeClustering(n_clusters=3) cls_assignment = cls.fit_predict(X) plot_labelled_scatter(X, cls_assignment, ['Cluster 1', 'Cluster 2', 'Cluster 3'])
from sklearn.cluster import KMeans from sklearn.preprocessing import MinMaxScaler import pandas as pd import matplotlib.pyplot as plt from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter fruits = pd.read_table('../resources/fruit_data_with_colors.txt') feature_names_fruits = ['height', 'width', 'mass', 'color_score'] target_fruit_names = ['apple', 'mandarin', 'orange', 'lemon'] X_fruits = fruits[feature_names_fruits].values y_fruits = fruits[['fruit_label']] - 1 X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits) kmeans = KMeans(n_clusters=4, random_state=0) kmeans.fit(X_fruits_normalized) plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])
import matplotlib.pyplot as plt from scipy.cluster.hierarchy import ward, dendrogram from sklearn.datasets import make_blobs from applied_machine_learning.fundamentals_of_machine_learning.adspy_shared_utilities import plot_labelled_scatter X, y = make_blobs(random_state=10, n_samples=10) plot_labelled_scatter(X, y, ['Cluster 1', 'Cluster 2', 'Cluster 3']) print(X) plt.figure() dendrogram(ward(X)) plt.show()