# casestudy_iris_pca.py import data_iris import hierarchical import matplotlib.pyplot as plt import metrics import numpy as np import pca import plot_data # (1) load data iris = data_iris.iris() X,class_label = iris.load() # perform pca and reduce dimension to 2 model_pca = pca.pca() model_pca.fit(X) R = model_pca.data_reduced_dimension(reduced_dim=2) plot_data.plot_scatter_class(R,class_label,"Iris Data Projected to 2 Dimensions using PCA","u0","u1") # (2) create model model = hierarchical.hierarchical() # (3) fit model model.fit(R) print("Time fit: {}".format(model.time_fit)) # (4) results level = -3 print("Purity: {}".format(metrics.purity(model.clustersave[level],class_label))) print("Davies-Bouldin: {}".format(metrics.davies_bouldin(R,model.clustersave[level]))) print("Silhouette: {}".format(metrics.silhouette(R,model.clustersave[level]))) model.plot_cluster(nlevel=level,title="Hierarchical Clustering for Iris Dataset reduced to 2d",xlabel="u0",ylabel="u1") metrics.plot_cluster_distribution(model.clustersave[level],class_label) plt.show()
iris = data_iris.iris() X, class_label = iris.load() # perform pca and reduce dimension to 2 model_pca = pca.pca() model_pca.fit(X) R = model_pca.data_reduced_dimension(reduced_dim=2) plot_data.plot_scatter_class(R, class_label, "Iris Data Projected to 2 Dimensions using PCA", "u0", "u1") # (2) create model ncluster = 3 initialization = "kmeans++" model = gaussianmm.gaussianmm(ncluster, initialization) # (3) fit model max_iter = 100 tolerance = 1e-4 model.fit(R, max_iter, tolerance) print("Time fit: {}".format(model.time_fit)) # (4) results level = -1 print("Purity: {}".format(metrics.purity(model.clustersave[level], class_label))) print("Davies-Bouldin: {}".format( metrics.davies_bouldin(X, model.clustersave[level]))) model.plot_cluster( nlevel=level, title="GaussianMM Clustering for Iris Dataset reduced to 2d", xlabel="u0", ylabel="u1") metrics.plot_cluster_distribution(model.clustersave[level], class_label) plt.show()
if model == "K Means": mod = kmeans.kmeans(ncluster=2, initialization='kmeans++') elif model == "GaussianMM": mod = gaussianmm.gaussianmm(ncluster=2, initialization='kmeans++') elif model == "DBSCAN": mod = dbscan.dbscan(minpts=5, epsilon=0.18) # fit model print("Model: {}".format(model)) if model == "DBSCAN": mod.fit(X[dataset]) else: mod.fit(X[dataset], 100, 1e-5, False) print("Time fit: {}".format(mod.time_fit)) # davies-bouldin and silhouette db = metrics.davies_bouldin(X[dataset], mod.clustersave[-1]) s = metrics.silhouette(X[dataset], mod.clustersave[-1]) print("Davies-Bouldin: {}".format(db)) print("Silhouette: {}".format(s)) colors = (mod.clustersave[-1] + 1) / mod.ncluster axes[i, j].scatter(X[dataset][0, :], X[dataset][1, :], color=cm.jet(colors), s=15) axes[i, j].set_xticklabels([]) axes[i, j].set_yticklabels([]) if i == 0: title = model + "\ndb:{:.2f} s:{:.2f} t:{:.3f}".format( db, s, mod.time_fit) else: title = "db: {:.2f} s:{:.2f} t:{:.3f}".format(db, s, mod.time_fit)
array_time = np.zeros((np.size(array_nsample))) # (2) generate time data # run 5 times to smooth out the data nrun = 5 for idx in range(np.size(array_nsample)): for _ in range(nrun): # (2) create model ncluster = 3 model = kmeans.kmeans(3, "random") # (3) fit model nsample = array_nsample[idx] model.fit(X[:, 0:nsample], 30, 1e-5, False) time_start = time.time() #db = metrics.silhouette(X,model.clustersave[-1]) db = metrics.davies_bouldin(X, model.clustersave[-1]) array_time[idx] += time.time() - time_start print("Dimension: {} Time Fit: {}".format(nsample, array_time[idx])) # determine power log_nsample = np.log(array_nsample) log_time = np.log(array_time) coeff = np.polyfit(log_nsample, log_time, 1) p = np.poly1d(coeff) plognsample = p(log_nsample) print("Power: {}".format(coeff[0])) plt.figure() plt.plot(log_nsample, log_time, "ro", label="Data") plt.plot(log_nsample, plognsample, "b-", label="Fit") plt.xlabel("Log Dimension") plt.ylabel("Log Time")