Пример #1
0
# casestudy_iris_pca.py

import data_iris
import hierarchical
import matplotlib.pyplot as plt
import metrics
import numpy as np
import pca
import plot_data

# (1) load data
iris = data_iris.iris()
X,class_label = iris.load()
# perform pca and reduce dimension to 2
model_pca = pca.pca()
model_pca.fit(X)
R = model_pca.data_reduced_dimension(reduced_dim=2)
plot_data.plot_scatter_class(R,class_label,"Iris Data Projected to 2 Dimensions using PCA","u0","u1")
# (2) create model
model = hierarchical.hierarchical()
# (3) fit model
model.fit(R)
print("Time fit: {}".format(model.time_fit))
# (4) results
level = -3
print("Purity: {}".format(metrics.purity(model.clustersave[level],class_label)))
print("Davies-Bouldin: {}".format(metrics.davies_bouldin(R,model.clustersave[level])))
print("Silhouette: {}".format(metrics.silhouette(R,model.clustersave[level])))
model.plot_cluster(nlevel=level,title="Hierarchical Clustering for Iris Dataset reduced to 2d",xlabel="u0",ylabel="u1")
metrics.plot_cluster_distribution(model.clustersave[level],class_label)
plt.show()
Пример #2
0
iris = data_iris.iris()
X, class_label = iris.load()
# perform pca and reduce dimension to 2
model_pca = pca.pca()
model_pca.fit(X)
R = model_pca.data_reduced_dimension(reduced_dim=2)
plot_data.plot_scatter_class(R, class_label,
                             "Iris Data Projected to 2 Dimensions using PCA",
                             "u0", "u1")
# (2) create model
ncluster = 3
initialization = "kmeans++"
model = gaussianmm.gaussianmm(ncluster, initialization)
# (3) fit model
max_iter = 100
tolerance = 1e-4
model.fit(R, max_iter, tolerance)
print("Time fit: {}".format(model.time_fit))
# (4) results
level = -1
print("Purity: {}".format(metrics.purity(model.clustersave[level],
                                         class_label)))
print("Davies-Bouldin: {}".format(
    metrics.davies_bouldin(X, model.clustersave[level])))
model.plot_cluster(
    nlevel=level,
    title="GaussianMM Clustering for Iris Dataset reduced to 2d",
    xlabel="u0",
    ylabel="u1")
metrics.plot_cluster_distribution(model.clustersave[level], class_label)
plt.show()
     if model == "K Means":
         mod = kmeans.kmeans(ncluster=2, initialization='kmeans++')
     elif model == "GaussianMM":
         mod = gaussianmm.gaussianmm(ncluster=2,
                                     initialization='kmeans++')
     elif model == "DBSCAN":
         mod = dbscan.dbscan(minpts=5, epsilon=0.18)
 # fit model
 print("Model: {}".format(model))
 if model == "DBSCAN":
     mod.fit(X[dataset])
 else:
     mod.fit(X[dataset], 100, 1e-5, False)
 print("Time fit: {}".format(mod.time_fit))
 # davies-bouldin and silhouette
 db = metrics.davies_bouldin(X[dataset], mod.clustersave[-1])
 s = metrics.silhouette(X[dataset], mod.clustersave[-1])
 print("Davies-Bouldin: {}".format(db))
 print("Silhouette: {}".format(s))
 colors = (mod.clustersave[-1] + 1) / mod.ncluster
 axes[i, j].scatter(X[dataset][0, :],
                    X[dataset][1, :],
                    color=cm.jet(colors),
                    s=15)
 axes[i, j].set_xticklabels([])
 axes[i, j].set_yticklabels([])
 if i == 0:
     title = model + "\ndb:{:.2f} s:{:.2f} t:{:.3f}".format(
         db, s, mod.time_fit)
 else:
     title = "db: {:.2f} s:{:.2f} t:{:.3f}".format(db, s, mod.time_fit)
array_time = np.zeros((np.size(array_nsample)))

# (2) generate time data
# run 5 times to smooth out the data
nrun = 5
for idx in range(np.size(array_nsample)):
    for _ in range(nrun):
        # (2) create model
        ncluster = 3
        model = kmeans.kmeans(3, "random")
        # (3) fit model
        nsample = array_nsample[idx]
        model.fit(X[:, 0:nsample], 30, 1e-5, False)
        time_start = time.time()
        #db = metrics.silhouette(X,model.clustersave[-1])
        db = metrics.davies_bouldin(X, model.clustersave[-1])
        array_time[idx] += time.time() - time_start
    print("Dimension: {}  Time Fit: {}".format(nsample, array_time[idx]))

# determine power
log_nsample = np.log(array_nsample)
log_time = np.log(array_time)
coeff = np.polyfit(log_nsample, log_time, 1)
p = np.poly1d(coeff)
plognsample = p(log_nsample)
print("Power: {}".format(coeff[0]))
plt.figure()
plt.plot(log_nsample, log_time, "ro", label="Data")
plt.plot(log_nsample, plognsample, "b-", label="Fit")
plt.xlabel("Log Dimension")
plt.ylabel("Log Time")