Пример #1
0
def validity(tokenize_topics):
    """Measure entropy for different number of topics."""
    Entropy = []
    K = [10, 20, 30, 40, 50]

    for k in K:

        topics, publish_years, lda = lda_function(k)
        tokenize_topics = [nltk.word_tokenize(topic) for topic in topics]
        elements = []
        Topics2 = [index for index, i in enumerate(tokenize_topics)]
        # take the first manual topic for each paper
        for i in range(len(papers)):
            elements.append(Topics2[max(lda[corpus[i]], key=itemgetter(1))[0]])
        # compute cluster validity:
        Entropy.append(clusterval(y, elements)[0])

    plt.figure(5)
    plt.title('Cluster validity')
    plt.hold(True)
    plt.plot(K, Entropy)
    plt.xlabel('Number of topics')
    plt.ylim(0, 1.1)
    plt.show()

    return Entropy
Пример #2
0
def Evaluate(input_data, index_to_check):
    X = input_data[:, :7]

    y = np.argmax(input_data[:, 7:10], 1)
    # X = StandardScaler().fit_transform(X)

    N, M = np.shape(X)

    split_index = int(X.shape[0] * 0.5)
    print(split_index)
    X_train = X[:split_index, :]
    X_test = X[split_index:, :]
    y_test = y[split_index:]

    # Maximum number of clusters:
    K = 10

    # Allocate variables:
    Rand = np.zeros((K, ))
    Jaccard = np.zeros((K, ))
    NMI = np.zeros((K, ))

    for k in range(K):
        cls = GaussianMixture(n_components=K,
                              covariance_type="full",
                              n_init=10).fit(X)
        Rand[k], Jaccard[k], NMI[k] = clusterval(y.ravel(), cls.predict(X))
        print(Rand[k], Jaccard[k], NMI[k])

    # Plot results:

    figure(1)
    title('Cluster validity ')
    plot(np.arange(K) + 1, Rand)
    plot(np.arange(K) + 1, Jaccard)
    plot(np.arange(K) + 1, NMI)
    ylim(-2, 1.1)
    legend(['Rand', 'Jaccard', 'NMI'], loc=4)
    show()

    print('Ran Exercise 10.1.3')
Пример #3
0
# Maximum number of clusters:
K = 10

# Allocate variables:
Entropy = np.zeros((K, 1))
Purity = np.zeros((K, 1))
Rand = np.zeros((K, 1))
Jaccard = np.zeros((K, 1))
OtherMetrics = np.zeros((K, 5))

for k in range(K):
    # run K-means clustering:
    #cls = Pycluster.kcluster(X,k+1)[0]
    centroids, cls, inertia = k_means(X, k + 1)
    # compute cluster validities:
    Entropy[k], Purity[k], Rand[k], Jaccard[k] = clusterval(y, cls)
    # compute other metrics, implemented in sklearn.metrics package
    OtherMetrics[k, 0] = cluster_metrics.supervised.completeness_score(
        y.A.ravel(), cls)
    OtherMetrics[k, 1] = cluster_metrics.supervised.homogeneity_score(
        y.A.ravel(), cls)
    OtherMetrics[k, 2] = cluster_metrics.supervised.mutual_info_score(
        y.A.ravel(), cls)
    OtherMetrics[k, 3] = cluster_metrics.supervised.v_measure_score(
        y.A.ravel(), cls)
    OtherMetrics[k, 4] = cluster_metrics.supervised.adjusted_rand_score(
        y.A.ravel(), cls)

# Plot results:

figure(1)
Пример #4
0
# result as the starting point. K-means might converge faster/better than
# random, but might also cause the algorithm to be stuck in a poor local minimum

# type of covariance, you can try out 'diag' as well
reps = 1
# number of fits with different initalizations, best result will be kept
# Fit Gaussian mixture model
gmm = GaussianMixture(n_components=K,
                      covariance_type=cov_type,
                      n_init=reps,
                      tol=1e-6,
                      reg_covar=1e-6,
                      init_params=initialization_method).fit(X)
cls = gmm.predict(X)
print(cls)
Rand, Jaccard, NMI, purity = clusterval(y, cls)
# extract cluster labels
cds = gmm.means_
# extract cluster centroids (means of gaussians)
covs = gmm.covariances_
# extract cluster shapes (covariances of gaussians)
if cov_type.lower() == 'diag':
    new_covs = np.zeros([K, M, M])

    count = 0
    for elem in covs:
        temp_m = np.zeros([M, M])
        new_covs[count] = np.diag(elem)
        count += 1

    covs = new_covs
Пример #5
0
classNames = [name[0][0] for name in mat_data['classNames']]
N, M = X.shape
C = len(classNames)

# Maximum number of clusters:
K = 10

# Allocate variables:
Rand = np.zeros((K, ))
Jaccard = np.zeros((K, ))
NMI = np.zeros((K, ))

for k in range(K):
    # run K-means clustering:
    #cls = Pycluster.kcluster(X,k+1)[0]
    centroids, cls, inertia = k_means(X, k + 1)
    # compute cluster validities:
    Rand[k], Jaccard[k], NMI[k] = clusterval(y, cls)

# Plot results:

figure(1)
title('Cluster validity')
plot(np.arange(K) + 1, Rand)
plot(np.arange(K) + 1, Jaccard)
plot(np.arange(K) + 1, NMI)
ylim(-2, 1.1)
legend(['Rand', 'Jaccard', 'NMI'], loc=4)
show()

print('Ran Exercise 10.1.3')
Пример #6
0
    ax.set(title='Plot of the best fitting GMM')
    plt.show()


if __name__ == '__main__':
    # Create data set
    seed = 56
    np.random.seed(seed)
    X, y = create_dataset()

    n_components_range = range(1, 3)
    cv_types = ['full']

    models = create_gmm_models(cv_types, n_components_range)
    best_gmm, score = my_cv(X, y, models, K_out=10, K_in=10, seed=seed)
    best_gmm.fit(X)
    clf = best_gmm.predict(X)
    cent = best_gmm.means_
    covars = best_gmm.covariances_

    plot_accuracy(seed)
    plot_gmms(cv_types, n_components_range, best_gmm)

    #Cluster validity
    rand, jaccard, NMI = clusterval(y, clf)
    print('''\n ----- Quality Check of GMM ----- \n
    Rand index score: {} \n
    Jaccard similarity score: {} \n
    Normalized Mutual Information score: {} \n
    '''.format(rand, jaccard, NMI))
Пример #7
0
for k in range(1, K):
    # run K-means clustering:
    #cls = Pycluster.kcluster(X,k+1)[0]
    #centroids, cls, inertia = k_means(X,k+1)
    # compute cluster validities:

    cls1 = fcluster(Z, criterion='maxclust', t=k)
    gmm = GaussianMixture(n_components=k,
                          covariance_type=cov_type,
                          n_init=reps,
                          tol=1e-6,
                          reg_covar=1e-6,
                          init_params=initialization_method).fit(X)
    cls2 = gmm.predict(X)

    Rand1[k], Jaccard1[k], NMI1[k] = clusterval(Y, cls1)
    Rand2[k], Jaccard2[k], NMI2[k] = clusterval(Y, cls2)

# Plot results:

figure(1)
title('Cluster validity for Hierarchal')
plot(np.arange(K) + 1, Rand1)
plot(np.arange(K) + 1, Jaccard1)
plot(np.arange(K) + 1, NMI1)
legend(['Rand', 'Jaccard', 'NMI'], loc=4)
show()

figure(2)
title('Cluster validity for GMM')
plot(np.arange(K) + 1, Rand2)
Пример #8
0
max_display_levels=10
plt.figure(2,figsize=(10,4))
dendrogram(Z, truncate_mode='level', p=max_display_levels)

plt.show()

# Calculate validities

y=y.flatten()
for m in range(len(Methods)):
    # run hierarchical clustering:
    Z = linkage(X, method=Methods[m], metric=Metric)
    Maxclust = 2
    cls = fcluster(Z, criterion='maxclust', t=Maxclust)
    # compute cluster validities:
    Rand[m], Jaccard[m], NMI[m] = clusterval(y,cls) 
        
# Plot results:

plt.figure(5)
plt.title('Cluster validity')
plt.plot(np.arange(len(Methods))+1, Rand)
plt.plot(np.arange(len(Methods))+1, Jaccard)
plt.plot(np.arange(len(Methods))+1, NMI)
plt.legend(['Rand', 'Jaccard', 'NMI'], loc=4)
plt.show()


# PCA

# PCA by computing SVD of X_tilde
Пример #9
0
M = len(attributeNames)
C = len(classNames)

cov_type = 'diag'     # you can try out 'diag' as well
reps = 10               # number of fits with different initalizations, best result will be kept


#Set K to value earlier found through cross validation
K = 9

#Fit GMM to data
gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps).fit(X)
cls = gmm.predict(X)

#Compute error with respect to actual classes
rand_gmm, Jaccard_gmm, NMI_gmm = clusterval(np.asarray(y).ravel(), cls)

# Perform hierarchical/agglomerative clustering on data matrix
Method = 'complete'
Metric = 'euclidean'

Z = linkage(X, method=Method, metric=Metric)
cls = fcluster(Z, criterion='maxclust', t=K)

rand_h, Jaccard_h, NMI_h = clusterval(np.asarray(y).ravel(), cls)

print("GMM:")
print("rand: {0}".format(rand_gmm))
print("Jaccard: {0}".format(Jaccard_gmm))
print("NMI: {0}".format(NMI_gmm))
gmm = GaussianMixture(n_components=K_optimal,
                      covariance_type=covar_type,
                      n_init=reps).fit(X)
cls = gmm.predict(X)
# extract cluster labels
cds = gmm.means_
# extract cluster centroids (means of gaussians)
covs = gmm.covariances_
plt.figure(figsize=(12, 9))
clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs)
plt.title('Gaussian Mixture Model using {} clusters'.format(K_optimal))
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.show()
# Evaluate GMM model
Rand_gmm, Jaccard_gmm, NMI_gmm = clusterval(y, cls)

print('###################################################')
print('#             HIERARCHICAL CLUSTERING             #')
print('###################################################')
Metric = 'euclidean'
Maxclust = K_optimal
max_display_levels = K_optimal
Methods = ['single', 'complete', 'average', 'weighted', 'median',
           'ward']  # We will try all these methods
n_methods = len(Methods)

# Allocate variables:
Rand_hier = np.zeros((n_methods, ))
Jaccard_hier = np.zeros((n_methods, ))
NMI_hier = np.zeros((n_methods, ))
# Maximum number of clusters:
K = 10

# Allocate variables:
Entropy = np.zeros((K,1))
Purity = np.zeros((K,1))
Rand = np.zeros((K,1))
Jaccard = np.zeros((K,1))
OtherMetrics = np.zeros((K,5))

for k in range(K):
    # run K-means clustering:
    #cls = Pycluster.kcluster(X,k+1)[0]
    centroids, cls, inertia = k_means(X,k+1)
    # compute cluster validities:
    Entropy[k], Purity[k], Rand[k], Jaccard[k] = clusterval(y,cls)
    # compute other metrics, implemented in sklearn.metrics package
    OtherMetrics[k,0] = cluster_metrics.supervised.completeness_score(y.A.ravel(),cls)
    OtherMetrics[k,1] = cluster_metrics.supervised.homogeneity_score(y.A.ravel(),cls)
    OtherMetrics[k,2] = cluster_metrics.supervised.mutual_info_score(y.A.ravel(),cls)
    OtherMetrics[k,3] = cluster_metrics.supervised.v_measure_score(y.A.ravel(),cls)
    OtherMetrics[k,4] = cluster_metrics.supervised.adjusted_rand_score(y.A.ravel(),cls)

        
# Plot results:

figure(1)
title('Cluster validity')
hold(True)
plot(np.arange(K)+1, -Entropy)
plot(np.arange(K)+1, Purity)
Пример #12
0
        # extract training and test set for current CV fold
        X_train = X[train_index]
        X_test = X[test_index]

        # Fit Gaussian mixture model to X_train
        gmm = GaussianMixture(n_components=K,
                              covariance_type=covar_type,
                              n_init=reps).fit(X_train)
        # compute negative log likelihood of X_test
        CVE[t] += -gmm.score_samples(X_test).sum()

# Plot results

figure(1)
plot(KRange, CVE, '-ok')
xlabel('K')
savefig('GMM_with_y.png')
show()

Rand = np.zeros((1, ))
Jaccard = np.zeros((1, ))
NMI = np.zeros((1, ))
Rand[0], Jaccard[0], NMI[0] = clusterval(y, cls[8])

print("\n Rand: ", Rand, "Jaccard: ", Jaccard, "NMI: ", NMI)

print("Lowest score: ", np.min(CVE))
print("Highest score: ", np.max(CVE))

print("Cluster means: ", cds[8])
Пример #13
0
import numpy as np

X = X_standard
y = X[:, 9]  #make y the chd column
X = X[:, :9]  #get rid of chd column

covar_type = 'full'  # you can try out 'diag' as well
reps = 3  # number of fits with different initalizations, best result will be kept
init_procedure = 'kmeans'

gmm = GaussianMixture(n_components=8,
                      covariance_type=covar_type,
                      n_init=reps,
                      init_params=init_procedure,
                      tol=1e-6,
                      reg_covar=1e-6).fit(X)

link = linkage(X, method="ward", metric="euclidean")
# # Compute clusters by thresholding the dendrogram
Maxclust = 8
cls = fcluster(link, criterion='maxclust', t=Maxclust)

b = gmm.predict(X)

Rand_hc, Jaccard_hc, NMI_hc = clusterval(cls, y)
Rand_gmm, Jaccard_gmm, NMI_gmm = clusterval(b, y)
# the exercise script 10_1_3 shows this as the output of the function
# clusterval while help(clusterval) seems to tell something different.
print(Rand_hc, Jaccard_hc, NMI_hc)
print(Rand_gmm, Jaccard_gmm, NMI_gmm)
Пример #14
0
unique, counts = np.unique(cls, return_counts=True)

# Display dendrogram
max_display_levels = 2
plt.figure(2, figsize=(10, 4))
dendrogram(Z, truncate_mode='level', p=max_display_levels)
plt.show()

# Calculate validities
for m in range(len(Methods)):
    # run hierarchical clustering:
    Z = linkage(X, method=Methods[m], metric=Metric)
    Maxclust = 2
    cls = fcluster(Z, criterion='maxclust', t=Maxclust)
    # compute cluster validities:
    Rand[m], Jaccard[m], NMI[m] = clusterval(y, cls)

# Plot results:

plt.figure(5)
plt.title('Cluster validity')
plt.plot(np.arange(len(Methods)) + 1, Rand)
plt.plot(np.arange(len(Methods)) + 1, Jaccard)
plt.plot(np.arange(len(Methods)) + 1, NMI)
plt.legend(['Rand', 'Jaccard', 'NMI'], loc=4)
plt.show()

# PCA

# PCA by computing SVD of X_tilde
U, S, V = svd(X, full_matrices=False)
Пример #15
0
    count = 0
    for elem in covs:
        temp_m = np.zeros([M,M])
        new_covs[count] = np.diag(elem)
        count += 1

    covs = new_covs

# Plot results:
#figure(figsize=(14,9))
#clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs)
#show()
print(cds)

## In case the number of features != 2, then a subset of features most be plotted instead.
figure(figsize=(14,9))
idx = [4,1] # feature index, choose two features to use as x and y axis in the plot
clusterplot(X[:,idx], clusterid=cls, centroids=cds[:,idx], y=y, covars=covs[:,idx,:][:,:,idx])
ylabel("glucose")
xlabel("insulin")
show()

Rand, Jaccard, NMI = clusterval(y,cls)
print(Rand,Jaccard,NMI)
print(type(y))
print(y)
print(attributeNames)

print('Ran Exercise 11.1.1')
Пример #16
0
from matplotlib.pyplot import figure, plot, ylim, title, legend, xlabel, ylabel, show
import pickle
from toolbox_02450 import clusterval
from _load_data import *

# fetch data
gmm_f = open('gmm_data.pckl', 'rb')
gmm = pickle.load(gmm_f)
gmm_f.close()
bestK = gmm[0]
clsGMM = gmm[3]

hier_f = open('hier_data.pckl', 'rb')
hier = pickle.load(hier_f)
hier_f.close()
clsHIER = hier[0]

# Quality Evaluation
# Allocate variables:
Rand = np.zeros((2))
Jaccard = np.zeros((2))
NMI = np.zeros((2))

# compute cluster validities:
Rand[0], Jaccard[0], NMI[0] = clusterval(Y, clsGMM)
Rand[1], Jaccard[1], NMI[1] = clusterval(Y, clsHIER)

# Save data results
f = open('eval_data.pckl', 'wb')
pickle.dump([ Rand, Jaccard, NMI], f)
f.close()