def validity(tokenize_topics): """Measure entropy for different number of topics.""" Entropy = [] K = [10, 20, 30, 40, 50] for k in K: topics, publish_years, lda = lda_function(k) tokenize_topics = [nltk.word_tokenize(topic) for topic in topics] elements = [] Topics2 = [index for index, i in enumerate(tokenize_topics)] # take the first manual topic for each paper for i in range(len(papers)): elements.append(Topics2[max(lda[corpus[i]], key=itemgetter(1))[0]]) # compute cluster validity: Entropy.append(clusterval(y, elements)[0]) plt.figure(5) plt.title('Cluster validity') plt.hold(True) plt.plot(K, Entropy) plt.xlabel('Number of topics') plt.ylim(0, 1.1) plt.show() return Entropy
def Evaluate(input_data, index_to_check): X = input_data[:, :7] y = np.argmax(input_data[:, 7:10], 1) # X = StandardScaler().fit_transform(X) N, M = np.shape(X) split_index = int(X.shape[0] * 0.5) print(split_index) X_train = X[:split_index, :] X_test = X[split_index:, :] y_test = y[split_index:] # Maximum number of clusters: K = 10 # Allocate variables: Rand = np.zeros((K, )) Jaccard = np.zeros((K, )) NMI = np.zeros((K, )) for k in range(K): cls = GaussianMixture(n_components=K, covariance_type="full", n_init=10).fit(X) Rand[k], Jaccard[k], NMI[k] = clusterval(y.ravel(), cls.predict(X)) print(Rand[k], Jaccard[k], NMI[k]) # Plot results: figure(1) title('Cluster validity ') plot(np.arange(K) + 1, Rand) plot(np.arange(K) + 1, Jaccard) plot(np.arange(K) + 1, NMI) ylim(-2, 1.1) legend(['Rand', 'Jaccard', 'NMI'], loc=4) show() print('Ran Exercise 10.1.3')
# Maximum number of clusters: K = 10 # Allocate variables: Entropy = np.zeros((K, 1)) Purity = np.zeros((K, 1)) Rand = np.zeros((K, 1)) Jaccard = np.zeros((K, 1)) OtherMetrics = np.zeros((K, 5)) for k in range(K): # run K-means clustering: #cls = Pycluster.kcluster(X,k+1)[0] centroids, cls, inertia = k_means(X, k + 1) # compute cluster validities: Entropy[k], Purity[k], Rand[k], Jaccard[k] = clusterval(y, cls) # compute other metrics, implemented in sklearn.metrics package OtherMetrics[k, 0] = cluster_metrics.supervised.completeness_score( y.A.ravel(), cls) OtherMetrics[k, 1] = cluster_metrics.supervised.homogeneity_score( y.A.ravel(), cls) OtherMetrics[k, 2] = cluster_metrics.supervised.mutual_info_score( y.A.ravel(), cls) OtherMetrics[k, 3] = cluster_metrics.supervised.v_measure_score( y.A.ravel(), cls) OtherMetrics[k, 4] = cluster_metrics.supervised.adjusted_rand_score( y.A.ravel(), cls) # Plot results: figure(1)
# result as the starting point. K-means might converge faster/better than # random, but might also cause the algorithm to be stuck in a poor local minimum # type of covariance, you can try out 'diag' as well reps = 1 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps, tol=1e-6, reg_covar=1e-6, init_params=initialization_method).fit(X) cls = gmm.predict(X) print(cls) Rand, Jaccard, NMI, purity = clusterval(y, cls) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covariances_ # extract cluster shapes (covariances of gaussians) if cov_type.lower() == 'diag': new_covs = np.zeros([K, M, M]) count = 0 for elem in covs: temp_m = np.zeros([M, M]) new_covs[count] = np.diag(elem) count += 1 covs = new_covs
classNames = [name[0][0] for name in mat_data['classNames']] N, M = X.shape C = len(classNames) # Maximum number of clusters: K = 10 # Allocate variables: Rand = np.zeros((K, )) Jaccard = np.zeros((K, )) NMI = np.zeros((K, )) for k in range(K): # run K-means clustering: #cls = Pycluster.kcluster(X,k+1)[0] centroids, cls, inertia = k_means(X, k + 1) # compute cluster validities: Rand[k], Jaccard[k], NMI[k] = clusterval(y, cls) # Plot results: figure(1) title('Cluster validity') plot(np.arange(K) + 1, Rand) plot(np.arange(K) + 1, Jaccard) plot(np.arange(K) + 1, NMI) ylim(-2, 1.1) legend(['Rand', 'Jaccard', 'NMI'], loc=4) show() print('Ran Exercise 10.1.3')
ax.set(title='Plot of the best fitting GMM') plt.show() if __name__ == '__main__': # Create data set seed = 56 np.random.seed(seed) X, y = create_dataset() n_components_range = range(1, 3) cv_types = ['full'] models = create_gmm_models(cv_types, n_components_range) best_gmm, score = my_cv(X, y, models, K_out=10, K_in=10, seed=seed) best_gmm.fit(X) clf = best_gmm.predict(X) cent = best_gmm.means_ covars = best_gmm.covariances_ plot_accuracy(seed) plot_gmms(cv_types, n_components_range, best_gmm) #Cluster validity rand, jaccard, NMI = clusterval(y, clf) print('''\n ----- Quality Check of GMM ----- \n Rand index score: {} \n Jaccard similarity score: {} \n Normalized Mutual Information score: {} \n '''.format(rand, jaccard, NMI))
for k in range(1, K): # run K-means clustering: #cls = Pycluster.kcluster(X,k+1)[0] #centroids, cls, inertia = k_means(X,k+1) # compute cluster validities: cls1 = fcluster(Z, criterion='maxclust', t=k) gmm = GaussianMixture(n_components=k, covariance_type=cov_type, n_init=reps, tol=1e-6, reg_covar=1e-6, init_params=initialization_method).fit(X) cls2 = gmm.predict(X) Rand1[k], Jaccard1[k], NMI1[k] = clusterval(Y, cls1) Rand2[k], Jaccard2[k], NMI2[k] = clusterval(Y, cls2) # Plot results: figure(1) title('Cluster validity for Hierarchal') plot(np.arange(K) + 1, Rand1) plot(np.arange(K) + 1, Jaccard1) plot(np.arange(K) + 1, NMI1) legend(['Rand', 'Jaccard', 'NMI'], loc=4) show() figure(2) title('Cluster validity for GMM') plot(np.arange(K) + 1, Rand2)
max_display_levels=10 plt.figure(2,figsize=(10,4)) dendrogram(Z, truncate_mode='level', p=max_display_levels) plt.show() # Calculate validities y=y.flatten() for m in range(len(Methods)): # run hierarchical clustering: Z = linkage(X, method=Methods[m], metric=Metric) Maxclust = 2 cls = fcluster(Z, criterion='maxclust', t=Maxclust) # compute cluster validities: Rand[m], Jaccard[m], NMI[m] = clusterval(y,cls) # Plot results: plt.figure(5) plt.title('Cluster validity') plt.plot(np.arange(len(Methods))+1, Rand) plt.plot(np.arange(len(Methods))+1, Jaccard) plt.plot(np.arange(len(Methods))+1, NMI) plt.legend(['Rand', 'Jaccard', 'NMI'], loc=4) plt.show() # PCA # PCA by computing SVD of X_tilde
M = len(attributeNames) C = len(classNames) cov_type = 'diag' # you can try out 'diag' as well reps = 10 # number of fits with different initalizations, best result will be kept #Set K to value earlier found through cross validation K = 9 #Fit GMM to data gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps).fit(X) cls = gmm.predict(X) #Compute error with respect to actual classes rand_gmm, Jaccard_gmm, NMI_gmm = clusterval(np.asarray(y).ravel(), cls) # Perform hierarchical/agglomerative clustering on data matrix Method = 'complete' Metric = 'euclidean' Z = linkage(X, method=Method, metric=Metric) cls = fcluster(Z, criterion='maxclust', t=K) rand_h, Jaccard_h, NMI_h = clusterval(np.asarray(y).ravel(), cls) print("GMM:") print("rand: {0}".format(rand_gmm)) print("Jaccard: {0}".format(Jaccard_gmm)) print("NMI: {0}".format(NMI_gmm))
gmm = GaussianMixture(n_components=K_optimal, covariance_type=covar_type, n_init=reps).fit(X) cls = gmm.predict(X) # extract cluster labels cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covariances_ plt.figure(figsize=(12, 9)) clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) plt.title('Gaussian Mixture Model using {} clusters'.format(K_optimal)) plt.xlabel('PC 1') plt.ylabel('PC 2') plt.show() # Evaluate GMM model Rand_gmm, Jaccard_gmm, NMI_gmm = clusterval(y, cls) print('###################################################') print('# HIERARCHICAL CLUSTERING #') print('###################################################') Metric = 'euclidean' Maxclust = K_optimal max_display_levels = K_optimal Methods = ['single', 'complete', 'average', 'weighted', 'median', 'ward'] # We will try all these methods n_methods = len(Methods) # Allocate variables: Rand_hier = np.zeros((n_methods, )) Jaccard_hier = np.zeros((n_methods, )) NMI_hier = np.zeros((n_methods, ))
# Maximum number of clusters: K = 10 # Allocate variables: Entropy = np.zeros((K,1)) Purity = np.zeros((K,1)) Rand = np.zeros((K,1)) Jaccard = np.zeros((K,1)) OtherMetrics = np.zeros((K,5)) for k in range(K): # run K-means clustering: #cls = Pycluster.kcluster(X,k+1)[0] centroids, cls, inertia = k_means(X,k+1) # compute cluster validities: Entropy[k], Purity[k], Rand[k], Jaccard[k] = clusterval(y,cls) # compute other metrics, implemented in sklearn.metrics package OtherMetrics[k,0] = cluster_metrics.supervised.completeness_score(y.A.ravel(),cls) OtherMetrics[k,1] = cluster_metrics.supervised.homogeneity_score(y.A.ravel(),cls) OtherMetrics[k,2] = cluster_metrics.supervised.mutual_info_score(y.A.ravel(),cls) OtherMetrics[k,3] = cluster_metrics.supervised.v_measure_score(y.A.ravel(),cls) OtherMetrics[k,4] = cluster_metrics.supervised.adjusted_rand_score(y.A.ravel(),cls) # Plot results: figure(1) title('Cluster validity') hold(True) plot(np.arange(K)+1, -Entropy) plot(np.arange(K)+1, Purity)
# extract training and test set for current CV fold X_train = X[train_index] X_test = X[test_index] # Fit Gaussian mixture model to X_train gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps).fit(X_train) # compute negative log likelihood of X_test CVE[t] += -gmm.score_samples(X_test).sum() # Plot results figure(1) plot(KRange, CVE, '-ok') xlabel('K') savefig('GMM_with_y.png') show() Rand = np.zeros((1, )) Jaccard = np.zeros((1, )) NMI = np.zeros((1, )) Rand[0], Jaccard[0], NMI[0] = clusterval(y, cls[8]) print("\n Rand: ", Rand, "Jaccard: ", Jaccard, "NMI: ", NMI) print("Lowest score: ", np.min(CVE)) print("Highest score: ", np.max(CVE)) print("Cluster means: ", cds[8])
import numpy as np X = X_standard y = X[:, 9] #make y the chd column X = X[:, :9] #get rid of chd column covar_type = 'full' # you can try out 'diag' as well reps = 3 # number of fits with different initalizations, best result will be kept init_procedure = 'kmeans' gmm = GaussianMixture(n_components=8, covariance_type=covar_type, n_init=reps, init_params=init_procedure, tol=1e-6, reg_covar=1e-6).fit(X) link = linkage(X, method="ward", metric="euclidean") # # Compute clusters by thresholding the dendrogram Maxclust = 8 cls = fcluster(link, criterion='maxclust', t=Maxclust) b = gmm.predict(X) Rand_hc, Jaccard_hc, NMI_hc = clusterval(cls, y) Rand_gmm, Jaccard_gmm, NMI_gmm = clusterval(b, y) # the exercise script 10_1_3 shows this as the output of the function # clusterval while help(clusterval) seems to tell something different. print(Rand_hc, Jaccard_hc, NMI_hc) print(Rand_gmm, Jaccard_gmm, NMI_gmm)
unique, counts = np.unique(cls, return_counts=True) # Display dendrogram max_display_levels = 2 plt.figure(2, figsize=(10, 4)) dendrogram(Z, truncate_mode='level', p=max_display_levels) plt.show() # Calculate validities for m in range(len(Methods)): # run hierarchical clustering: Z = linkage(X, method=Methods[m], metric=Metric) Maxclust = 2 cls = fcluster(Z, criterion='maxclust', t=Maxclust) # compute cluster validities: Rand[m], Jaccard[m], NMI[m] = clusterval(y, cls) # Plot results: plt.figure(5) plt.title('Cluster validity') plt.plot(np.arange(len(Methods)) + 1, Rand) plt.plot(np.arange(len(Methods)) + 1, Jaccard) plt.plot(np.arange(len(Methods)) + 1, NMI) plt.legend(['Rand', 'Jaccard', 'NMI'], loc=4) plt.show() # PCA # PCA by computing SVD of X_tilde U, S, V = svd(X, full_matrices=False)
count = 0 for elem in covs: temp_m = np.zeros([M,M]) new_covs[count] = np.diag(elem) count += 1 covs = new_covs # Plot results: #figure(figsize=(14,9)) #clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) #show() print(cds) ## In case the number of features != 2, then a subset of features most be plotted instead. figure(figsize=(14,9)) idx = [4,1] # feature index, choose two features to use as x and y axis in the plot clusterplot(X[:,idx], clusterid=cls, centroids=cds[:,idx], y=y, covars=covs[:,idx,:][:,:,idx]) ylabel("glucose") xlabel("insulin") show() Rand, Jaccard, NMI = clusterval(y,cls) print(Rand,Jaccard,NMI) print(type(y)) print(y) print(attributeNames) print('Ran Exercise 11.1.1')
from matplotlib.pyplot import figure, plot, ylim, title, legend, xlabel, ylabel, show import pickle from toolbox_02450 import clusterval from _load_data import * # fetch data gmm_f = open('gmm_data.pckl', 'rb') gmm = pickle.load(gmm_f) gmm_f.close() bestK = gmm[0] clsGMM = gmm[3] hier_f = open('hier_data.pckl', 'rb') hier = pickle.load(hier_f) hier_f.close() clsHIER = hier[0] # Quality Evaluation # Allocate variables: Rand = np.zeros((2)) Jaccard = np.zeros((2)) NMI = np.zeros((2)) # compute cluster validities: Rand[0], Jaccard[0], NMI[0] = clusterval(Y, clsGMM) Rand[1], Jaccard[1], NMI[1] = clusterval(Y, clsHIER) # Save data results f = open('eval_data.pckl', 'wb') pickle.dump([ Rand, Jaccard, NMI], f) f.close()