def two_blobs_clustering(): """ TO BE COMPLETED Clustering of two blobs. Used in questions 2.1 and 2.2 """ # Get data and compute number of classes X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 3 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [1, 2, 3] # indices of the ordered eigenvalues to pick # build laplacian W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # Plot results plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def point_and_circle_clustering(): """ TO BE COMPLETED. Used in question 2.8 """ # Generate data and compute number of clusters X, Y = point_and_circle(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 50 var = 1.0 # exponential_euclidean's sigma^2 chosen_eig_indices = [0, 1] # indices of the ordered eigenvalues to pick # build laplacian W = build_similarity_graph(X, var=var, k=k) L_unn = build_laplacian(W, 'unn') L_norm = build_laplacian(W, 'rw') Y_unn = spectral_clustering(L_unn, chosen_eig_indices, num_classes=num_classes) Y_norm = spectral_clustering(L_norm, chosen_eig_indices, num_classes=num_classes) plot_clustering_result(X, Y, L_unn, Y_unn, Y_norm, 1)
def two_moons_clustering(): """ TO BE COMPLETED. Used in question 2.7 """ # Generate data and compute number of clusters X, Y = two_moons(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 3 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [1, 2, 3] # indices of the ordered eigenvalues to pick # build laplacian W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def point_and_circle_clustering(): """ TO BE COMPLETED. Used in question 2.8 """ # Generate data and compute number of clusters X, Y = point_and_circle(600, sigma=.2) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 eps = 0.4 var = 1 # exponential_euclidean's sigma^2 # build laplacian W = build_similarity_graph(X, var=var, eps=eps, k=k) L_unn = build_laplacian(W, 'unn') L_norm = build_laplacian(W, 'rw') Y_unn = spectral_clustering_adaptive(L_unn, num_classes=num_classes) Y_norm = spectral_clustering_adaptive(L_norm, num_classes=num_classes) plot_clustering_result(X, Y, L_unn, Y_unn, Y_norm, 1)
def two_moons_clustering(): """ TO BE COMPLETED. Used in question 2.7 """ # Generate data and compute number of clusters X, Y = two_moons(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 eps = 0.8 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'rw' # build laplacian W = build_similarity_graph(X, var=var, eps=eps, k=k) L = build_laplacian(W, laplacian_normalization) # spectral clustering Y_rec = spectral_clustering_adaptive(L, num_classes=num_classes) plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def point_and_circle_clustering(eig_max=15): """ TO BE COMPLETED. Used in question 2.8 """ # Generate data and compute number of clusters X, Y = point_and_circle(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 #chosen_eig_indices = [1, 2, 3] # indices of the ordered eigenvalues to pick if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) #distance_threshold = sorted(l)[-1] distance_threshold = sorted(l)[-1] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) W = build_similarity_graph(X, var=var, eps=eps, k=k) # build laplacian else: W = build_similarity_graph(X, var=var, k=k) L_unn = build_laplacian(W, 'unn') L_norm = build_laplacian(W, 'sym') #eigenvalues,U = np.linalg.eig(L_unn) #indexes = np.argsort(eigenvalues) #eigenvalues = eigenvalues[indexes] #U = U[:,indexes] #chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max = eig_max) chosen_eig_indices = [0, 1] Y_unn = spectral_clustering(L_unn, chosen_eig_indices, num_classes=num_classes) Y_norm = spectral_clustering(L_norm, chosen_eig_indices, num_classes=num_classes) plot_clustering_result(X, Y, L_unn, Y_unn, Y_norm, 1)
def two_moons_clustering(eig_max=15): """ TO BE COMPLETED. Used in question 2.7 """ # Generate data and compute number of clusters X, Y = two_moons(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' # chosen_eig_indices = [0, 1, 2] # indices of the ordered eigenvalues to pick if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) #distance_threshold = sorted(l)[-1] distance_threshold = sorted(l)[-1] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) # build laplacian W = build_similarity_graph(X, var=var, eps=eps, k=k) L = build_laplacian(W, laplacian_normalization) # chose the eigenvalues eigenvalues, U = np.linalg.eig(L) indexes = np.argsort(eigenvalues) eigenvalues = eigenvalues[indexes] U = U[:, indexes] chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max=eig_max) Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def two_blobs_clustering(): """ TO BE COMPLETED Clustering of two blobs. Used in questions 2.1 and 2.2 """ # Get data and compute number of classes X, Y = blobs(50, n_blobs=2, blob_var=0.15, surplus=0) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [0, 1, 2] # indices of the ordered eigenvalues to pick if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) #distance_threshold = sorted(l)[-1] distance_threshold = sorted(l)[-2] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) ##### # build laplacian W = build_similarity_graph(X, var=var, eps=eps, k=k) plot_graph_matrix(X, Y, W) L = build_laplacian(W, laplacian_normalization) # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # Plot results plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def two_blobs_clustering(): """ TO BE COMPLETED Clustering of two blobs. Used in questions 2.1 and 2.2 """ question = '2.2' # Get data and compute number of classes X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0) num_classes = len(np.unique(Y)) n = X.shape[0] """ Choose parameters """ var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'rw' if question == '2.1': # as the graph has to be connected in this question, we construct a epsilon-graph using a MST dists = pairwise_distances(X).reshape( (n, n)) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) distance_threshold = np.max(dists[min_tree]) eps = np.exp(-distance_threshold**2 / (2 * var)) # choice of eigenvectors to use chosen_eig_indices = [1] # indices of the ordered eigenvalues to pick # build similarity graph and laplacian W = build_similarity_graph(X, var=var, eps=eps) L = build_laplacian(W, laplacian_normalization) elif question == '2.2': # choice of eigenvectors to use chosen_eig_indices = [0, 1] # choice of k for the k-nn graph k = 20 # build similarity graph and laplacian W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # Plot results plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def main(): X = generate_dataset(shape="blobs") D = pairwise_distances(X) # euclidean distance as distance metric A = gaussian_kernel(D, is_sym=True) # Gaussian distance as affinity metric # K-MEANS clusters, _ = apply_kmeans(X) plot_clustering_result(X, A, clusters, clustering_name="K means clustering") # DBSCAN clusters, noise = apply_dbscan(X, D) plot_clustering_result(X, A, clusters, noise, clustering_name="DBSCAN clustering") # EIGENVECTOR BASED CLUSTERING A_eigen = gaussian_kernel( D, mult=0.05, is_sym=True) # Gaussian distance as affinity metric clusters, noise = apply_eigenvector_based(X, A_eigen) plot_clustering_result(X, A_eigen, clusters, noise, clustering_name="Eigenvector based clustering")
def two_moons_clustering(): """ TO BE COMPLETED. Used in question 2.7 """ # Generate data and compute number of clusters X, Y = two_moons(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [0, 1] # indices of the ordered eigenvalues to pick # build laplacian # build laplacian if k == 0: dists = sd.cdist(X, X, metric="euclidean") min_tree = min_span_tree(dists) distance_threshold = dists[min_tree].max() eps = np.exp(-distance_threshold**2.0 / (2 * var)) print(eps) W = build_similarity_graph(X, var=var, k=k, eps=eps) else: W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) # Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # # plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X)) Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes) plot_clustering_result(X, Y, L, Y_rec_adaptive, KMeans(num_classes).fit_predict(X))
def two_blobs_clustering(): """ TO BE COMPLETED Clustering of two blobs. Used in questions 2.1 and 2.2 """ # Get data and compute number of classes X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [0, 1] # indices of the ordered eigenvalues to pick # build laplacian if k == 0: dists = sd.cdist(X, X, metric="euclidean") min_tree = min_span_tree(dists) distance_threshold = dists[min_tree].max() eps = np.exp(-distance_threshold**2.0 / (2 * var)) W = build_similarity_graph(X, var=var, k=k, eps=eps) else: W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # Plot results plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))