def point_and_circle_clustering(eig_max=15): """ TO BE COMPLETED. Used in question 2.8 """ # Generate data and compute number of clusters X, Y = point_and_circle(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 #chosen_eig_indices = [1, 2, 3] # indices of the ordered eigenvalues to pick if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) #distance_threshold = sorted(l)[-1] distance_threshold = sorted(l)[-1] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) W = build_similarity_graph(X, var=var, eps=eps, k=k) # build laplacian else: W = build_similarity_graph(X, var=var, k=k) L_unn = build_laplacian(W, 'unn') L_norm = build_laplacian(W, 'sym') #eigenvalues,U = np.linalg.eig(L_unn) #indexes = np.argsort(eigenvalues) #eigenvalues = eigenvalues[indexes] #U = U[:,indexes] #chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max = eig_max) chosen_eig_indices = [0, 1] Y_unn = spectral_clustering(L_unn, chosen_eig_indices, num_classes=num_classes) Y_norm = spectral_clustering(L_norm, chosen_eig_indices, num_classes=num_classes) plot_clustering_result(X, Y, L_unn, Y_unn, Y_norm, 1)
def two_moons_clustering(eig_max=15): """ TO BE COMPLETED. Used in question 2.7 """ # Generate data and compute number of clusters X, Y = two_moons(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' # chosen_eig_indices = [0, 1, 2] # indices of the ordered eigenvalues to pick if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) #distance_threshold = sorted(l)[-1] distance_threshold = sorted(l)[-1] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) # build laplacian W = build_similarity_graph(X, var=var, eps=eps, k=k) L = build_laplacian(W, laplacian_normalization) # chose the eigenvalues eigenvalues, U = np.linalg.eig(L) indexes = np.argsort(eigenvalues) eigenvalues = eigenvalues[indexes] U = U[:, indexes] chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max=eig_max) Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def find_the_bend(): """ TO BE COMPLETED Used in question 2.3 :return: """ # the number of samples to generate num_samples = 600 # Generate blobs and compute number of clusters X, Y = blobs(num_samples, 4, 0.2) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' # either 'unn'normalized, 'sym'metric normalization or 'rw' random-walk normalization # build laplacian if k == 0: dists = sd.cdist(X, X, metric="euclidean") min_tree = min_span_tree(dists) distance_threshold = dists[min_tree].max() eps = np.exp(-distance_threshold**2.0 / (2 * var)) print(eps) W = build_similarity_graph(X, var=var, k=k, eps=eps) else: W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) """ compute first 15 eigenvalues and call choose_eigenvalues() to choose which ones to use. """ eigenvalues, vects = scipy.linalg.eig(L) eigenvalues = sorted(eigenvalues.real) # for ind,val in enumerate(eigenvalues[:15]): # plt.scatter(ind, val) # plt.xlabel("index of the eigenvalue") # plt.ylabel("value of the eigenvalue") # chosen_eig_indices = [0,1,2,3] # indices of the ordered eigenvalues to pick """ compute spectral clustering solution using a non-adaptive method first, and an adaptive one after (see handout) Y_rec = (n x 1) cluster assignments [0,1,..., c-1] """ # run spectral clustering # Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes) # plot_the_bend(X, Y, L, Y_rec, eigenvalues) plot_the_bend(X, Y, L, Y_rec_adaptive, eigenvalues)
def two_blobs_clustering(): """ TO BE COMPLETED Clustering of two blobs. Used in questions 2.1 and 2.2 """ # Get data and compute number of classes X, Y = blobs(50, n_blobs=2, blob_var=0.15, surplus=0) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [0, 1, 2] # indices of the ordered eigenvalues to pick if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) #distance_threshold = sorted(l)[-1] distance_threshold = sorted(l)[-2] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) ##### # build laplacian W = build_similarity_graph(X, var=var, eps=eps, k=k) plot_graph_matrix(X, Y, W) L = build_laplacian(W, laplacian_normalization) # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # Plot results plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def two_blobs_clustering(): """ TO BE COMPLETED Clustering of two blobs. Used in questions 2.1 and 2.2 """ question = '2.2' # Get data and compute number of classes X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0) num_classes = len(np.unique(Y)) n = X.shape[0] """ Choose parameters """ var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'rw' if question == '2.1': # as the graph has to be connected in this question, we construct a epsilon-graph using a MST dists = pairwise_distances(X).reshape( (n, n)) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) distance_threshold = np.max(dists[min_tree]) eps = np.exp(-distance_threshold**2 / (2 * var)) # choice of eigenvectors to use chosen_eig_indices = [1] # indices of the ordered eigenvalues to pick # build similarity graph and laplacian W = build_similarity_graph(X, var=var, eps=eps) L = build_laplacian(W, laplacian_normalization) elif question == '2.2': # choice of eigenvectors to use chosen_eig_indices = [0, 1] # choice of k for the k-nn graph k = 20 # build similarity graph and laplacian W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # Plot results plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def how_to_choose_epsilon(): """ TO BE COMPLETED. Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j) representing a fully connected graph. One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where (i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)). """ # the number of samples to generate num_samples = 100 # the option necessary for worst_case_blob, try different values gen_pam = 2.0 # to understand the meaning of the parameter, read worst_case_blob in generate_data.py # get blob data X, Y = worst_case_blob(num_samples, gen_pam) # get two moons data # X, Y = two_moons(num_samples) n = X.shape[0] """ use the distance function and the min_span_tree function to build the minimal spanning tree min_tree - var: the exponential_euclidean's sigma2 parameter - dists: (n x n) matrix with euclidean distance between all possible couples of points - min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree """ var = 1.0 dists = pairwise_distances(X).reshape( (n, n)) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) """ set threshold epsilon to the max weight in min_tree """ distance_threshold = np.max(dists[min_tree]) eps = np.exp(-distance_threshold**2 / (2 * var)) """ use the build_similarity_graph function to build the graph W W: (n x n) dimensional matrix representing the adjacency matrix of the graph use plot_graph_matrix to plot the graph """ W = build_similarity_graph(X, var=var, eps=eps, k=0) plot_graph_matrix(X, Y, W)
def two_moons_clustering(): """ TO BE COMPLETED. Used in question 2.7 """ # Generate data and compute number of clusters X, Y = two_moons(600) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [0, 1] # indices of the ordered eigenvalues to pick # build laplacian # build laplacian if k == 0: dists = sd.cdist(X, X, metric="euclidean") min_tree = min_span_tree(dists) distance_threshold = dists[min_tree].max() eps = np.exp(-distance_threshold**2.0 / (2 * var)) print(eps) W = build_similarity_graph(X, var=var, k=k, eps=eps) else: W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) # Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # # plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X)) Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes) plot_clustering_result(X, Y, L, Y_rec_adaptive, KMeans(num_classes).fit_predict(X))
def two_blobs_clustering(): """ TO BE COMPLETED Clustering of two blobs. Used in questions 2.1 and 2.2 """ # Get data and compute number of classes X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' chosen_eig_indices = [0, 1] # indices of the ordered eigenvalues to pick # build laplacian if k == 0: dists = sd.cdist(X, X, metric="euclidean") min_tree = min_span_tree(dists) distance_threshold = dists[min_tree].max() eps = np.exp(-distance_threshold**2.0 / (2 * var)) W = build_similarity_graph(X, var=var, k=k, eps=eps) else: W = build_similarity_graph(X, var=var, k=k) L = build_laplacian(W, laplacian_normalization) # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) # Plot results plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
def parameter_sensitivity(eig_max=15): """ TO BE COMPLETED. A function to test spectral clustering sensitivity to parameter choice. Used in question 2.9 """ # the number of samples to generate num_samples = 500 """ Choose parameters """ var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'unn' #chosen_eig_indices = [0, 1, 2] """ Choose candidate parameters """ parameter_candidate = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] # the number of neighbours for the graph or the epsilon threshold parameter_performance = [] for k in parameter_candidate: # Generate data X, Y = two_moons(num_samples, 1, 0.02) num_classes = len(np.unique(Y)) if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) distance_threshold = sorted(l)[-1] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) W = build_similarity_graph(X, var=var, eps=eps, k=k) else: W = build_similarity_graph(X, k=k) L = build_laplacian(W, laplacian_normalization) eigenvalues, U = np.linalg.eig(L) indexes = np.argsort(eigenvalues) eigenvalues = eigenvalues[indexes] U = U[:, indexes] chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max=eig_max) Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes) parameter_performance += [skm.adjusted_rand_score(Y, Y_rec)] plt.figure() plt.plot(parameter_candidate, parameter_performance) plt.title('parameter sensitivity') plt.show() #parameter_sensitivity()
def find_the_bend(eig_max=15, blob_var=0.03): """ TO BE COMPLETED Used in question 2.3 :return: """ eig_max -= 1 # to count starting from 0 # the number of samples to generate num_samples = 600 # Generate blobs and compute number of clusters X, Y = blobs(num_samples, 4, blob_var) num_classes = len(np.unique(Y)) """ Choose parameters """ k = 0 var = 1.0 # exponential_euclidean's sigma^2 laplacian_normalization = 'sym' # either 'unn'normalized, 'sym'metric normalization or 'rw' random-walk normalization if k == 0: # compute epsilon dists = sd.cdist( X, X, 'euclidean' ) # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append(dists[i][j]) #distance_threshold = sorted(l)[-1] distance_threshold = sorted(l)[-num_classes] eps = np.exp(-(distance_threshold)**2.0 / (2 * var)) # build laplacian W = build_similarity_graph(X, var=var, eps=eps, k=k) L = build_laplacian(W, laplacian_normalization) """ compute first 15 eigenvalues and call choose_eigenvalues() to choose which ones to use. """ eigenvalues, U = np.linalg.eig(L) indexes = np.argsort(eigenvalues) eigenvalues = eigenvalues[indexes] U = U[:, indexes] chosen_eig_indices = choose_eigenvalues( eigenvalues, eig_max=eig_max) # indices of the ordered eigenvalues to pick plt.plot(eigenvalues, [i for i in range(len(eigenvalues))], 'r+') """ compute spectral clustering solution using a non-adaptive method first, and an adaptive one after (see handout) Y_rec = (n x 1) cluster assignments [0,1,..., c-1] """ # run spectral clustering Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes) Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes, eig_max=eig_max) plot_the_bend(X, Y, L, Y_rec_adaptive, eigenvalues)
def how_to_choose_epsilon(gen_pam, k): """ TO BE COMPLETED. Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j) representing a fully connected graph. One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where (i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)). """ # the number of samples to generate num_samples = 100 # the option necessary for worst_case_blob, try different values #gen_pam = 10 # to understand the meaning of the parameter, read worst_case_blob in generate_data.py # get blob data # X, Y = worst_case_blob(num_samples, gen_pam) X, Y = two_moons(num_samples) """ use the distance function and the min_span_tree function to build the minimal spanning tree min_tree - var: the exponential_euclidean's sigma2 parameter - dists: (n x n) matrix with euclidean distance between all possible couples of points - min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree """ var = 1.0 dists = sd.cdist( X, X, 'euclidean') # dists[i, j] = euclidean distance between x_i and x_j min_tree = min_span_tree(dists) l = [] n1, m1 = min_tree.shape for i in range(n1): for j in range(m1): if min_tree[i][j] == True: l.append([(i, j), dists[i][j]]) l = sorted(l, key=lambda x: x[1], reverse=True) #print(min_tree) """ set threshold epsilon to the max weight in min_tree """ distance_threshold = l[0][1] eps = np.exp(-distance_threshold**2.0 / (2 * var)) """ use the build_similarity_graph function to build the graph W W: (n x n) dimensional matrix representing the adjacency matrix of the graph use plot_graph_matrix to plot the graph """ W = build_similarity_graph(X, var=var, eps=eps, k=k) plot_graph_matrix(X, Y, W) return eps, X, Y, W #if __name__ == '__main__': # for gp in [0,1,10,100]: # print(gp) # how_to_choose_epsilon(gp,0) # for k in [0,1,2,5,10]: # how_to_choose_epsilon(0,k)
distance_threshold = np.max(dists[min_tree]) eps = np.exp(-distance_threshold**2 / (2 * var)) """ use the build_similarity_graph function to build the graph W W: (n x n) dimensional matrix representing the adjacency matrix of the graph use plot_graph_matrix to plot the graph """ W = build_similarity_graph(X, var=var, eps=eps, k=0) plot_graph_matrix(X, Y, W) if __name__ == '__main__': n = 300 blobs_data, blobs_clusters = blobs(n) moons_data, moons_clusters = two_moons(n) point_circle_data, point_circle_clusters = point_and_circle(n) worst_blobs_data, worst_blobs_clusters = worst_case_blob(n, 1.0) var = 1 X, Y = moons_data, moons_clusters n_samples = X.shape[0] dists = pairwise_distances(X).reshape((n_samples, n_samples)) min_tree = min_span_tree(dists) eps = np.exp(-np.max(dists[min_tree])**2 / (2 * var)) W_eps = build_similarity_graph(X, var=var, eps=0.6) W_knn = build_similarity_graph(X, k=15) plot_graph_matrix(X, Y, W_eps) plot_graph_matrix(X, Y, W_knn)