Пример #1
0
def point_and_circle_clustering(eig_max=15):
    """
    TO BE COMPLETED.

    Used in question 2.8
    """
    # Generate data and compute number of clusters
    X, Y = point_and_circle(600)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    #chosen_eig_indices = [1, 2, 3]    # indices of the ordered eigenvalues to pick

    if k == 0:  # compute epsilon
        dists = sd.cdist(
            X, X, 'euclidean'
        )  # dists[i, j] = euclidean distance between x_i and x_j

        min_tree = min_span_tree(dists)

        l = []
        n1, m1 = min_tree.shape

        for i in range(n1):
            for j in range(m1):
                if min_tree[i][j] == True:
                    l.append(dists[i][j])

        #distance_threshold = sorted(l)[-1]
        distance_threshold = sorted(l)[-1]

        eps = np.exp(-(distance_threshold)**2.0 / (2 * var))
        W = build_similarity_graph(X, var=var, eps=eps, k=k)

    # build laplacian
    else:
        W = build_similarity_graph(X, var=var, k=k)
    L_unn = build_laplacian(W, 'unn')
    L_norm = build_laplacian(W, 'sym')

    #eigenvalues,U = np.linalg.eig(L_unn)
    #indexes = np.argsort(eigenvalues)
    #eigenvalues = eigenvalues[indexes]
    #U = U[:,indexes]
    #chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max = eig_max)
    chosen_eig_indices = [0, 1]

    Y_unn = spectral_clustering(L_unn,
                                chosen_eig_indices,
                                num_classes=num_classes)
    Y_norm = spectral_clustering(L_norm,
                                 chosen_eig_indices,
                                 num_classes=num_classes)

    plot_clustering_result(X, Y, L_unn, Y_unn, Y_norm, 1)
Пример #2
0
def two_moons_clustering(eig_max=15):
    """
    TO BE COMPLETED.

    Used in question 2.7
    """
    # Generate data and compute number of clusters
    X, Y = two_moons(600)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'

    #    chosen_eig_indices = [0, 1, 2]    # indices of the ordered eigenvalues to pick

    if k == 0:  # compute epsilon
        dists = sd.cdist(
            X, X, 'euclidean'
        )  # dists[i, j] = euclidean distance between x_i and x_j

        min_tree = min_span_tree(dists)

        l = []
        n1, m1 = min_tree.shape

        for i in range(n1):
            for j in range(m1):
                if min_tree[i][j] == True:
                    l.append(dists[i][j])

        #distance_threshold = sorted(l)[-1]
        distance_threshold = sorted(l)[-1]

        eps = np.exp(-(distance_threshold)**2.0 / (2 * var))

    # build laplacian
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    L = build_laplacian(W, laplacian_normalization)

    # chose the eigenvalues
    eigenvalues, U = np.linalg.eig(L)
    indexes = np.argsort(eigenvalues)
    eigenvalues = eigenvalues[indexes]
    U = U[:, indexes]
    chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max=eig_max)

    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Пример #3
0
def find_the_bend():
    """
    TO BE COMPLETED

    Used in question 2.3
    :return:
    """

    # the number of samples to generate
    num_samples = 600

    # Generate blobs and compute number of clusters
    X, Y = blobs(num_samples, 4, 0.2)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1  # exponential_euclidean's sigma^2
    laplacian_normalization = 'unn'  # either 'unn'normalized, 'sym'metric normalization or 'rw' random-walk normalization

    # build laplacian
    if k == 0:
        dists = sd.cdist(X, X, metric="euclidean")
        min_tree = min_span_tree(dists)
        distance_threshold = dists[min_tree].max()
        eps = np.exp(-distance_threshold**2.0 / (2 * var))
        print(eps)
        W = build_similarity_graph(X, var=var, k=k, eps=eps)
    else:
        W = build_similarity_graph(X, var=var, k=k)
    L = build_laplacian(W, laplacian_normalization)
    """
    compute first 15 eigenvalues and call choose_eigenvalues() to choose which ones to use. 
    """
    eigenvalues, vects = scipy.linalg.eig(L)
    eigenvalues = sorted(eigenvalues.real)
    #    for ind,val in enumerate(eigenvalues[:15]):
    #        plt.scatter(ind, val)
    #    plt.xlabel("index of the eigenvalue")
    #    plt.ylabel("value of the eigenvalue")
    #    chosen_eig_indices =  [0,1,2,3]  # indices of the ordered eigenvalues to pick
    """
    compute spectral clustering solution using a non-adaptive method first, and an adaptive one after (see handout) 
    Y_rec = (n x 1) cluster assignments [0,1,..., c-1]    
    """
    # run spectral clustering
    #    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
    Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes)

    #    plot_the_bend(X, Y, L, Y_rec, eigenvalues)
    plot_the_bend(X, Y, L, Y_rec_adaptive, eigenvalues)
Пример #4
0
def two_blobs_clustering():
    """
    TO BE COMPLETED

    Clustering of two blobs. Used in questions 2.1 and 2.2
    """

    # Get data and compute number of classes
    X, Y = blobs(50, n_blobs=2, blob_var=0.15, surplus=0)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [0, 1,
                          2]  # indices of the ordered eigenvalues to pick

    if k == 0:  # compute epsilon
        dists = sd.cdist(
            X, X, 'euclidean'
        )  # dists[i, j] = euclidean distance between x_i and x_j

        min_tree = min_span_tree(dists)

        l = []
        n1, m1 = min_tree.shape

        for i in range(n1):
            for j in range(m1):
                if min_tree[i][j] == True:
                    l.append(dists[i][j])

        #distance_threshold = sorted(l)[-1]
        distance_threshold = sorted(l)[-2]

        eps = np.exp(-(distance_threshold)**2.0 / (2 * var))
    #####

    # build laplacian
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    plot_graph_matrix(X, Y, W)

    L = build_laplacian(W, laplacian_normalization)

    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    # Plot results
    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Пример #5
0
def two_blobs_clustering():
    """
    TO BE COMPLETED

    Clustering of two blobs. Used in questions 2.1 and 2.2
    """

    question = '2.2'

    # Get data and compute number of classes
    X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0)
    num_classes = len(np.unique(Y))
    n = X.shape[0]
    """
    Choose parameters
    """
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'rw'

    if question == '2.1':
        # as the graph has to be connected in this question, we construct a epsilon-graph using a MST
        dists = pairwise_distances(X).reshape(
            (n, n))  # dists[i, j] = euclidean distance between x_i and x_j
        min_tree = min_span_tree(dists)
        distance_threshold = np.max(dists[min_tree])
        eps = np.exp(-distance_threshold**2 / (2 * var))

        # choice of eigenvectors to use
        chosen_eig_indices = [1]  # indices of the ordered eigenvalues to pick

        # build similarity graph and laplacian
        W = build_similarity_graph(X, var=var, eps=eps)
        L = build_laplacian(W, laplacian_normalization)

    elif question == '2.2':
        # choice of eigenvectors to use
        chosen_eig_indices = [0, 1]

        # choice of k for the k-nn graph
        k = 20

        # build similarity graph and laplacian
        W = build_similarity_graph(X, var=var, k=k)
        L = build_laplacian(W, laplacian_normalization)

    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    # Plot results
    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Пример #6
0
def how_to_choose_epsilon():
    """
    TO BE COMPLETED.

    Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j)
    representing a fully connected graph.
    One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where
    (i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold
    epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)).
    """
    # the number of samples to generate
    num_samples = 100

    # the option necessary for worst_case_blob, try different values
    gen_pam = 2.0  # to understand the meaning of the parameter, read worst_case_blob in generate_data.py

    # get blob data
    X, Y = worst_case_blob(num_samples, gen_pam)

    # get two moons data
    # X, Y = two_moons(num_samples)
    n = X.shape[0]
    """
     use the distance function and the min_span_tree function to build the minimal spanning tree min_tree                   
     - var: the exponential_euclidean's sigma2 parameter          
     - dists: (n x n) matrix with euclidean distance between all possible couples of points                   
     - min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree                           
    """
    var = 1.0
    dists = pairwise_distances(X).reshape(
        (n, n))  # dists[i, j] = euclidean distance between x_i and x_j
    min_tree = min_span_tree(dists)
    """
    set threshold epsilon to the max weight in min_tree 
    """
    distance_threshold = np.max(dists[min_tree])
    eps = np.exp(-distance_threshold**2 / (2 * var))
    """
    use the build_similarity_graph function to build the graph W  
     W: (n x n) dimensional matrix representing                    
        the adjacency matrix of the graph
       use plot_graph_matrix to plot the graph                    
    """
    W = build_similarity_graph(X, var=var, eps=eps, k=0)
    plot_graph_matrix(X, Y, W)
Пример #7
0
def two_moons_clustering():
    """
    TO BE COMPLETED.

    Used in question 2.7
    """
    # Generate data and compute number of clusters
    X, Y = two_moons(600)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [0, 1]  # indices of the ordered eigenvalues to pick

    # build laplacian
    # build laplacian
    if k == 0:
        dists = sd.cdist(X, X, metric="euclidean")
        min_tree = min_span_tree(dists)
        distance_threshold = dists[min_tree].max()
        eps = np.exp(-distance_threshold**2.0 / (2 * var))
        print(eps)
        W = build_similarity_graph(X, var=var, k=k, eps=eps)
    else:
        W = build_similarity_graph(X, var=var, k=k)
    L = build_laplacian(W, laplacian_normalization)

    #    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
    #
    #    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))

    Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes)

    plot_clustering_result(X, Y, L, Y_rec_adaptive,
                           KMeans(num_classes).fit_predict(X))
Пример #8
0
def two_blobs_clustering():
    """
    TO BE COMPLETED

    Clustering of two blobs. Used in questions 2.1 and 2.2
    """

    # Get data and compute number of classes
    X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [0, 1]  # indices of the ordered eigenvalues to pick

    # build laplacian
    if k == 0:
        dists = sd.cdist(X, X, metric="euclidean")
        min_tree = min_span_tree(dists)
        distance_threshold = dists[min_tree].max()
        eps = np.exp(-distance_threshold**2.0 / (2 * var))

        W = build_similarity_graph(X, var=var, k=k, eps=eps)
    else:
        W = build_similarity_graph(X, var=var, k=k)

    L = build_laplacian(W, laplacian_normalization)

    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    # Plot results
    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Пример #9
0
def parameter_sensitivity(eig_max=15):
    """
    TO BE COMPLETED.

    A function to test spectral clustering sensitivity to parameter choice.

    Used in question 2.9
    """
    # the number of samples to generate
    num_samples = 500
    """
    Choose parameters
    """
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'unn'
    #chosen_eig_indices = [0, 1, 2]
    """
    Choose candidate parameters
    """
    parameter_candidate = [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
    ]  # the number of neighbours for the graph or the epsilon threshold
    parameter_performance = []

    for k in parameter_candidate:
        # Generate data
        X, Y = two_moons(num_samples, 1, 0.02)
        num_classes = len(np.unique(Y))

        if k == 0:  # compute epsilon
            dists = sd.cdist(
                X, X, 'euclidean'
            )  # dists[i, j] = euclidean distance between x_i and x_j

            min_tree = min_span_tree(dists)

            l = []
            n1, m1 = min_tree.shape

            for i in range(n1):
                for j in range(m1):
                    if min_tree[i][j] == True:
                        l.append(dists[i][j])
            distance_threshold = sorted(l)[-1]
            eps = np.exp(-(distance_threshold)**2.0 / (2 * var))
            W = build_similarity_graph(X, var=var, eps=eps, k=k)
        else:
            W = build_similarity_graph(X, k=k)
        L = build_laplacian(W, laplacian_normalization)

        eigenvalues, U = np.linalg.eig(L)
        indexes = np.argsort(eigenvalues)
        eigenvalues = eigenvalues[indexes]
        U = U[:, indexes]
        chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max=eig_max)

        Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes)

        parameter_performance += [skm.adjusted_rand_score(Y, Y_rec)]

    plt.figure()
    plt.plot(parameter_candidate, parameter_performance)
    plt.title('parameter sensitivity')
    plt.show()


#parameter_sensitivity()
Пример #10
0
def find_the_bend(eig_max=15, blob_var=0.03):
    """
    TO BE COMPLETED

    Used in question 2.3
    :return:
    """
    eig_max -= 1  # to count starting from 0
    # the number of samples to generate
    num_samples = 600

    # Generate blobs and compute number of clusters
    X, Y = blobs(num_samples, 4, blob_var)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'sym'  # either 'unn'normalized, 'sym'metric normalization or 'rw' random-walk normalization

    if k == 0:  # compute epsilon
        dists = sd.cdist(
            X, X, 'euclidean'
        )  # dists[i, j] = euclidean distance between x_i and x_j

        min_tree = min_span_tree(dists)

        l = []
        n1, m1 = min_tree.shape

        for i in range(n1):
            for j in range(m1):
                if min_tree[i][j] == True:
                    l.append(dists[i][j])

        #distance_threshold = sorted(l)[-1]
        distance_threshold = sorted(l)[-num_classes]

        eps = np.exp(-(distance_threshold)**2.0 / (2 * var))

    # build laplacian
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    L = build_laplacian(W, laplacian_normalization)
    """
    compute first 15 eigenvalues and call choose_eigenvalues() to choose which ones to use. 
    """
    eigenvalues, U = np.linalg.eig(L)
    indexes = np.argsort(eigenvalues)
    eigenvalues = eigenvalues[indexes]
    U = U[:, indexes]

    chosen_eig_indices = choose_eigenvalues(
        eigenvalues,
        eig_max=eig_max)  # indices of the ordered eigenvalues to pick

    plt.plot(eigenvalues, [i for i in range(len(eigenvalues))], 'r+')
    """
    compute spectral clustering solution using a non-adaptive method first, and an adaptive one after (see handout) 
    Y_rec = (n x 1) cluster assignments [0,1,..., c-1]    
    """
    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
    Y_rec_adaptive = spectral_clustering_adaptive(L,
                                                  num_classes=num_classes,
                                                  eig_max=eig_max)

    plot_the_bend(X, Y, L, Y_rec_adaptive, eigenvalues)
def how_to_choose_epsilon(gen_pam, k):
    """
    TO BE COMPLETED.

    Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j)
    representing a fully connected graph.
    One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where
    (i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold
    epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)).
    """
    # the number of samples to generate
    num_samples = 100

    # the option necessary for worst_case_blob, try different values
    #gen_pam = 10  # to understand the meaning of the parameter, read worst_case_blob in generate_data.py

    # get blob data
    # X, Y = worst_case_blob(num_samples, gen_pam)
    X, Y = two_moons(num_samples)
    """
     use the distance function and the min_span_tree function to build the minimal spanning tree min_tree                   
     - var: the exponential_euclidean's sigma2 parameter          
     - dists: (n x n) matrix with euclidean distance between all possible couples of points                   
     - min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree                           
    """
    var = 1.0
    dists = sd.cdist(
        X, X,
        'euclidean')  # dists[i, j] = euclidean distance between x_i and x_j

    min_tree = min_span_tree(dists)

    l = []
    n1, m1 = min_tree.shape
    for i in range(n1):
        for j in range(m1):
            if min_tree[i][j] == True:
                l.append([(i, j), dists[i][j]])
    l = sorted(l, key=lambda x: x[1], reverse=True)

    #print(min_tree)
    """
    set threshold epsilon to the max weight in min_tree 
    """
    distance_threshold = l[0][1]
    eps = np.exp(-distance_threshold**2.0 / (2 * var))
    """
    use the build_similarity_graph function to build the graph W  
     W: (n x n) dimensional matrix representing                    
        the adjacency matrix of the graph
       use plot_graph_matrix to plot the graph                    
    """
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    plot_graph_matrix(X, Y, W)
    return eps, X, Y, W


#if __name__ == '__main__':
#    for gp in [0,1,10,100]:
#        print(gp)
#        how_to_choose_epsilon(gp,0)
#    for k in [0,1,2,5,10]:
#        how_to_choose_epsilon(0,k)
Пример #12
0
    distance_threshold = np.max(dists[min_tree])
    eps = np.exp(-distance_threshold**2 / (2 * var))
    """
    use the build_similarity_graph function to build the graph W  
     W: (n x n) dimensional matrix representing                    
        the adjacency matrix of the graph
       use plot_graph_matrix to plot the graph                    
    """
    W = build_similarity_graph(X, var=var, eps=eps, k=0)
    plot_graph_matrix(X, Y, W)


if __name__ == '__main__':
    n = 300
    blobs_data, blobs_clusters = blobs(n)
    moons_data, moons_clusters = two_moons(n)
    point_circle_data, point_circle_clusters = point_and_circle(n)
    worst_blobs_data, worst_blobs_clusters = worst_case_blob(n, 1.0)

    var = 1

    X, Y = moons_data, moons_clusters
    n_samples = X.shape[0]
    dists = pairwise_distances(X).reshape((n_samples, n_samples))
    min_tree = min_span_tree(dists)
    eps = np.exp(-np.max(dists[min_tree])**2 / (2 * var))
    W_eps = build_similarity_graph(X, var=var, eps=0.6)
    W_knn = build_similarity_graph(X, k=15)

    plot_graph_matrix(X, Y, W_eps)
    plot_graph_matrix(X, Y, W_knn)