Exemplo n.º 1
0
def main():
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    X, y = util.get_lfw_data()
    #util.show_image(X[0])
    #util.show_image(X[1])
    #util.show_image(X[2])

    scores = np.zeros((19, 19))
    for i in range(19):
        for j in range(19):
            if i != j:
                X1, y1 = util.limit_pics(X, y, [i, j], 40)
                face_points = build_face_image_points(X1, y1)
                cluster_set = kMeans(face_points, 2, "cheat", plot=False)
                scores[i, j] = cluster_set.score()
    np.fill_diagonal(scores, np.iinfo(np.int16).max)
    similar_tuple = np.unravel_index(np.argmin(scores), scores.shape)
    print "it did worst with: ", similar_tuple
    np.fill_diagonal(scores, np.iinfo(np.int16).min)
    distinct_tuple = np.unravel_index(np.argmax(scores), scores.shape)
    print "it did best with: ", distinct_tuple

    X1, y1 = util.limit_pics(X, y, [similar_tuple[0]], 40)
    util.show_image(X1[0])

    X1, y1 = util.limit_pics(X, y, [similar_tuple[1]], 40)
    util.show_image(X1[0])

    X1, y1 = util.limit_pics(X, y, [distinct_tuple[0]], 40)
    util.show_image(X1[0])

    X1, y1 = util.limit_pics(X, y, [distinct_tuple[1]], 40)
    util.show_image(X1[0])

    #util.show_image(np.mean(X, axis=1))

    U, mu = util.PCA(X)
    #util.plot_gallery([util.vec_to_image(U[:,i]) for i in xrange(12)])

    # for i in [1,10,50, 100, 500, 1288]:

    #     Z, ul = util.apply_PCA_from_Eig(X, U, i, mu)

    #     new_X = util.reconstruct_from_PCA(Z, ul, mu)

    #     util.plot_gallery([new_X[j] for j in xrange(12)])

    ### ========== TODO : END ========== ###

    #========================================
    # part 2

    # part b: test Cluster implementation
    # centroid: [ 1.04022358  0.62914619]

    np.random.seed(1234)
    sim_points = generate_points_2d(20)
    cluster = Cluster(sim_points)
    print 'centroid:', cluster.centroid().attrs

    # parts c-d: test kMeans implementation using toy dataset
    np.random.seed(1234)
    sim_points = generate_points_2d(20)
    k = 3

    # test cluster using random initialization
    #kmeans_clusters = kMeans(sim_points, k, init='random', plot=True)

    # test cluster using cheat initialization
    kmeans_clusters = kMeans(sim_points, k, init='cheat', plot=True)

    ### ========== TODO : START ========== ###
    # part 3

    # part a: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)

    # part b: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)

    ### ========== TODO : END ========== ###

    #========================================
    # part 4a

    # test Cluster implementation
    # medoid:   [ 1.05674064  0.71183522]

    np.random.seed(1234)
    sim_points = generate_points_2d(20)
    cluster = Cluster(sim_points)
    print 'medoid:', cluster.medoid().attrs

    # test kMedoids implementation using toy dataset
    np.random.seed(1234)
    sim_points = generate_points_2d(20)
    k = 3

    # test cluster using random initialization
    kmedoids_clusters = kMedoids(sim_points, k, init='random', plot=True)

    ### ========== TODO : START ========== ###
    # part 4

    # part b: compare k-means and k-medoids
    np.random.seed(1234)

    # part c: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)
Exemplo n.º 2
0
def main():
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    # part 1a:

    X, y = get_lfw_data()
    size = (50, 37)
    y_range = []
    for y_ in y:
        if y_ not in y_range:
            y_range.append(y_)
    print y_range
    """
    show_image(X[0],size)
    print y[0]
    show_image(X[1], size)
    print y[1]
    show_image(X[2], size)
    print y[2]
    show_image(X[4], size)
    print y[4]
"""
    average = 0
    for i in range(0, 1508):
        average += X[i]
    average = average / 1508.0
    show_image(average, size)
    # part 1b:
    U, mu = PCA(X)
    plot_gallery([vec_to_image(U[:, i]) for i in xrange(12)])
    # part 1c:
    l_range = [1, 10, 50, 100, 500, 1288]
    for l in l_range:
        Z, Ul = apply_PCA_from_Eig(X, U, l, mu)
        X_rec = reconstruct_from_PCA(Z, Ul, mu)
        #plot_gallery([X_rec[i] for i in xrange(12)])
    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
    np.random.seed(1234)
    points = generate_points_2d(20)
    #kMeans(points,3,plot = True)
    #kMedoids(points,3, plot = True)
    #kMeans(points,3,init='cheat', plot = True)
    #kMedoids(points,3, init='cheat', plot = True)

    ### ========== TODO : END ========== ###
    """
    ### ========== TODO : START ========== ###    
    # part 3a: cluster faces
    np.random.seed(1234)
    X1, y1 = util.limit_pics(X, y, [2,3,6,8], 40)
    points = build_face_image_points(X1, y1)

    plot = {}
    for pt in points:
        if pt.label not in plot:
            plot[pt.label] = []
        plot[pt.label].append(pt)
    clusters = ClusterSet()
    for l in plot:
        clusters.add(Cluster(plot[l]))
    plot_clusters(clusters, 'orig', ClusterSet.centroids)
    print "start"
    # faces kMeans cluster
    score = kMeans(points,k=4)
    max = score
    min = score
    average = score
    for i in range(9):
        score = kMeans(points, k=4)
        average = average + score
        if score>max:
            max = score
        if score<min:
            min = score
    print "KMeans:"
    print "average:"
    print average/10.0
    print "max"
    print max
    print "min"
    print min
    print "start"
    score = kMedoids(points, k=4)
    max = score
    min = score
    average = score
    for i in range(9):
        score = kMedoids(points, k=4)
        average = average + score
        if score > max:
            max = score
        if score < min:
            min = score
    print "KMedoids"
    print "average:"
    print average / 10.0
    print "max"
    print max
    print "min"
    print min


    # part 3b: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)
    X1, y1 = util.limit_pics(X, y, [2, 8], 40)
    U, mu = PCA(X)
    l = 1
    l_range = []
    kMeans_score = []
    kMedoids_score = []
    while l <= 41:
        l_range.append(l)
        Z, Ul = apply_PCA_from_Eig(X1, U, l, mu)
        points = build_face_image_points(Z, y1)
        kMeans_score.append(kMeans(points, 2, init='cheat'))
        kMedoids_score.append(kMedoids(points, 2, init='cheat'))
        l = l + 2
    mean_scatter = plt.scatter(l_range, kMeans_score,c='b', s=20)
    medoid_scatter = plt.scatter(l_range,kMedoids_score,c='r',s=20)
    plt.legend((mean_scatter,medoid_scatter),('kMeans', 'kMedoids'))
    plt.show()
    #X_rec = reconstruct_from_PCA(Z, Ul, mu)
"""
    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)
    best_pair = []
    poorest_pair = []
    best_score = 0
    poorest_score = 100
    l = 30
    for person1 in range(12):
        for person2 in range(person1 + 1, 12):
            X1, y1 = util.limit_pics(X, y, [person1, person2], 40)
            U, mu = PCA(X)
            Z, Ul = apply_PCA_from_Eig(X1, U, l, mu)
            points = build_face_image_points(Z, y1)
            score = kMedoids(points, 2)
            if score > best_score:
                best_score = score
                best_pair = [person1, person2]
            if score < poorest_score:
                poorest_score = score
                poorest_pair = [person1, person2]
    print best_pair
    print best_score
    plot_representative_images(X,
                               y,
                               best_pair,
                               title='The most distinguished two persons')
    print poorest_pair
    print poorest_score
    plot_representative_images(X,
                               y,
                               poorest_pair,
                               title='The most undistinguished two persons')
Exemplo n.º 3
0
def main():
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    x_in, y_in = get_lfw_data()
    #print(x_in.shape)
    #print("y size:", y_in.shape)
    #x_average = np.mean(x_in, axis=0)
    #show_image(x_in[3])
    #show_image(x_average)

    #U, mu = PCA(x_in)
    #plot_gallery([vec_to_image(U[:, i]) for i in range(12)])
    #l_comp = [1, 10, 50, 100, 500, 1288]
    #for i in range(len(l_comp)):
    #x_eigen, u_eigen = apply_PCA_from_Eig(x_in, U, l_comp[i], mu)
    #x_out = reconstruct_from_PCA(x_eigen, u_eigen, mu)
    #print(l_comp[i])
    #plot_gallery([x_out[i] for i in range(12)])
    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
    np.random.seed(1234)
    #points = generate_points_2d(20)
    #kMeans(points,k=3, plot=True)
    #kMedoids(points,k=3, plot=True)

    #print(tt.label)
    #tt = kMedoids(test_1, 3, init='cheat', plot=True)
    # for i in tt.members:
    #     print("--------------")
    #     print(i)
    # print(tt.score())
    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 3a: cluster faces
    np.random.seed(1234)
    X1, y1 = util.limit_pics(x_in, y_in, [4, 6, 13, 16], 40)
    print(X1.shape)
    points = build_face_image_points(X1, y1)
    #

    total_med_score = 0
    med_max = 0
    med_min = float('inf')
    for i in range(10):
        tt = kMedoids(points, 4, plot=False)
        score = tt.score()
        total_med_score += score
        if score > med_max:
            med_max = score
        if score < med_min:
            med_min = score
    print('total med score is', total_med_score)
    print('max is', med_max, 'min is', med_min)

    # part 3b: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)

    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)
Exemplo n.º 4
0
def main():
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    # 1a: show images
    X, y = get_lfw_data()
    #show_image(X[0])
    #show_image(X[100])
    #show_image(X[1000])

    mu = X.mean(0)
    #show_image(mu)

    # 1b: eigenfaces
    U = PCA(X)
    #show_image(vec_to_image(U[0][:, 0]))
    #show_image(vec_to_image(U[0][:, 1]))
    #show_image(vec_to_image(U[0][:, 2]))
    #show_image(vec_to_image(U[0][:, 3]))
    #show_image(vec_to_image(U[0][:, 4]))
    #show_image(vec_to_image(U[0][:, 5]))
    #show_image(vec_to_image(U[0][:, 6]))
    #show_image(vec_to_image(U[0][:, 7]))
    #show_image(vec_to_image(U[0][:, 8]))
    #show_image(vec_to_image(U[0][:, 9]))
    #show_image(vec_to_image(U[0][:, 10]))
    #show_image(vec_to_image(U[0][:, 11]))

    # 1c: reconstruct from PCA
    '''li = [1, 10, 50, 100, 500, 1288]
    for l in li:
        Z, Ul = apply_PCA_from_Eig(X, U[0], l, mu)
        for i in range(0, 12):
            im_name = "l{}_im{}".format(l, (i + 1))
            show_image(reconstruct_from_PCA(Z, Ul, mu)[i])
            print(im_name)
            plt.savefig("../../images/{}".format(im_name))'''
    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
    np.random.seed(1234)
    pts = generate_points_2d(20)
    #print("Using kmeans and random_init")
    #kMeans(pts, 3, plot=True)
    #print("Using kmedoids and random_init")
    #kMedoids(pts, 3, plot=True)
    #print("Using kmeans and cheat_init")
    #kMeans(pts, 3, init="cheat", plot=True)
    #print("Using kmedoids and cheat_init")
    #kMedoids(pts, 3, init="cheat", plot=True)
    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 3a: cluster faces
    '''np.random.seed(1234)
    X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40)
    points = build_face_image_points(X1, y1)

    kmeans_scores = []
    kmeans_runtime = []
    kmeds_scores = []
    kmeds_runtime = []
    for i in range(0, 10):
        start = time.time()
        kmeans_clusterset = kMeans(points, 4, plot=False)
        end = time.time()
        kmeans_runtime.append(end - start)
        kmeans_score = kmeans_clusterset.score()
        kmeans_scores.append(kmeans_score)

        start = time.time()
        kmeds_clusterset = kMedoids(points, 4, plot=False)
        end = time.time()
        kmeds_runtime.append(end - start)
        kmeds_score = kmeds_clusterset.score()
        kmeds_scores.append(kmeds_score)
    
    print("kmeans average score: {}".format(sum(kmeans_scores) / float(len(kmeans_scores))))
    print("kmeans max score: {}".format(max(kmeans_scores)))
    print("kmeans min score: {}".format(min(kmeans_scores)))
    print("kmeans average runtime: {}s".format((sum(kmeans_runtime) / float(len(kmeans_runtime)))))
    print("kmeans max runtime: {}".format(max(kmeans_runtime)))
    print("kmeans min runtime: {}".format(min(kmeans_runtime)))
    print("kmeds average score: {}".format(sum(kmeds_scores) / float(len(kmeds_scores))))
    print("kmeds max score: {}".format(max(kmeds_scores)))
    print("kmeds min score: {}".format(min(kmeds_scores)))
    print("kmeds average runtime: {}s".format((sum(kmeds_runtime) / float(len(kmeds_runtime)))))
    print("kmeds max runtime: {}".format(max(kmeds_runtime)))
    print("kmeds min runtime: {}".format(min(kmeds_runtime)))'''

    # part 3b: explore effect of lower-dimensional representations on clustering performance
    '''np.random.seed(1234)
    X2, y2 = util.limit_pics(X, y, [4, 13], 40)

    li = []
    for i in range(1, 43, 2):
        li.append(i)
    
    kmeans_face_scores = []
    kmeds_face_scores = []
    for l in li:
        print(l)
        Z, Ul = apply_PCA_from_Eig(X2, U[0], l, mu)
        points2 = build_face_image_points(Z, y2)
        kmeans_face_cset = kMeans(points2, 2, init="cheat", plot=False)
        kmeans_face_scores.append(kmeans_face_cset.score())
        kmeds_face_cset = kMedoids(points2, 2, init="cheat", plot=False)
        kmeds_face_scores.append(kmeds_face_cset.score())
    
    plt.plot(li, kmeans_face_scores, label="K-Means")
    plt.plot(li, kmeds_face_scores, label="K-Medoids")
    plt.title("Clustering Score Vs. Number of Principal Components")
    plt.xlabel("Number of Principal Components")
    plt.ylabel("Score")
    plt.legend()
    plt.show()'''

    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)

    best_score = float("-inf")
    best_im = [0, 0]
    worst_score = float("inf")
    worst_im = [0, 0]
    for i in range(0, 19):
        for j in range(0, 19):
            if i != j:
                X3, y3 = util.limit_pics(X, y, [i, j], 40)
                curr_points = build_face_image_points(X3, y3)
                c_set = kMedoids(curr_points, 2, init="cheat", plot=False)
                curr_score = c_set.score()
                if curr_score > best_score:
                    best_score = curr_score
                    best_im[0] = i
                    best_im[1] = j

                if curr_score < worst_score:
                    worst_score = curr_score
                    worst_im[0] = i
                    worst_im[1] = j

    print("The Most Discriminative Images were {}, with a score of {}".format(
        best_im, best_score))
    plot_representative_images(X, y, best_im, title="Most Discriminative")
    print("The Least Discriminative Images {}, with a score of {}".format(
        worst_im, worst_score))
    plot_representative_images(X, y, worst_im, title="Least Discriminative")
Exemplo n.º 5
0
def main():
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    X, y = get_lfw_data()

    #show_image(im=X[2])
    #show_image(im=X[1])
    #show_image(im=X[3])

    #average_image = np.mean(X, axis=0)
    #show_image(im=average_image)

    U, mu = PCA(X)
    #plot_gallery([vec_to_image(U[:, i]) for i in xrange(12)])

    # Selecting the dimension, l, to map all features to
    for l in [1, 10, 50, 100, 500, 1288]:
        Z, Ul = apply_PCA_from_Eig(X, U, l, mu)
        X_rec = reconstruct_from_PCA(Z, Ul, mu)
        #plot_gallery([vec_to_image(X_rec[i]) for i in xrange(12)], subtitles=["l="+str(l)+",n="+str(j) for j in xrange(12)])

    # Original 12 Images
    #plot_gallery([vec_to_image(X[i]) for i in xrange(12)])

    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 2d: cluster toy dataset
    #np.random.seed(1234)
    #points = generate_points_2d(20)
    #kMeans(points, 3, init='cheat', plot=True)
    #kMedoids(points, 3, init='cheat', plot=True)

    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 3a: cluster faces
    np.random.seed(1234)
    X1, y1 = limit_pics(X, y, [4, 6, 13, 16], 40)
    points = build_face_image_points(X1, y1)

    k_means_scores = []
    k_medoids_scores = []

    #for _ in range(10):
    #clusters = kMeans(points, 4, init='random', plot=False)
    #k_means_scores.append(clusters.score())
    #clusters = kMedoids(points, 4, init='random', plot=False)
    #k_medoids_scores.append(clusters.score())

    #print('k-means average: {}'.format(np.mean(k_means_scores)))
    #print('k-means min: {}'.format(np.min(k_means_scores)))
    #print('k-means max: {}'.format(np.max(k_means_scores)))
    #print('k-medoids average: {}'.format(np.mean(k_medoids_scores)))
    #print('k-medoids min: {}'.format(np.min(k_medoids_scores)))
    #print('k-medoids max: {}'.format(np.max(k_medoids_scores)))

    # part 3b: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)
    X2, y2 = util.limit_pics(X, y, [4, 13], 40)
    #
    kmeans_scores_dict = dict()
    kmedoids_scores_dict = dict()
    #
    for l in np.arange(1, 42):
        Z2, Ul2 = apply_PCA_from_Eig(X2, U, l, mu)
        X_rec2 = reconstruct_from_PCA(Z2, Ul2, mu)
        points = build_face_image_points(X_rec2, y2)
        #
        cluster_set1 = kMeans(points, 2, "cheat")
        cluster_set2 = kMedoids(points, 2, "cheat")
        #
        kmeans_scores_dict[l] = cluster_set1.score()
        kmedoids_scores_dict[l] = cluster_set2.score()
    #
    plt.plot(list(kmeans_scores_dict.keys()),
             list(kmeans_scores_dict.values()),
             'r',
             label='K-means')
    plt.plot(list(kmedoids_scores_dict.keys()),
             list(kmedoids_scores_dict.values()),
             'b',
             label='K-medoids')
    plt.title('Score for kMeans and kMedoids vs. # Principal Components')
    plt.xlabel('# Principal Components')
    plt.ylabel('score')
    plt.legend()
    #plt.show()

    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)
    max_score = (-1, None, None)
    min_score = (np.Inf, None, None)

    for i in np.arange(0, 19):
        for j in np.arange(0, 19):
            if i != j:
                X_ij, y_ij = util.limit_pics(X, y, [i, j], 40)
                points = build_face_image_points(X_ij, y_ij)
                cluster_set = kMedoids(points, 2, init='cheat')
                score = cluster_set.score()
                if score < min_score[0]:
                    min_score = (score, i, j)
                if score > max_score[0]:
                    max_score = (score, i, j)

    print max_score
    print min_score
    plot_representative_images(X,
                               y, [min_score[1], min_score[2]],
                               title='min score images')
    plot_representative_images(X,
                               y, [max_score[1], max_score[2]],
                               title='max score images')
Exemplo n.º 6
0
def main() :
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    X, y = get_lfw_data()
    mean_face = np.mean(X, axis = 0)
    U, mu = PCA(X)
    assert(np.sum(np.abs(mean_face - mu)) == 0)
    #show_image(vec_to_image(mu)) #PART A
    num_eigenfaces_to_plot = 12
    #plot_gallery([vec_to_image(U[:,i])
    #              for i in xrange(num_eigenfaces_to_plot)]) #PART B
    for l in [1,10,50,100,500,1288]:
        Z, Ul = apply_PCA_from_Eig(X, U, l, mu)
        X_rec = reconstruct_from_PCA(Z, Ul, mu)
    #    plot_gallery([vec_to_image(X_rec[i])
    #                  for i in xrange(num_eigenfaces_to_plot)]) #PART C
    ### ========== TODO : END ========== ###



    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
    print "generating data for clustering"
    np.random.seed(1234)
    pts = generate_points_2d(20)
    cluster_set = kMeans(pts, 3, plot = False, verbose = False) # 2
    print "kmeans rand init score: {}".format(cluster_set.score())
    another_cluster_set = kMedoids(pts, 3, plot = False, verbose = False) #2
    print "k medoids rand init score: {}".format(another_cluster_set.score())
    km_clust_2 = kMeans(pts, 3, init = 'cheat', plot = False, verbose = False) #2
    print "k means cheat init score: {}".format(km_clust_2.score())
    k_med_clust_2 = kMedoids(pts, 3, init='cheat', plot = False, verbose = False) #2
    print "k medoids cheat init score: {}".format(k_med_clust_2.score())

    ### ========== TODO : END ========== ###



    ### ========== TODO : START ========== ###
    # part 3a: cluster faces
    np.random.seed(1234)
    X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40)
    points = build_face_image_points(X1, y1)
    kmeans_scores, kmed_scores = [], []
    kmeans_times, kmed_times = [], []
    import time
    for i in range(10):
        print "running k-means and k-medoids for the {}th time".format(i+1)
        t = time.time()
        cluster_set = kMeans(points, 4)
        kmeans_times.append(time.time()-t)
        kmeans_scores.append(cluster_set.score())
        t = time.time()
        kmed_set = kMedoids(points, 4)
        kmed_times.append(time.time()-t)
        kmed_scores.append(kmed_set.score())
    means_avg, means_max, means_min = np.mean(np.array(kmeans_scores)), max(kmeans_scores), min(kmeans_scores)
    med_avg, med_max, med_min = np.mean(np.array(kmed_scores)), max(kmed_scores), min(kmed_scores)
    kmeans_time = np.mean(np.array(kmeans_times))
    kmed_time = np.mean(np.array(kmed_times))
    print "kmeans time: {}".format(kmeans_time)
    print "kmed time: {}".format(kmed_time)
    print "K means average: {}, max: {}, min: {}".format(means_avg,
                                                         means_max, means_min)
    print "K medoids average: {}, max: {}, min: {}".format(med_avg,
                                                           med_max, med_min)
    exit()

    # part 3b: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)
    X2, y2 = util.limit_pics(X, y, [4, 13], 40)
    l_kmeans = {}
    l_kmed = {}
    for l in range(1,42):
        if l % 5 == 0: print "iteration: l = {}".format(l)
        Z, Ul = apply_PCA_from_Eig(X2, U, l, mu)
        X2_rec = reconstruct_from_PCA(Z, Ul, mu)
        points = build_face_image_points(X2_rec, y2)
        kmeans_clust = kMeans(points, 2, init='cheat')
        kmed_clust = kMedoids(points, 2, init='cheat')
        l_kmeans[l] = kmeans_clust.score()
        l_kmed[l] = kmed_clust.score()
    plt.plot(list(l_kmeans.keys()), list(l_kmeans.values()), 'r', label='K means')
    plt.plot(list(l_kmed.keys()), list(l_kmed.values()), 'b', label='K medoids')
    plt.title('K-means and K-medoids score with respect to principal components')
    plt.xlabel('Number of principal components')
    plt.ylabel('Clustering score')
    plt.legend()
    plt.show()
    print l_kmed.items()

    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    max_score, min_score = (-1, None, None), (np.Inf, None, None)
    min_tup, max_tup = (None, None, []), (None, None, [])
    np.random.seed(1234)
    for i in range(0,19):
        for j in range(0,19):
            if i != j:
                if i % 5 == 0 and j % 5 == 0:
                    print "considering groups {} and {}".format(i,j)
                X_ij, y_ij = util.limit_pics(X, y, [i,j], 40)
                points = build_face_image_points(X_ij, y_ij)
                med_clust = kMedoids(points, 2, init='cheat')
                score = med_clust.score()
                if score < min_score[0]:
                    min_score = (score, i, j)
                if score > max_score[0]:
                    max_score = (score, i, j)
    print max_score
    print min_score
    assert(min_score[1] == 4 and min_score[2] == 5)
    plot_representative_images(X, y, [min_score[1], min_score[2]],
                               title = 'min score images')
    assert(max_score[1] == 9 and max_score[2] == 16)
    plot_representative_images(X, y, [max_score[1], max_score[2]],
                               title = 'max score images')
Exemplo n.º 7
0
def main() :
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    
    X, y = get_lfw_data()
    mean = np.mean(X, axis=0)
    #print(mean)
    #show_image(mean)
    show_image(vec_to_image(mean))
    
    U, mu = PCA(X)
    plot_gallery([vec_to_image(U[:,i]) for i in range(12)])
    
    #plot_title = "1c-"
    #for l in [1,10,50,100,500,1288]:
     #   Z, Ul = apply_PCA_from_Eig(X, U, l, mu)
      #  X_rec = reconstruct_from_PCA(Z, Ul, mu)
       # title = plot_title + str(l)
        #plot_gallery([vec_to_image(X_rec[i]) for i in range(12)], title=title)
        
    
    ### ========== TODO : END ========== ###
    
    
    
    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
   # np.random.seed(1234)
   # points = generate_points_2d(20)
   # kMeans(points, 3, init='random', plot=True)
    
   # np.random.seed(1234)
   # points = generate_points_2d(20)
   # kMedoids(points, 3, init='random', plot=True)
    
   # np.random.seed(1234)
   # points = generate_points_2d(20)
   # kMeans(points, 3, init='cheat', plot=True)
    
   # np.random.seed(1234)
   # points = generate_points_2d(20)
   # kMedoids(points, 3, init='cheat', plot=True)
    ### ========== TODO : END ========== ###
    
    
    
    ### ========== TODO : START ========== ###    
    # part 3a: cluster faces
    np.random.seed(1234)
    X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40)
    points = build_face_image_points(X1, y1)
    minScoreMeans = 1000
    maxScoreMeans = -1000
    average = 0
    totalTime = 0
    for i in range(10):
        start = time.time()
        scoreMeans = kMeans(points, 4, init='random').score()
        print(scoreMeans)
        if(i==0):
            minScoreMeans = scoreMeans
            maxScoreMeans = scoreMeans
            average = scoreMeans
            totalTime = time.time() - start
        else:
            if scoreMeans < minScoreMeans:
                minScoreMeans = scoreMeans
            if scoreMeans > maxScoreMeans:
                maxScoreMeans = scoreMeans
            average += scoreMeans
            totalTime += time.time() - start
    print("min score")
    print(minScoreMeans)
    print("max score")
    print(maxScoreMeans)
    print("average score")
    print(average/10)
    print("average time")
    print(totalTime/10)
    
    minScoreMedoids = 1000
    maxScoreMedoids = -1000
    averageM = 0
    totalTime = 0
    for i in range(10):
        start = time.time()
        scoreMedoids = kMedoids(points, 4, init='random').score()
        print(scoreMedoids)
        if(i==0):
            minScoreMedoids = scoreMedoids
            maxScoreMedoids = scoreMedoids
            averageM = scoreMeans
            totalTime = time.time() - start
        else:
            if scoreMedoids < minScoreMedoids:
                minScoreMedoids = scoreMedoids
            if scoreMedoids > maxScoreMedoids:
                maxScoreMedoids = scoreMedoids
            averageM += scoreMedoids
            totalTime += time.time() - start
    print("min score")
    print(minScoreMedoids)
    print("max score")
    print(maxScoreMedoids)
    print("average")
    print(averageM/10)
    print("average time")
    print(totalTime/10)
        
    # part 3b: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)
    scoresMeans = []
    scoresMedoids = []
    l_values = []
    X1, y1 = util.limit_pics(X, y, [4, 13], 40)
    for l in range(1, 42):
        l_values.append(l)
        l+=2
    
    for l in l_values:
        Z, U1 = apply_PCA_from_Eig(X1, U, l, mu)
        X_rec = reconstruct_from_PCA(Z, U1, mu)
        points = build_face_image_points(X_rec, y1)
        
        scoreM = kMeans(points, 2, init='cheat').score()
        scoreM2 = kMedoids(points, 2, init='cheat').score()
        scoresMeans.append(scoreM)
        scoresMedoids.append(scoreM2)        

    plt.plot(l_values, scoresMeans, 'c', label='kMeans')
    plt.plot(l_values, scoresMedoids, 'b', label='kMedoids')
    plt.xlabel('# of principal components')
    plt.ylabel('score')
    plt.legend()
    plt.show()
    
    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)
    
    score = 0
    minScore = 1000
    iMin = -1
    iMax = -1
    jMin = -1
    jMax = -1
    maxScore = -1000

    for i in range(0,19):
        for j in range(0,19):
            if i != j:
                X2, y2 = util.limit_pics(X, y, [i, j], 40)
                points = build_face_image_points(X, y)
                score = kMedoids(points,2, init='cheat').score()
                if score < minScore:
                    minScore = score
                    iMin = i
                    jMin = j
                if score > maxScore:
                    maxScore = score
                    iMax = i
                    jMax = j
                
    print("min:")
    print(minScore)
    print(iMin)
    print(jMin)
    plot_representative_images(X, y, iMin, jMin, title="min")
    print("max:")
    print(maxScore)
    print(iMax)
    print(jMax)
    plot_representative_images(X, y, iMax, jMax, title="min")
Exemplo n.º 8
0
def main() :
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    X, y = util.get_lfw_data()
    n,d = X.shape
    avg_face = []
    for column_index in range(d):
        col = X[:,column_index]
        avg_face_attr = np.mean(col, axis=0)
        avg_face.append(avg_face_attr)

    util.show_image(np.array(avg_face))
    ### ========== TODO : END ========== ###
    
    # 1b
    U, mu = util.PCA(X)
    n,d = U.shape
    plot_gallery([vec_to_image(U[:,i]) for i in xrange(12)])
    for column_index in range(d):
        col = U[:,column_index]
        util.show_image(util.vec_to_image(col))
    
    # 1c
    ls = [1, 10, 50, 100, 500, 1288]
    for l in ls:
        Z, Ul = util.apply_PCA_from_Eig(X, U, l, mu)
        X_rec = util.reconstruct_from_PCA(Z, Ul, mu)
        plot_gallery(X_rec[:12])
    

    # test centroid
    # p1 = Point('1', 1, np.array([5, 4]))
    # p2 = Point('2', 2, np.array([9, 10]))
    # p3 = Point('3', 3, np.array([3, 9]))
    # c = Cluster([p1, p2, p3])
    # print(str(c))
    # print(str(c.centroid()))
    # end test centroid

    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
    np.random.seed(1234)
    k = 3
    pts_per_cluster = 20
    for i in range(1):
        points = generate_points_2d(pts_per_cluster)
        k_clusters = kMeans(points, k, init="cheat", plot=True)
        k_clusters = kMedoids(points, k, init="cheat", plot=True)
    ### ========== TODO : END ========== ###
    
    ### ========== TODO : START ========== ###    
    # part 3a: cluster faces
    np.random.seed(1234)
    k = 4
    X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40)
    points = build_face_image_points(X1, y1)

    plot = {}
    for pt in points:
        if pt.label not in plot:
            plot[pt.label] = []
        plot[pt.label].append(pt)
    clusters = ClusterSet()
    for l in plot:
        clusters.add(Cluster(plot[l]))
    plot_clusters(clusters, 'orig', ClusterSet.centroids)

    Part 3a
    centroid_score = []
    medoid_score = []
    for i in range(10):
        k_clusters = kMeans(points, k, init="random", plot=False)
        centroid_score.append(k_clusters.score())

    centroid_mean = sum(centroid_score) / float(len(centroid_score))
    centroid_min = min(centroid_score)
    centroid_max = max(centroid_score)
    print('Centroid avg:', centroid_mean)
    print('Centroid min:', centroid_min)
    print('Centroid max:', centroid_max)

    medoid_score = []
    for i in range(10):
        k_clusters = kMedoids(points, k, init="random", plot=False)
        medoid_score.append(k_clusters.score())

    centroid_mean = sum(medoid_score) / float(len(medoid_score))
    centroid_min = min(medoid_score)
    centroid_max = max(medoid_score)
    print('Medoid avg:', centroid_mean)
    print('Medoid min:', centroid_min)
    print('Medoid max:', centroid_max)



    # part 3b: explore effect of lower-dimensional representations on clustering performance
    np.random.seed(1234)

    U, mu = util.PCA(X)
    X1, y1 = util.limit_pics(X, y, [4, 13], 40)
    k = 2
    ls = range(42)[1::2]

    centroid_score = []
    medoid_score = []

    for l in ls:
        Z, Ul = util.apply_PCA_from_Eig(X1, U, l, mu)
        # X_rec = util.reconstruct_from_PCA(Z, Ul, mu)
        points = build_face_image_points(Z, y1)
        # plot_gallery(X_rec[:12])

        c = kMeans(points, k, init="cheat", plot=False)
        centroid_score.append(c.score())
        k_clusters = kMedoids(points, k, init="cheat")
        medoid_score.append(k_clusters.score())

    scatter = plt.scatter(ls, centroid_score, c='c', s=20)
    scatter2 = plt.scatter(ls, medoid_score, c='r', s=20)
    plt.suptitle('kMeans and kMedoids', fontsize=20)
    plt.xlabel('L', fontsize=16)
    plt.ylabel('Score', fontsize=16)
    plt.legend((scatter, scatter2),
               ('kMeans', 'kMedoids'),
               scatterpoints=1,
               loc='lower right',
               ncol=3,
               fontsize=14)
    plt.show()
    
    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)
  
    totalPeople = 19
    best_score = 0
    worst_score = float("inf")
    best_pair = None
    worst_pair = None
    for p1 in xrange(totalPeople):
        for p2 in xrange(p1+1, totalPeople):
            X3, y3 = util.limit_pics(X, y, [p1, p2], 40)
            points = build_face_image_points(X3, y3)
            clusters = kAverages(points, 2, ClusterSet.medoids, init='cheat', plot=False)
            score = clusters.score()
            if score > best_score:
                best_score = score
                best_pair = (p1,p2)
            if score < worst_score:
                worst_score = score
                worst_pair = (p1,p2)
    
    print(best_pair)
    print(best_score)
    plot_representative_images(X,y, best_pair, title="Most Similar Face")

    print(worst_pair)
    print(worst_score)
    plot_representative_images(X,y, worst_pair, title="Least Similar Face")
Exemplo n.º 9
0
def main():
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    X, y = get_lfw_data()
    # show_image(X[0])
    # show_image(np.mean(X, axis=0))
    U, mu = util.PCA(X)
    # plot_gallery([vec_to_image(U[:,i]) for i in xrange(12)])
    # l_values = [1, 10, 50, 100, 500, 1288]
    # for l_value in l_values:
    #     Z, UI = apply_PCA_from_Eig(X, U, l_value, mu)
    #     X_rec = reconstruct_from_PCA(Z, UI, mu)
    #     title_text = "Reconstructed for l = %d" %l_value
    #     plot_gallery(X_rec, title =title_text)

    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
    np.random.seed(1234)
    # print 'Problem 2(d)'
    # points_list = generate_points_2d(20)

    # only do one or the other, if you do both, the results appear to be wrong...
    # cluster_set_result = kMeans(points_list, 3, plot=True)
    # cluster_set_result2 = kMedoids(points_list, 3, plot=True)

    # using cheat_init
    # cluster_set_result3 = kMeans(points_list, 3, init='cheat', plot=True)
    # cluster_set_result4 = kMedoids(points_list, 3, init='cheat', plot=True)

    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 3a: cluster faces
    np.random.seed(1234)
    X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40)
    points = build_face_image_points(X1, y1)
    k_means_purity = []
    k_medoids_purity = []
    k_means_times = []
    k_medoids_times = []

    for i in range(10):
        # repeat 10 times

        start_time = time.time()
        k_medoids_result = kMedoids(points, 4)
        end_time = time.time()
        k_medoids_purity.append(k_medoids_result.score())
        k_medoids_times.append(end_time - start_time)

        start_time = time.time()
        k_means_result = kMeans(points, 4)
        end_time = time.time()
        k_means_purity.append(k_means_result.score())
        k_means_times.append(end_time - start_time)

    k_means_min = min(k_means_purity)
    k_means_max = max(k_means_purity)
    k_means_average = np.mean(np.asarray(k_means_purity))
    print 'K-means min: %f, max: %f, avg: %f' % (k_means_min, k_means_max,
                                                 k_means_average)
    print 'K-means avg time: %f' % np.mean(np.asarray(k_means_times))

    print 'K-medoids min: %f, max: %f, avg: %f' % (min(k_medoids_purity), \
                                                    max(k_medoids_purity), \
                                                    np.mean(np.asarray(k_medoids_purity)))
    print 'K-medoids avg time: %f' % np.mean(np.asarray(k_medoids_times))
def main():
    ### ========== TODO : START ========== ###
    # part 1: explore LFW data set
    X, y = get_lfw_data()
    #show_image(np.mean(X, axis=0)) #axis 0 is finds the average of the column for all of the images
    U, mu = util.PCA(X)
    """
    l_values = [1,10, 50,100,500, 1288]
    image_arr = np.arange(start=0, stop=12)
    for l in l_values:
        Z, Ul = util.apply_PCA_from_Eig(X, U, l, mu)  # to lower the dimension of the images
        X_rec = reconstruct_from_PCA(Z,Ul,mu)
        title = "Reconstructed images for l = %d" % (l)
        print title
        plot_gallery(X_rec, title= title)
    """

    ### ========== TODO : END ========== ###

    ### ========== TODO : START ========== ###
    # part 2d-2f: cluster toy dataset
    #part 2d
    np.random.seed(1234)
    points = generate_points_2d(N=20)
    print("2d")
    #uncomment this
    """
    clusters = kMeans(points,3,'random',True)
    #end of 2d
    #part 2e
    medoid_cluster = kMedoids(points,3,'random',True)
    #end of 2e
    #part 2f, cheat initialization
    kmeans_cheat = kMeans(points,3,'cheat',True)
    
    kmedoid_cheat = kMedoids(points,3,'cheat', True)
    """
    ### ========== TODO : END ========== ###

    #IMPORTANT
    #Begin of 3a comment
    """
    ### ========== TODO : START ========== ###
    # part 3a: cluster faces
    np.random.seed(1234)
    X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40)
    points = build_face_image_points(X1, y1)
    kmean_score_list = []
    kmedoid_score_list = []

    for i in np.arange(0,10):
        kmean_cluster = kMeans(points, 4, 'random', False)
        kmean_score_list.append(kmean_cluster.score())
        kmedoid_cluster = kMedoids(points, 4, 'random', False)
        kmedoid_score_list.append(kmedoid_cluster.score())
    kmean_avg = np.mean(kmean_score_list)
    kmean_max = max(kmean_score_list)
    kmean_min = min(kmean_score_list)

    kmedoid_avg = np.mean(kmedoid_score_list)
    kmedoid_max = max(kmedoid_score_list)
    kmedoid_min = min(kmedoid_score_list)
    """
    #End of 3A comments

    #IMPORTANT
    #Begin of 3b comment
    """"
    # part 3b: explore effect of lower-dimensional representations on clustering performance
    print ("3b")
    np.random.seed(1234)
    # Use PCA to get the the eigenfaces (and eigenvectors)
    U, mu = util.PCA(X)
    l_range = np.arange(1,42)
    k = 2
    X2, y2 = util.limit_pics(X, y, [4, 13], 40)
    kmean_score_dict = {}
    kmedoid_score_dict = {}
    for l in l_range:
        Z1, Ul1 = apply_PCA_from_Eig(X2,U,l, mu) #reduce the dimension
        X2_reconstructed = reconstruct_from_PCA(Z1,Ul1,mu)
        points = build_face_image_points(X2_reconstructed, y2)
        kmeans_clust = kMeans(points,k,'cheat',False)
        kmedoid_clust = kMedoids(points,k,'cheat',False)
        kmean_score_dict[l] = kmeans_clust.score()
        kmedoid_score_dict[l] = kmedoid_clust.score()
    print "3b here"
    plt.plot(list(kmean_score_dict.keys()),list(kmean_score_dict.values()), color= 'b', label='kMeans')
    plt.plot(list(kmedoid_score_dict.keys()),list(kmedoid_score_dict.values()),color= 'g', label='kMedoid')
    plt.title("kMean and kMedoid Scores vs l (Number of Principal Components)")
    plt.xlabel("l (Number of Principal Components)")
    plt.ylabel("kMean and kMedoid Scores")
    plt.legend()
    plt.show()
    #End of 3b comment
    """

    # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images
    np.random.seed(1234)
    min_score = (np.inf, None, None)
    max_score = (-1, None, None)
    #we know there are 19 people
    print("Starting")
    for i in range(0, 19):
        for j in range(0, 19):
            if i == j:  #if on the same person
                continue
            X3, y3 = util.limit_pics(X, y, [i, j], 40)  #receive the images
            points = build_face_image_points(X3, y3)
            kmedoid_clust = kMedoids(points, 2, 'cheat', False)
            if kmedoid_clust.score() < min_score[0]:
                min_score = (kmedoid_clust.score, i, j)
            if kmedoid_clust.score() > max_score[0]:
                max_score = (kmedoid_clust.score, i, j)
    #now we have the min and max clusters
    print("before the plot")
    plot_representative_images(
        X,
        y, [max_score[1], max_score[2]],
        title="Images with Maximum Cluster Score (Best Clustering)")
    plot_representative_images(
        X,
        y, [min_score[1], min_score[2]],
        title="Images with Minumum Cluster Score (Worst Clustering)")