def main (k, m="means", init_type="random"):
    # Starting clustering timer
    start_cluster = timeit.default_timer()

    # Initialize clusters
    if init_type == "random":
        initial_clusters = Initialize.random_centers(k)
    else:
        init_type = "kplusplus"
        initial_clusters = Initialize.kmeans_plusplus(k, train_images_flat,\
            dist_fn=Distance.sumsq)
        
    # Run clustering algorithm
    final_responsibilities, final_clusters = Kmeans.kmeans(k,train_images_flat,
        initial_clusters, distfn = Distance.sumsq, method=m)

    # Find and print clustering time
    end_cluster = timeit.default_timer()
    clustering_time = end_cluster - start_cluster
    print "Time spent clustering : ", clustering_time

    # Save representative images to file.
    title = m + "_" + init_type + "_cluster" + str(k)
    File.save_images(k, train_images, final_responsibilities, 
                     final_clusters, title)

    ###########################################################################
    #                           Calculate Accuracy                            #
    ###########################################################################

    # Calculate final accuracy for clusters
    final, cluster_set = Accuracy.final_accuracy(final_responsibilities, 
        train_labels, train_images_flat, final_clusters)

    # Now see how well we can classify the dataset
    start_cluster_test = timeit.default_timer()
    predictions = ClassifyClusters.classify(cluster_set, test_images_flat, 
        test_labels, distfn = Distance.sumsq)
    finish_cluster_test = timeit.default_timer()

    # find time it took to test 
    testing_time = finish_cluster_test - start_cluster_test
    print "Time spent testing : ", testing_time

    ###########################################################################
    #                                 Outputs                                 #
    ###########################################################################

    # k, prediction level, cluster_set, 
    results = {"k" : k, "prediction_accuracy" : predictions[1], 
    "cluster_means" : cluster_set, "cluster_stats" : final,
    "clustering_time" : clustering_time, "testing_time" : testing_time}

    with open('./results/' + title + '/' + title + '_results.json', 'w') as outfile:
        json.dump(results, outfile, cls=File.NumpyEncoder)
示例#2
0
def kmeans(k, training_data, initial_clusters, distfn = Distance.sumsq,
    method = "means"):

    n = len(training_data) # number of training instances

    i = 0 # keep track of iteration

    r = np.zeros((n,k)) # create empty array to store cluster assignments

    # find and store k that minimize sum of square distance for each image
    # in form of a vector of shape (n,)
    newks = np.apply_along_axis(Distance.leastsquares, 1, training_data,
        initial_clusters, distfn)

    # Check if every cluster is represented within the cluster assignments;
    # if not then re-initialize centers
    while np.array_equal(np.unique(newks), np.array(range(k))) == False:
        initial_clusters = Initialize.random_centers(k)
        newks = np.apply_along_axis(Distance.leastsquares, 1, training_data,
            initial_clusters, distfn)

    # create one hot coded vector for each image to signify cluster assignment
    r[range(n), newks] = 1

    # create a "means" vector to store cluster centers as they are updated
    means = initial_clusters

    # Find new means
    while True:
        for smallk in range(k): # iterate through clusters
            ones = np.where(r[:,smallk]==1)[0]
            # The k-means method updates cluster centers to be the mean of each
            # corresponding pixel of the datapoints that are contained in that
            # cluster.
            if method == "means":
                means[smallk,:] = np.mean(training_data[list(ones),:], axis=0)
            # The k-medoids method updates cluster centers to be the closest
            # *datapoint* to the mean of the corresponding pixel of datapoints
            # contained in that cluster.
            elif method == "medoids":
                dist_to_ctr = np.sum((training_data[list(ones),:] 
                    - np.mean(training_data[list(ones),:], axis=0))**2,axis=1)
                means[smallk,:]=training_data[list(ones),:]\
                    [np.argmin(dist_to_ctr)]
            # The k-medians method updates cluster centeras to be the median of
            # each corresponding pixel of the datapoints contained in that
            # cluster.
            elif method == "medians":
                means[smallk,:] = np.median(training_data[list(ones),:],axis=0)
            # If no proper value is chosen for method, then return error.
            else:
                raise ValueError("Not a valid method specification; must be \
                    'means','medoids', or 'medians'")

        # update responsibilities by minimizing distance metric
        r_new = np.zeros((n,k))

        # stores indices of k's that minimize distance metric
        newks = np.apply_along_axis(Distance.leastsquares, 1, training_data,
          means, distfn)
        r_new[range(n), newks] = 1

        # if none of the responsibilities change, then we've reached the optimal
        # cluster assignments
        if np.all((r_new - r)==0):
            return r, means
        else:
            r = r_new
        # After each iteration, print iteration number and the number of images
        # assigned to a given cluster.
        print i, r.sum(axis=0)
        i += 1

    print 'finished'