def main (k, m="means", init_type="random"): # Starting clustering timer start_cluster = timeit.default_timer() # Initialize clusters if init_type == "random": initial_clusters = Initialize.random_centers(k) else: init_type = "kplusplus" initial_clusters = Initialize.kmeans_plusplus(k, train_images_flat,\ dist_fn=Distance.sumsq) # Run clustering algorithm final_responsibilities, final_clusters = Kmeans.kmeans(k,train_images_flat, initial_clusters, distfn = Distance.sumsq, method=m) # Find and print clustering time end_cluster = timeit.default_timer() clustering_time = end_cluster - start_cluster print "Time spent clustering : ", clustering_time # Save representative images to file. title = m + "_" + init_type + "_cluster" + str(k) File.save_images(k, train_images, final_responsibilities, final_clusters, title) ########################################################################### # Calculate Accuracy # ########################################################################### # Calculate final accuracy for clusters final, cluster_set = Accuracy.final_accuracy(final_responsibilities, train_labels, train_images_flat, final_clusters) # Now see how well we can classify the dataset start_cluster_test = timeit.default_timer() predictions = ClassifyClusters.classify(cluster_set, test_images_flat, test_labels, distfn = Distance.sumsq) finish_cluster_test = timeit.default_timer() # find time it took to test testing_time = finish_cluster_test - start_cluster_test print "Time spent testing : ", testing_time ########################################################################### # Outputs # ########################################################################### # k, prediction level, cluster_set, results = {"k" : k, "prediction_accuracy" : predictions[1], "cluster_means" : cluster_set, "cluster_stats" : final, "clustering_time" : clustering_time, "testing_time" : testing_time} with open('./results/' + title + '/' + title + '_results.json', 'w') as outfile: json.dump(results, outfile, cls=File.NumpyEncoder)
def kmeans(k, training_data, initial_clusters, distfn = Distance.sumsq, method = "means"): n = len(training_data) # number of training instances i = 0 # keep track of iteration r = np.zeros((n,k)) # create empty array to store cluster assignments # find and store k that minimize sum of square distance for each image # in form of a vector of shape (n,) newks = np.apply_along_axis(Distance.leastsquares, 1, training_data, initial_clusters, distfn) # Check if every cluster is represented within the cluster assignments; # if not then re-initialize centers while np.array_equal(np.unique(newks), np.array(range(k))) == False: initial_clusters = Initialize.random_centers(k) newks = np.apply_along_axis(Distance.leastsquares, 1, training_data, initial_clusters, distfn) # create one hot coded vector for each image to signify cluster assignment r[range(n), newks] = 1 # create a "means" vector to store cluster centers as they are updated means = initial_clusters # Find new means while True: for smallk in range(k): # iterate through clusters ones = np.where(r[:,smallk]==1)[0] # The k-means method updates cluster centers to be the mean of each # corresponding pixel of the datapoints that are contained in that # cluster. if method == "means": means[smallk,:] = np.mean(training_data[list(ones),:], axis=0) # The k-medoids method updates cluster centers to be the closest # *datapoint* to the mean of the corresponding pixel of datapoints # contained in that cluster. elif method == "medoids": dist_to_ctr = np.sum((training_data[list(ones),:] - np.mean(training_data[list(ones),:], axis=0))**2,axis=1) means[smallk,:]=training_data[list(ones),:]\ [np.argmin(dist_to_ctr)] # The k-medians method updates cluster centeras to be the median of # each corresponding pixel of the datapoints contained in that # cluster. elif method == "medians": means[smallk,:] = np.median(training_data[list(ones),:],axis=0) # If no proper value is chosen for method, then return error. else: raise ValueError("Not a valid method specification; must be \ 'means','medoids', or 'medians'") # update responsibilities by minimizing distance metric r_new = np.zeros((n,k)) # stores indices of k's that minimize distance metric newks = np.apply_along_axis(Distance.leastsquares, 1, training_data, means, distfn) r_new[range(n), newks] = 1 # if none of the responsibilities change, then we've reached the optimal # cluster assignments if np.all((r_new - r)==0): return r, means else: r = r_new # After each iteration, print iteration number and the number of images # assigned to a given cluster. print i, r.sum(axis=0) i += 1 print 'finished'