Exemplo n.º 1
0
def main (k, m="means", init_type="random"):
    # Starting clustering timer
    start_cluster = timeit.default_timer()

    # Initialize clusters
    if init_type == "random":
        initial_clusters = Initialize.random_centers(k)
    else:
        init_type = "kplusplus"
        initial_clusters = Initialize.kmeans_plusplus(k, train_images_flat,\
            dist_fn=Distance.sumsq)
        
    # Run clustering algorithm
    final_responsibilities, final_clusters = Kmeans.kmeans(k,train_images_flat,
        initial_clusters, distfn = Distance.sumsq, method=m)

    # Find and print clustering time
    end_cluster = timeit.default_timer()
    clustering_time = end_cluster - start_cluster
    print "Time spent clustering : ", clustering_time

    # Save representative images to file.
    title = m + "_" + init_type + "_cluster" + str(k)
    File.save_images(k, train_images, final_responsibilities, 
                     final_clusters, title)

    ###########################################################################
    #                           Calculate Accuracy                            #
    ###########################################################################

    # Calculate final accuracy for clusters
    final, cluster_set = Accuracy.final_accuracy(final_responsibilities, 
        train_labels, train_images_flat, final_clusters)

    # Now see how well we can classify the dataset
    start_cluster_test = timeit.default_timer()
    predictions = ClassifyClusters.classify(cluster_set, test_images_flat, 
        test_labels, distfn = Distance.sumsq)
    finish_cluster_test = timeit.default_timer()

    # find time it took to test 
    testing_time = finish_cluster_test - start_cluster_test
    print "Time spent testing : ", testing_time

    ###########################################################################
    #                                 Outputs                                 #
    ###########################################################################

    # k, prediction level, cluster_set, 
    results = {"k" : k, "prediction_accuracy" : predictions[1], 
    "cluster_means" : cluster_set, "cluster_stats" : final,
    "clustering_time" : clustering_time, "testing_time" : testing_time}

    with open('./results/' + title + '/' + title + '_results.json', 'w') as outfile:
        json.dump(results, outfile, cls=File.NumpyEncoder)
test_images,test_labels = File.load_mnist("testing",path=os.getcwd())
# flatten training images into 60,000 x 784 array
train_images_flat = np.array([np.ravel(img) for img in train_images])
test_images_flat = np.array([np.ravel(img) for img in test_images])

###############################################################################
#                               Run Scikit_learn                              #
###############################################################################
k = int(sys.argv[1]) # number of clusters (system argument)

# Train k means model
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans_fit = kmeans.fit(train_images_flat)
# Get the cluster assignments of each point of training images
kmeans_labels = kmeans_fit.labels_
kmeans_centers = kmeans_fit.cluster_centers_ 

# Initialize a vector of responsibilities in a one-hot-coded format.
final_responsibilities = np.zeros((len(train_images_flat),k))
# For each cluster assignment, assign the appropriate vector in the
# one-hot-coded format to a 1.
for imgnum in range(len(train_images_flat)):
	final_responsibilities[imgnum][kmeans_labels[imgnum]] = 1


# Obtain predictions for each point.
Z = kmeans.predict(test_images_flat)

# Determine accuracies.
Accuracy.final_accuracy(final_responsibilities, train_labels, 
    train_images_flat, kmeans_centers)