def q6():
	data_table = viz.load_data_table(viz.DATA_111_URL)
	singleton_list=[]
	for line in data_table:
		singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
	cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
	alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)   
Пример #2
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_from_file("unifiedCancerData_290.csv")
    #data_table = load_data_from_file("unifiedCancerData_896.csv")
    #data_table = load_data_from_file("unifiedCancerData_3108.csv")
    data_table = load_data_from_file("unifiedCancerData_111.csv")
    singleton_list = gen_singleton_list(data_table)

    #print alg_project3.fast_closest_pair(singleton_list)
    #print alg_project3.slow_closest_pairs(singleton_list)

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "k-means clusters"
    print "compute_distortion : ", alg_project3.compute_distortion(
        cluster_list, data_table)

    cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    print "compute_distortion : ", alg_project3.compute_distortion(
        cluster_list, data_table)
Пример #3
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_from_file("unifiedCancerData_290.csv")
    #data_table = load_data_from_file("unifiedCancerData_896.csv")
    #data_table = load_data_from_file("unifiedCancerData_3108.csv")
    data_table = load_data_from_file("unifiedCancerData_111.csv")
    singleton_list = gen_singleton_list(data_table)
    

    #print alg_project3.fast_closest_pair(singleton_list)
    #print alg_project3.slow_closest_pairs(singleton_list)
        
    #cluster_list = sequential_clustering(singleton_list, 15)    
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
    print "Displaying", len(cluster_list), "k-means clusters"
    print "compute_distortion : ", alg_project3.compute_distortion(cluster_list, data_table)

    cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    print "compute_distortion : ", alg_project3.compute_distortion(cluster_list, data_table)
def q6():
    data_table = viz.load_data_table(viz.DATA_111_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
def q10():
	nodes_list = {viz.DATA_111_URL:111, viz.DATA_290_URL:290, viz.DATA_896_URL:896}
	url_list = [viz.DATA_111_URL, viz.DATA_290_URL, viz.DATA_896_URL]

	kmeans_dict = dict()
	hierarchical_dict = dict()


	for url in url_list:
		data_table = viz.load_data_table(url)
		singleton_list = []
		for line in data_table:
			singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))


		kmeans_dict[url] = list()
		hierarchical_dict[url] = list()

		cluster_range = range(6, 20 + 1)
		for cluster_count in cluster_range:
			#kmeans
			cluster_list = alg_project3.kmeans_clustering(singleton_list, cluster_count, 5)
			kmeans_error = compute_distortion(cluster_list, data_table)	 
			kmeans_dict[url].append(kmeans_error)

		#hierarchical
		count = 20
		while count >= 6:
			alg_project3.hierarchical_clustering(singleton_list, count)
			hierarchical_error = compute_distortion(singleton_list, data_table)	 
			hierarchical_dict[url].insert(0, hierarchical_error)
			count -= 1

	for url in url_list:
		plt.title('Distortion for hierarchical and k-means clustering for '+str(nodes_list[url])+' points')
		plt.xlabel('Number of clusters')
		plt.ylabel('Distortion')
		line1, = plt.plot(cluster_range, kmeans_dict[url],'g') 
		line2, = plt.plot(cluster_range, hierarchical_dict[url],'b') 
		plt.legend((line1, line2), ('kmeans clustering', 'hierarchical clustering'))
		plt.show()




#q2()
#q2()
#q3()
#q5()
#q6()
#q7()
#q10()
def q7():

	data_table = viz.load_data_table(viz.DATA_111_URL)
	singleton_list = []
	for line in data_table:
		singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

	cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
	error2 = compute_distortion(cluster_list, data_table)
	
	cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9)
	error1 = compute_distortion(cluster_list, data_table)

	print 'hierarchical clustering',error1
	print 'kmeans clustering', error2
def q10():
    nodes_list = {
        viz.DATA_111_URL: 111,
        viz.DATA_290_URL: 290,
        viz.DATA_896_URL: 896
    }
    url_list = [viz.DATA_111_URL, viz.DATA_290_URL, viz.DATA_896_URL]

    kmeans_dict = dict()
    hierarchical_dict = dict()

    for url in url_list:
        data_table = viz.load_data_table(url)
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        kmeans_dict[url] = list()
        hierarchical_dict[url] = list()

        cluster_range = range(6, 20 + 1)
        for cluster_count in cluster_range:
            #kmeans
            cluster_list = alg_project3.kmeans_clustering(
                singleton_list, cluster_count, 5)
            kmeans_error = compute_distortion(cluster_list, data_table)
            kmeans_dict[url].append(kmeans_error)

        #hierarchical
        count = 20
        while count >= 6:
            alg_project3.hierarchical_clustering(singleton_list, count)
            hierarchical_error = compute_distortion(singleton_list, data_table)
            hierarchical_dict[url].insert(0, hierarchical_error)
            count -= 1

    for url in url_list:
        plt.title('Distortion for hierarchical and k-means clustering for ' +
                  str(nodes_list[url]) + ' points')
        plt.xlabel('Number of clusters')
        plt.ylabel('Distortion')
        line1, = plt.plot(cluster_range, kmeans_dict[url], 'g')
        line2, = plt.plot(cluster_range, hierarchical_dict[url], 'b')
        plt.legend((line1, line2),
                   ('kmeans clustering', 'hierarchical clustering'))
        plt.show()
def q7():

    data_table = viz.load_data_table(viz.DATA_111_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    cluster_list = alg_project3.kmeans_clustering(singleton_list, 9, 5)
    error2 = compute_distortion(cluster_list, data_table)

    cluster_list = alg_project3.hierarchical_clustering(singleton_list, 9)
    error1 = compute_distortion(cluster_list, data_table)

    print('hierarchical clustering', error1)
    print('kmeans clustering', error2)
def test_kmeans():
    """
    Test for k-means clustering
    kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways
    """
    
    # load small data table
    print
    print "Testing kmeans_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)
        
    kmeansdata_24 = [[15, 1, set([('34017', '36061'), ('06037',), ('06059',), ('36047',), ('36081',), ('06071', '08031'), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], 
                     [15, 3, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [15, 5, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [10, 1, set([('34017', '36061'), ('06029', '06037', '06075'), ('11001', '24510', '34013', '34039', '51013', '51760', '51840', '54009'), ('06059',), ('36047',), ('36081',), ('06071', '08031', '41051', '41067'), ('36059',), ('36005',), ('01073', '55079')])],
                     [10, 3, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [10, 5, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])],
                     [5, 1, set([('06029', '06037', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36047', '51013', '51760', '51840', '54009', '55079'), ('06059',), ('36005', '36059', '36061', '36081'), ('06071', '08031', '41051', '41067')])],
                     [5, 3, set([('06029', '06037', '06075'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '51760', '51840', '54009', '55079')])],
                     [5, 5, set([('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])]]    
        
    suite = poc_simpletest.TestSuite()    
    
    for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24:
        
        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        # compute student answer
        student_clustering = student.kmeans_clustering(cluster_list, num_clusters, num_iterations)
        student_county_tuple = set_of_county_tuples(student_clustering)
        
        # Prepare test
        error_message = "Testing kmeans_custering on 24 county table, num_clusters = " + str(num_clusters)
        error_message += " num_iterations = " + str(num_iterations)
        error_message += "\nStudent county tuples: " + str(student_county_tuple)
        error_message += "\nExpected county tuples: " + str(expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True, error_message)   

        
         

    suite.report_results()
Пример #10
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_896_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    #cluster_list = sequential_clustering(singleton_list, 15)	
    #print "Displaying", len(cluster_list), "sequential clusters"
    k_dis = []
    h_dis = []
    for cluster_num in xrange(6, 21):
        cluster_list = alg_project3.hierarchical_clustering([c.copy() for c in singleton_list], cluster_num)
        print "Displaying", len(cluster_list), "hierarchical clusters"
        h_dis.append(alg_app3.compute_distortion(cluster_list, data_table))

        cluster_list = alg_project3.kmeans_clustering([c.copy() for c in singleton_list], cluster_num, 5)	
        print "Displaying", len(cluster_list), "k-means clusters"
        k_dis.append(alg_app3.compute_distortion(cluster_list, data_table))

    xvals = range(6, 21)
    plt.plot(xvals, k_dis, '-b', label='kmeans_clustering (5 iteration)')
    plt.plot(xvals, h_dis, '-r', label='hierarchical_clustering')
    plt.legend(loc='upper right')
    plt.xlabel("number of clusters")
    plt.ylabel("distortion")
    plt.title("Distortion of 2 clustering methods: 896 counties")
    plt.show()
            
    # draw the clusters using matplotlib or simplegui
    """
def test_kmeans():
    """
    Test for k-means clustering
    kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways
    """

    # load small data table
    print
    print "Testing kmeans_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)

    kmeansdata_24 = [[
        15, 1,
        set([('34017', '36061'), ('06037', ), ('06059', ), ('36047', ),
             ('36081', ), ('06071', '08031'), ('36059', ), ('36005', ),
             ('55079', ), ('34013', '34039'), ('06075', ), ('01073', ),
             ('06029', ), ('41051', '41067'),
             ('11001', '24510', '51013', '51760', '51840', '54009')])
    ],
                     [
                         15, 3,
                         set([('34017', '36061'), ('06037', '06059'),
                              ('06071', ), ('36047', ), ('36081', ),
                              ('08031', ), ('36059', ), ('36005', ),
                              ('55079', ), ('34013', '34039'), ('06075', ),
                              ('01073', ), ('06029', ), ('41051', '41067'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         15, 5,
                         set([('34017', '36061'), ('06037', '06059'),
                              ('06071', ), ('36047', ), ('36081', ),
                              ('08031', ), ('36059', ), ('36005', ),
                              ('55079', ), ('34013', '34039'), ('06075', ),
                              ('01073', ), ('06029', ), ('41051', '41067'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         10, 1,
                         set([('34017', '36061'), ('06029', '06037', '06075'),
                              ('11001', '24510', '34013', '34039', '51013',
                               '51760', '51840', '54009'), ('06059', ),
                              ('36047', ), ('36081', ),
                              ('06071', '08031', '41051', '41067'),
                              ('36059', ), ('36005', ), ('01073', '55079')])
                     ],
                     [
                         10, 3,
                         set([('34013', '34017', '36061'),
                              ('06029', '06037', '06075'),
                              ('08031', '41051', '41067'), ('06059', '06071'),
                              ('34039', '36047'), ('36081', ), ('36059', ),
                              ('36005', ), ('01073', '55079'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         10, 5,
                         set([('34013', '34017', '36061'),
                              ('06029', '06037', '06075'),
                              ('08031', '41051', '41067'), ('06059', '06071'),
                              ('34039', '36047'), ('36081', ), ('36059', ),
                              ('36005', ), ('01073', '55079'),
                              ('11001', '24510', '51013', '51760', '51840',
                               '54009')])
                     ],
                     [
                         5, 1,
                         set([('06029', '06037', '06075'),
                              ('01073', '11001', '24510', '34013', '34017',
                               '34039', '36047', '51013', '51760', '51840',
                               '54009', '55079'), ('06059', ),
                              ('36005', '36059', '36061', '36081'),
                              ('06071', '08031', '41051', '41067')])
                     ],
                     [
                         5, 3,
                         set([('06029', '06037', '06075'),
                              ('11001', '24510', '34013', '34017', '34039',
                               '36005', '36047', '36059', '36061', '36081',
                               '51013'), ('08031', '41051', '41067'),
                              ('06059', '06071'),
                              ('01073', '51760', '51840', '54009', '55079')])
                     ],
                     [
                         5, 5,
                         set([('06029', '06037', '06075'),
                              ('08031', '41051', '41067'), ('06059', '06071'),
                              ('01073', '55079'),
                              ('11001', '24510', '34013', '34017', '34039',
                               '36005', '36047', '36059', '36061', '36081',
                               '51013', '51760', '51840', '54009')])
                     ]]

    suite = poc_simpletest.TestSuite()

    for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24:

        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        # compute student answer
        student_clustering = student.kmeans_clustering(cluster_list,
                                                       num_clusters,
                                                       num_iterations)
        student_county_tuple = set_of_county_tuples(student_clustering)

        # Prepare test
        error_message = "Testing kmeans_custering on 24 county table, num_clusters = " + str(
            num_clusters)
        error_message += " num_iterations = " + str(num_iterations)
        error_message += "\nStudent county tuples: " + str(
            student_county_tuple)
        error_message += "\nExpected county tuples: " + str(
            expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True,
                       error_message)

    suite.report_results()