def test_hierarchical24(): """ Test for hierarchical clustering Note that hierarchical_clustering mutates cluster_list """ # load small data table print print "Testing hierarchical_clustering on 24 county set" data_24_table = load_data_table(DATA_24_URL) # test data of the form [size of output cluster, sets of county tuples] hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])], [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])], [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])], [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])], [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])], [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])], [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])], [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])], [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])], [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])], [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])], [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])], [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])], ] suite = poc_simpletest.TestSuite() for num_clusters, expected_county_tuple in hierdata_24: # build initial list of clusters for each test since mutation is allowed cluster_list = [] for idx in range(len(data_24_table)): line = data_24_table[idx] cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute student answer student_clustering = student.hierarchical_clustering(cluster_list, num_clusters) student_county_tuple = set_of_county_tuples(student_clustering) # Prepare test error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters) error_message += "\nStudent county tuples: " + str(student_county_tuple) error_message += "\nExpected county tuples: " + str(expected_county_tuple) suite.run_test(student_county_tuple == expected_county_tuple, True, error_message) suite.report_results()
def calculate_distortion(): data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list_hierarchical = closest_pairs_and_clustering_algorithms.hierarchical_clustering( singleton_list, 9) cluster_list_kmeans = closest_pairs_and_clustering_algorithms.kmeans_clustering( singleton_list, 9, 5) distortion_hierarchical = closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_hierarchical, data_table) distortion_kmeans = closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_kmeans, data_table) print "distortion_hierarchical: ", distortion_hierarchical print "distortion_kmeans: ", distortion_kmeans
def compare_distortion(): distortion_hierarchical_111 = [] distortion_kmeans_111 = [] distortion_hierarchical_290 = [] distortion_kmeans_290 = [] distortion_hierarchical_896 = [] distortion_kmeans_896 = [] data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) for num in xrange(6, 21): cluster_list_hierarchical = closest_pairs_and_clustering_algorithms.hierarchical_clustering( singleton_list[:], num) cluster_list_kmeans = closest_pairs_and_clustering_algorithms.kmeans_clustering( singleton_list[:], num, 5) distortion_hierarchical_111.append( closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_hierarchical, data_table)) distortion_kmeans_111.append( closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_kmeans, data_table)) print "distortion_hierarchical_111: ", distortion_hierarchical_111 print "distortion_kmeans_111: ", distortion_kmeans_111 data_table = load_data_table(DATA_290_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) for num in xrange(6, 21): cluster_list_hierarchical = closest_pairs_and_clustering_algorithms.hierarchical_clustering( singleton_list[:], num) cluster_list_kmeans = closest_pairs_and_clustering_algorithms.kmeans_clustering( singleton_list[:], num, 5) distortion_hierarchical_290.append( closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_hierarchical, data_table)) distortion_kmeans_290.append( closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_kmeans, data_table)) print "distortion_hierarchical_290: ", distortion_hierarchical_290 print "distortion_kmeans_290: ", distortion_kmeans_290 data_table = load_data_table(DATA_896_URL) singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) for num in xrange(6, 21): cluster_list_hierarchical = closest_pairs_and_clustering_algorithms.hierarchical_clustering( singleton_list[:], num) cluster_list_kmeans = closest_pairs_and_clustering_algorithms.kmeans_clustering( singleton_list[:], num, 5) distortion_hierarchical_896.append( closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_hierarchical, data_table)) distortion_kmeans_896.append( closest_pairs_and_clustering_algorithms.compute_distortion( cluster_list_kmeans, data_table)) print "distortion_hierarchical_896: ", distortion_hierarchical_896 print "distortion_kmeans_896: ", distortion_kmeans_896