Exemplo n.º 1
0
def question10(data, filename):
    table = load_data_table(data)
    clusters = load_as_list(data)
    xs = range(6, 21)
    ys_hier = []

    def dist(clusters):
        ys_hier.append(distortion(clusters, table))

    hierarchical_clustering(clusters, 6, dist, set(xs))
    ys_hier.reverse()
    clusters = load_as_list(data)
    ys_kmeans = [
        distortion(kmeans_clustering(clusters, x, 5), table) for x in xs
    ]

    plt.cla()
    plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion')
    plt.plot(xs, ys_kmeans, '-b', label='k-means clustering distortion')
    plt.title('Clustering distortion (%s)' % data)
    plt.xlabel('Number of output clusters')
    plt.ylabel('Distortion')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(filename)
    print('Saved plot to %s' % filename)
def clustering_distortion(data_url, cluster_method):
    """Return a list of distortions.

    Input: a data_url for information on cancer data and either clustering
    method of des.kmeans_clustering or des.hierarchical_clustering

    Output: a list of distortions for a range of iterations for
    kmeans_clustering
    """
    cluster_list = des.cluster_lst(data_url)
    distortions_list = []

    if cluster_method == des.kmeans_clustering:
        for num_clstr in range(6, 21):
            kmeans_clusters = des.kmeans_clustering(cluster_list, num_clstr, 5)
            distortions_list.append(
                compute_distortion(kmeans_clusters, data_url))

    elif cluster_method == des.hierarchical_clustering:
        init_hierachical_clusters = des.hierarchical_clustering(
            cluster_list, 20)
        distortions_list.append(
            compute_distortion(init_hierachical_clusters, data_url))
        for num_clstr in range(19, 5, -1):
            hierachical_clusters = des.hierarchical_clustering(
                init_hierachical_clusters, num_clstr)
            distortions_list.append(
                compute_distortion(hierachical_clusters, data_url))
        distortions_list.reverse()

    else:
        return "Invalid cluster_method"

    return distortions_list
Exemplo n.º 3
0
    def test_hierarchical_clustering2(self):

        cluster0 = p.c.Cluster(set(["Al"]), 1.1, 10, 10, 0)
        cluster1 = p.c.Cluster(set(["DK"]), 1, 2, 10, 0.01)

        self.assertEqual(p.hierarchical_clustering([], 0), [])
        self.assertEqual(p.hierarchical_clustering([cluster0], 1), [cluster0])
        self.assertEqual(p.hierarchical_clustering([cluster0, cluster1], 2),
                         [cluster0, cluster1])
Exemplo n.º 4
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    data_table = load_data_table_local(DATA_290)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    singleton_list_copy = [singleton.copy() for singleton in singleton_list]

    # cluster_list = sequential_clustering(singleton_list, 15)
    # print "Displaying", len(cluster_list), "sequential clusters"

    hierarchical_distortions = []

    cluster_list = alg_project3_solution.hierarchical_clustering(
        singleton_list, 20)
    hierarchical_distortions.append(
        compute_distortion(cluster_list, data_table))
    for num_clusters in range(19, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(
            cluster_list, num_clusters)
        hierarchical_distortions.append(
            compute_distortion(cluster_list, data_table))
    hierarchical_distortions.reverse()
    # print "Displaying", len(cluster_list), "hierarchical clusters"

    kmeans_distortions = []

    for num_clusters in range(6, 21):
        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list_copy, num_clusters, 5)
        kmeans_distortions.append(compute_distortion(cluster_list, data_table))
    # print "Displaying", len(cluster_list), "k-means clusters"

    # code to compute distortion
    # distortion = compute_distortion(cluster_list, data_table)
    # print "distortion = " + str(distortion)

    # draw the clusters using matplotlib or simplegui
    # if DESKTOP:
    # alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
    #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    # else:
    # alg_clusters_simplegui.PlotClusters(data_table, cluster_list)   # use toggle in GUI to add cluster centers

    return [hierarchical_distortions, kmeans_distortions]
Exemplo n.º 5
0
    def test_four_pairs(self):
        s = map(lambda x: Cluster(*x), [
            (set([1]), 0, 0, 13, 0.1),
            (set([2]), 0, 0, 14, 0.2),
            (set([3]), 0, 0, 14, 0.2),
            (set([4]), 0, 0, 14, 0.2),
            (set([10]), 10, 10, 14, 0.2),
            (set([11]), 10, 10, 14, 0.2),
            (set([12]), 10, 10, 14, 0.2),
            (set([13]), 10, 10, 14, 0.2),
        ])
        # s = map(lambda x: Cluster(*x),
        #         [(set([1]), 0, 0, 13, 0.1),
        #          (set([2]), 1, 0, 14, 0.2),
        #          (set([3]), 2, 0, 14, 0.2),
        #          (set([4]), 3, 0, 14, 0.2),

        #          (set([10]), 0, 100, 14, 0.2),
        #          (set([11]), 1, 100, 14, 0.2),
        #          (set([12]), 2, 100, 14, 0.2),
        #          (set([13]), 3, 100, 14, 0.2),
        #          ])

        h = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   hierarchical_clustering(s, 2), [])
        k = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   kmeans_clustering(s, 2, 10), [])

        h = sorted([list(x) for x in h])
        k = sorted([list(x) for x in k])

        # print(h)
        # print(k)
        self.assertEqual(h, k)
Exemplo n.º 6
0
    def test_one_pair(self):
        s = map(lambda x: Cluster(*x), [(set([5]), 0, 0, 13, 0.1),
                                        (set([10]), 42, 0, 14, 0.2)])

        r = reduce(lambda acc, x: [acc, acc.update(x.fips_codes())][0],
                   hierarchical_clustering(s, 1), set())
        self.assertEqual(set([5, 10]), r)
Exemplo n.º 7
0
def q10_legend(DATA_URL):
    data_table = load_data_table(DATA_URL)
    singleton_list = []
    hierarchical_cluster_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
        hierarchical_cluster_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    xvals = []
    yvals1 = []
    yvals2 = []
    for num_clusters in range(20, 5, -1):
        xvals.append(num_clusters)
        hierarchical_cluster_list = alg_project3_solution.hierarchical_clustering(
            hierarchical_cluster_list, num_clusters)
        yvals1.append(compute_distortion(hierarchical_cluster_list,
                                         data_table))
        yvals2.append(
            compute_distortion(
                alg_project3_solution.kmeans_clustering(
                    singleton_list, num_clusters, 5), data_table))
    curve1 = [[xvals[idx], yvals1[idx]] for idx in range(len(xvals))]
    curve2 = [[xvals[idx], yvals2[idx]] for idx in range(len(xvals))]
    simpleplot.plot_lines(
        "The distortion of output clusters uesd " + str(len(data_table)) +
        "-county data set", 800, 600, "the number of output clusters",
        "the distortion associated with each output clustering",
        [curve1, curve2], True, ["hierarchical cluster", "kmeans cluster"])
    def test_four_pairs(self):
        s = map(lambda x: Cluster(*x),
                [(set([1]), 0, 0, 13, 0.1),
                 (set([2]), 0, 0, 14, 0.2),
                 (set([3]), 0, 0, 14, 0.2),
                 (set([4]), 0, 0, 14, 0.2),

                 (set([10]), 10, 10, 14, 0.2),
                 (set([11]), 10, 10, 14, 0.2),
                 (set([12]), 10, 10, 14, 0.2),
                 (set([13]), 10, 10, 14, 0.2),
                 ])
        # s = map(lambda x: Cluster(*x),
        #         [(set([1]), 0, 0, 13, 0.1),
        #          (set([2]), 1, 0, 14, 0.2),
        #          (set([3]), 2, 0, 14, 0.2),
        #          (set([4]), 3, 0, 14, 0.2),

        #          (set([10]), 0, 100, 14, 0.2),
        #          (set([11]), 1, 100, 14, 0.2),
        #          (set([12]), 2, 100, 14, 0.2),
        #          (set([13]), 3, 100, 14, 0.2),
        #          ])

        h = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   hierarchical_clustering(s, 2), [])
        k = reduce(lambda acc, x: [acc, acc.append(x.fips_codes())][0],
                   kmeans_clustering(s, 2, 10), [])

        h = sorted([list(x) for x in h])
        k = sorted([list(x) for x in k])

        # print(h)
        # print(k)
        self.assertEqual(h, k)
Exemplo n.º 9
0
def q10():
	sizes = xrange(6,21)
	data_file = open('unifiedCancerData_896.csv','r')
	data = data_file.read()
	data_lines = data.split('\n')
	data_tokens = [line.split(',') for line in data_lines]
	data_table = [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] for tokens in data_tokens]
	singleton_list = []
	singleton_list1 = []
	
	t1 = []
	t2 = []
	for item in sizes:
		singleton_list = []
		singleton_list1 = []
		for line in data_table:
			singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
			singleton_list1.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
		cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, item)
		t1.append(compute_distortion(cluster_list,data_table))
		cluster_list1 = alg_project3_solution.kmeans_clustering(singleton_list1, item, 5)
		t2.append(compute_distortion(cluster_list1,data_table))
	print t1
	print t2
	plt.plot(sizes,t1,'r-',label='hierarchical_clustering')
	plt.plot(sizes,t2,'b-',label='kmeans_clustering')
	plt.title('CancerData_896')
	plt.xlabel('Number of output clusters')
	plt.ylabel('Distortion associated with each output clustering')
	plt.legend(loc='upper right')
	plt.show()
Exemplo n.º 10
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    #cluster_list = sequential_clustering(singleton_list, 15)	
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 15)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)	
    #print "Displaying", len(cluster_list), "k-means clusters"

            
    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(data_table, cluster_list)   # use toggle in GUI to add cluster centers
Exemplo n.º 11
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_290_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"
    cluster_list = alg_project3_solution.hierarchical_clustering(
        list(singleton_list), 16)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    print "Distortion", compute_distortion(cluster_list, data_table)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list2 = alg_project3_solution.kmeans_clustering(
        list(singleton_list), 16, 5)
    print "Displaying", len(cluster_list2), "k-means clusters"
    print "Distortion", compute_distortion(cluster_list2, data_table)
Exemplo n.º 12
0
    def test_hierarchical_clustering6(self):

        cluster0 = p.c.Cluster(set(["Al"]), 1.1, 10, 10, 0)
        cluster1 = p.c.Cluster(set(["DK"]), 1, 2, 10, 0.01)
        cluster2 = p.c.Cluster(set(["SW"]), 4, 60, 10, 0.05)
        cluster3 = p.c.Cluster(set(["Brasil"]), 4, 2, 100000000,
                               3)  #cluster1.merge_clusters(cluster2)

        clone0 = cluster0.copy()
        clone1 = cluster1.copy()
        clone2 = cluster2.copy()
        clone3 = cluster3.copy()

        result = p.hierarchical_clustering(
            [cluster0, cluster1, cluster2, cluster3], 2)
        result_str = ""
        for res in result:
            result_str += str(res)

        expected = [
            clone0.merge_clusters(clone1.merge_clusters(clone3)), clone2
        ]
        exp_str = ""
        for exp in expected:
            exp_str += str(exp)

        self.assertEqual(result_str, exp_str)
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    # data_table = load_data_table(DATA_3108_URL)

    print 'in run_example'

    k_n=[]
    h_n=[]
    for x in range(6,21):
        print '------>:',x,'<-----\n'
        # kmeans
        data_table=load_data_table(DATA_111_URL)
        singleton_list=[]
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        cluster_list_k = alg_project3_solution.kmeans_clustering(singleton_list, x, 5)
        kmeans=reduce(lambda x,y:x+y,map(lambda x:x.cluster_error(data_table),cluster_list_k))
        k_n.append(kmeans)

        #hierarchical
        data_table=load_data_table(DATA_111_URL)
        singleton_list=[]
        for line in data_table:
            singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        cluster_list_h = alg_project3_solution.hierarchical_clustering(singleton_list, x)
        hierarchical=reduce(lambda x,y:x+y,map(lambda x:x.cluster_error(data_table),cluster_list_h))
        h_n.append(hierarchical)
    print 'kmean:',k_n
    print 'hierarchical:',h_n
Exemplo n.º 14
0
def question5(filename):
    data = 'unifiedCancerData_111.csv'
    dist = distortion(
        visualize(data, filename, lambda x: hierarchical_clustering(x, 9)),
        load_data_table(data))
    print('Distortion in question5, hierarchical_clustering = %f (%s)' %
          (dist, dist))
Exemplo n.º 15
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = alg_project3_solution.hierarchical_clustering(
        singleton_list, 15)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False)
        #alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
def test_hierarchical24():
    """
    Test for hierarchical clustering
    Note that hierarchical_clustering mutates cluster_list
    """
    
    # load small data table
    print
    print "Testing hierarchical_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)
    
    
    # test data of the form [size of output cluster, sets of county tuples]
    hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])],
                   [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])],
                   [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])],
                   [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])],
                   ]

        
    suite = poc_simpletest.TestSuite()
    
    for num_clusters, expected_county_tuple in hierdata_24:
        
        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        # compute student answer
        student_clustering = student.hierarchical_clustering(cluster_list, num_clusters)
        student_county_tuple = set_of_county_tuples(student_clustering)
        
        # Prepare test
        error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters)
        error_message += "\nStudent county tuples: " + str(student_county_tuple)
        error_message += "\nExpected county tuples: " + str(expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True, error_message)

    suite.report_results()
Exemplo n.º 17
0
    def test_one_pair(self):
        s = map(lambda x: Cluster(*x),
                [(set([5]), 0, 0, 13, 0.1),
                 (set([10]), 42, 0, 14, 0.2)])

        r = reduce(lambda acc, x: [acc, acc.update(x.fips_codes())][0],
                   hierarchical_clustering(s, 1), set())
        self.assertEqual(set([5, 10]), r)
def test_hierarchical24():
    """
    Test for hierarchical clustering
    Note that hierarchical_clustering mutates cluster_list
    """

    # load small data table
    print
    print "Testing hierarchical_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)

    # test data of the form [size of output cluster, sets of county tuples]
    hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])],
                   [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])],
                   [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])],
                   [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])],
                   ]

    suite = poc_simpletest.TestSuite()

    for num_clusters, expected_county_tuple in hierdata_24:

        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        # compute student answer
        student_clustering = student.hierarchical_clustering(cluster_list, num_clusters)
        student_county_tuple = set_of_county_tuples(student_clustering)

        # Prepare test
        error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters)
        error_message += "\nStudent county tuples: " + str(student_county_tuple)
        error_message += "\nExpected county tuples: " + str(expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True, error_message)

    suite.report_results()
Exemplo n.º 19
0
    def distortion_h(data_table, num_clstrs):
        singleton_list = \
            [alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])
                for line in data_table]
        cluster_list = \
            alg_project3_solution.hierarchical_clustering(singleton_list, num_clstrs)
        distortion = \
            sum([clstr.cluster_error(data_table) for clstr in cluster_list])

        return distortion
Exemplo n.º 20
0
def cluster_by_hierarchical(data_table_url,num_clusters):
    # load data table
    data_table = load_data_table(data_table_url)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, num_clusters)	
    return cluster_list
def question_seven():
    """Return the distortion for kmeans and hierarchical clusters."""
    q7_data_url = des.DATA_111_URL  # change url depending on desired data table
    q7_clst = des.cluster_lst(q7_data_url)
    q7_kmeans_clusters = des.kmeans_clustering(q7_clst, 9, 5)
    q7_hierarchical_clusters = des.hierarchical_clustering(q7_clst, 9)

    kmeans_dist = compute_distortion(q7_kmeans_clusters, q7_data_url)
    hierarchical_dist = compute_distortion(q7_hierarchical_clusters,
                                           q7_data_url)

    return "hierarchical distortion =", hierarchical_dist, "kmeans distortion =", kmeans_dist
Exemplo n.º 22
0
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    #data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_111_URL)
    #data_table = load_data_table(DATA_290_URL)
    data_table = load_data_table(DATA_896_URL)


    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    hier = []
    k_means = []
    for num in range(6, 21):
        singleton_list_copy = [item.copy() for item in singleton_list]
        #cluster_list = sequential_clustering(singleton_list, 15)
        #print "Displaying", len(cluster_list), "sequential clusters"

        cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list_copy, num)
        hier.append(compute_distortion(cluster_list, data_table))
        #print "Displaying", len(cluster_list), "hierarchical clusters"

        cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, num, 5)
        k_means.append(compute_distortion(cluster_list, data_table))
    print hier
    print k_means
    plt.plot(range(6, 21), hier, label="hierarchical_clustering")
    plt.plot(range(6, 21), k_means, label="kmeans_clustering")
    plt.xlabel("Number of outcome clusters")
    plt.ylabel("Distrotion")
    plt.title("Distrotion with 896 counties")
    plt.legend()
    plt.show()
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "k means", compute_distortion(cluster_list, data_table)
    #print "Displaying", len(cluster_list), "k-means clusters"


    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"


    # draw the clusters using matplotlib or simplegui
    """
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_896_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16)
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 16, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"

    kmeans = []
    for clusters_number in xrange(6, 21):
        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, clusters_number, 5)
        kmeans.append([
            clusters_number, 0.0 +
            alg_project3_solution.compute_distortion(cluster_list, data_table)
        ])

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 20)
    #hierarchical = [[20, alg_project3_solution.compute_distortion(cluster_list, data_table)]]
    hierarchical = []
    for clusters_number in xrange(20, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(
            singleton_list, clusters_number)
        hierarchical.append([
            clusters_number, 0.0 +
            alg_project3_solution.compute_distortion(cluster_list, data_table)
        ])
    hierarchical.reverse()
    #print hierarchical[10], kmeans[10]

    simpleplot.plot_lines(
        "Distortion of the clusterings produced by hierarchical and k-means metods on 896 county data set",
        800, 600, "Number of clusters n [6 .. 20]", "Distortion",
        [hierarchical, kmeans], False,
        ["Hierarchical clustering", "k-means clustering with 5 iterations"])
Exemplo n.º 24
0
def question10(data, filename):
    table = load_data_table(data)
    clusters = Cluster.load_as_list(data)
    xs = range(6, 21)
    ys_hier = []

    def dist(clusters):
        ys_hier.append(distortion(clusters, table))

    hierarchical_clustering(clusters, 6, dist, set(xs))
    ys_hier.reverse()
    ys_kmeans = [distortion(kmeans_clustering(clusters, x, 5), table) for x in xs]

    plt.cla()
    plt.plot(xs, ys_hier, '-r', label='Hierarchical clustering distortion')
    plt.plot(xs, ys_kmeans, '-b', label='K-means clustering distortion')
    plt.title('Clustering distortion (%s)' % data)
    plt.xlabel('Number of output clusters')
    plt.ylabel('Distortion')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(filename)
    print('Saved plot to %s' % filename)
Exemplo n.º 25
0
def compute_hier_distortions(cluster_list):
    """ list -> list
    Takes a list of cluster objects and returns the list of distortions as that
    list is further clustered from 20 down to 5 clusters.
    """
    distortions = []

    for iteration in range(20, 5, -1):
        new_list = sol.hierarchical_clustering(cluster_list, iteration)
        cluster_list = new_list
        distortions.append(sol.compute_distortion(new_list, data_table))

    distortions.reverse()
    return distortions
Exemplo n.º 26
0
    def test_hierarchical_clustering3(self):

        cluster0 = p.c.Cluster(set(["Al"]), 1.1, 10, 10, 0)
        cluster1 = p.c.Cluster(set(["DK"]), 1, 2, 2, 0.01)
        cluster2 = p.c.Cluster(set(["SW"]), 4, 60, 1, 0.05)

        clone0 = cluster0.copy()
        clone1 = cluster1.copy()
        clone2 = cluster2.copy()

        result = set((p.hierarchical_clustering([cluster0, cluster1, cluster2],
                                                1))[0].fips_codes())
        expected = set(((clone0.merge_clusters(clone1)).merge_clusters(clone2)
                        ).fips_codes())

        self.assertEqual(result, expected)
Exemplo n.º 27
0
def clustering():
    title_list = ['111 counties', '290 counties', '896 counties']
    url_list = [DATA_111_URL, DATA_290_URL, DATA_896_URL]
    distortion_hierarchical = [[], [], []]
    distortion_kmeans = [[], [], []]
    num_clusters_list = range(20, 5, -1)

    for idx in range(len(url_list)):
        data_table = load_data_table(url_list[idx])
        cluster_list = []
        for line in data_table:
            cluster_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))
        cluster_list_copy = [cluster.copy() for cluster in cluster_list]
        for num_cluster in num_clusters_list:
            cluster_list = student.hierarchical_clustering(
                cluster_list, num_cluster)
            distortion = compute_distortion(cluster_list, data_table)
            distortion_hierarchical[idx].append(distortion)
            print "Displaying", len(
                cluster_list), "hierarchical clusters, distortion:", distortion

        for num_cluster in num_clusters_list:
            cluster_list = student.kmeans_clustering(cluster_list_copy,
                                                     num_cluster, 5)
            distortion = compute_distortion(cluster_list, data_table)
            distortion_kmeans[idx].append(distortion)
            print "Displaying", len(
                cluster_list), "k-means clusters, distortion:", distortion

        plot_num = 131 + idx
        plt.subplot(plot_num)
        plt.plot(num_clusters_list,
                 distortion_hierarchical[idx],
                 "o-",
                 label="hierarchical")
        plt.plot(num_clusters_list,
                 distortion_kmeans[idx],
                 "x-",
                 label="kmeans")
        plt.legend()
        plt.ylabel('Distortion')
        plt.xlabel('Number of clusters')
        plt.grid(True)
        plt.title(title_list[idx])
    plt.show()
def compute_distortion(cluster_list, data_table, out_size):
    clust_list = cluster_list[:]

    # note that hierarchical_clustering mutates cluster_list
    clusters_k = cluster_algs.kmeans_clustering(clust_list, out_size, 5)
    clusters_h = cluster_algs.hierarchical_clustering(cluster_list, out_size)

    distortion_h = 0
    distortion_k = 0

    for cluster_h in clusters_h:
        distortion_h += cluster_h.cluster_error(data_table)

    for cluster_k in clusters_k:
        distortion_k += cluster_k.cluster_error(data_table)

    return (distortion_h, distortion_k)
Exemplo n.º 29
0
def run_example_two():

    #data_table = load_data_table(DATA_896_URL)
    #data_table = load_data_table(DATA_290_URL)
    data_table = load_data_table(DATA_111_URL)

    min_num_of_clusters = 6
    max_num_of_clusters = 20

    kmeans_points = {}
    hierarchical_points = {}

    num_of_clusters = min_num_of_clusters

    while num_of_clusters <= max_num_of_clusters:
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        # generate the clusters
        cluster_list = alg_project3_solution.hierarchical_clustering(
            singleton_list, num_of_clusters)

        # calculate the distortion
        distortion = compute_distortion(cluster_list, data_table)
        #print(distortion)
        hierarchical_points.update({num_of_clusters: distortion})

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))

        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, num_of_clusters, 5)
        distortion = compute_distortion(cluster_list, data_table)
        #print(distortion)
        kmeans_points.update({num_of_clusters: distortion})

        num_of_clusters += 1

    plot_graphs(hierarchical_points, kmeans_points)
Exemplo n.º 30
0
def run_hierarchical():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 15)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    
    # draw the clusters using matplotlib or simplegui
    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True)
Exemplo n.º 31
0
    def test_hierarchical_clustering4(self):

        cluster0 = p.c.Cluster(set(["Al"]), 1.1, 10, 10, 0)
        cluster1 = p.c.Cluster(set(["DK"]), 1, 2, 10, 0.01)
        cluster2 = p.c.Cluster(set(["SW"]), 4, 60, 10, 0.05)
        cluster3 = p.c.Cluster(set(["Brasil"]), 4, 2, 100000000,
                               3)  #cluster1.merge_clusters(cluster2)

        clone0 = cluster0.copy()
        clone1 = cluster1.copy()
        clone2 = cluster2.copy()
        clone3 = cluster3.copy()

        result = p.hierarchical_clustering(
            [cluster0, cluster1, cluster2, cluster3], 4)
        expected = [cluster0, cluster1, cluster2, cluster3]

        self.assertEqual(result, expected)
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_896_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    #cluster_list = sequential_clustering(singleton_list, 15)	
    #print "Displaying", len(cluster_list), "sequential clusters"

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 16)
    #print "Displaying", len(cluster_list), "hierarchical clusters"
 
    #cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, 16, 5)	
    #print "Displaying", len(cluster_list), "k-means clusters"
    
    kmeans = []
    for clusters_number in xrange(6, 21):
        cluster_list = alg_project3_solution.kmeans_clustering(singleton_list, clusters_number, 5)
        kmeans.append([clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table)])

    #cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, 20)
    #hierarchical = [[20, alg_project3_solution.compute_distortion(cluster_list, data_table)]]
    hierarchical = []
    for clusters_number in xrange(20, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(singleton_list, clusters_number)
        hierarchical.append([clusters_number, 0.0 + alg_project3_solution.compute_distortion(cluster_list, data_table)])
    hierarchical.reverse() 
    #print hierarchical[10], kmeans[10]
 
 
    
    
    simpleplot.plot_lines("Distortion of the clusterings produced by hierarchical and k-means metods on 896 county data set",
                      800, 600, "Number of clusters n [6 .. 20]", "Distortion",
                      [hierarchical, kmeans], False,
                      ["Hierarchical clustering", "k-means clustering with 5 iterations"])            
Exemplo n.º 33
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    #cluster_list = sequential_clustering(singleton_list, 15)    
    #print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = hierarchical_clustering(singleton_list, 9)
    cluster_error = compute_distortion(cluster_list, data_table)
    print "hierarchical clusters distortion {0}".format(cluster_error)
Exemplo n.º 34
0
def plot_distortions():
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_3108_URL = DIRECTORY + "data_clustering/unifiedCancerData_3108.csv"
    DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_24_URL = DIRECTORY + "data_clustering/unifiedCancerData_24.csv"

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    data_table = load_data_table(DATA_896_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    errors_h = []
    for num_clusters in range(6, 21):
        cluster_list = hierarchical_clustering(
            [clu.copy() for clu in singleton_list], num_clusters)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_h.append(cluster_error)

    errors_k = []
    for num_clusters in range(6, 21):
        cluster_list = kmeans_clustering(
            [clu.copy() for clu in singleton_list], num_clusters, 5)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_k.append(cluster_error)

    xlabel("number of output clusters")
    ylabel("distortion")
    #xscale('log')
    #yscale('log')
    plot(range(6, 21), errors_h, '-b', label="hierarchical")
    plot(range(6, 21), errors_k, '-r', label="kmeans")

    legend(loc="upper left")
    title("896 county data sets")
    show()
Exemplo n.º 35
0
def plot_distortions():
    DIRECTORY = "http://commondatastorage.googleapis.com/codeskulptor-assets/"
    DATA_3108_URL = DIRECTORY + "data_clustering/unifiedCancerData_3108.csv"
    DATA_896_URL = DIRECTORY + "data_clustering/unifiedCancerData_896.csv"
    DATA_290_URL = DIRECTORY + "data_clustering/unifiedCancerData_290.csv"
    DATA_111_URL = DIRECTORY + "data_clustering/unifiedCancerData_111.csv"
    DATA_24_URL = DIRECTORY + "data_clustering/unifiedCancerData_24.csv"

    
        
    #cluster_list = sequential_clustering(singleton_list, 15)    
    #print "Displaying", len(cluster_list), "sequential clusters"

    data_table = load_data_table(DATA_896_URL)
    singleton_list = []
    for line in data_table:
        singleton_list.append(Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    errors_h = []
    for num_clusters in range(6,21):             
        cluster_list = hierarchical_clustering([clu.copy() for clu in singleton_list], num_clusters)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_h.append(cluster_error)
        
    errors_k = []
    for num_clusters in range(6,21):             
        cluster_list = kmeans_clustering([clu.copy() for clu in singleton_list], num_clusters, 5)
        cluster_error = compute_distortion(cluster_list, data_table)
        errors_k.append(cluster_error)
        
    xlabel("number of output clusters")
    ylabel("distortion")
    #xscale('log')
    #yscale('log')
    plot(range(6,21), errors_h, '-b', label="hierarchical")
    plot(range(6,21), errors_k, '-r', label="kmeans")
    
    legend(loc="upper left")
    title("896 county data sets")
    show()
Exemplo n.º 36
0
def run_distortion_graph():
    data_table = load_data_table(DATA_896_URL)
    size_clusters = range(6, 21)
    hierarchical_distortion = []
    kmeans_distortion = []
    for size in size_clusters:
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))
        cluster_list_hierarchical = alg_project3_solution.hierarchical_clustering(
            singleton_list, size)
        singleton_list = []
        for line in data_table:
            singleton_list.append(
                alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                    line[4]))
        cluster_list_kmeans = alg_project3_solution.kmeans_clustering(
            singleton_list, size, 5)
        hierarchical_distortion.append(
            compute_distortion(cluster_list_hierarchical, data_table) /
            100000000000.0)
        kmeans_distortion.append(
            compute_distortion(cluster_list_kmeans, data_table) /
            100000000000.0)

    plt.figure()
    plt.plot(size_clusters,
             hierarchical_distortion,
             '-b',
             label='Hierarchical_distortion')
    plt.plot(size_clusters, kmeans_distortion, '-g', label='Kmeans_distortion')
    plt.legend(loc='upper right')
    plt.title(
        'Distortion for hierarchical and k-means clustering for 896 data')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion(x 10^11)')
    plt.show()
Exemplo n.º 37
0
def run_example():
    """
    Load a data table, compute a list of clusters and
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)
    #data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    #cluster_list = sequential_clustering(singleton_list, 15)
    #print "Displaying", len(cluster_list), "sequential clusters"

    start_time = time()
    cluster_list = alg_project3_solution.hierarchical_clustering(
        singleton_list, 15)
    print "Displaying", len(cluster_list), "hierarchical clusters"
    end_time = time()

    hierarchical_clustering_time = end_time - start_time

    start_time = time()
    cluster_list = alg_project3_solution.kmeans_clustering(
        singleton_list, 15, 5)
    print "Displaying", len(cluster_list), "k-means clusters"
    end_time = time()

    kmeans_clustering_time = end_time - start_time

    print hierarchical_clustering_time, kmeans_clustering_time
    """
Exemplo n.º 38
0
def question10_plot(date_url):
    data_table = load_data_table(date_url)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    xvals = [cluster_num_k for cluster_num_k in range(6, 21)]

    kc_cd_yvals = []
    for cluster_num_k in range(20, 5, -1):
        cluster_list = alg_project3_solution.kmeans_clustering(
            singleton_list, cluster_num_k, 5)
        kc_cd_yvals.append(compute_distortion(cluster_list, data_table))
    kc_cd_yvals.reverse()

    hc_cd_yvals = []
    cluster_list = list(singleton_list)
    for cluster_num_k in range(20, 5, -1):
        cluster_list = alg_project3_solution.hierarchical_clustering(
            cluster_list, cluster_num_k)
        hc_cd_yvals.append(compute_distortion(cluster_list, data_table))
    hc_cd_yvals.reverse()

    plt.plot(xvals, kc_cd_yvals, '-b', label='kmeans_clustering_distortion')
    plt.plot(xvals,
             hc_cd_yvals,
             '-r',
             label='hierarchical_clustering_distortion')
    plt.legend(loc='upper right')
    plt.xlabel('cluster num')
    plt.ylabel('distortion')
    title_str = 'DATA: ' + re.search('[0-9]+', date_url).group(0)
    plt.title(title_str)
    plt.grid(True)
    plt.show()
def run_suite():
    """
    Testing code for the functions written for Word Wrangler
    """
    
    # create a TestSuite (and an object)
    suite = poc_simpletest.TestSuite()

    # create a set of 3 clusters
    cluster1 = CC.Cluster([1, 1], 0, 0, 100, 0.00001)
    cluster2 = CC.Cluster([2, 2, 2], 3, 4, 200, 0.00002)
    cluster3 = CC.Cluster([3, 3, 3, 3], 6, 8, 300, 0.00003)
    list_of_clusters = [cluster1, cluster2, cluster3]
        
    # testing the slow_closest_pair function with the 3 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #1: testing slow_closest_pair on 3 clusters")
    # testing the fast_closest_pair function with the 3 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #2: testing fast_closest_pair on 3 clusters")

    # add a fourth cluster to the list
    cluster4 = CC.Cluster([4, 4, 4, 4, 4], 12, 16, 400, 0.00004)
    list_of_clusters.append(cluster4)

    # testing the slow_closest_pair function with the 4 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #3: testing slow_closest_pair on 4 clusters")
    # testing the fast_closest_pair function with the 4 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (5., 0, 1),
                   "Test #4: testing fast_closest_pair on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), 0, 0, 1, 0)
    cluster2 = CC.Cluster(set([]), 1, 0, 1, 0)
    cluster3 = CC.Cluster(set([]), 2, 0, 1, 0)
    cluster4 = CC.Cluster(set([]), 3, 0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing closest_pair_strip on 4 clusters
    suite.run_test(student.closest_pair_strip(list_of_clusters, 1.5, 1.0), (1.0, 1, 2),
                   "Test #5: testing closest_pair_strip on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), 1.0, 0.0, 1, 0)
    cluster2 = CC.Cluster(set([]), 4.0, 0.0, 1, 0)
    cluster3 = CC.Cluster(set([]), 5.0, 0.0, 1, 0)
    cluster4 = CC.Cluster(set([]), 7.0, 0.0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing fast_closest_pair on 4 clusters
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.0, 1, 2),
                   "Test #6: testing closest_pair_strip on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), -4.0, 0.0, 1, 0)
    cluster2 = CC.Cluster(set([]), 0.0, -1.0, 1, 0)
    cluster3 = CC.Cluster(set([]), 0.0, 1.0, 1, 0)
    cluster4 = CC.Cluster(set([]), 4.0, 0.0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing closest_pair_strip on 4 clusters
    suite.run_test(student.closest_pair_strip(list_of_clusters, 0.0, 4.1231059999999999), (2.0, 1, 2),
                   "Test #7: testing closest_pair_strip on 4 clusters")

    # create a set of 4 clusters
    cluster1 = CC.Cluster(set([]), -4.0, 0.0, 1, 0)
    cluster2 = CC.Cluster(set([]), 0.0, -1.0, 1, 0)
    cluster3 = CC.Cluster(set([]), 0.0, 1.0, 1, 0)
    cluster4 = CC.Cluster(set([]), 4.0, 0.0, 1, 0)
    list_of_clusters = [cluster1, cluster2, cluster3, cluster4]
        
    # testing fast_closest_pair on 4 clusters
    suite.run_test(student.fast_closest_pair(list_of_clusters), (2.0, 1, 2),
                   "Test #8: testing fast_closest_pair on 4 clusters")

    # create a sorted list_of_clusters from a small dataset containing 8 clusters
    fhandle = open("unifiedCancerData_8.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
    print "The following list_of_clusters was loaded:"
    for index in range(len(list_of_clusters)):
        print index, list_of_clusters[index]
    print

    # testing the slow_closest_pair function with 8 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (2.4479655653349655, 5, 7),
                   "Test #9: testing slow_closest_pair on 8 clusters")
    # testing the fast_closest_pair function with 8 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (2.4479655653349655, 5, 7),
                   "Test #10: testing fast_closest_pair on 8 clusters")
    # testing the hierarchical_clustering function with 8 clusters
    clustering_result = student.hierarchical_clustering(list_of_clusters, 5)
    for index in range(len(clustering_result)):
        print clustering_result[index]
    print
    # testing the kmeans_clustering function with 8 clusters
    clustering_result = student.kmeans_clustering(list_of_clusters, 5, 3)
    for index in range(len(clustering_result)):
        print clustering_result[index]
    print

    # create a sorted list_of_clusters from a small dataset containing 17 clusters
    fhandle = open("unifiedCancerData_17.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
 
    # testing the slow_closest_pair function with 17 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (1.9439662413427632, 9, 10),
                   "Test #11: testing slow_closest_pair on 17 clusters")
    # testing the fast_closest_pair function with 17 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.9439662413427632, 9, 10),
                   "Test #12: testing fast_closest_pair on 17 clusters")

    # create a sorted list_of_clusters from a small dataset containing 24 clusters
    fhandle = open("unifiedCancerData_24.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
    print "The following list_of_clusters was loaded:"
    for index in range(len(list_of_clusters)):
        print index, list_of_clusters[index]
    print

    # testing the kmeans_clustering function with 24 clusters
    clustering_result = student.kmeans_clustering(list_of_clusters, 10, 1)
    print "This output was created by kmeans_slustering:"
    for index in range(len(clustering_result)):
        print index, clustering_result[index]
    print

    # create a sorted list_of_clusters from a small dataset containing 39 clusters
    fhandle = open("unifiedCancerData_39.txt")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
 
    # testing the slow_closest_pair function with 39 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (1.6612217536988727, 22, 24),
                   "Test #13: testing slow_closest_pair on 39 clusters")
    # testing the fast_closest_pair function with 39 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.6612217536988727, 22, 24),
                   "Test #14: testing fast_closest_pair on 39 clusters")

    # create a sorted list_of_clusters from a small dataset containing 111 clusters
    fhandle = open("unifiedCancerData_111.csv")
    list_of_clusters = []
    for line in fhandle:
        tokens = line.split(',')
        cluster = CC.Cluster(set([tokens[0]]), float(tokens[1]), float(tokens[2]),
                             int(tokens[3]), float(tokens[4]))
        list_of_clusters.append(cluster)
    list_of_clusters.sort(key = lambda cluster: cluster.horiz_center())
    print "The following list_of_clusters was loaded:"
    for index in range(len(list_of_clusters)):
        print index, list_of_clusters[index]
    print
 
    # testing the slow_closest_pair function with 111 cluster list
    suite.run_test(student.slow_closest_pair(list_of_clusters), (1.266216002018164, 79, 81),
                   "Test #15: testing slow_closest_pair on 111 clusters")
    # testing the fast_closest_pair function with 111 cluster list    
    suite.run_test(student.fast_closest_pair(list_of_clusters), (1.266216002018164, 79, 81),
                   "Test #16: testing fast_closest_pair on 111 clusters")
    # testing the hierarchical_clustering function with 111 clusters
    clustering_result = student.hierarchical_clustering(list_of_clusters, 5)
    for index in range(len(clustering_result)):
        print clustering_result[index]
    print

    # report number of tests and failures
    print
    suite.report_results()
Exemplo n.º 40
0
def question5(filename):
    data = 'data/unifiedCancerData_111.csv'
    dist = distortion(visualize(data, filename,
                                lambda x: hierarchical_clustering(x, 9)),
                      load_data_table(data))
    print('Distortion in question5, hierarchical_clustering = %f (%s)' % (dist, dist))
Exemplo n.º 41
0
"""
Assignment 3 Question 7 Answer
"""

import alg_project3_viz as viz
import alg_project3_solution as sol
import alg_cluster

data_table = viz.load_data_table(viz.DATA_111_URL)

hier_data_list = sol.make_data_list(data_table)
kmeans_data_list = sol.make_data_list(data_table)

hier_cluster_list = sol.hierarchical_clustering(hier_data_list, 9)
kmeans_cluster_list = sol.kmeans_clustering(kmeans_data_list, 9, 5)

print("hierarchical:", sol.compute_distortion(hier_cluster_list, data_table))
print("kmeans:", sol.compute_distortion(kmeans_cluster_list, data_table))


# Hierarchical: 175163886915.8305 or 1.752 x 10^11 with four significant figures
# K-means: 271254226924.20047 or 2.712 x 10^11
Exemplo n.º 42
0
def hier():
    sol.hierarchical_clustering(s, 9)
Exemplo n.º 43
0
DATA_3108_URL = "unifiedCancerData_3108.csv"
DATA_896_URL = "dunifiedCancerData_896.csv"
DATA_290_URL = "unifiedCancerData_290.csv"
DATA_111_URL = "unifiedCancerData_111.csv"

def load_data_table(data_url):
    """
    Import a table of county-based cancer risk data
    from a csv format file
    """
    data = pd.read_csv(DATA_3108_URL, names=['fips_codes', 'horiz_pos', 'vert_pos', 'population', 'risk'])
    print "Loaded", len(data), "data points"
    return [alg_cluster.Cluster(set([data.ix[token][0]]), float(data.ix[token][1]), float(data.ix[token][2]), int(data.ix[token][3]), float(data.ix[token][4])) for token in data.index]

data_table = load_data_table(DATA_3108_URL)
cluster_list = alg_project3_solution.hierarchical_clustering (data_table, 15)

############################################################
# Code to create sequential clustering
# Create alphabetical clusters for county data

def sequential_clustering(singleton_list, num_clusters):
    """
    Take a data table and create a list of clusters
    by partitioning the table into clusters based on its ordering
    
    Note that method may return num_clusters or num_clusters + 1 final clusters
    """
    
    cluster_list = []
    cluster_idx = 0
Exemplo n.º 44
0
def question2(filename):
    visualize('unifiedCancerData_3108.csv', filename,
              lambda x: hierarchical_clustering(x, 15))
Exemplo n.º 45
0
def question2(filename):
    visualize('data/unifiedCancerData_3108.csv', filename,
              lambda x: hierarchical_clustering(x, 15))
    return [[tokens[0], float(tokens[1]), float(tokens[2]), int(tokens[3]), float(tokens[4])] 
            for tokens in data_tokens]


#####################################
# Code for answering question 7 of the application

# Read the input data for 290 county data and create a list of clusters
data_table = load_data_table(DATA_290_URL)
    
singleton_list = []
for line in data_table:
    singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

# Create the clustered lists needed for computing the distortions
hierarchical_list = sol.hierarchical_clustering(singleton_list, 16)
kmeans_list = sol.kmeans_clustering(singleton_list, 16, 5) 

# Compute and print the distortions
num_clusters = len(kmeans_list)
hierarchical_distortion = 0
kmeans_distortion = 0
for index in range(num_clusters):
    hierarchical_distortion += hierarchical_list[index].cluster_error(data_table)
    kmeans_distortion += kmeans_list[index].cluster_error(data_table)
    
# Print the results
print
print "=====> Results for 290 county datapoints in 16 clusters"
print ".......... Distortion for hiearchical_clustering:", hierarchical_distortion
print ".......... Distortion for kmeans_clustering:     ", kmeans_distortion
Exemplo n.º 47
0
# create clusters and calculate distortion
data_table_url = DATA_896_URL
data_table_url = DATA_290_URL
data_table_url = DATA_111_URL
data_table_url_name = "DATA_111_URL"

start = 6
end = 20
num_iterate = 5

hierarchical_distortion = []
# use 20 as start cluster list
cluster_list = cluster_by_hierarchical(data_table_url,end)
hierarchical_distortion.append(compute_distortion(cluster_list,data_table_url))
for i in range(end-1,start-1,-1):
    cluster_list = alg_project3_solution.hierarchical_clustering(cluster_list,i)
    hierarchical_distortion.append(compute_distortion(cluster_list,data_table_url))

reversed_hierarchical_distortion = hierarchical_distortion[::-1]
print reversed_hierarchical_distortion


kmeans_distortion = []
for i in range(start,end+1):
    cluster_list = cluster_by_kmeans(data_table_url,i,num_iterate)
    kmeans_distortion.append(compute_distortion(cluster_list,data_table_url))
    
print kmeans_distortion 
    
    
# plot the curve
Exemplo n.º 48
0
def hier():
    sol.hierarchical_clustering(s, 9)
def compute_and_plot_distortions():
    """
    Compute the distortion of the list of clusters produced by hierarchical clustering and k-means clustering (using 5 iterations)
    on the 111, 290, and 896 county data sets, respectively, where the number of output clusters ranges from 6 to 20 (inclusive).
    Important note:To compute the distortion for all 15 output clusterings produced by hierarchical_clustering, you should remember
    that you can use the hierarchical cluster of size 20 to compute the hierarchical clustering of size 19 and so on. Otherwise,
     you will introduce an unnecessary factor of 15 into the computation of the 15 hierarchical clusterings.
    """

    #choose data set:
    #data_table = viz.load_data_table(viz.DATA_111_URL)
    #data_table = viz.load_data_table(viz.DATA_290_URL)
    data_table = viz.load_data_table(viz.DATA_896_URL)

    num_output_clusters = []
    kmeans_distortion = []
    hierarchical_distortion = []

    print "\nComputing kmeans distortions"
    for indx in range(6, 21):
        ##Dette loop kunne optimeres, saa beregningerne genbruges, men det er ikke noedvendigt, da k_means er saa hurtig
        num_output_clusters.append(indx)

        singleton_list = []
        for line in data_table:
            singleton_list.append(
                c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        kmeans_cluster_list = p.kmeans_clustering(singleton_list, indx, 5)
        distortion = compute_distortion(kmeans_cluster_list, data_table)
        kmeans_distortion.append(distortion)
        print indx, distortion

    print "Computed kmeans distortions"
    print ""
    print "Computing hierarchical distortions"

    for line in data_table:
        singleton_list.append(
            c.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

    hierarchical_cluster_list = singleton_list

    for indx in range(20, 5, -1):
        hierarchical_cluster_list = p.hierarchical_clustering(
            hierarchical_cluster_list, indx)
        distortion = compute_distortion(hierarchical_cluster_list, data_table)
        hierarchical_distortion.append(distortion)
        print indx, distortion

    hierarchical_distortion.reverse()

    print "Computed hierarchical distortions\n"
    print "Plotting data"

    plt.plot(num_output_clusters,
             kmeans_distortion,
             label="K-means clustering")
    plt.plot(num_output_clusters,
             hierarchical_distortion,
             label="Hierarchical clustering")

    plt.xlabel("Number of output clusters")
    plt.ylabel('Distortion')

    #tegner
    plt.legend()

    plt.title(
        "Comparison of distortion of two clustering methods \n Dataset: 896 counties"
    )

    #goer det hele synligt
    plt.show()