def compute_distortion(): data_table = alp.load_data_table(alp.DATA_111_URL) cluster_list = alp.run_example() list1 = [] for i in cluster_list: error = i.cluster_error(data_table) list1.append(error) print sum(list1)
def question10(URLs, min_number_clusters=6, max_number_clusters=20, steps=5, file_to_save="Question10.png"): ''' :return: ''' for url in URLs: hierarchical_distortion_result = [] kmeans_distortion_result = [] for num in range(max_number_clusters,min_number_clusters-1,-1): data_table, singleton_list = alg_project3_viz.run_example(url) cluster_list_hierarchical = hierarchical_clustering(singleton_list, num) hierarchical_distortion = sum([x.cluster_error(data_table) for x in cluster_list_hierarchical]) hierarchical_distortion_result.append(hierarchical_distortion) hierarchical_distortion_result.reverse() for num in range(min_number_clusters, max_number_clusters + 1): data_table, singleton_list = alg_project3_viz.run_example(url) #print "num clsusters", num cluster_list_kmeans = kmeans_clustering(singleton_list, num, steps) kmeans_distortion = sum(cluster.cluster_error(data_table) for cluster in cluster_list_kmeans) #print "kmeans distortion", kmeans_distortion kmeans_distortion_result.append(kmeans_distortion) counties = re.search('_(\d+)\.csv', url).group(1) print "hierarchical clustering with", num, "cluster for", counties, "counties", hierarchical_distortion_result print "k-means clustering with", num, "cluster for", counties, "counties", kmeans_distortion_result, "\n" label_hierar = 'Distortion of hierarchical clustering for' + counties label_kmeans = 'Distortion of k-means clustering for' + counties range0 = range(min_number_clusters, max_number_clusters+1) plot.plot(range0, hierarchical_distortion_result, '-r', label=label_hierar) plot.plot(range0, kmeans_distortion_result, '-g', label=label_kmeans) plot.title('Hierarchical vs K-means distortion - PyCharm (IntelliJ)') plot.xlabel('Size of Cluster') plot.ylabel('Distortion') plot.legend(loc='upper right') plot.tight_layout() if file_to_save: file = file_to_save[:-4] + "_" + counties + '.png' plot.savefig(file) plot.clf()
def question6(URL, file_to_save, number_clusters=9, iterations=5, centers=False): ''' :return: ''' data_table, singleton_list = alg_project3_viz.run_example(URL) cluster_list = kmeans_clustering(singleton_list, number_clusters, iterations) print "Displaying", len(cluster_list), "k-means clusters" if centers: file = file_to_save[:-4] + 'with_centers' + '.png' alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file, True) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file_to_save, False)
def question2(URL, file_to_save, number_clusters=15, centers=False): ''' :return: ''' data_table, singleton_list = alg_project3_viz.run_example(URL) cluster_list = hierarchical_clustering(singleton_list, number_clusters) print "Displaying", len(cluster_list), "hierarchical clusters" if centers: file = file_to_save[:-4] + 'with_centers' + '.png' alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file, centers) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, file_to_save, centers)
def question7(URL, number_clusters=9, iterations=5): ''' :return: ''' data_table, singleton_list = alg_project3_viz.run_example(URL) cluster_list_hierarchical = hierarchical_clustering(singleton_list, number_clusters) cluster_list_kmeans = kmeans_clustering(singleton_list, number_clusters, iterations) hierarchical_distortion = sum([x.cluster_error(data_table) for x in cluster_list_hierarchical]) kmeans_distortion = sum([y.cluster_error(data_table) for y in cluster_list_kmeans]) counties = re.search('_(\d+)\.csv', URL).group(1) print "Displaying distortion of hierarchical clusteringfor", counties, "counties", hierarchical_distortion print "Displaying distortion of k-means clustering", counties, "counties", kmeans_distortion
def gen_random_clusters(): listx = [] listy_slow = [] listy_fast = [] data_table = alp.load_data_table(alp.DATA_896_URL) for i in range(6,21): list1 = [] sumerror = 0 sumerror_fast = 0 cluster_list = alp.run_example(i, True) cluster_list_fast = alp.run_example(i, False) for s in cluster_list: error = s.cluster_error(data_table) list1.append(error) sumerror = sum(list1) list1 = [] for s in cluster_list_fast: error = s.cluster_error(data_table) list1.append(error) sumerror_fast = sum(list1) listx.append(i) listy_slow.append(sumerror) listy_fast.append(sumerror_fast) plt.plot(listx, listy_slow, '-r', label='hierarchical clustering') plt.plot(listx, listy_fast, '-b', label='kmeans clustering') plt.legend(loc='upper right') plt.title("Quality - Data Set of 896") plt.ylabel('Total error') plt.xlabel('Number of clusters') plt.show()