def test_kmeans_when_k_is_3(datasetPath, expected1, expected2, expected3): random.seed(1) dataset = read.read_csv(datasetPath) expected_clustering1 = read.read_csv(expected1) expected_clustering2 = read.read_csv(expected2) expected_clustering3 = read.read_csv(expected3) clustering = kmeans.k_means(dataset=dataset, k=3) cost = kmeans.cost_function(clustering) for _ in range(10): new_clustering = kmeans.k_means(dataset=dataset, k=3) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost assert len(clustering.keys()) == 3 assert clustered_all_points(clustering, dataset) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered == [ expected_clustering1, expected_clustering2, expected_clustering3 ]
def test_read(dataset, expected): actual_data = read.read_csv(dataset) expected_data = expected assert len(actual_data) == len(expected_data) for i in range(len(actual_data)): assert actual_data[i] == expected_data[i]
def test_kmeans_when_k_is_2(datasetPath, expected1, expected2): dataset = read.read_csv(datasetPath) expected_clustering1 = read.read_csv(expected1) expected_clustering2 = read.read_csv(expected2) clustering = kmeans.k_means_pp(dataset=dataset, k=2) cost = kmeans.cost_function(clustering) for _ in range(10): new_clustering = kmeans.k_means_pp(dataset=dataset, k=2) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost assert len(clustering.keys()) == 2 assert clustered_all_points(clustering, dataset) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered.sort() == [expected_clustering1, expected_clustering2].sort()
def test_kmeans_when_k_is_1(datasetPath): dataset = read.read_csv(datasetPath) expected_clustering = dataset clustering = kmeans.k_means(dataset=dataset, k=1) assert len(clustering.keys()) == 1 assert clustered_all_points(clustering, dataset) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered == [expected_clustering]
def test_read(dataset): actual_data = read.read_csv(dataset) expected_data = [[138, 143], [93, 104], [61, 69], [179, 260], [48, 75], [37, 63], [29, 50], [23, 48], [30, 111], [2, 50], [38, 52], [46, 53], [71, 79], [25, 57], [298, 317], [74, 93], [50, 58], [76, 80], [381, 464], [387, 459], [78, 106], [60, 57], [507, 634], [50, 64], [77, 89], [64, 77], [40, 60], [136, 139], [243, 291], [256, 288], [94, 85], [36, 46], [45, 53], [67, 67], [120, 115], [172, 183], [66, 86], [46, 65], [121, 113], [44, 58], [64, 63], [56, 142], [40, 64], [116, 130], [87, 105], [43, 61], [43, 50], [161, 232], [36, 54]] assert len(actual_data) == len(expected_data) for i in range(len(actual_data)): assert actual_data[i] == expected_data[i]
def kmeans(self, args): """Run the kmeans command """ import csv dataset = read.read_csv(args.dataset_file) clustering = kmeans.k_means(dataset=dataset, k=int(args.k)) cost = kmeans.cost_function(clustering) for _ in range(100): new_clustering = kmeans.k_means(dataset=dataset, k=int(args.k)) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost for assignment in clustering.keys(): file_name = str(args.dataset).split(".")[0]+"_k_is_"+args.k+"_"+str(assignment)+".csv" with open(file_name, "w") as f: writer = csv.writer(f) print("assignement ", assignment, " is: ", clustering[assignment]) writer.writerows(clustering[assignment]) f.close()
while assignments != old_assignments: new_centers = update_centers(dataset, assignments) old_assignments = assignments assignments = assign_points(dataset, new_centers) clustering = defaultdict(list) for assignment, point in zip(assignments, dataset): clustering[assignment].append(point) return clustering def k_means(dataset, k): if k not in range(1, len(dataset)+1): raise ValueError("lengths must be in [1, len(dataset)]") k_points = generate_k(dataset, k) return _do_lloyds_algo(dataset, k_points) def k_means_pp(dataset, k): if k not in range(1, len(dataset)+1): raise ValueError("lengths must be in [1, len(dataset)]") k_points = generate_k_pp(dataset, k) return _do_lloyds_algo(dataset, k_points) if __name__ =='__main__': from cs506 import read data = read.read_csv('D:/OneDrive/College Notebook/Boston University/Fall Senior Year/CS 506/CS506-Fall2020/02-library/tests/test_files/dataset_1.csv') res = (k_means(data, 4)) print(res[0])