def test_kmeans_when_k_is_2(dataset, expected1, expected2): expected_clustering1 = kmeans.get_list_from_dataset_file(expected1) expected_clustering2 = kmeans.get_list_from_dataset_file(expected2) clustering = kmeans.k_means(dataset_file=dataset, k=2) cost = kmeans.cost_function(clustering) for _ in range(10): new_clustering = kmeans.k_means(dataset_file=dataset, k=2) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost assert len(clustering.keys()) == 2 assert clustered_all_points(clustering, kmeans.get_list_from_dataset_file(dataset)) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered == [expected_clustering1, expected_clustering2]
def test_kmeans_when_k_is_1(dataset): expected_clustering = kmeans.get_list_from_dataset_file(dataset) clustering = kmeans.k_means(dataset_file=dataset, k=1) assert len(clustering.keys()) == 1 assert clustered_all_points(clustering, kmeans.get_list_from_dataset_file(dataset)) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered == [expected_clustering]
def kmeans(self, args): """Run the kmeans command """ import csv clustering = kmeans.k_means(dataset_file=args.dataset, k=int(args.k)) cost = kmeans.cost_function(clustering) for _ in range(100): new_clustering = kmeans.k_means(dataset_file=args.dataset, k=int(args.k)) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost for assignment in clustering.keys(): file_name = str(args.dataset).split(".")[0]+"_k_is_"+args.k+"_"+str(assignment)+".csv" with open(file_name, "w") as f: writer = csv.writer(f) print("assignement ", assignment, " is: ", clustering[assignment]) writer.writerows(clustering[assignment]) f.close()
def test_kmeans_when_k_is_3(dataset, expected1, expected2, expected3): expected_clustering1 = kmeans.get_list_from_dataset_file(expected1) expected_clustering2 = kmeans.get_list_from_dataset_file(expected2) expected_clustering3 = kmeans.get_list_from_dataset_file(expected3) clustering = kmeans.k_means(dataset_file=dataset, k=3) cost = kmeans.cost_function(clustering) for _ in range(3000): new_clustering = kmeans.k_means(dataset_file=dataset, k=3) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost assert len(clustering.keys()) == 3 assert clustered_all_points( clustering, kmeans.get_list_from_dataset_file(dataset)) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered == [ expected_clustering1, expected_clustering2, expected_clustering3 ] #return clustered == [expected_clustering1, expected_clustering2, expected_clustering3] # a = "/Users/AaronLee/clustering/tests/test_files/dataset_1.csv" # b = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_3_0.csv" # c = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_3_1.csv" # d = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_3_2.csv" # x = test_kmeans_when_k_is_3(a,b,c,d) # # a1 = "/Users/AaronLee/clustering/tests/test_files/dataset_1.csv" # b1 = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_2_0.csv" # c1 = "/Users/AaronLee/clustering/tests/test_files/dataset_1_k_is_2_1.csv" # x1 = test_kmeans_when_k_is_2(a1,b1,c1) # # a2 = "/Users/AaronLee/clustering/tests/test_files/dataset_1.csv" # x2 = test_kmeans_when_k_is_1(a2)
def generateSeparator(length): result = "" for i in range(length): result += "-" result += "\n" return result startTime = current_milli_time() result = head for i in range(1, 6, 1): result += generateSeparator(120) dataset = "inputs/graph.txt" result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n" centroids = k_means(dataset, i) centroids.sort() for centroid in centroids: for coordinate in centroid: result += str(coordinate) + "\t" result += "\n" result += "\n" for i in range(1, 6, 1): result += generateSeparator(120) dataset = "inputs/iris.data.txt" result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n" centroids = k_means(dataset, i) centroids.sort() for centroid in centroids: for coordinate in centroid:
NC = list(range(3, 4)) #numOfModels+1)) # list of numbers of clusters accuracies = np.zeros((numOfEPS, len(NC))) clusteringResult = {} for numOfClusters in NC: clusteringResult[numOfClusters] = [] with open("clustering_result.txt", "w") as fp: for numOfClusters in NC: # clustering into c groups print("Clustering: {} clusters".format(numOfClusters)) # kmeans = KMeans(n_clusters=numOfClusters, random_state=0).fit(predVec) # for c in range(numOfClusters): # clusteringResult[numOfClusters].append(np.where(kmeans.labels_ == c)[0]) # print(np.where(kmeans.labels_ == c)[0]) assignments = k_means(predVec, numOfClusters, "L2", "ZerosFarAway") fp.write("## number of clusters: " + str(numOfClusters) + "\n") for c in range(numOfClusters): cluster = np.where(assignments == c)[0] clusteringResult[numOfClusters].append(cluster) print(cluster) fp.write("\t" + str(cluster) + "\n") fp.write("\n") def vote1(participants): ''' Input: participants: a list of opinions. Each element in the list is a numpy array, N X 2. N is the number of events. The second dimension contains (opinion/label, confidence) Output:
current_milli_time = lambda: int(round(time.time() * 1000)) def generateSeparator(length): result = "" for i in range(length): result += "-" result += "\n" return result startTime = current_milli_time() result = head for i in range(1, 6, 1): result += generateSeparator(120) dataset = "inputs/graph.txt" result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n" centroids = k_means(dataset, i) centroids.sort() for centroid in centroids: for coordinate in centroid: result += str(coordinate) + "\t" result+="\n" result += "\n" for i in range(1, 6, 1): result += generateSeparator(120) dataset = "inputs/iris.data.txt" result += "Dataset: " + dataset + "\t\t count centroids: " + str(i) + "\n" centroids = k_means(dataset, i) centroids.sort() for centroid in centroids: for coordinate in centroid: