def process(self): """! @brief Performs cluster analysis in line with rules of BIRCH algorithm. @return (birch) Returns itself (BIRCH instance). @see get_clusters() """ self.__insert_data() self.__extract_features() cf_data = [feature.get_centroid() for feature in self.__features] algorithm = agglomerative(cf_data, self.__number_clusters, type_link.SINGLE_LINK).process() self.__cf_clusters = algorithm.get_clusters() cf_labels = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, self.__cf_clusters, cf_data).\ set_encoding(type_encoding.CLUSTER_INDEX_LABELING).get_clusters() self.__clusters = [[] for _ in range(len(self.__cf_clusters))] for index_point in range(len(self.__pointer_data)): index_cf_entry = numpy.argmin(numpy.sum(numpy.square( numpy.subtract(cf_data, self.__pointer_data[index_point])), axis=1)) index_cluster = cf_labels[index_cf_entry] self.__clusters[index_cluster].append(index_point) return self
def getIndexRepresentorTwoDimensionData(self): clusters = [[0, 1, 2, 3], [4, 5, 6, 7]] data = [[5.1, 5.2], [5.2, 5.1], [5.4, 5.2], [5.1, 5.0], [8.1, 8.0], [8.4, 8.2], [8.3, 8.4], [8.5, 8.5]] return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data)
def aggl_cluster(df, n_clusters, link, hover_text): datadf = df.loc[:, df.columns != hover_text] data_list = datadf.to_numpy(dtype="int64").tolist() if (link == "centroid"): typelink = type_link.CENTROID_LINK elif (link == "single"): typelink = type_link.SINGLE_LINK elif (link == "complete"): typelink = agglomerative.type_link.COMPLETE_LINK else: typelink = agglomerative.type_link.AVERAGE_LINK aggl_instance = agglomerative(data_list, n_clusters, typelink) aggl_instance.process() clusters = aggl_instance.get_clusters() reps = aggl_instance.get_cluster_encoding() encoder = cluster_encoder(reps, clusters, data_list) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) label = np.array(encoder.get_clusters(), dtype='int32') data_array = np.array(data_list) col_len = len(datadf.columns) if (col_len == 2): clus = scat2d(data_array, label, hover_text, df) return clus else: clus = scat3d(data_array, label, hover_text, df) return clus
def getIndexRepresentorDoubleData(self): clusters = [[0, 1, 2, 3], [4, 5, 6, 7]] data = [ 5.4562, 5.1235, 4.9235, 4.8712, 8.3451, 8.4215, 8.6535, 8.7345 ] return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data)
def testLabelsToIndexListAndObjectListMissedPoint(self): clusters = [0, 0, float('NaN'), 1, 1] data = [[5.1, 5.2], [5.2, 5.1], [14.1, 76.0], [8.1, 8.0], [8.4, 8.2]] encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LABELING, clusters, data) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) expected = [[0, 1], [3, 4]] actual = encoder.get_clusters() self.assertEqual(len(expected), len(actual)) self.assertEqual(expected, actual) encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LABELING, clusters, data) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) expected = [[[5.1, 5.2], [5.2, 5.1]], [[8.1, 8.0], [8.4, 8.2]]] actual = encoder.get_clusters() self.assertEqual(len(expected), len(actual)) self.assertEqual(expected, actual)
def templateEncoderProcedures(ccore_flag): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) cure_instance = cure(sample, 4, 5, 0.5, ccore=ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() encoding = cure_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assert 4 == len(clusters)
def templateEncoderProcedures(filename, initial_centers, number_clusters, ccore_flag): sample = read_sample(filename) kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag) kmeans_instance.process() clusters = kmeans_instance.get_clusters() encoding = kmeans_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assertion.eq(number_clusters, len(clusters))
def templateEncoderProcedures(filename, initial_centers, number_clusters, ccore_flag): sample = read_sample(filename); kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag); kmeans_instance.process(); clusters = kmeans_instance.get_clusters(); encoding = kmeans_instance.get_cluster_encoding(); encoder = cluster_encoder(encoding, clusters, sample); encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING); encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION); encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION); assertion.eq(number_clusters, len(clusters));
def templateEncoderProcedures(ccore_flag): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) cure_instance = cure(sample, 4, 5, 0.5, ccore = ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() encoding = cure_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assertion.eq(4, len(clusters))
def templateEncoderProcedures(sample, initial_centers, number_clusters, ccore_flag): sample = read_sample(sample) cure_instance = kmeans(sample, initial_centers, 0.025, ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() encoding = cure_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assert number_clusters == len(clusters)
def testObjectListToLabelsMissedPoint(self): clusters = [[[5.1, 5.2], [5.2, 5.1]], [[8.1, 8.0], [8.4, 8.2]]] data = [[5.1, 5.2], [5.2, 5.1], [14.1, 76.0], [8.1, 8.0], [8.4, 8.2]] encoder = cluster_encoder(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION, clusters, data) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) expected = [0, 0, float('NaN'), 1, 1] actual = encoder.get_clusters() self.assertEqual(len(expected), len(actual)) for i in range(len(expected)): if math.isnan(expected[i]) is True: self.assertTrue(math.isnan(actual[i])) else: self.assertEqual(expected[i], actual[i])
def testIndexListToLabelsMissedPoint(self): clusters = [[0, 1, 2, 3], [4, 5, 6]] # the last point is missed data = [[5.1, 5.2], [5.2, 5.1], [5.4, 5.2], [5.1, 5.0], [8.1, 8.0], [8.4, 8.2], [8.3, 8.4], [8.5, 8.5]] encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) expected = [0, 0, 0, 0, 1, 1, 1, float('NaN')] actual = encoder.get_clusters() self.assertEqual(len(expected), len(actual)) for i in range(len(expected)): if math.isnan(expected[i]) is True: self.assertTrue(math.isnan(actual[i])) else: self.assertEqual(expected[i], actual[i])
def dbscan_cluster(df, eps, neighbours, hover_text): datadf = df.loc[:, df.columns != hover_text] data_list = datadf.to_numpy(dtype="int64").tolist() dbscan_instance = dbscan(data_list, eps, neighbours) dbscan_instance.process() clusters = dbscan_instance.get_clusters() reps = dbscan_instance.get_cluster_encoding() encoder = cluster_encoder(reps, clusters, data_list) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) label = np.array(encoder.get_clusters(), dtype='int32') data_array = np.array(data_list) col_len = len(datadf.columns) if (col_len == 2): clus = scat2d(data_array, label, hover_text, df) return clus else: clus = scat3d(data_array, label, hover_text, df) return clus
def kmeans_cluster(df, n_clusters, tolerance, metric, hover_text): datadf = df.loc[:, df.columns != hover_text] data_list = datadf.to_numpy(dtype="int64").tolist() if (metric == "manhattan"): metric_str = distance_metric(type_metric.MANHATTAN) else: metric_str = distance_metric(type_metric.EUCLIDEAN_SQUARE) centers = kmeans_plusplus_initializer(data_list, n_clusters).initialize() kmeans_instance = kmeans(data_list, centers, tolerance, metric_str) kmeans_instance.process() clusters = kmeans_instance.get_clusters() reps = kmeans_instance.get_cluster_encoding() encoder = cluster_encoder(reps, clusters, data_list) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) label = np.array(encoder.get_clusters(), dtype='int32') data_array = np.array(data_list) col_len = len(datadf.columns) if (col_len == 2): clus = scat2d(data_array, label, hover_text, df) return clus else: clus = scat3d(data_array, label, hover_text, df) return clus
def getIndexRepresentor(self): clusters = [ [0, 1, 2, 3], [4, 5, 6, 7] ]; data = [10, 11, 13, 12, 64, 65, 65, 68]; return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data);
def cluster_nodes(visualisation=False): # (kmedoids_cluster_nodes.py # template_clustering() # K-medoids clustering using points as data # Get the nodes sample from the data file # samplePath = os.path.dirname(os.path.abspath("kmedoids_cluster_nodes.py")) + os.sep + "nodes-test1.data" #scenarios = ["scenario1.data", "scenario2.data", "scenario3.data", "scenario4.data", "scenario5.data"] scenarios = ["scenario1.data", "scenario2.data", "scenario3.data", "scenario4.data", "scenario5.data"] """ #random.seed(35344796) random.seed(35334096) scenario4 = [] for i in range(10): scenario4.append(random.randrange(1000)) print(scenario4) """ # [564, 162, 959, 271, 663, 992, 566, 883, 438, 118] # 1k #initial_medoids = [[8, 12, 17, 25], [8, 12, 17, 25], [8, 12, 22, 28]]; #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [10461, 7157, 11717, 2709, 13116, 16811, 2041, 19481, 9130, 12817]]; #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [8257, 3356, 10812, 2440, 14783, 10547, 11063, 11980, 6929, 18896]] #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [1129, 324, 1919, 542, 1326, 1985, 1133, 1767, 877, 237], [103, 16, 196, 76, 80]] #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [564, 162, 959, 271, 663, 992, 566, 883, 438, 118]] #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199]]; #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [15, 25, 28, 32, 10]]; 0.2476339234441543 #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [13, 23, 28, 32, 8]]; scenarioClustersDistanceList = [] for scenarioIndex in range(0, len(scenarios)): total_time_start = time.perf_counter() total_wall_time_start = time.time() samplePath = os.path.dirname(os.path.abspath("kmedoids_cluster_nodes.py")) + os.sep + scenarios[scenarioIndex] sample = read_sample(samplePath) print("\nScenario", scenarioIndex+1, "\nSample:", samplePath, "\n") # Use Manhattan distance metric = distance_metric(type_metric.MANHATTAN); # Store the silhouette value for different number of clusters (k) silhouettes = [] # Run clustering k times, calculate silhouette value for each time and choose clustering with best value for k in range(2, 11): # Randomly generate the medoids """random.seed(35334096) random_medoids = [] for i in range(k): random_medoids.append(random.randrange(len(sample))) """ # Initialise the clustering algorithm with the k-means++ algorithm # Initialise the random generator with a seed for reproducibility random.seed(35334096) initial_points = kmeans_plusplus_initializer(sample, k).initialize() #print(sample) #print("Type of initial points:", type(initial_points)) #print("Initial points:", initial_points) initial_medoids = [] for point in sample: #print("Sample point:", point) for initial_point in initial_points: #print("Single point type:", type(initial_point)) if(point[0] == initial_point[0] and point[1] == initial_point[1]): initial_medoids.append(sample.index(point)) #print("Initial medoids:", initial_medoids) #print("Random medoids:", random_medoids) # Initiate the k-medoids algorithm with the sample and the initial medoids #kmedoids_instance = kmedoids(sample, initial_medoids[scenarioIndex], 0.001, metric=metric, ccore = True); kmedoids_instance = kmedoids(sample, initial_medoids, 0.001, metric=metric, ccore = True); # Start performance counter time_start = time.perf_counter() wall_time_start = time.time() # Perform actual clustering kmedoids_instance.process() # Stop performance counter time_end = time.perf_counter() wall_time_end = time.time() # Calculate execution time and wall time clustering_time = time_end - time_start clustering_wall_time = wall_time_end - wall_time_start print("Execution time for clustering for k=" + str(k) + ":", clustering_time, "\nWall time for clustering for k=" + str(k) + ":", clustering_wall_time) # by default k-medoids returns representation CLUSTER_INDEX_LIST_SEPARATION clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids(); #print("Clusters before changing encoding:", clusters) type_repr = kmedoids_instance.get_cluster_encoding(); #print("Representator type:", type_repr) encoder = cluster_encoder(type_repr, clusters, sample); # change representation from index list to label list encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING); #kmedoids_instance.process type_repr2 = encoder.get_encoding; # Cluster representation converted from a list of sample indexes to their respective labels cluster_labels = encoder.get_clusters() #print("Representator type afterwards:", type_repr2) #print("Cluster labels", cluster_labels) #print("Number of medoids:", len(medoids)) #[[float(y) for y in x] for x in l] medoidPoints = [[point for point in sample[index]] for index in medoids] #print("Medoid points:", medoidPoints) # Calculate the silhouette value silhouettes.append((calculate_silhouette(sample, cluster_labels, medoidPoints, k, scenarioIndex, visualisation), k, clustering_time, clustering_wall_time, clusters, medoids)) # Calculate Silhouette value #sil_val = silhouette_value(kmedoids_instance.get_clusters(), sample) #print("pyclustering silhouette value for", k, "clusters:", sil_val) best_silhouette, best_k, best_time, best_wall_time, best_clusters, best_medoids = max(silhouettes,key=itemgetter(0)) print("The best silhouette value of", best_silhouette, "was achieved with k=" + str(best_k) + "\nExecution time of best clustering: " + str(best_time) + "\nWall time of best clustering: " + str(best_wall_time)) #print("Best clusters:", best_clusters) #print("Best medoids:", best_medoids) # Run clustering and print result of clustering as well as execution time #(ticks, result) = timedcall(kmedoids_instance.process); #print( "\nExecution time:", time); #clusters = kmedoids_instance.get_clusters(); #medoids = kmedoids_instance.get_medoids(); #print("Clusters:", clusters); #print("Medoids:", medoids) # Generate visualisation if(visualisation): title = "K-medoids clustering - Scenario " + str(scenarioIndex+1) visualizer = cluster_visualizer(1, titles=[title]); visualizer.append_clusters(best_clusters, sample, 0); #visualizer.append_cluster([ sample[index] for index in initial_medoids[index] ], marker = '*', markersize = 15); visualizer.append_cluster(best_medoids, data=sample, marker='*', markersize=15, color="black"); visualizer.show(visible_axis = False, visible_grid = False); # Post-processing # Calculate Manhattan distance from medoid to all points in the cluster metric = distance_metric(type_metric.MANHATTAN); clusterList = [] #print("Number of clusters:", len(best_clusters),) for index in range(0, len(best_clusters)): #print("Index: ", index) medoidPoint = sample[best_medoids[index]] #print("Medoid point array: ", medoidPoint) #print("Cluster index array: ", clusters[index]) nodeList = [] for currentClusterIndex in best_clusters[index]: # Make sure not to compare the medoid to itself if best_medoids[index] != currentClusterIndex: # Get the point array of the current cluster to compare to the medoid currentClusterPoint = sample[currentClusterIndex] #print("Current cluster point from sample:", currentClusterPoint) # Calculate the Manhattan distance between the medoid and the current point to compare with distance = metric(medoidPoint, currentClusterPoint) # Append the result to a list as the index of the medoid, the index of the current point and the distance between them nodeList.append([best_medoids[index], currentClusterIndex, distance]) #print("Distance between ", medoidPoint, " and ", currentClusterPoint, " is: ", distance) clusterList.append(nodeList) scenarioClustersDistanceList.append(clusterList) total_time_end = time.perf_counter() total_wall_time_end = time.time() print("\nTotal scenario execution time:", total_time_end - total_time_start, "\nTotal scenario wall time:", total_wall_time_end - total_wall_time_start, "\n\n----") return scenarioClustersDistanceList """# K-medoids clustering using distance matrix
def getIndexRepresentorTwoDimensionData(self): clusters = [ [0, 1, 2, 3], [4, 5, 6, 7] ]; data = [ [5.1, 5.2], [5.2, 5.1], [5.4, 5.2], [5.1, 5.0], [8.1, 8.0], [8.4, 8.2], [8.3, 8.4], [8.5, 8.5]]; return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data);
def getIndexRepresentor(self): clusters = [[0, 1, 2, 3], [4, 5, 6, 7]] data = [10, 11, 13, 12, 64, 65, 65, 68] return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data)
def getIndexRepresentorDoubleData(self): clusters = [ [0, 1, 2, 3], [4, 5, 6, 7] ]; data = [5.4562, 5.1235, 4.9235, 4.8712, 8.3451, 8.4215, 8.6535, 8.7345]; return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data);