def templateLengthProcessData(data, start_medians, expected_cluster_length, ccore, **kwargs): tolerance = kwargs.get('tolerance', 0.01) metric = kwargs.get('metric', None) itermax = kwargs.get('itermax', 200) if isinstance(data, str): sample = read_sample(data) else: sample = data kmedians_instance = kmedians(sample, start_medians, tolerance, ccore, metric=metric, itermax=itermax) kmedians_instance.process() clusters = kmedians_instance.get_clusters() medians = kmedians_instance.get_medians() if itermax == 0: assert clusters == [] assert start_medians == medians return obtained_cluster_sizes = [len(cluster) for cluster in clusters] assert len(sample) == sum(obtained_cluster_sizes) assert len(medians) == len(clusters) if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: print(obtained_cluster_sizes) assert obtained_cluster_sizes == expected_cluster_length
def templateClusterAllocationOneDimensionData(self): input_data = [ [random()] for i in range(10) ] + [ [random() + 3] for i in range(10) ] + [ [random() + 5] for i in range(10) ] + [ [random() + 8] for i in range(10) ]; kmedians_instance = kmedians(input_data, [ [0.0], [3.0], [5.0], [8.0] ], 0.025); kmedians_instance.process(); clusters = kmedians_instance.get_clusters(); assert len(clusters) == 4; for cluster in clusters: assert len(cluster) == 10;
def template_clustering(start_centers, path, tolerance = 0.25): sample = read_sample(path); kmedians_instance = kmedians(sample, start_centers, tolerance); (ticks, result) = timedcall(kmedians_instance.process); clusters = kmedians_instance.get_clusters(); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); draw_clusters(sample, clusters);
def templateLengthProcessData(self, path_to_file, start_centers, expected_cluster_length): sample = read_sample(path_to_file); kmedians_instance = kmedians(sample, start_centers, 0.025); kmedians_instance.process(); clusters = kmedians_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; assert len(sample) == sum(obtained_cluster_sizes); obtained_cluster_sizes.sort(); expected_cluster_length.sort(); assert obtained_cluster_sizes == expected_cluster_length;
def templateClusterAllocationTheSameObjects(self, number_objects, number_clusters, ccore_flag = False): value = random(); input_data = [ [value] ] * number_objects; initial_centers = []; for i in range(number_clusters): initial_centers.append([ random() ]); kmedians_instance = kmedians(input_data, initial_centers); kmedians_instance.process(); clusters = kmedians_instance.get_clusters(); object_mark = [False] * number_objects; allocated_number_objects = 0; for cluster in clusters: for index_object in cluster: assert (object_mark[index_object] == False); # one object can be in only one cluster. object_mark[index_object] = True; allocated_number_objects += 1; assert (number_objects == allocated_number_objects); # number of allocated objects should be the same.
def templateClusterAllocationTheSameObjects(number_objects, number_clusters, ccore_flag): value = random() input_data = [ [value] ] * number_objects initial_centers = [] for i in range(number_clusters): initial_centers.append([ random() ]) kmedians_instance = kmedians(input_data, initial_centers, ccore=ccore_flag) kmedians_instance.process() clusters = kmedians_instance.get_clusters() object_mark = [False] * number_objects allocated_number_objects = 0 for cluster in clusters: for index_object in cluster: assert (object_mark[index_object] is False) # one object can be in only one cluster. object_mark[index_object] = True allocated_number_objects += 1 assert (number_objects == allocated_number_objects) # number of allocated objects should be the same.
def testDifferentDimensions(self): kmedians_instance = kmedians([ [0, 1, 5], [0, 2, 3] ], [ [0, 3] ]); self.assertRaises(NameError, kmedians_instance.process);
def testCoreInterfaceIntInputData(self): kmedians_instance = kmedians([ [1], [2], [3], [20], [21], [22] ], [ [2], [21] ], ccore=True) kmedians_instance.process() assert len(kmedians_instance.get_clusters()) == 2
def process_kmedians(sample): instance = kmedians(sample, [ [random() + (multiplier * 5), random() + (multiplier + 5)] for multiplier in range(NUMBER_CLUSTERS) ]) (ticks, _) = timedcall(instance.process) return ticks
def testDifferentDimensions(self): kmedians_instance = kmedians([ [0, 1, 5], [0, 2, 3] ], [ [0, 3] ], ccore=False) self.assertRaises(NameError, kmedians_instance.process)
def testTotalWCESimple4(self): sample = [[0, 1, 5], [7, 8, 9], [0, 2, 3], [4, 5, 6]] initial_medians = [[0, 3, 2], [4, 6, 5]] kmedians_instance = kmedians(sample, initial_medians, ccore=False) self.assertNotEqual(self, kmedians_instance.get_total_wce(), 16.0)
# Clean up our visual ax.set(title='Elbow Plot', xlabel='Number of Clusters', ylabel='Total distance') sns.despine(offset=5, trim=True) # Let's now run a k-medians (instead of mean distance, let's calculate median distance) # We can do this efficiently with the pyclustering library. # If you don't have it installed, install it! #python -m pip install pyclustering from pyclustering.cluster.kmedians import kmedians from pyclustering.cluster import cluster_visualizer # Create instance of K-Medians algorithm. initial_medians = [[0.0, 0.1], [2.5, 0.7], [3.5, 1.5]] kmedians_instance = kmedians(xt, initial_medians) # Run cluster analysis and obtain results. kmedians_instance.process() clusters = kmedians_instance.get_clusters() medians = kmedians_instance.get_medians() # How well did it do? # Sum of metric errors is calculated using distance between point and its center print(kmedians_instance.get_total_wce()) # Visualize clustering results. visualizer = cluster_visualizer() visualizer.append_clusters(clusters, xt) #visualizer.append_cluster(initial_medians, marker='*', markersize=10) visualizer.append_cluster(medians, marker='*', markersize=10, color='k') visualizer.show()