def testVisualize2DAnd3DClusters(self): sample_2d = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); sample_3d = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA); visualizer = cluster_visualizer(2, 2); visualizer.append_clusters([ sample_2d ], None, 0, markersize = 5); visualizer.append_clusters([ sample_3d ], None, 1, markersize = 30); visualizer.show();
def testVisualize1DClustersTwoCanvases(self): sample_simple7 = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE7); sample_simple8 = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE8); # Two canvas visualization visualizer = cluster_visualizer(2); visualizer.append_clusters([ sample_simple7 ], None, 0, markersize = 30); visualizer.append_clusters([ sample_simple8 ], None, 1, markersize = 30); visualizer.show();
def testVisualize3DClustersTwoCanvases(self): sample_tetra = read_sample(FCPS_SAMPLES.SAMPLE_TETRA); sample_hepta = read_sample(FCPS_SAMPLES.SAMPLE_HEPTA); # Two canvas visualization visualizer = cluster_visualizer(2); visualizer.append_clusters([ sample_tetra ], None, 0, markersize = 30); visualizer.append_clusters([ sample_hepta ], None, 1, markersize = 30); visualizer.show();
def testVisualizeRectangeRepresentation2x2(self): sample_simple1 = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); sample_simple2 = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2); sample_simple3 = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3); visualizer = cluster_visualizer(3, 2); visualizer.append_clusters([ sample_simple1 ], None, 0, markersize = 5); visualizer.append_clusters([ sample_simple2 ], None, 1, markersize = 5); visualizer.append_clusters([ sample_simple3 ], None, 2, markersize = 5); visualizer.show();
def testAllocatedRequestedClustersSampleSimple03(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3) KmedoidsTestTemplates.templateAllocateRequestedClusterAmount(sample, 2, None, False) KmedoidsTestTemplates.templateAllocateRequestedClusterAmount(sample, 5, None, False) KmedoidsTestTemplates.templateAllocateRequestedClusterAmount(sample, 8, None, False) KmedoidsTestTemplates.templateAllocateRequestedClusterAmount(sample, 10, None, False) KmedoidsTestTemplates.templateAllocateRequestedClusterAmount(sample, 15, None, False)
def templateClusterAllocation(self, path, cluster_sizes, number_clusters, iterations, maxneighbors): result_testing = False; # it's randomized algorithm therefore attempts are required for attempt in range(0, 5, 1): sample = read_sample(path); clarans_instance = clarans(sample, number_clusters, iterations, maxneighbors); clarans_instance.process(); clusters = clarans_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; total_length = sum(obtained_cluster_sizes); if (total_length != len(sample)): continue; cluster_sizes.sort(); obtained_cluster_sizes.sort(); if (cluster_sizes != obtained_cluster_sizes): continue; result_testing = True; break; assert result_testing == True;
def templateLengthProcessData(path_to_file, start_centers, expected_cluster_length, ccore, **kwargs): sample = read_sample(path_to_file) metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)) itermax = kwargs.get('itermax', 200) kmeans_instance = kmeans(sample, start_centers, 0.001, ccore, metric=metric, itermax=itermax) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() wce = kmeans_instance.get_total_wce() if itermax == 0: assertion.eq(start_centers, centers) assertion.eq([], clusters) assertion.eq(0.0, wce) return obtained_cluster_sizes = [len(cluster) for cluster in clusters] assertion.eq(len(sample), sum(obtained_cluster_sizes)) assertion.eq(len(clusters), len(centers)) for center in centers: assertion.eq(len(sample[0]), len(center)) if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() assertion.eq(obtained_cluster_sizes, expected_cluster_length)
def templateClusteringWithAnswers(sample_path, answer_path, radius, neighbors, ccore, **kwargs): random_order = kwargs.get('random_order', False) repeat = kwargs.get('repeat', 1) for _ in range(repeat): sample = read_sample(sample_path) sample_index_map = [ i for i in range(len(sample)) ] if random_order: shuffle(sample_index_map) sample_shuffled = [ sample[i] for i in sample_index_map ] dbscan_instance = dbscan(sample_shuffled, radius, neighbors, ccore) dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() for cluster in clusters: for i in range(len(cluster)): cluster[i] = sample_index_map[cluster[i]] for i in range(len(noise)): noise[i] = sample_index_map[noise[i]] noise = sorted(noise) reader = answer_reader(answer_path) expected_noise = sorted(reader.get_noise()) expected_length_clusters = reader.get_cluster_lengths() assertion.eq(len(sample), sum([len(cluster) for cluster in clusters]) + len(noise)) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters])) assertion.eq(expected_noise, noise)
def template_clustering(path, count_clusters, chromosome_count, population_count, count_mutation_gens, coeff_mutation_count=0.25, select_coeff=1.0, fps=15, animation=False): sample = read_sample(path) algo_instance = genetic_algorithm(data=sample, count_clusters=count_clusters, chromosome_count=chromosome_count, population_count=population_count, count_mutation_gens=count_mutation_gens, coeff_mutation_count=coeff_mutation_count, select_coeff=select_coeff, observer=ga_observer(True, True, True)) start_time = time.time() algo_instance.process() print("Sample: ", path, "\t\tExecution time: ", time.time() - start_time, "\n") observer = algo_instance.get_observer() ga_visualizer.show_clusters(sample, observer) if (animation is True): ga_visualizer.animate_cluster_allocation(sample, observer, movie_fps=fps, save_movie="clustering_animation.mp4")
def templateClustering(self, file, radius, order, solver, initial, storage_flag, conn_weigh_flag, tolerance, connection, expected_cluster_length, ccore_flag): result_testing = False; # If phases crosses each other because of random part of the network then we should try again. for attempt in range(0, 4, 1): sample = read_sample(file); network = syncnet(sample, radius, connection, initial, conn_weigh_flag, ccore_flag); analyser = network.process(order, solver, storage_flag); clusters = analyser.allocate_clusters(tolerance); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; if (len(obtained_cluster_sizes) != len(expected_cluster_length)): continue; obtained_cluster_sizes.sort(); expected_cluster_length.sort(); if (obtained_cluster_sizes != expected_cluster_length): continue; # Unit-test is passed result_testing = True; break; assert result_testing;
def template_clustering(file, map_size, trust_order, sync_order = 0.999, show_dyn = False, show_layer1 = False, show_layer2 = False, show_clusters = True): # Read sample sample = read_sample(file); # Create network network = syncsom(sample, map_size[0], map_size[1]); # Run processing (ticks, (dyn_time, dyn_phase)) = timedcall(network.process, trust_order, show_dyn, sync_order); print("Sample: ", file, "\t\tExecution time: ", ticks, "\n"); # Show dynamic of the last layer. if (show_dyn == True): draw_dynamics(dyn_time, dyn_phase, x_title = "Time", y_title = "Phase", y_lim = [0, 2 * 3.14]); if (show_clusters == True): clusters = network.get_som_clusters(); draw_clusters(network.som_layer.weights, clusters); # Show network stuff. if (show_layer1 == True): network.show_som_layer(); if (show_layer2 == True): network.show_sync_layer(); if (show_clusters == True): clusters = network.get_clusters(); draw_clusters(sample, clusters);
def correct_ksearch(sample_path, answer_path, kmin, kmax, algorithm, ccore_flag): attempts = 10 testing_result = False sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() for _ in range(attempts): ksearch_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm, ccore=ccore_flag).process() amount = ksearch_instance.get_amount() score = ksearch_instance.get_score() scores = ksearch_instance.get_scores() assertion.le(-1.0, score) assertion.ge(1.0, score) assertion.eq(kmax - kmin, len(scores)) upper_limit = len(clusters) + 1 lower_limit = len(clusters) - 1 if lower_limit < 1: lower_limit = 1 if (amount > upper_limit) or (amount < lower_limit): continue testing_result = True break assertion.true(testing_result)
def template_clustering(number_clusters, path, links): sample = read_sample(path); clusters_centroid_link = None; clusters_single_link = None; clusters_complete_link = None; clusters_average_link = None; visualizer = cluster_visualizer(len(links)); index_canvas = 0; if (type_link.CENTROID_LINK in links): agglomerative_centroid_link = agglomerative(sample, number_clusters, type_link.CENTROID_LINK); (ticks, result) = timedcall(agglomerative_centroid_link.process); clusters_centroid_link = agglomerative_centroid_link.get_clusters(); visualizer.append_clusters(clusters_centroid_link, sample, index_canvas); visualizer.set_canvas_title('Link: Centroid', index_canvas); index_canvas += 1; print("Sample: ", path, "Link: Centroid", "\tExecution time: ", ticks, "\n"); if (type_link.SINGLE_LINK in links): agglomerative_simple_link = agglomerative(sample, number_clusters, type_link.SINGLE_LINK); (ticks, result) = timedcall(agglomerative_simple_link.process); clusters_single_link = agglomerative_simple_link.get_clusters(); visualizer.append_clusters(clusters_single_link, sample, index_canvas); visualizer.set_canvas_title('Link: Single', index_canvas); index_canvas += 1; print("Sample: ", path, "Link: Single", "\tExecution time: ", ticks, "\n"); if (type_link.COMPLETE_LINK in links): agglomerative_complete_link = agglomerative(sample, number_clusters, type_link.COMPLETE_LINK); (ticks, result) = timedcall(agglomerative_complete_link.process); clusters_complete_link = agglomerative_complete_link.get_clusters(); visualizer.append_clusters(clusters_complete_link, sample, index_canvas); visualizer.set_canvas_title('Link: Complete', index_canvas); index_canvas += 1; print("Sample: ", path, "Link: Complete", "\tExecution time: ", ticks, "\n"); if (type_link.AVERAGE_LINK in links): agglomerative_average_link = agglomerative(sample, number_clusters, type_link.AVERAGE_LINK); (ticks, result) = timedcall(agglomerative_average_link.process); clusters_average_link = agglomerative_average_link.get_clusters(); visualizer.append_clusters(clusters_average_link, sample, index_canvas); visualizer.set_canvas_title('Link: Average', index_canvas); index_canvas += 1; print("Sample: ", path, "Link: Average", "\tExecution time: ", ticks, "\n"); visualizer.show();
def templateLengthProcessData(self, file, som_map_size, avg_num_conn, eps, expected_cluster_length): result_testing = False; # If phases crosses each other because of random part of the network then we should try again. for attempt in range(0, 3, 1): sample = read_sample(file); network = syncsom(sample, som_map_size[0], som_map_size[1]); network.process(avg_num_conn, collect_dynamic = False, order = eps); clusters = network.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; if (len(sample) != sum(obtained_cluster_sizes)): continue; obtained_cluster_sizes.sort(); expected_cluster_length.sort(); #print(obtained_cluster_sizes, expected_cluster_length); if (obtained_cluster_sizes != expected_cluster_length): continue; # Unit-test is passed result_testing = True; break; assert result_testing;
def elbow_analysis(sample_file_path, kmin, kmax, **kwargs): initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(sample_file_path) elbow_instance = elbow(sample, kmin, kmax, initializer=initializer) elbow_instance.process() amount_clusters = elbow_instance.get_amount() wce = elbow_instance.get_wce() centers = kmeans_plusplus_initializer(sample, amount_clusters).initialize() kmeans_instance = kmeans(sample, centers) kmeans_instance.process() clusters = kmeans_instance.get_clusters() centers = kmeans_instance.get_centers() print("Sample '%s': Obtained amount of clusters: '%d'." % (sample_file_path, amount_clusters)) figure = plt.figure(1) ax = figure.add_subplot(111) ax.plot(range(kmin, kmax), wce, color='b', marker='.') ax.plot(amount_clusters, wce[amount_clusters - kmin], color='r', marker='.', markersize=10) ax.annotate("Elbow", (amount_clusters + 0.1, wce[amount_clusters - kmin] + 5)) ax.grid(True) plt.ylabel("WCE") plt.xlabel("K") plt.show() kmeans_visualizer.show_clusters(sample, clusters, centers)
def templateLengthProcessData(data, start_medians, expected_cluster_length, ccore, **kwargs): tolerance = kwargs.get('tolerance', 0.01) metric = kwargs.get('metric', None) itermax = kwargs.get('itermax', 200) if isinstance(data, str): sample = read_sample(data) else: sample = data kmedians_instance = kmedians(sample, start_medians, tolerance, ccore, metric=metric, itermax=itermax) kmedians_instance.process() clusters = kmedians_instance.get_clusters() medians = kmedians_instance.get_medians() if itermax == 0: assert clusters == [] assert start_medians == medians return obtained_cluster_sizes = [len(cluster) for cluster in clusters] assert len(sample) == sum(obtained_cluster_sizes) assert len(medians) == len(clusters) if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: print(obtained_cluster_sizes) assert obtained_cluster_sizes == expected_cluster_length
def testVisualizeByDataOnly(self): visualizer = cluster_visualizer(); sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); visualizer.append_clusters([ sample ]); visualizer.show();
def templateTestAwardNeurons(file, rows, cols, time, expected_result, autostop, ccore_flag, parameters = None, **kwargs): store_load = kwargs.get('store_load', False) types = [type_conn.func_neighbor, type_conn.grid_eight, type_conn.grid_four, type_conn.honeycomb] sample = read_sample(file) if (parameters is None): parameters = som_parameters() for stucture in types: network = som(rows, cols, stucture, parameters, ccore = ccore_flag) if store_load: dump_network = pickle.dumps(network) network = pickle.loads(dump_network) network.train(sample, time, autostop) winners = network.get_winner_number() assert winners == len(expected_result) if sorted(network.awards) != expected_result: network.show_network(awards = True) assert sorted(network.awards) == expected_result total_capture_points = 0 for points in network.capture_objects: total_capture_points += len(points) assert total_capture_points == sum(expected_result) del network
def template_cluster_allocation(input_data, cluster_sizes, number_cluster, number_represent_points = 5, compression = 0.5, ccore_flag = False, **kwargs): if isinstance(input_data, str): sample = read_sample(input_data) else: sample = input_data numpy_usage = kwargs.get('numpy_usage', False) if numpy_usage is True: sample = numpy.array(sample) cure_instance = cure(sample, number_cluster, number_represent_points, compression, ccore = ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() representors = cure_instance.get_representors() means = cure_instance.get_means() assertion.eq(len(clusters), number_cluster) assertion.eq(len(representors), number_cluster) assertion.eq(len(means), number_cluster) obtained_cluster_sizes = [len(cluster) for cluster in clusters] total_length = sum(obtained_cluster_sizes) assertion.eq(total_length, len(sample)) cluster_sizes.sort() obtained_cluster_sizes.sort() assertion.eq(cluster_sizes, obtained_cluster_sizes)
def templateDataClustering(self, sample_path, amount_clusters, chromosome_count, population_count, count_mutation_gens, coeff_mutation_count, expected_clusters_sizes): testing_result = False for _ in range(3): sample = read_sample(sample_path) ga_instance = genetic_algorithm(sample, amount_clusters, chromosome_count, population_count, count_mutations_gen=count_mutation_gens, coeff_mutation_count=coeff_mutation_count) ga_instance.process() clusters = ga_instance.get_clusters() obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_clusters_sizes is not None: obtained_cluster_sizes.sort() expected_clusters_sizes.sort() if obtained_cluster_sizes != expected_clusters_sizes: continue testing_result = True break assert testing_result is True
def clustering(path, amount, threshold, expected, ccore, **kwargs): metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)); sample = read_sample(path); bsas_instance = bsas(sample, amount, threshold, ccore=ccore, metric=metric); bsas_instance.process(); clusters = bsas_instance.get_clusters(); representatives = bsas_instance.get_representatives(); obtained_length = 0; obtained_cluster_length = []; for cluster in clusters: obtained_length += len(cluster); obtained_cluster_length.append(len(cluster)); assertion.eq(len(sample), obtained_length); assertion.eq(len(expected), len(clusters)); assertion.eq(len(expected), len(representatives)); assertion.ge(amount, len(clusters)); dimension = len(sample[0]); for rep in representatives: assertion.eq(dimension, len(rep)); expected.sort(); obtained_cluster_length.sort(); assertion.eq(expected, obtained_cluster_length);
def template_visualize(self, path_sample, path_answer, filter=None, **kwargs): data = read_sample(path_sample) clusters = answer_reader(path_answer).get_clusters() visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, data) visualizer.show(filter, **kwargs)
def template_clustering(sample_file_path, amount_clusters, initializer, show_animation = False): sample = read_sample(sample_file_path); observer = None; if (show_animation is True): observer = ema_observer(); initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize(initializer); ema_instance = ema(sample, amount_clusters, initial_means, initial_covariance, observer=observer); ema_instance.process(); clusters = ema_instance.get_clusters(); covariances = ema_instance.get_covariances(); means = ema_instance.get_centers(); cluster_length = [ len(cluster) for cluster in clusters ]; print("Data '" + sample_file_path + "'"); print("Clusters: " + str(len(clusters)) + ", Length:" + str(cluster_length)); if (observer is True): ema_visualizer.show_clusters(observer.get_evolution_clusters()[0], sample, observer.get_evolution_covariances()[0], observer.get_evolution_means()[0]); ema_visualizer.show_clusters(clusters, sample, covariances, means); if (show_animation is True): ema_visualizer.animate_cluster_allocation(sample, observer);
def templateLengthProcessData(input_sample, start_centers, expected_cluster_length, type_splitting, kmax, ccore): if isinstance(input_sample, str): sample = read_sample(input_sample) else: sample = input_sample #clusters = xmeans(sample, start_centers, 20, ccore); xmeans_instance = xmeans(sample, start_centers, kmax, 0.025, type_splitting, ccore) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() obtained_cluster_sizes = [len(cluster) for cluster in clusters] assert len(sample) == sum(obtained_cluster_sizes); assert len(clusters) == len(centers); assert len(centers) <= kmax; if expected_cluster_length is not None: assert len(centers) == len(expected_cluster_length); obtained_cluster_sizes.sort() expected_cluster_length.sort() assert obtained_cluster_sizes == expected_cluster_length;
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 10 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(path_to_data) answer = answer_reader(path_to_answer) additional_info = [] for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), kmax - kmin) assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001) if actual_elbow != len(answer.get_clusters()): additional_info.append(actual_elbow) #time.sleep(0.05) # sleep to gain new seed for random generator continue testing_result = True break message = str(len(answer.get_clusters())) + ": " + str(additional_info) assertion.true(testing_result, message=message)
def testShowLayersProcessing(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); network = syncsom(sample, 4, 4, 1.0); network.process(collect_dynamic = False, order = 0.99); network.show_som_layer(); network.show_sync_layer();
def testClusteringOrderVisualizer(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE4); optics_instance = optics(sample, 6.0, 3, 5); optics_instance.process(); analyser = ordering_analyser(optics_instance.get_ordering()); ordering_visualizer.show_ordering_diagram(analyser, 5);
def templateAnimateClusteringResultNoFailure(filename, initial_centers, ccore_flag): sample = read_sample(filename) observer = kmeans_observer() kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag, observer=observer) kmeans_instance.process() kmeans_visualizer.animate_cluster_allocation(sample, observer)
def testVisualizeOnExistedFigure(self): figure = plt.figure(); sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); visualizer = cluster_visualizer(); visualizer.append_clusters([ sample ]); visualizer.show(figure);
def testVisualizeRectangeRepresentation3x5(self): visualizer = cluster_visualizer(15, 5); for i in range(15): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); visualizer.append_clusters([ sample ], None, i, markersize = 5); visualizer.show();
def templateClusteringResults(path, radius, neighbors, expected_length_clusters, ccore): sample = read_sample(path) dbscan_instance = dbscan(sample, radius, neighbors, ccore) dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample) assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters) assert sorted([len(cluster) for cluster in clusters]) == expected_length_clusters
def template_clustering(start_centers, path, tolerance = 0.25, ccore = True): sample = read_sample(path); kmeans_instance = kmeans(sample, start_centers, tolerance, ccore); (ticks, result) = timedcall(kmeans_instance.process); clusters = kmeans_instance.get_clusters(); centers = kmeans_instance.get_centers(); print("Sample: ", path, "\t\tExecution time: ", ticks, "\n"); visualizer = cluster_visualizer(); visualizer.append_clusters(clusters, sample); visualizer.append_cluster(start_centers, marker = '*', markersize = 20); visualizer.append_cluster(centers, marker = '*', markersize = 20); visualizer.show();
def random_state(rows, cols, connections, random_state, ccore_flag): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1) params = som_parameters() params.random_state = random_state network_1 = som(rows, cols, connections, ccore=ccore_flag) steps_1 = network_1.train(sample, 100, True) network_2 = som(rows, cols, connections, ccore=ccore_flag) steps_2 = network_2.train(sample, 100, True) assert steps_1 == steps_2 assert network_1.weights == network_2.weights assert network_1.capture_objects == network_2.capture_objects assert network_1.awards == network_2.awards
def templateEncoderProcedures(sample, initial_centers, number_clusters, ccore_flag): sample = read_sample(sample) cure_instance = kmeans(sample, initial_centers, 0.025, ccore_flag) cure_instance.process() clusters = cure_instance.get_clusters() encoding = cure_instance.get_cluster_encoding() encoder = cluster_encoder(encoding, clusters, sample) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING) encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION) encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION) assert number_clusters == len(clusters)
def templateLengthProcessData(path_to_file, radius, cluster_numbers, threshold, expected_cluster_length, ccore): sample = read_sample(path_to_file) rock_instance = rock(sample, radius, cluster_numbers, threshold, ccore) rock_instance.process() clusters = rock_instance.get_clusters() length = sum([len(cluster) for cluster in clusters]) assert len(sample) == length obtained_cluster_sizes = [len(cluster) for cluster in clusters] obtained_cluster_sizes.sort() expected_cluster_length.sort() assert obtained_cluster_sizes == expected_cluster_length
def templateLengthProcessData(path_to_file, initial_medoids, expected_cluster_length, ccore_flag): sample = read_sample(path_to_file) kmedoids_instance = kmedoids(sample, initial_medoids, 0.025, ccore_flag) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() obtained_cluster_sizes = [len(cluster) for cluster in clusters] assert len(sample) == sum(obtained_cluster_sizes) obtained_cluster_sizes.sort() expected_cluster_length.sort() assert obtained_cluster_sizes == expected_cluster_length
def templateClusteringResultsSpecificData(data_type, path, radius, neighbors, amount_clusters, expected_length_clusters, ccore): sample = read_sample(path) if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) else: input_data = sample optics_instance = optics(input_data, radius, neighbors, amount_clusters, ccore, data_type=data_type) optics_instance.process() clusters = optics_instance.get_clusters() noise = optics_instance.get_noise() optics_objects = optics_instance.get_optics_objects() object_indexes = set([obj.index_object for obj in optics_objects]) assertion.eq(len(optics_objects), len(object_indexes)) for obj in optics_objects: if obj.core_distance is not None: assertion.ge(obj.core_distance, 0) if obj.reachability_distance is not None: assertion.ge(obj.reachability_distance, 0) assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample) assert len(clusters) == len(expected_length_clusters) assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters) assert sorted([len(cluster) for cluster in clusters ]) == sorted(expected_length_clusters) if amount_clusters is not None: analyser = ordering_analyser(optics_instance.get_ordering()) assert len(analyser) > 0 amount_clusters, borders = analyser.extract_cluster_amount( optics_instance.get_radius()) assert amount_clusters == len(expected_length_clusters) assert len(borders) == amount_clusters - 1
def template_clustering(file, map_size, radius, sync_order=0.999, show_dyn=False, show_layer1=False, show_layer2=False, show_clusters=True): # Read sample sample = read_sample(file) # Create network network = syncsom(sample, map_size[0], map_size[1], radius) # Run processing (ticks, (dyn_time, dyn_phase)) = timedcall(network.process, show_dyn, sync_order) print("Sample: ", file, "\t\tExecution time: ", ticks, "\n") # Show dynamic of the last layer. if (show_dyn == True): draw_dynamics(dyn_time, dyn_phase, x_title="Time", y_title="Phase", y_lim=[0, 3.14]) if (show_clusters == True): clusters = network.get_som_clusters() visualizer = cluster_visualizer() visualizer.append_clusters(clusters, network.som_layer.weights) visualizer.show() # Show network stuff. if (show_layer1 == True): network.show_som_layer() if (show_layer2 == True): network.show_sync_layer() if (show_clusters == True): clusters = network.get_clusters() visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show()
def correct_scores(sample_path, answer_path, ccore_flag, **kwargs): data_type = kwargs.get('data_type', 'points') sample = read_sample(sample_path) if data_type == 'distance_matrix': sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE)) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score) return scores
def exception(type, input_data, number_cluster, number_represent_points, compression, ccore_flag): try: if isinstance(input_data, str): sample = read_sample(input_data) else: sample = input_data cure_instance = cure(sample, number_cluster, number_represent_points, compression, ccore=ccore_flag) cure_instance.process() except type: return except Exception as ex: raise AssertionError("Expected: '%s', Actual: '%s'" % (type, type(ex).__name__)) raise AssertionError("Expected: '%s', Actual: 'None'" % type)
def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5): sample = read_sample(path); birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier); birch_instance.process(); clusters = birch_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; total_length = sum(obtained_cluster_sizes); assert total_length == len(sample); if (cluster_sizes != None): cluster_sizes.sort(); obtained_cluster_sizes.sort(); assert cluster_sizes == obtained_cluster_sizes;
def find_optimal_amout_clusters(sample_path, kmin, kmax, algorithm): sample = read_sample(sample_path) search_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm).process() amount = search_instance.get_amount() scores = search_instance.get_scores() print("Sample: '%s', Scores: '%s'" % (sample_path, str(scores))) initial_centers = kmeans_plusplus_initializer(sample, amount).initialize() kmeans_instance = kmeans(sample, initial_centers).process() clusters = kmeans_instance.get_clusters() visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.show()
def template_clustering(path, amount, threshold, **kwargs): metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE)); ccore = kwargs.get('ccore', False); draw = kwargs.get('draw', True); sample = read_sample(path); print("Sample: ", path); mbsas_instance = mbsas(sample, amount, threshold, ccore=ccore, metric=metric); mbsas_instance.process(); clusters = mbsas_instance.get_clusters(); representatives = mbsas_instance.get_representatives(); if draw is True: bsas_visualizer.show_clusters(sample, clusters, representatives);
def templateLengthProcessData(path_to_file, amount_clusters, expected_cluster_length, ccore): sample = read_sample(path_to_file); somsc_instance = somsc(sample, amount_clusters, 100, ccore); somsc_instance.process(); clusters = somsc_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; assert len(sample) == sum(obtained_cluster_sizes); if (expected_cluster_length != None): obtained_cluster_sizes.sort(); expected_cluster_length.sort(); if (obtained_cluster_sizes != expected_cluster_length): print assert obtained_cluster_sizes == expected_cluster_length;
def templatePredict(path_to_file, initial_centers, points, expected_amount, expected_closest_clusters, ccore, **kwargs): sample = read_sample(path_to_file) kmax = kwargs.get('kmax', 20) xmeans_instance = xmeans(sample, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore) xmeans_instance.process() closest_clusters = xmeans_instance.predict(points) assertion.eq(expected_amount, len(xmeans_instance.get_clusters())) assertion.eq(len(expected_closest_clusters), len(closest_clusters)) assertion.true( numpy.array_equal(numpy.array(expected_closest_clusters), closest_clusters))
def template_clustering(path_sample, eps, minpts): sample = read_sample(path_sample) optics_instance = optics(sample, eps, minpts) optics_instance.process() clusters = optics_instance.get_clusters() noise = optics_instance.get_noise() draw_clusters(sample, clusters, [], '.') ordering = optics_instance.get_cluster_ordering() indexes = [i for i in range(0, len(ordering))] # visualization of cluster ordering in line with reachability distance. plt.bar(indexes, ordering) plt.show()
def testVisualizeClusterWithAttributes(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1); cure_instance = cure(sample, 2, 5, 0.5, False); cure_instance.process(); clusters = cure_instance.get_clusters(); representors = cure_instance.get_representors(); means = cure_instance.get_means(); visualizer = cluster_visualizer(); visualizer.append_clusters(clusters, sample); for cluster_index in range(len(clusters)): visualizer.append_cluster_attribute(0, cluster_index, representors[cluster_index], '*', 10); visualizer.append_cluster_attribute(0, cluster_index, [ means[cluster_index] ], 'o'); visualizer.show();
def templateDynamicLength(path, number_clusters, expected_length, initial_neighbors, increase_persent, collect_dynamic_flag, ccore_flag): sample = read_sample(path); network = hsyncnet(sample, number_clusters, initial_type.EQUIPARTITION, initial_neighbors, increase_persent, ccore = ccore_flag); analyser = network.process(order = 0.995, solution = solve_type.FAST, collect_dynamic = collect_dynamic_flag); assert len(analyser) != 0; if (collect_dynamic_flag is True): assert len(analyser) >= 1; if (expected_length is None): assert len(analyser) > 1; else: assert len(analyser) == expected_length; else: assert len(analyser) == 1;
def template_clustering(path, radius, cluster_numbers, threshold, draw=True, ccore=True): sample = read_sample(path) rock_instance = rock(sample, radius, cluster_numbers, threshold, ccore) (ticks, result) = timedcall(rock_instance.process) clusters = rock_instance.get_clusters() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") if (draw == True): draw_clusters(sample, clusters)
def templateKmeansPlusPlusForKmedoidsClustering(self, path_sample, amount, expected_clusters_length): result_success = True for _ in range(3): try: sample = read_sample(path_sample) start_medoids = kmeans_plusplus_initializer( sample, amount).initialize(return_index=True) KmedoidsTestTemplates.templateLengthProcessData( path_sample, start_medoids, expected_clusters_length, False) except AssertionError: continue break assert result_success == True
def templateLengthProcessData(self, path_to_file, start_centers, expected_cluster_length, ccore = False): sample = read_sample(path_to_file); kmedians_instance = kmedians(sample, start_centers, 0.025, ccore); kmedians_instance.process(); clusters = kmedians_instance.get_clusters(); obtained_cluster_sizes = [len(cluster) for cluster in clusters]; assert len(sample) == sum(obtained_cluster_sizes); if (expected_cluster_length is not None): obtained_cluster_sizes.sort(); expected_cluster_length.sort(); if (obtained_cluster_sizes != expected_cluster_length): print(obtained_cluster_sizes); assert obtained_cluster_sizes == expected_cluster_length;
def templateClusterAllocation(self, path, cluster_sizes, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDEAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5): sample = read_sample(path) birch_instance = birch(sample, number_clusters, branching_factor, max_node_entries, initial_diameter, type_measurement, entry_size_limit, diameter_multiplier) birch_instance.process() clusters = birch_instance.get_clusters() obtained_cluster_sizes = [len(cluster) for cluster in clusters] total_length = sum(obtained_cluster_sizes) self.assertEqual(total_length, len(sample)) if cluster_sizes is not None: cluster_sizes.sort() obtained_cluster_sizes.sort() self.assertEqual(cluster_sizes, obtained_cluster_sizes)
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 15 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False kstep = kwargs.get('kstep', 1) sample = read_sample(path_to_data) expected_clusters_amount = None if path_to_answer is not None: if isinstance(path_to_answer, int): expected_clusters_amount = path_to_answer else: expected_clusters_amount = len( answer_reader(path_to_answer).get_clusters()) additional_info = [] for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), math.floor((kmax - kmin) / kstep + 1)) assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001) if (expected_clusters_amount is not None) and ( actual_elbow != expected_clusters_amount): additional_info.append(actual_elbow) continue testing_result = True break message = None if expected_clusters_amount is not None: message = str(expected_clusters_amount) + ": " + str( additional_info) assertion.true(testing_result, message=message)
def clustering(path, levels, density_threshold, expected_clusters, expected_noise, ccore, **kwargs): sample = read_sample(path) amount_threshold = kwargs.get('amount_threshold', 0) bang_instance = bang(sample, levels, ccore, density_threshold=density_threshold, amount_threshold=amount_threshold) bang_instance.process() clusters = bang_instance.get_clusters() noise = bang_instance.get_noise() directory = bang_instance.get_directory() dendrogram = bang_instance.get_dendrogram() assertion.eq(len(clusters), len(dendrogram)) obtained_length = len(noise) obtained_cluster_length = [] for cluster in clusters: obtained_length += len(cluster) obtained_cluster_length.append(len(cluster)) obtained_cluster_length.sort() assertion.eq(len(sample), obtained_length) assertion.eq(expected_noise, len(noise)) if expected_clusters is not None: assertion.eq(len(expected_clusters), len(clusters)) assertion.eq(expected_clusters, obtained_cluster_length) leafs = directory.get_leafs() covered_points = set() for leaf in leafs: points = leaf.get_points() for index_point in points: covered_points.add(index_point) assertion.eq(len(sample), len(covered_points)) return bang_instance
def templateClusteringResults(self, path, radius, neighbors, amount_clusters, expected_length_clusters, ccore): sample = read_sample(path); optics_instance = optics(sample, radius, neighbors, amount_clusters); optics_instance.process(); clusters = optics_instance.get_clusters(); noise = optics_instance.get_noise(); assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample); assert len(clusters) == len(expected_length_clusters); assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters); assert sorted([len(cluster) for cluster in clusters]) == sorted(expected_length_clusters); if (amount_clusters is not None): analyser = ordering_analyser(optics_instance.get_ordering()); assert len(analyser) > 0; assert analyser.extract_cluster_amount(optics_instance.get_radius()) == len(expected_length_clusters);
def template_predict(self, path, amount, points, ccore): metric = distance_metric(type_metric.EUCLIDEAN) sample = read_sample(path) gmeans_instance = gmeans(sample, amount, ccore).process() centers = gmeans_instance.get_centers() closest_clusters = gmeans_instance.predict(points) self.assertEqual(len(points), len(closest_clusters)) for i in range(len(points)): cluster_index = closest_clusters[i] distance = metric(centers[cluster_index], points[i]) for center_index in range(len(centers)): if center_index != cluster_index: other_distance = metric(centers[center_index], points[i]) self.assertLessEqual(distance, other_distance)
def templateSeachNearestNodeInTree(self, sample_path, **kwargs): numpy_usage = kwargs.get('numpy_usage', False) sample = read_sample(sample_path) if numpy_usage is True: sample = numpy.array(sample) tree = kdtree() for point in sample: node = tree.find_nearest_dist_node(point, 0.0) assert node == None tree.insert(point, None) node = tree.find_nearest_dist_node(point, 0.0) assert node != None assert node.data is point
def testObserver(self): sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2) means, variances = ema_initializer(sample, 3).initialize( ema_init_type.RANDOM_INITIALIZATION) observer_instance = ema_observer() ema_instance = ema(sample, 3, means, variances, observer_instance) ema_instance.process() observer_length = len(observer_instance) assert observer_length > 0 assert observer_length == len( observer_instance.get_evolution_clusters()) assert observer_length == len( observer_instance.get_evolution_covariances()) assert observer_length == len(observer_instance.get_evolution_means()) assert observer_length == observer_instance.get_iterations()
def random_state_fixed(path_to_data, kmin, kmax, ccore, **kwargs): repeat = kwargs.get('repeat', 1) for _ in range(repeat): sample = read_sample(path_to_data) elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs).process() elbow_1 = elbow_instance.get_amount() wce_1 = elbow_instance.get_wce() elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs).process() elbow_2 = elbow_instance.get_amount() wce_2 = elbow_instance.get_wce() assertion.eq(elbow_1, elbow_2) assertion.eq(wce_1, wce_2)
def templateClusteringResults(path, number_clusters, link, expected_length_clusters, ccore_flag=False): sample = read_sample(path) agglomerative_instance = agglomerative(sample, number_clusters, link, ccore_flag) agglomerative_instance.process() clusters = agglomerative_instance.get_clusters() assert sum([len(cluster) for cluster in clusters]) == len(sample) assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters) assert sorted([len(cluster) for cluster in clusters]) == expected_length_clusters