def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) itermax = kwargs.get('itermax', 200) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if itermax == 0: assertion.eq([], clusters) assertion.eq(medoids, initial_medoids) return if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
def templateClusteringDistanceMatrix(path_to_file, radius, neighbors, expected_length_clusters, ccore): sample = read_sample(path_to_file) distance_matrix = calculate_distance_matrix(sample) dbscan_instance = dbscan(distance_matrix, radius, neighbors, ccore, data_type='distance_matrix') dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() assertion.eq(len(sample), sum([len(cluster) for cluster in clusters]) + len(noise)) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters]))
def templateClusteringResultsSpecificData(data_type, path, radius, neighbors, amount_clusters, expected_length_clusters, ccore): sample = read_sample(path) if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) else: input_data = sample optics_instance = optics(input_data, radius, neighbors, amount_clusters, ccore, data_type=data_type) optics_instance.process() clusters = optics_instance.get_clusters() noise = optics_instance.get_noise() optics_objects = optics_instance.get_optics_objects() object_indexes = set([obj.index_object for obj in optics_objects]) assertion.eq(len(optics_objects), len(object_indexes)) for obj in optics_objects: if obj.core_distance is not None: assertion.ge(obj.core_distance, 0) if obj.reachability_distance is not None: assertion.ge(obj.reachability_distance, 0) assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample) assert len(clusters) == len(expected_length_clusters) assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters) assert sorted([len(cluster) for cluster in clusters ]) == sorted(expected_length_clusters) if amount_clusters is not None: analyser = ordering_analyser(optics_instance.get_ordering()) assert len(analyser) > 0 amount_clusters, borders = analyser.extract_cluster_amount( optics_instance.get_radius()) assert amount_clusters == len(expected_length_clusters) assert len(borders) == amount_clusters - 1
def correct_scores(sample_path, answer_path, ccore_flag, **kwargs): data_type = kwargs.get('data_type', 'points') sample = read_sample(sample_path) if data_type == 'distance_matrix': sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE)) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score) return scores
def template_clustering(start_medoids, path, tolerance=0.25, show=True, **kwargs): ccore = kwargs.get('ccore', True) data_type = kwargs.get('data_type', 'points') original_data = read_sample(path) sample = original_data if data_type == 'distance_matrix': sample = calculate_distance_matrix(sample) metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample) kmedoids_instance = kmedoids(sample, start_medoids, tolerance, metric=metric, ccore=ccore, data_type=data_type) (ticks, result) = timedcall(kmedoids_instance.process) clusters = kmedoids_instance.get_clusters() print("Iterations:", kmedoids_instance.get_iterations()) print([len(cluster) for cluster in clusters]) print(clusters) medoids = kmedoids_instance.get_medoids() print("Sample: ", path, "\t\tExecution time: ", ticks, "\n") if show is True: visualizer = cluster_visualizer(1) visualizer.append_clusters(clusters, original_data, 0) visualizer.append_cluster( [original_data[index] for index in start_medoids], marker='*', markersize=15) visualizer.append_cluster(medoids, data=original_data, marker='*', markersize=15) visualizer.show() return original_data, clusters
def templateClusterAllocationOneDimensionDataSpecificData(data_type, ccore_flag): for _ in range(50): sample = [[random()] for _ in range(10)] + [[random() + 3] for _ in range(10)] + [[random() + 6] for _ in range(10)] + [[random() + 9] for _ in range(10)] if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) elif data_type == 'points': input_data = sample else: raise ValueError("Incorrect data type '%s' is specified" % data_type) dbscan_instance = dbscan(input_data, 1.0, 2, ccore_flag, data_type=data_type) dbscan_instance.process() clusters = dbscan_instance.get_clusters() assertion.eq(4, len(clusters)) for cluster in clusters: assertion.eq(10, len(cluster))
def templateClusterAllocationOneDimensionDataSpecificData(data_type, ccore_flag): for _ in range(50): sample = [[random()] for _ in range(10)] + [[random() + 3] for _ in range(10)] + [[random() + 6] for _ in range(10)] + [[random() + 9] for _ in range(10)] if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) elif data_type == 'points': input_data = sample else: raise ValueError("Incorrect data type '%s' is specified" % data_type) dbscan_instance = dbscan(input_data, 1.0, 2, ccore_flag, data_type=data_type) dbscan_instance.process() clusters = dbscan_instance.get_clusters() assertion.eq(4, len(clusters)) for cluster in clusters: assertion.eq(10, len(cluster))
def templateLengthProcessSpecificData(data_type, path_to_file, radius, min_number_neighbors, max_number_neighbors, ccore): for _ in range(min_number_neighbors, max_number_neighbors, 1): sample = read_sample(path_to_file) if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) elif data_type == 'points': input_data = sample else: raise ValueError("Incorrect data type '%s' is specified" % data_type) dbscan_instance = dbscan(input_data, radius, min_number_neighbors, ccore, data_type=data_type) dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() length = len(noise) length += sum([len(cluster) for cluster in clusters]) assertion.eq(len(sample), length)
def templateClusteringDistanceMatrix(path_to_file, radius, neighbors, expected_length_clusters, ccore): sample = read_sample(path_to_file) distance_matrix = calculate_distance_matrix(sample) dbscan_instance = dbscan(distance_matrix, radius, neighbors, ccore, data_type='distance_matrix') dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() assertion.eq(len(sample), sum([len(cluster) for cluster in clusters]) + len(noise)) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters]))
def templateLengthProcessSpecificData(data_type, path_to_file, radius, min_number_neighbors, max_number_neighbors, ccore): for _ in range(min_number_neighbors, max_number_neighbors, 1): sample = read_sample(path_to_file) if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) elif data_type == 'points': input_data = sample else: raise ValueError("Incorrect data type '%s' is specified" % data_type) dbscan_instance = dbscan(input_data, radius, min_number_neighbors, ccore, data_type=data_type) dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() length = len(noise) length += sum([len(cluster) for cluster in clusters]) assertion.eq(len(sample), length)
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.matrix(input_data) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.025, ccore_flag, metric=metric, data_type=data_type) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() assertion.eq(len(clusters), len(medoids)) assertion.eq(len(set(medoids)), len(medoids)) obtained_cluster_sizes = [len(cluster) for cluster in clusters] assertion.eq(len(sample), sum(obtained_cluster_sizes)) obtained_cluster_sizes.sort() expected_cluster_length.sort() assertion.eq(obtained_cluster_sizes, expected_cluster_length)
def templateClusteringResultsSpecificData(data_type, path, radius, neighbors, amount_clusters, expected_length_clusters, ccore): sample = read_sample(path) if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) else: input_data = sample optics_instance = optics(input_data, radius, neighbors, amount_clusters, ccore, data_type=data_type) optics_instance.process() clusters = optics_instance.get_clusters() noise = optics_instance.get_noise() optics_objects = optics_instance.get_optics_objects() object_indexes = set( [ obj.index_object for obj in optics_objects ] ) assertion.eq(len(optics_objects), len(object_indexes)) for obj in optics_objects: if obj.core_distance is not None: assertion.ge(obj.core_distance, 0) if obj.reachability_distance is not None: assertion.ge(obj.reachability_distance, 0) assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample) assert len(clusters) == len(expected_length_clusters) assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters) assert sorted([len(cluster) for cluster in clusters]) == sorted(expected_length_clusters) if amount_clusters is not None: analyser = ordering_analyser(optics_instance.get_ordering()) assert len(analyser) > 0 amount_clusters, borders = analyser.extract_cluster_amount(optics_instance.get_radius()) assert amount_clusters == len(expected_length_clusters) assert len(borders) == amount_clusters - 1
def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs): sample = read_sample(path_to_file) data_type = kwargs.get('data_type', 'points') input_type = kwargs.get('input_type', 'list') initialize_medoids = kwargs.get('initialize_medoids', None) itermax = kwargs.get('itermax', 200) if metric is None: metric = distance_metric(type_metric.EUCLIDEAN_SQUARE) input_data = sample if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) if input_type == 'numpy': input_data = numpy.array(input_data) testing_result = False testing_attempts = 1 if initialize_medoids is not None: # in case center initializer randomization appears testing_attempts = 10 for _ in range(testing_attempts): if initialize_medoids is not None: initial_medoids = kmeans_plusplus_initializer( sample, initialize_medoids).initialize(return_index=True) kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() if itermax == 0: assertion.eq([], clusters) assertion.eq(medoids, initial_medoids) return if len(clusters) != len(medoids): continue if len(set(medoids)) != len(medoids): continue obtained_cluster_sizes = [len(cluster) for cluster in clusters] if len(sample) != sum(obtained_cluster_sizes): continue if expected_cluster_length is not None: obtained_cluster_sizes.sort() expected_cluster_length.sort() if obtained_cluster_sizes != expected_cluster_length: continue testing_result = True assertion.true(testing_result)
def testCalculateMatrixDistanceAsNumPy(self): data = numpy.array([[0], [2], [4]]) matrix = utils.calculate_distance_matrix(data) self.assertEqual(matrix, [[0.0, 2.0, 4.0], [2.0, 0.0, 2.0], [4.0, 2.0, 0.0]])
def testCalculateMatrixDistance(self): data = [[0], [2], [4]] matrix = utils.calculate_distance_matrix(data) assert matrix == [[0.0, 2.0, 4.0], [2.0, 0.0, 2.0], [4.0, 2.0, 0.0]]
def clustering_with_answer(data_file, answer_file, ccore, **kwargs): data_type = kwargs.get('data_type', 'points') metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)) original_data = read_sample(data_file) data = original_data if data_type == 'distance_matrix': data = calculate_distance_matrix(original_data, metric) reader = answer_reader(answer_file) amount_medoids = len(reader.get_clusters()) initial_medoids = kmeans_plusplus_initializer( data, amount_medoids, **kwargs).initialize(return_index=True) kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore, **kwargs) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() expected_length_clusters = sorted(reader.get_cluster_lengths()) assertion.eq(len(expected_length_clusters), len(medoids)) assertion.eq(len(data), sum([len(cluster) for cluster in clusters])) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) unique_medoids = set() for medoid in medoids: assertion.false( medoid in unique_medoids, message="Medoids '%s' is not unique (actual medoids: '%s')" % (str(medoid), str(unique_medoids))) unique_medoids.add(medoid) unique_points = set() for cluster in clusters: for point in cluster: assertion.false( point in unique_points, message= "Point '%s' is already assigned to one of the clusters." % str(point)) unique_points.add(point) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters])) expected_clusters = reader.get_clusters() for actual_cluster in clusters: cluster_found = False for expected_cluster in expected_clusters: if actual_cluster == expected_cluster: cluster_found = True assertion.true( cluster_found, message="Actual cluster '%s' is not found among expected." % str(actual_cluster))
def testCalculateMatrixDistance(self): data = [ [0], [2], [4] ]; matrix = utils.calculate_distance_matrix(data); assert matrix == [ [0.0, 2.0, 4.0], [2.0, 0.0, 2.0], [4.0, 2.0, 0.0] ];