def test_to_dic(self): clustering = Clustering([ Cluster(16, [16]), Cluster(9, [9, 10, 11, 12, 13, 14, 15]), Cluster(0, [0, 1, 2, 3]), Cluster(4, [4, 5, 6, 7, 8]) ]) self.assertDictEqual( clustering.to_dic(), { 'clusters': [{ 'prototype': 9, 'elements': '9:15', 'id': 'cluster_1' }, { 'prototype': 4, 'elements': '4:8', 'id': 'cluster_3' }, { 'prototype': 0, 'elements': '0:3', 'id': 'cluster_2' }, { 'prototype': 16, 'elements': '16', 'id': 'cluster_0' }], 'total_number_of_elements': 17, 'number_of_clusters': 4 })
def test_remove_noise(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) clustering.eliminate_noise(5) self.assertEqual(len(clustering.clusters), 2)
def test_cluster_mixed_cohesion_wo_prot(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters_1 = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clusters_2 = [ Cluster(None, elements=[0, 2, 4]), Cluster(None, elements=[1, 3]) ] sep_calctor = SeparationCalculator() self.assertEqual( sep_calctor._SeparationCalculator__between_cluster_distance( clusters_1[0], clusters_1[1], distances), 7.0) self.assertEqual( sep_calctor._SeparationCalculator__between_cluster_distance( clusters_1[0], clusters_1[2], distances), 20.0) self.assertEqual( sep_calctor._SeparationCalculator__between_cluster_distance( clusters_1[1], clusters_1[2], distances), 17.0) self.assertEqual( sep_calctor._SeparationCalculator__between_cluster_distance( clusters_2[0], clusters_2[1], distances), 34.0)
def test_get_all_clustered_elements(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) self.assertItemsEqual(sorted(clustering.get_all_clustered_elements()), range(17))
def test_cluster_cohe_sep_wo_prot_eval(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters_1 = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clusters_2 = [ Cluster(None, elements=[0, 2, 4]), Cluster(None, elements=[1, 3]) ] clusterization_1 = Clustering(clusters_1) clusterization_2 = Clustering(clusters_2) sep_calctor = SeparationCalculator() self.assertEqual( sep_calctor.cluster_separation(clusters_1[0], clusterization_1, 1., distances), 27.0) self.assertEqual( sep_calctor.cluster_separation(clusters_1[1], clusterization_1, 1., distances), 24.0) self.assertEqual( sep_calctor.cluster_separation(clusters_1[2], clusterization_1, 1., distances), 37.0) self.assertEqual( sep_calctor.cluster_separation(clusters_2[0], clusterization_2, 1., distances), 34.0) self.assertEqual( sep_calctor.cluster_separation(clusters_2[1], clusterization_2, 1., distances), 34.0)
def test_cluster_cohesion_without_prototype(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters_1 = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clusters_2 = [ Cluster(None, elements=[0, 2, 4]), Cluster(None, elements=[1, 3]) ] cohesion_calctor = CohesionCalculator() self.assertEqual( cohesion_calctor.evaluate_cluster(clusters_1[0], distances), 0.5) self.assertEqual( cohesion_calctor.evaluate_cluster(clusters_1[1], distances), 0.) self.assertEqual( cohesion_calctor.evaluate_cluster(clusters_1[2], distances), 5.0) self.assertEqual( cohesion_calctor.evaluate_cluster(clusters_2[0], distances), 5.0) self.assertEqual( cohesion_calctor.evaluate_cluster(clusters_2[1], distances), 3.0)
def setUpClass(cls): cls.matrix = CondensedMatrix(squared_CH_table1) cls.clusterings = [Clustering([Cluster(None, [0,1,2,3]), Cluster(None, [4,5])]), Clustering([Cluster(None, [0,1]), Cluster(None, [2,3]), Cluster(None, [4,5])])] update_medoids(cls.clusterings[0], cls.matrix) update_medoids(cls.clusterings[0], cls.matrix)
def test_redefine_cluster_with_map(self): initial_cluster = Cluster(None,[1,3,4,7,8]) final_cluster_1 = Cluster(None,[0,1,4]) #-> elements [1,3,8] of initial cluster final_cluster_2 = Cluster(None,[2,3]) #-> elements [4,7] of initial cluster self.assertItemsEqual( [1,3,8],Refiner.redefine_cluster_with_map(initial_cluster, final_cluster_1).all_elements) self.assertItemsEqual( [4,7],Refiner.redefine_cluster_with_map(initial_cluster, final_cluster_2).all_elements)
def test_creation(self): # The inner list is a copy but shares clusters clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) clusters[1].prototype = -20 self.assertEqual(clusters[1].prototype, clustering.clusters[1].prototype)
def test_mean_cluster_size(self): clusters = [ Cluster(0,[0,4,5,7,13]), Cluster(1,[1,16,17,18]), Cluster(2,[2,3,8,19]), Cluster(6,[6,11,12,15]), Cluster(9,[9,10,14])] clustering = Clustering(clusters, "Test Clustering") analysisPopulator = AnalysisPopulatorMock("") self.assertEqual(4, analysisPopulator.analysis_function_mean_cluster_size(clustering))
def test_update_medois(self): clusters = [Cluster(None, [1,2]),Cluster(None, [3,4]), Cluster(None, [5])] clustering = Clustering(clusters) matrix = CondensedMatrix(squared_CH_table1) update_medoids(clustering, matrix) for c in clusters: self.assertNotEqual(c.prototype, None) self.assertItemsEqual([c.prototype for c in clusters], [1,3,5])
def test_get_percent_of_n_clusters(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) percents = clustering.get_population_percent_of_n_bigger_clusters(3) expected_percents = [41.1764705882, 29.4117647059, 23.5294117647] for i in range(3): self.assertAlmostEqual(percents[i], expected_percents[i], 1)
def test_get_percent_population_of_cluster(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) total = 0 for i in range(4): total = total + clustering.get_population_percent_of_cluster(i) self.assertAlmostEqual(total, 100., 2)
def test_mini_evaluation(self): calculator = MeanMinimumDistanceCalculator(10) clusters = [ Cluster(None, elements=[0, 1, 2]), Cluster(None, elements=[3, 4]) ] triangle = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10.] distances = CondensedMatrix(triangle) clustering = Clustering(clusters) self.assertEqual(7.0, calculator.evaluate(clustering, distances, 20))
def test_calculate_biased_medoid(self): condensed_matrix = CondensedMatrix([1.0, 4.5, 7.2, 6.7, 8.5, 4.5, 3.6, 7.8, 2.2, 2.0]) c = Cluster(None,[0,2,3,4]) interesting_elements = [3,4,0] self.assertEquals(4, c.calculate_biased_medoid(condensed_matrix,interesting_elements)) interesting_elements = [4,2,3] self.assertEquals(4,c.calculate_biased_medoid(condensed_matrix,interesting_elements))
def test_regression_cohesion_eval(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clustering = Clustering(clusters) cohesion_calctor = CohesionCalculator() self.assertEqual(cohesion_calctor.evaluate(clustering, distances), 5.5)
def test_getClusterAndComplementary(self): clustering = Clustering([ Cluster(1, range(5)), Cluster(5, range(5, 10)), Cluster(10, range(10, 20)) ]) A, Acomp = get_cluster_and_complementary(1, clustering.clusters) A.sort() Acomp.sort() self.assertItemsEqual(A, [0, 1, 2, 3, 4]) self.assertItemsEqual( Acomp, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
def test_subsampled_mean_min_dist(self): calculator = MeanMinimumDistanceCalculator(10) clusters = [ Cluster(None, elements=[0, 1, 2]), Cluster(None, elements=[3, 4]) ] triangle = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10.] distances = CondensedMatrix(triangle) self.assertEqual( (8.0, 6.0), calculator.subsampled_mean_min_dist(clusters[0], clusters[1], 20, distances))
def test_get_intra_cluster_distances(self): matrix = CondensedMatrix(CH_table1) numpy.testing.assert_almost_equal(get_intra_cluster_distances(Cluster(None, [4,5]), matrix),[2.4494897427831779],5) numpy.testing.assert_almost_equal(get_intra_cluster_distances(Cluster(None, [1,3,5]), matrix),[2.4494897427831779, 3.8729833462074170, 3.8729833462074170],5) data = [0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] matrix = CondensedMatrix(data) expected_distance = 4 self.assertEqual(expected_distance, numpy.sum(get_intra_cluster_distances(Cluster(None, range(5)), matrix)))
def test_PCA(self): """ Regression test. """ trajectory_handler = TrajectoryHandlerStub( testPCAMetric.not_iterposed_coordsets, 66) clustering = Clustering( [Cluster(None, range(6)), Cluster(None, range(6, 12))], "a clustering") pcaMetric = PCAMetric(trajectory_handler) self.assertAlmostEquals(pcaMetric.evaluate(clustering), 1.427748687873, 12)
def test_number_of_clusters_needed_to_get_this_percent_of_elems(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) self.assertEqual(clustering.number_of_clusters_to_get_percent(71), 3) self.assertEqual(clustering.number_of_clusters_to_get_percent(70), 2) self.assertEqual(clustering.number_of_clusters_to_get_percent(40), 1) self.assertEqual(clustering.number_of_clusters_to_get_percent(42), 2) self.assertEqual(clustering.number_of_clusters_to_get_percent(100), 4)
def calculate_RMSF(best_clustering, data_handler): ca_pdb_coordsets = data_handler.get_data().getSelectionCoordinates("name CA") global_cluster = Cluster(None, best_clustering.get_all_clustered_elements()) global_cluster.id = "global" clusters = best_clustering.clusters + [global_cluster] rmsf_per_cluster = {} for cluster in clusters: rmsf_per_cluster[cluster.id] = superpose_and_calc_rmsf(ca_pdb_coordsets, cluster) return rmsf_per_cluster
def test_get_min_distances(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters = [ Cluster(None, elements=[0, 1, 2]), Cluster(None, elements=[3, 4]) ] calculator = MeanMinimumDistanceCalculator(10) min_dists, mean = calculator.get_mean_and_min_distances( clusters[0], clusters[1], distances) self.assertItemsEqual([3.0, 6.0, 8.0], min_dists) self.assertAlmostEquals(12.33, mean, 2)
def test_gen_clusters_from_grouping_list(self): # numpy.random.random_integers(0,4,20) numclusters = 5 group_list = [4, 1, 2, 2, 4, 4, 3, 4, 2, 0, 0, 3, 3, 4, 0, 3, 1, 1, 1, 2] true_clusters = [Cluster(0,[0,4,5,7,13]), Cluster(1,[1,16,17,18]), Cluster(2,[2,3,8,19]), Cluster(6,[6,11,12,15]), Cluster(9,[9,10,14])] clusters = gen_clusters_from_class_list(group_list) sorted_clusters = sorted(clusters, key=lambda c: c.prototype) self.assertEqual(numclusters,len(sorted_clusters)) for i in range(numclusters): self.assertEqual(true_clusters[i], sorted_clusters[i])
def test_load_and_save_to_disk(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) before_saving_elements = clustering.get_all_clustered_elements() clustering.save_to_disk( os.path.join(test_data.__path__[0], "saved_clustering_for_test")) loaded_clustering = Clustering.load_from_disk( os.path.join(test_data.__path__[0], "saved_clustering_for_test")) after_saving_elements = loaded_clustering.get_all_clustered_elements() self.assertItemsEqual(before_saving_elements, after_saving_elements) os.system("rm data/saved_clustering_for_test")
def test_one_clusterization_silhouette(self): distances = CondensedMatrix( [ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters_1 = [Cluster(None, elements=[0,1]), Cluster(None, elements=[2] ), Cluster(None, elements=[3,4])] clusterization_1 = Clustering(clusters_1) sil_calc = SilhouetteCoefficientCalculator() expected = [0.5, 0.80000000000000004, -0.55000000000000004, -0.45000000000000001, 0.7142857142857143] self.assertItemsEqual(sil_calc._SilhouetteCoefficientCalculator__one_clusterization_partial_silhouette(clusterization_1,distances),expected)
def calculate_RMSF(best_clustering, data_handler): ca_pdb_coordsets = data_handler.get_data().getSelectionCoordinates( "name CA") global_cluster = Cluster(None, best_clustering.get_all_clustered_elements()) global_cluster.id = "global" clusters = best_clustering.clusters + [global_cluster] rmsf_per_cluster = {} for cluster in clusters: rmsf_per_cluster[cluster.id] = superpose_and_calc_rmsf( ca_pdb_coordsets, cluster) return rmsf_per_cluster
def test_regression_separation_eval(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clustering = Clustering(clusters) sep_calctor = SeparationCalculator() self.assertEqual( sep_calctor.evaluate(clustering, distances, [1, 1, 1]), 27.0 + 24.0 + 37.0) self.assertEqual(sep_calctor.evaluate(clustering, distances), (1 / 0.5) * 27.0 + (1 / 5.0) * 37.0)
def calculate_distance_stats(elements, matrix): """ Calculates the mean, dispersion and radius of all the distances to the central element of a set of elements. @param elements: The elements we are working with. @param matrix: The used condensed matrix. @return: Mean, std deviation and radius of all the elements with respect to their central element. """ cluster = Cluster(None, elements) medoid = cluster.calculate_medoid(matrix) # We also get a 0 distance from the medoid vs itself (it is contained in 'elements') distances = get_distances_of_elements_to(medoid, elements, matrix) return numpy.mean(distances), numpy.std(distances), numpy.max(distances)
def purge_mixed_clusters_and_do_graph(mixed, pure_clusters_traj1,condensed_distance_matrix,std_devs_from_A,path): """ """ common.print_and_flush( "Purging clusters...") # Purge all mixed clusters of elements from traj2 purged = [] num_elems_of_traj_2 = [] for i in range(len(mixed)): cluster, elems_in_traj1, elems_in_traj2 = mixed[i] #@UnusedVariable num_elems_of_traj_2.append(len(elems_in_traj2)) # We rebuild the cluster with only elements of traj 1 purged.append(Cluster(prototype=None,elements = elems_in_traj1)) # print "l ",len(elems_in_traj1)," ",len(elems_in_traj2) # we also need to have traj 1 pure clusters purged.extend(pure_clusters_traj1) # Those don't have any element of traj 2, so we put 0s in the number of # elements list num_elems_of_traj_2.extend([0]*len(pure_clusters_traj1)) #Calculate statistics for the remaining clusters for i in range(len(pure_clusters_traj1)): medoid = pure_clusters_traj1[i].calculate_medoid(condensed_distance_matrix) std_devs_from_A.append(get_distance_std_dev_for_elems(pure_clusters_traj1[i].all_elements,medoid,condensed_distance_matrix)) common.print_and_flush( "Done.\n") common.print_and_flush("Trying to draw state graph...") do_graph(Clustering(purged,sort = False),num_elems_of_traj_2,std_devs_from_A,path) common.print_and_flush("Done.\n")
def calculate_RMSF(best_clustering, trajectoryHandler, workspaceHandler, matrixHandler): ca_pdb_coordsets = numpy.copy( trajectoryHandler.getMergedStructure().select( "name CA").getCoordsets()) global_cluster = Cluster(None, best_clustering.get_all_clustered_elements()) global_cluster.id = "global" clusters = best_clustering.clusters + [global_cluster] rmsf_per_cluster = {} for cluster in clusters: rmsf_per_cluster[cluster.id] = superpose_and_calc_rmsf( ca_pdb_coordsets, cluster) return rmsf_per_cluster
def evaluate(self, clustering, matrix): """ Mean is approximated to medoid. """ update_medoids(clustering, matrix) global_cluster = Cluster(None, clustering.get_all_clustered_elements()) global_cluster.prototype = global_cluster.calculate_medoid(matrix) global_variance = numpy.var(get_distances_of_elements_to(global_cluster.prototype, global_cluster.all_elements, matrix)) variances = [self.cluster_variance(cluster,matrix) for cluster in clustering.clusters] sum_ci = numpy.sum(variances) Cmp = sum_ci / (len(clustering.clusters)*global_variance) return Cmp
def calculate_mean_center_differences(decomposed_cluster, matrix): """ Given a mixed decomposed cluster, it calculates the mean of all center differences (giving a qualitative view of how separated the inner subclusters are). @param decomposed_cluster: A MIXED decomposed cluster. @param matrix: The condensed distance matrix used. @return: The mean of center distances. """ centers = [] for traj_id in decomposed_cluster: cluster = Cluster(None, decomposed_cluster[traj_id]) centers.append(cluster.calculate_medoid(matrix)) center_distances = [] for i in range(len(centers)-1): for j in range(i+1, len(centers)): center_distances.append(matrix[centers[i],centers[j]]) return numpy.mean(center_distances)
def from_dic(cls, clustering_dic): """ Creates a clustering from a clustering dictionary describing it (as reverse operation of 'to_dic'). """ clusters_dic = clustering_dic["clusters"]; clusters = [] for cluster_dic in clusters_dic: clusters.append(Cluster.from_dic(cluster_dic)) return Clustering(clusters)
def test_calculate_biased_medoid_scenario(self): cluster = Cluster.from_dic({ "prototype": 28, "elements": "0:46, 49, 51, 53, 57:58, 62:67", "id": "cluster_0" }) matrix = CondensedMatrix(list(numpy.asfarray(numpy.load(os.path.join(test_data.__path__[0],"matrix.npy"))))) self.assertEqual(cluster.prototype, cluster.calculate_medoid(matrix)) cluster = Cluster.from_dic({ "prototype": 54, "elements": "0:117, 119:135, 138:139, 141, 143, 145:146, 148:150, 153, 155:156, 167:168, 170:172, 175, 177, 190, 193, 212, 215, 234", "id": "cluster_0" }) self.assertEqual(cluster.prototype, cluster.calculate_medoid(matrix)) cluster = Cluster.from_dic({ "prototype": 1604, "elements": "224, 290, 312, 334, 378, 422, 444, 466, 468, 488, 504, 526, 645, 782, 799, 821, 843, 953, 1208, 1254, 1276, 1291, 1313, 1320, 1357, 1445, 1450, 1467, 1472, 1489, 1494, 1516, 1538, 1560, 1582, 1591, 1604, 1613, 1626, 1635, 1671, 1693, 1767, 1789, 1811, 1833, 1841, 1855, 1877, 1899, 1921, 1943, 1965, 2007, 2049, 2070, 2091, 2112, 2203", "id": "cluster_18" }) self.assertEqual(cluster.prototype, cluster.calculate_medoid(matrix))
def test_to_dic(self): true_clusters = [Cluster(0,[0,4,5,7,13]), Cluster(1,[1,16,17,18]), Cluster(2,[2,3,8,19]), Cluster(6,[6,11,12,15]), Cluster(9,[9,10,14])] dic_clusters = [ {'prototype': 0, 'elements': '0, 4:5, 7, 13'}, {'prototype': 1, 'elements': '1, 16:18'}, {'prototype': 2, 'elements': '2:3, 8, 19'}, {'prototype': 6, 'elements': '6, 11:12, 15'}, {'prototype': 9, 'elements': '9:10, 14'} ] for i in range(len(true_clusters)): self.assertDictEqual(Cluster.to_dic(true_clusters[i]), dic_clusters[i])
def test_from_dic(self): clusters = [ { "prototype": 400, "elements": "400:410, 0, 1 ,2,3" }, { "prototype": 500, "elements": "4,500:510, 5, 6:10, 11" } ] expected_elements =[ [400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 0, 1, 2, 3], [4, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 5, 6, 7, 8, 9, 10, 11] ] for i in range(len(clusters)): self.assertEqual(Cluster.from_dic(clusters[i]).all_elements, expected_elements[i])
def test_get_size(self): cluster = Cluster(prototype = 0, elements = [0,4,5,7,13]) self.assertEqual(cluster.get_size(),5)
def test_random_sample(self): cluster = Cluster(None, range(0,100)) self.assertItemsEqual(cluster.get_random_sample(10, 123), [45, 66, 89, 62, 67, 51, 65, 56, 22, 77])