def setUpClass(cls): cls.matrix = CondensedMatrix(squared_CH_table1) cls.clusterings = [Clustering([Cluster(None, [0,1,2,3]), Cluster(None, [4,5])]), Clustering([Cluster(None, [0,1]), Cluster(None, [2,3]), Cluster(None, [4,5])])] update_medoids(cls.clusterings[0], cls.matrix) update_medoids(cls.clusterings[0], cls.matrix)
def test_update_medois(self): clusters = [Cluster(None, [1,2]),Cluster(None, [3,4]), Cluster(None, [5])] clustering = Clustering(clusters) matrix = CondensedMatrix(squared_CH_table1) update_medoids(clustering, matrix) for c in clusters: self.assertNotEqual(c.prototype, None) self.assertItemsEqual([c.prototype for c in clusters], [1,3,5])
def evaluate(self, clustering, matrix): """ Calculates the index. @param clustering: The clustering being checked. @param matrix: The condensed matrix containing all distances. @return: The calculated value for the index. """ update_medoids(clustering, matrix) C = len(clustering.clusters) constant = 2. / (C * (C-1)) # x2 as we use only half of the distances Sep = constant * self.exponential_list_generation(clustering, matrix).sum() return Sep
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder): """ CSV file """ file_name = parameters.get_value( "file", default_value="per_cluster_stats") + ".csv" stats_file_path = os.path.join(results_folder, file_name) stats_file = open(stats_file_path, "w") header_line = "," for i in range(len(best_clustering.clusters)): cluster = best_clustering.clusters[i] header_line += "%s," % cluster.id header_line = header_line[:-1] + "\n" stats_file.write(header_line) # TODO: Once clusterings and clusters become inmutable its medoids will be always updated, # then this kind of operations will be unnecessary update_medoids(best_clustering, matrix) #---------------------------------------- for i in range(len(best_clustering.clusters)): cluster_i = best_clustering.clusters[i] try: intra_distances = get_intra_cluster_distances(cluster_i, matrix) diameter = max(intra_distances) distances_from_proto = get_distances_of_elements_to( cluster_i.prototype, cluster_i.all_elements, matrix) radius = max(distances_from_proto) except SingularClusterException: diameter = 0 radius = 0 finally: line = "%s(d: %.2f r: %.2f)," % (cluster_i.id, diameter, radius) for j in range(0, i + 1): line += "," for j in range(i + 1, len(best_clustering.clusters)): cluster_j = best_clustering.clusters[j] line += "%.2f," % matrix[cluster_i.prototype, cluster_j.prototype] line = line[:-1] + "\n" stats_file.write(line) stats_file.close() return stats_file_path
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder): """ CSV file """ file_name = parameters.get_value("file", default_value = "per_cluster_stats") + ".csv" stats_file_path = os.path.join(results_folder,file_name) stats_file = open(stats_file_path,"w") header_line ="," for i in range(len(best_clustering.clusters)): cluster = best_clustering.clusters[i] header_line+="%s,"%cluster.id header_line = header_line[:-1] +"\n" stats_file.write(header_line) # TODO: Once clusterings and clusters become inmutable its medoids will be always updated, # then this kind of operations will be unnecessary update_medoids(best_clustering, matrix) #---------------------------------------- for i in range(len(best_clustering.clusters)): cluster_i = best_clustering.clusters[i] try: intra_distances = get_intra_cluster_distances(cluster_i, matrix) diameter = max(intra_distances) distances_from_proto = get_distances_of_elements_to(cluster_i.prototype, cluster_i.all_elements, matrix) radius = max(distances_from_proto) except SingularClusterException: diameter = 0 radius = 0 finally: line = "%s(d: %.2f r: %.2f),"%(cluster_i.id, diameter, radius) for j in range(0, i+1): line += "," for j in range(i+1, len(best_clustering.clusters)): cluster_j = best_clustering.clusters[j] line+="%.2f,"%matrix[ cluster_i.prototype, cluster_j.prototype] line = line[:-1] + "\n" stats_file.write(line) stats_file.close() return stats_file_path
def evaluate(self, clustering, matrix): """ Calculates the index value for a clustering. @param clustering: The clustering being checked. @param matrix: The condensed matrix containing all distances. @return: The calculated Davies-Bouldin estimator value. """ # Medoids will be the new prototypes update_medoids(clustering, matrix) # We calculate the average distances to the medoid for each of the clusters average_distances = self.calc_average_distances(clustering, matrix) # Then we can do the actual calculation db_index = self.db_index_calculation( average_distances, clustering.clusters, matrix) return db_index
def evaluate(self, clustering, matrix): """ Calculates the index value for a clustering. @param clustering: The clustering being checked. @param matrix: The condensed matrix containing all distances. @return: The calculated Davies-Bouldin estimator value. """ # Medoids will be the new prototypes update_medoids(clustering, matrix) # We calculate the average distances to the medoid for each of the clusters average_distances = self.calc_average_distances(clustering, matrix) # Then we can do the actual calculation db_index = self.db_index_calculation(average_distances, clustering.clusters, matrix) return db_index
def evaluate(self, clustering, matrix): """ Mean is approximated to medoid. """ update_medoids(clustering, matrix) global_cluster = Cluster(None, clustering.get_all_clustered_elements()) global_cluster.prototype = global_cluster.calculate_medoid(matrix) global_variance = numpy.var(get_distances_of_elements_to(global_cluster.prototype, global_cluster.all_elements, matrix)) variances = [self.cluster_variance(cluster,matrix) for cluster in clustering.clusters] sum_ci = numpy.sum(variances) Cmp = sum_ci / (len(clustering.clusters)*global_variance) return Cmp
def evaluate(self, clustering, matrix): """ Calculates the index value for a clustering. @param clustering: The clustering being checked. @param matrix: The condensed matrix containing all distances. @return: The calculated Calinski-Harabasz (VRC) index. """ # Cluster prototypes (medoids here) must be updated update_medoids(clustering, matrix) # We'll follow the paper expanded formula # First we need the global mean of the squared distances D = mean(matrix.get_data()**2) # A_k calculation k = len(clustering.clusters) n = matrix.row_length # print "WGSS", self.WGSS(clustering.clusters, matrix) # print "BGSS", self.BGSS(clustering, D, matrix) # print (self.BGSS(clustering, D, matrix)/self.WGSS(clustering.clusters, matrix))*(float(n-k)/(k-1)) A_k = CalinskiHarabaszCalculator.A_k(clustering, D, matrix) VRC = (D + (float(n - k) / (k - 1)) * A_k) / float((D - A_k)) return VRC
def evaluate(self, clustering, matrix): """ Calculates the index value for a clustering. :param clustering: The clustering being checked. :param matrix: The condensed matrix containing all distances. :return: The calculated Calinski-Harabasz (VRC) index. """ # Cluster prototypes (medoids here) must be updated update_medoids(clustering, matrix) # We'll follow the paper expanded formula # First we need the global mean of the squared distances D = mean(matrix.get_data()**2) # A_k calculation k = len(clustering.clusters) n = matrix.row_length # print "WGSS", self.WGSS(clustering.clusters, matrix) # print "BGSS", self.BGSS(clustering, D, matrix) # print (self.BGSS(clustering, D, matrix)/self.WGSS(clustering.clusters, matrix))*(float(n-k)/(k-1)) A_k = CalinskiHarabaszCalculator.A_k(clustering, D, matrix) VRC = (D + (float(n -k) / (k-1))*A_k) / float((D - A_k)) return VRC