示例#1
0
 def setUpClass(cls):
     cls.matrix = CondensedMatrix(squared_CH_table1)
      
     cls.clusterings = [Clustering([Cluster(None, [0,1,2,3]), Cluster(None, [4,5])]),
                         Clustering([Cluster(None, [0,1]), Cluster(None, [2,3]), Cluster(None, [4,5])])]
     update_medoids(cls.clusterings[0], cls.matrix)
     update_medoids(cls.clusterings[0], cls.matrix)
示例#2
0
 def test_update_medois(self):
     clusters = [Cluster(None, [1,2]),Cluster(None, [3,4]), Cluster(None, [5])]
     clustering = Clustering(clusters)
     matrix = CondensedMatrix(squared_CH_table1)
     update_medoids(clustering, matrix)
     for c in clusters:
         self.assertNotEqual(c.prototype, None)
     
     self.assertItemsEqual([c.prototype for c in clusters], [1,3,5])
示例#3
0
 def evaluate(self, clustering, matrix):
     """
     Calculates the index.
     @param clustering: The clustering being checked.
     @param matrix: The condensed matrix containing all distances.
     @return: The calculated value for the index.
     """
     update_medoids(clustering, matrix)
     
     C = len(clustering.clusters)
     constant = 2. / (C * (C-1)) # x2 as we use only half of the distances  
     
     Sep = constant * self.exponential_list_generation(clustering, matrix).sum()
     
     return Sep
示例#4
0
def calculate_per_cluster_stats(best_clustering, matrix, parameters,
                                results_folder):
    """
    CSV file
    """
    file_name = parameters.get_value(
        "file", default_value="per_cluster_stats") + ".csv"
    stats_file_path = os.path.join(results_folder, file_name)
    stats_file = open(stats_file_path, "w")
    header_line = ","
    for i in range(len(best_clustering.clusters)):
        cluster = best_clustering.clusters[i]
        header_line += "%s," % cluster.id
    header_line = header_line[:-1] + "\n"

    stats_file.write(header_line)

    # TODO: Once clusterings and clusters become inmutable its medoids will be always updated,
    # then this kind of operations will be unnecessary
    update_medoids(best_clustering, matrix)
    #----------------------------------------

    for i in range(len(best_clustering.clusters)):
        cluster_i = best_clustering.clusters[i]

        try:
            intra_distances = get_intra_cluster_distances(cluster_i, matrix)
            diameter = max(intra_distances)
            distances_from_proto = get_distances_of_elements_to(
                cluster_i.prototype, cluster_i.all_elements, matrix)
            radius = max(distances_from_proto)
        except SingularClusterException:
            diameter = 0
            radius = 0
        finally:
            line = "%s(d: %.2f r: %.2f)," % (cluster_i.id, diameter, radius)

        for j in range(0, i + 1):
            line += ","

        for j in range(i + 1, len(best_clustering.clusters)):
            cluster_j = best_clustering.clusters[j]
            line += "%.2f," % matrix[cluster_i.prototype, cluster_j.prototype]

        line = line[:-1] + "\n"
        stats_file.write(line)
    stats_file.close()
    return stats_file_path
示例#5
0
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder):
    """
    CSV file
    """
    file_name = parameters.get_value("file", default_value = "per_cluster_stats") + ".csv"
    stats_file_path = os.path.join(results_folder,file_name)
    stats_file = open(stats_file_path,"w")
    header_line =","
    for i in range(len(best_clustering.clusters)):
        cluster = best_clustering.clusters[i]
        header_line+="%s,"%cluster.id
    header_line = header_line[:-1] +"\n"

    stats_file.write(header_line)

    # TODO: Once clusterings and clusters become inmutable its medoids will be always updated,
    # then this kind of operations will be unnecessary 
    update_medoids(best_clustering, matrix)
    #----------------------------------------
    
    for i in range(len(best_clustering.clusters)):
        cluster_i = best_clustering.clusters[i]
        
        try:
            intra_distances = get_intra_cluster_distances(cluster_i, matrix)
            diameter = max(intra_distances) 
            distances_from_proto = get_distances_of_elements_to(cluster_i.prototype, 
                                                                cluster_i.all_elements, 
                                                                matrix)
            radius = max(distances_from_proto)
        except SingularClusterException:
            diameter = 0
            radius = 0
        finally:
            line = "%s(d: %.2f r: %.2f),"%(cluster_i.id, diameter, radius)

        for j in range(0, i+1):
            line += ","

        for j in range(i+1, len(best_clustering.clusters)):
            cluster_j = best_clustering.clusters[j]
            line+="%.2f,"%matrix[ cluster_i.prototype, cluster_j.prototype]

        line = line[:-1] + "\n"
        stats_file.write(line)
    stats_file.close()
    return stats_file_path
示例#6
0
 def evaluate(self, clustering, matrix):
     """
     Calculates the index value for a clustering.
     @param clustering: The clustering being checked.
     @param matrix: The condensed matrix containing all distances.
     @return: The calculated Davies-Bouldin estimator value.
     """
     # Medoids will be the new prototypes
     update_medoids(clustering, matrix)
     
     # We calculate the average distances to the medoid for each of the clusters
     average_distances = self.calc_average_distances(clustering, matrix)
     
     # Then we can do the actual calculation
     db_index = self.db_index_calculation( average_distances, clustering.clusters, matrix)
     
     return db_index
示例#7
0
    def evaluate(self, clustering, matrix):
        """
        Calculates the index value for a clustering.
        @param clustering: The clustering being checked.
        @param matrix: The condensed matrix containing all distances.
        @return: The calculated Davies-Bouldin estimator value.
        """
        # Medoids will be the new prototypes
        update_medoids(clustering, matrix)

        # We calculate the average distances to the medoid for each of the clusters
        average_distances = self.calc_average_distances(clustering, matrix)

        # Then we can do the actual calculation
        db_index = self.db_index_calculation(average_distances,
                                             clustering.clusters, matrix)

        return db_index
示例#8
0
    def evaluate(self, clustering, matrix):
        """
        Mean is approximated to medoid.
        """
        update_medoids(clustering, matrix)

        global_cluster = Cluster(None, clustering.get_all_clustered_elements())
        global_cluster.prototype = global_cluster.calculate_medoid(matrix)
        global_variance = numpy.var(get_distances_of_elements_to(global_cluster.prototype,
                                                                 global_cluster.all_elements,
                                                                 matrix))
        variances = [self.cluster_variance(cluster,matrix) for cluster in clustering.clusters]

        sum_ci = numpy.sum(variances)

        Cmp = sum_ci / (len(clustering.clusters)*global_variance)

        return Cmp
示例#9
0
    def evaluate(self, clustering, matrix):
        """
        Calculates the index value for a clustering.
        @param clustering: The clustering being checked.
        @param matrix: The condensed matrix containing all distances.
        @return: The calculated Calinski-Harabasz (VRC) index.
        """
        # Cluster prototypes (medoids here) must be updated
        update_medoids(clustering, matrix)

        # We'll follow the paper expanded formula
        # First we need the global mean of the squared distances
        D = mean(matrix.get_data()**2)
        # A_k calculation
        k = len(clustering.clusters)
        n = matrix.row_length
        #         print "WGSS", self.WGSS(clustering.clusters, matrix)
        #         print "BGSS", self.BGSS(clustering, D,  matrix)
        #         print (self.BGSS(clustering, D,  matrix)/self.WGSS(clustering.clusters, matrix))*(float(n-k)/(k-1))
        A_k = CalinskiHarabaszCalculator.A_k(clustering, D, matrix)
        VRC = (D + (float(n - k) / (k - 1)) * A_k) / float((D - A_k))
        return VRC
示例#10
0
    def evaluate(self, clustering, matrix):
        """
        Calculates the index value for a clustering.
        :param clustering: The clustering being checked.
        :param matrix: The condensed matrix containing all distances.
        :return: The calculated Calinski-Harabasz (VRC) index.
        """
        # Cluster prototypes (medoids here) must be updated
        update_medoids(clustering, matrix)
        
        # We'll follow the paper expanded formula
        # First we need the global mean of the squared distances
        D = mean(matrix.get_data()**2)
        # A_k calculation
        k = len(clustering.clusters)
        n = matrix.row_length
#         print "WGSS", self.WGSS(clustering.clusters, matrix)
#         print "BGSS", self.BGSS(clustering, D,  matrix)
#         print (self.BGSS(clustering, D,  matrix)/self.WGSS(clustering.clusters, matrix))*(float(n-k)/(k-1))
        A_k = CalinskiHarabaszCalculator.A_k(clustering, D, matrix)
        VRC = (D + (float(n -k) / (k-1))*A_k) / float((D - A_k))
        return VRC