Exemplo n.º 1
0
 def test_get_intra_cluster_distances(self):
     matrix = CondensedMatrix(CH_table1)
     numpy.testing.assert_almost_equal(get_intra_cluster_distances(Cluster(None, [4,5]), matrix),[2.4494897427831779],5)
     numpy.testing.assert_almost_equal(get_intra_cluster_distances(Cluster(None, [1,3,5]), matrix),[2.4494897427831779, 3.8729833462074170, 3.8729833462074170],5)
     
     data = [0.0, 1.0, 1.0, 1.0,
                  1.0, 0.0, 0.0,
                       0.0, 0.0,
                            0.0]
     matrix = CondensedMatrix(data)
     expected_distance = 4
     self.assertEqual(expected_distance, numpy.sum(get_intra_cluster_distances(Cluster(None, range(5)), matrix)))
Exemplo n.º 2
0
def calculate_per_cluster_stats(best_clustering, matrix, parameters,
                                results_folder):
    """
    CSV file
    """
    file_name = parameters.get_value(
        "file", default_value="per_cluster_stats") + ".csv"
    stats_file_path = os.path.join(results_folder, file_name)
    stats_file = open(stats_file_path, "w")
    header_line = ","
    for i in range(len(best_clustering.clusters)):
        cluster = best_clustering.clusters[i]
        header_line += "%s," % cluster.id
    header_line = header_line[:-1] + "\n"

    stats_file.write(header_line)

    for i in range(len(best_clustering.clusters)):
        cluster_i = best_clustering.clusters[i]
        intra_distances = get_intra_cluster_distances(cluster_i, matrix)
        radius = max(intra_distances) if intra_distances != [] else 0.
        line = "%s(%.2f)," % (cluster_i.id, radius)

        for j in range(0, i + 1):
            line += ","

        for j in range(i + 1, len(best_clustering.clusters)):
            cluster_j = best_clustering.clusters[j]
            line += "%.2f," % matrix[cluster_i.prototype, cluster_j.prototype]

        line = line[:-1] + "\n"
        stats_file.write(line)
    stats_file.close()
    return stats_file_path
Exemplo n.º 3
0
 def min_intracluster_distances(cls, clustering, matrix):
     """
     Calculates d_min, the minimum internal distance.
     @param clustering: The clustering being checked.
     @param matrix: The condensed matrix containing all distances.
     @return: d_min's value
     """
     return numpy.min([
         numpy.min(get_intra_cluster_distances(c, matrix))
         for c in clustering.clusters
     ])
Exemplo n.º 4
0
 def ch_cluster_term(cls, cluster, global_mean_distance, matrix):
     """
     Calculates one of the formula terms (ng-1)(D-d_g)
     @param cluster: The cluster to use in calculation.
     @param global_mean_distance: 'D'. Is the mean of the n*(n-1)/2 distances of all the elements.
     @param matrix: The condensed matrix containing all distances.
     @return: Calculated term.
     """
     # Calculate cluster mean distance
     n = len(cluster.all_elements)
     cluster_mean_distance = mean(
         numpy.array(get_intra_cluster_distances(cluster, matrix))**2)
     return (n - 1) * (global_mean_distance - cluster_mean_distance)
Exemplo n.º 5
0
 def WGSS(cls, clusters, matrix):
     """
     C-H description of the "Within group sum of squares".
     @param clusters: An array with all clusters description (usually Clustering.clusters)
     @param matrix: The condensed matrix containing all distances.
     @return: The value of WGSS.
     """
     wgss = 0
     for c in clusters:
         n = len(c.all_elements)
         d = mean(numpy.array(get_intra_cluster_distances(c, matrix))**2)
         wgss += (n - 1) * d
     return wgss * 0.5
Exemplo n.º 6
0
 def WGSS(cls, clusters, matrix):
     """
     C-H description of the "Within group sum of squares".
     :param clusters: An array with all clusters description (usually Clustering.clusters)
     :param matrix: The condensed matrix containing all distances.
     :return: The value of WGSS.
     """
     wgss = 0
     for c in clusters:
         n = len(c.all_elements)
         d = mean(numpy.array(get_intra_cluster_distances( c, matrix))**2) 
         wgss += (n-1)*d
     return wgss*0.5
Exemplo n.º 7
0
def calculate_per_cluster_stats(best_clustering, matrix, parameters,
                                results_folder):
    """
    CSV file
    """
    file_name = parameters.get_value(
        "file", default_value="per_cluster_stats") + ".csv"
    stats_file_path = os.path.join(results_folder, file_name)
    stats_file = open(stats_file_path, "w")
    header_line = ","
    for i in range(len(best_clustering.clusters)):
        cluster = best_clustering.clusters[i]
        header_line += "%s," % cluster.id
    header_line = header_line[:-1] + "\n"

    stats_file.write(header_line)

    # TODO: Once clusterings and clusters become inmutable its medoids will be always updated,
    # then this kind of operations will be unnecessary
    update_medoids(best_clustering, matrix)
    #----------------------------------------

    for i in range(len(best_clustering.clusters)):
        cluster_i = best_clustering.clusters[i]

        try:
            intra_distances = get_intra_cluster_distances(cluster_i, matrix)
            diameter = max(intra_distances)
            distances_from_proto = get_distances_of_elements_to(
                cluster_i.prototype, cluster_i.all_elements, matrix)
            radius = max(distances_from_proto)
        except SingularClusterException:
            diameter = 0
            radius = 0
        finally:
            line = "%s(d: %.2f r: %.2f)," % (cluster_i.id, diameter, radius)

        for j in range(0, i + 1):
            line += ","

        for j in range(i + 1, len(best_clustering.clusters)):
            cluster_j = best_clustering.clusters[j]
            line += "%.2f," % matrix[cluster_i.prototype, cluster_j.prototype]

        line = line[:-1] + "\n"
        stats_file.write(line)
    stats_file.close()
    return stats_file_path
Exemplo n.º 8
0
 def ch_cluster_term(cls, cluster, global_mean_distance, matrix):
     """
     Calculates one of the formula terms (ng-1)(D-d_g)
     :param cluster: The cluster to use in calculation.
     :param global_mean_distance: 'D'. Is the mean of the n*(n-1)/2 distances of all the elements.
     :param matrix: The condensed matrix containing all distances.
     :return: Calculated term.
     """
     # Calculate cluster mean distance
     n = len(cluster.all_elements)
     try:
         cluster_mean_distance = mean(numpy.array(get_intra_cluster_distances( cluster, matrix))**2)
     except SingularClusterException:
         cluster_mean_distance = 0
         
     return (n-1) * (global_mean_distance - cluster_mean_distance)
Exemplo n.º 9
0
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder):
    """
    CSV file
    """
    file_name = parameters.get_value("file", default_value = "per_cluster_stats") + ".csv"
    stats_file_path = os.path.join(results_folder,file_name)
    stats_file = open(stats_file_path,"w")
    header_line =","
    for i in range(len(best_clustering.clusters)):
        cluster = best_clustering.clusters[i]
        header_line+="%s,"%cluster.id
    header_line = header_line[:-1] +"\n"

    stats_file.write(header_line)

    # TODO: Once clusterings and clusters become inmutable its medoids will be always updated,
    # then this kind of operations will be unnecessary 
    update_medoids(best_clustering, matrix)
    #----------------------------------------
    
    for i in range(len(best_clustering.clusters)):
        cluster_i = best_clustering.clusters[i]
        
        try:
            intra_distances = get_intra_cluster_distances(cluster_i, matrix)
            diameter = max(intra_distances) 
            distances_from_proto = get_distances_of_elements_to(cluster_i.prototype, 
                                                                cluster_i.all_elements, 
                                                                matrix)
            radius = max(distances_from_proto)
        except SingularClusterException:
            diameter = 0
            radius = 0
        finally:
            line = "%s(d: %.2f r: %.2f),"%(cluster_i.id, diameter, radius)

        for j in range(0, i+1):
            line += ","

        for j in range(i+1, len(best_clustering.clusters)):
            cluster_j = best_clustering.clusters[j]
            line+="%.2f,"%matrix[ cluster_i.prototype, cluster_j.prototype]

        line = line[:-1] + "\n"
        stats_file.write(line)
    stats_file.close()
    return stats_file_path
Exemplo n.º 10
0
 def min_intracluster_distances(cls, clustering, matrix):
     """
     Calculates d_min, the minimum internal distance.
     @param clustering: The clustering being checked.
     @param matrix: The condensed matrix containing all distances.
     @return: d_min's value
     """
     distances = []
     for c in clustering.clusters:
         try:
             distances.append(numpy.min(get_intra_cluster_distances(c, matrix)))
         except SingularClusterException:
             # If we work with a singular cluster, we add 0s so that no min function
             # fails. The convention for the distance of a cluster with only one element
             # will be 0 in this case.
             distances.append(0)
     return numpy.min(distances)
Exemplo n.º 11
0
 def min_intracluster_distances(cls, clustering, matrix):
     """
     Calculates d_min, the minimum internal distance.
     @param clustering: The clustering being checked.
     @param matrix: The condensed matrix containing all distances.
     @return: d_min's value
     """
     distances = []
     for c in clustering.clusters:
         try:
             distances.append(
                 numpy.min(get_intra_cluster_distances(c, matrix)))
         except SingularClusterException:
             # If we work with a singular cluster, we add 0s so that no min function
             # fails. The convention for the distance of a cluster with only one element
             # will be 0 in this case.
             distances.append(0)
     return numpy.min(distances)