示例#1
0
 def __init__(self, data, linkage='ward', num_clusters=1):
     self._num_clusters = num_clusters
     vector_ids = [[i] for i in range(len(data))]
     self._dendrogram = Dendrogram(vector_ids)
     numpy.fill_diagonal(data, numpy.inf)
     self._dist_matrix = data
     self.linkage = linkage_fn(linkage)
示例#2
0
 def __init__(self,
              data,
              dist_metric=euclidean_distance,
              linkage=ward_link,
              num_clusters=1):
     self._num_clusters = num_clusters
     vector_ids = [[i] for i in range(len(data))]
     self._dendrogram = Dendrogram(vector_ids)
     self._dist_matrix = data
     self.linkage = linkage
示例#3
0
 def __init__(self, data, linkage='ward', num_clusters=1):
     self._num_clusters = num_clusters
     vector_ids = [[i] for i in range(len(data))]
     self._dendrogram = Dendrogram(vector_ids)
     numpy.fill_diagonal(data, numpy.inf)
     self._dist_matrix = data
     self.linkage = linkage_fn(linkage)
示例#4
0
 def __init__(self, data, dist_metric=euclidean_distance,
              linkage = ward_link, num_clusters=1):
     self._num_clusters = num_clusters
     vector_ids = [[i] for i in range(len(data))]
     self._dendrogram = Dendrogram(vector_ids)
     self._dist_matrix = data
     self.linkage = linkage
  def cluster(self):
    initial_clusters=[]
    for point in self.points:
      initial_clusters.append(Cluster(point))

    dnd = Dendrogram("Goodness")
    dnd.add_level(float('inf'),initial_clusters)

    goodness_measure = MergeGoodnessMeasure(self.th)

    all_clusters = RockClusters(initial_clusters,self.link_matrix,goodness_measure)

    n_clusters = all_clusters.size()
    while n_clusters>self.k:
      n_clusters_before_merge = n_clusters
      g = all_clusters.merge_best_candidates()
      n_clusters = all_clusters.size()
      if n_clusters==n_clusters_before_merge:
        break
      dnd.add_level(str(g),all_clusters.get_all_clusters())

    return dnd








# require 'jaccard_coefficient'
# require 'link_matrix'
# require 'cluster'
# require 'dendrogram'
# require 'merge_goodness_measure'
# require 'rock_clusters'

# class RockAlgorithm
#   attr_reader :similarity_measure, :points, :th, :link_matrix, :k
  
#   def initialize(points, k, th)
#     @points = points
#     @k = k
#     @th = th
#     similarity_measure = JaccardCoefficient.new
#     @link_matrix = LinkMatrix.new points, similarity_measure, th
#   end
  
#   def cluster
#     initial_clusters = []
#     points.each do |point|
#       initial_clusters.push Cluster.new(point)
#     end
    
#     dnd = Dendrogram.new 'Goodness'
#     dnd.add_level('inf', initial_clusters)
    
#     goodness_measure = MergeGoodnessMeasure.new th
    
#     all_clusters = RockClusters.new initial_clusters, link_matrix, goodness_measure
    
#     n_clusters = all_clusters.size
    
#     while n_clusters > k
#       n_clusters_before_merge = n_clusters
#       g = all_clusters.merge_best_candidates
#       n_clusters = all_clusters.size
#       # No linked clusters to merge
#       break if (n_clusters == n_clusters_before_merge)
#       dnd.add_level(g.to_s, all_clusters.get_all_clusters)
      
#       puts "Number of clusters: #{all_clusters.get_all_clusters.size}"
#     end
    
#     #all_clusters.cluster_map.each do |k, c|
#     #  puts c.get_elements.inspect
#     #end
    
#     dnd
#   end
  
# end
示例#6
0
class Clusterer(AbstractClusterer):
    """
    The Hierarchical Agglomerative Clusterer starts with each of the N vectors
    as singleton clusters. It then iteratively merges pairs of clusters which
    have the smallest distance according to function LINKAGE. This continues
    until there is only one cluster.
    """
    def __init__(self,
                 data,
                 dist_metric=euclidean_distance,
                 linkage=ward_link,
                 num_clusters=1):
        self._num_clusters = num_clusters
        vector_ids = [[i] for i in range(len(data))]
        self._dendrogram = Dendrogram(vector_ids)
        self._dist_matrix = data
        self.linkage = linkage

    def smallest_distance(self, clusters):
        """
        Return the smallest distance in the distance matrix.
        The smallest distance depends on the possible connections in
        the distance matrix.
        
        @param clusters: an object of the class L{DistanceMatrix} holding the 
            clusters at a specific state in the clustering procedure.
        @type clusters: L{DistanceMatrix}
        @return: a tuple containing the smallest distance and the indexes of
            the clusters yielding the smallest distance.
        """
        i, j = numpy.unravel_index(numpy.nanargmin(clusters), clusters.shape)
        return clusters[i, j], i, j

    def cluster(self, verbose=0, sum_ess=False):
        """
        Cluster all clusters hierarchically until the level of
        num_clusters is obtained.
        
        @param verbose: how much output is produced during the clustering (0-2)
        @type verbose: C{int}
        
        @return: None, desctructive method. 
        """
        ## if sum_ess and self.linkage.__name__ != "ward_link":
        ##     raise ValueError(
        ##         "Summing for method other than Ward makes no sense...")
        clusters = copy.copy(self._dist_matrix)
        summed_ess = 0.0

        while len(clusters) > max(self._num_clusters, 1):
            if verbose >= 1:
                print 'k=%s' % len(clusters)
                if verbose == 2:
                    print clusters

            best, i, j = self.smallest_distance(clusters)
            # In Ward (1963) ess is summed at each iteration
            # in R's hclust and Python's hcluster and some text books it is not.
            # Here it is optional...
            if sum_ess:
                summed_ess += best
            else:
                summed_ess = best
            clusters = self.update_distmatrix(i, j, clusters)
            self._dendrogram.merge(i, j)
            self._dendrogram._items[i].distance = summed_ess
            clusters = clusters.remove(j)

    def update_distmatrix(self, i, j, clusters):
        """
        Update the distance matrix using the specified linkage method so that
        it represents the correct distances to the newly formed cluster.
        """
        return self.linkage(clusters, i, j, self._dendrogram)

    def dendrogram(self):
        """Return the dendrogram object."""
        return self._dendrogram

    def num_clusters(self):
        return self._num_clusters

    def __repr__(self):
        return """<Hierarchical Agglomerative Clusterer(linkage method: %r,
                  n=%d clusters>""" % (self.linkage.__name__,
                                       self._num_clusters)
示例#7
0
class Clusterer(AbstractClusterer):
    """
    The Hierarchical Agglomerative Clusterer starts with each of the N vectors
    as singleton clusters. It then iteratively merges pairs of clusters which
    have the smallest distance according to function LINKAGE. This continues
    until there is only one cluster.
    """
    def __init__(self, data, linkage='ward', num_clusters=1):
        self._num_clusters = num_clusters
        vector_ids = [[i] for i in range(len(data))]
        self._dendrogram = Dendrogram(vector_ids)
        numpy.fill_diagonal(data, numpy.inf)
        self._dist_matrix = data
        self.linkage = linkage_fn(linkage)

    def smallest_distance(self, clusters):
        """
        Return the smallest distance in the distance matrix.
        The smallest distance depends on the possible connections in
        the distance matrix.
        
        @param clusters: an object of the class L{DistanceMatrix} holding the 
            clusters at a specific state in the clustering procedure.
        @type clusters: L{DistanceMatrix}
        @return: a tuple containing the smallest distance and the indexes of
            the clusters yielding the smallest distance.
        """
        i, j = numpy.unravel_index(numpy.argmin(clusters), clusters.shape)
        return clusters[i, j], i, j

    def cluster(self, verbose=0, sum_ess=False):
        """
        Cluster all clusters hierarchically until the level of
        num_clusters is obtained.
        
        @param verbose: how much output is produced during the clustering (0-2)
        @type verbose: C{int}
        
        @return: None, desctructive method. 
        """
        ## if sum_ess and self.linkage.__name__ != "ward_link":
        ##     raise ValueError(
        ##         "Summing for method other than Ward makes no sense...")
        clusters = copy.copy(self._dist_matrix)
        #clusters = self._dist_matrix
        summed_ess = 0.0

        while len(clusters) > max(self._num_clusters, 1):
            if verbose >= 1:
                print 'k=%s' % len(clusters)
                if verbose == 2:
                    print clusters
            
            best, i, j = self.smallest_distance(clusters)
            # In Ward (1963) ess is summed at each iteration
            # in R's hclust and Python's hcluster and some text books it is not. 
            # Here it is optional...
            if sum_ess:
                summed_ess += best
            else:
                summed_ess = best
            clusters = self.update_distmatrix(i, j, clusters)
            self._dendrogram.merge(i,j)
            self._dendrogram[i].distance = summed_ess
            indices = numpy.arange(clusters.shape[0])
            indices = indices[indices!=j]
            clusters = clusters.take(indices, axis=0).take(indices, axis=1)

    def update_distmatrix(self, i, j, clusters):
        """
        Update the distance matrix using the specified linkage method so that
        it represents the correct distances to the newly formed cluster.
        """
        return self.linkage(clusters, i, j, self._dendrogram)

    def dendrogram(self):
        """Return the dendrogram object."""
        return self._dendrogram

    def num_clusters(self):
        return self._num_clusters

    def __repr__(self):
        return """<Hierarchical Agglomerative Clusterer(linkage method: %r,
                  n=%d clusters>""" % (self.linkage.__name__, self._num_clusters)
示例#8
0
文件: __main__.py 项目: SMores/ng-ext
from modularity import find_best_splits

if __name__ == "__main__":
    """
        Finds the best clustering for each of the given ego network files.
        Stores output to file called submission.csv
    """
    ego_nets = read_data(argv[1])

    # Good sets (small) to test on are 25708, and 1310
    # Change this variable to change the egonet that it starts reading from
    # start = 8338
    # index = [k for k, v in tup_ls].index(start)
    index = 0  # use the line above instead if not running from start
    tup_ls = sorted(ego_nets.iteritems(), key=lambda t: t[1].size)
    out = open("submission.csv", "w")
    # out = open("out.txt", "a")  # if not running from start, use append instead
    while index < len(tup_ls):
        uid, ego_net = tup_ls[index]
        print("Analyzing ego network {0}".format(uid))
        dendrogram = Dendrogram(ego_net)
        size = ego_net.size
        best_split = find_best_splits(dendrogram.levels, size)
        circles = dendrogram.convert_to_circles()[best_split]
        circ_str = str(uid) + "," + str(len(circles)) + ","
        circ_str += ";".join([" ".join([str(fid) for fid in circle]) for circle in circles])
        print(circ_str, file=out)
        print("Best split level for ego network {0} is {1}".format(uid, best_split))
        index += 1
    out.close()
    def cluster(self):
        initial_clusters = []
        for point in self.points:
            initial_clusters.append(Cluster(point))

        dnd = Dendrogram("Goodness")
        dnd.add_level(float('inf'), initial_clusters)

        goodness_measure = MergeGoodnessMeasure(self.th)

        all_clusters = RockClusters(initial_clusters, self.link_matrix,
                                    goodness_measure)

        n_clusters = all_clusters.size()
        while n_clusters > self.k:
            n_clusters_before_merge = n_clusters
            g = all_clusters.merge_best_candidates()
            n_clusters = all_clusters.size()
            if n_clusters == n_clusters_before_merge:
                break
            dnd.add_level(str(g), all_clusters.get_all_clusters())

        return dnd


# require 'jaccard_coefficient'
# require 'link_matrix'
# require 'cluster'
# require 'dendrogram'
# require 'merge_goodness_measure'
# require 'rock_clusters'

# class RockAlgorithm
#   attr_reader :similarity_measure, :points, :th, :link_matrix, :k

#   def initialize(points, k, th)
#     @points = points
#     @k = k
#     @th = th
#     similarity_measure = JaccardCoefficient.new
#     @link_matrix = LinkMatrix.new points, similarity_measure, th
#   end

#   def cluster
#     initial_clusters = []
#     points.each do |point|
#       initial_clusters.push Cluster.new(point)
#     end

#     dnd = Dendrogram.new 'Goodness'
#     dnd.add_level('inf', initial_clusters)

#     goodness_measure = MergeGoodnessMeasure.new th

#     all_clusters = RockClusters.new initial_clusters, link_matrix, goodness_measure

#     n_clusters = all_clusters.size

#     while n_clusters > k
#       n_clusters_before_merge = n_clusters
#       g = all_clusters.merge_best_candidates
#       n_clusters = all_clusters.size
#       # No linked clusters to merge
#       break if (n_clusters == n_clusters_before_merge)
#       dnd.add_level(g.to_s, all_clusters.get_all_clusters)

#       puts "Number of clusters: #{all_clusters.get_all_clusters.size}"
#     end

#     #all_clusters.cluster_map.each do |k, c|
#     #  puts c.get_elements.inspect
#     #end

#     dnd
#   end

# end