class GAAClusterer(VectorSpaceClusterer): """ The Group Average Agglomerative starts with each of the N vectors as singleton clusters. It then iteratively merges pairs of clusters which have the closest centroids. This continues until there is only one cluster. The order of merges gives rise to a dendogram: a tree with the earlier merges lower than later merges. The membership of a given number of clusters c, 1 <= c <= N, can be found by cutting the dendogram at depth c. This clusterer uses the cosine similarity metric only, which allows for efficient speed-up in the clustering process. """ def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order self._dendogram = Dendogram( [numpy.array(vector, numpy.float64) for vector in vectors]) return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # the sum vectors vector_sum = copy.copy(vectors) while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge, based on their # S(union c_i, c_j) best = None for i in range(len(clusters)): for j in range(i + 1, len(clusters)): sim = self._average_similarity( vector_sum[i], len(clusters[i]), vector_sum[j], len(clusters[j])) if not best or sim > best[0]: best = (sim, i, j) # merge them and replace in cluster list i, j = best[1:] vsum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) clusters[i] = vsum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] del vector_sum[j] self._dendogram.merge(i, j) self.update_clusters(self._num_clusters) def update_clusters(self, num_clusters): clusters = self._dendogram.groups(num_clusters) self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector centroid /= float(len(cluster)) self._centroids.append(centroid) self._num_clusters = len(self._centroids) def compute_rss(self, num_clusters): clusters = self._dendogram.groups(num_clusters) rss = 0 for cluster in clusters: if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector centroid = centroid / float(len(cluster)) for vector in cluster: diff = vector - centroid rss = rss + numpy.sqrt(numpy.vdot(diff, diff)) return rss def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] sim = self._average_similarity(vector, 1, centroid, 1) if not best or sim > best[0]: best = (sim, i) return best[1] def dendogram(self): """ @return: The dendogram representing the current clustering @rtype: Dendogram """ return self._dendogram def num_clusters(self): return self._num_clusters def _average_similarity(self, v1, l1, v2, l2): asum = v1 + v2 length = l1 + l2 return (numpy.dot(asum, asum) - length) / (length * (length - 1)) def __repr__(self): return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
class OptCentroidClusterer(VectorSpaceClusterer): def __init__(self, vector_names = None, num_clusters=1, normalise=True, svd_dimensions=None, msg_handle=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None self._names = vector_names self._name_dendogram = None self._reassigned_clusters = {} self.msg_handle = msg_handle def array_max(self, ar): for i in range(ar.shape[0]): ar[i, i] = -9e9 location = ar.argmax() r = int(round (location / ar.shape[1])) c = int(numpy.mod(location, ar.shape[1])) return [r, c] def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order if self.msg_handle is not None: self.msg_handle.dm(str(len(vectors))) self.msg_handle.tile_yield() self._dendogram = Dendogram( [numpy.array(vector, numpy.float64) for vector in vectors]) if self._names: self._name_dendogram = Dendogram(self._names) self._vectors_to_cluster = vectors return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)]) i = 0 for v in norm_sum: cluster_matrix[:, i] = v i = i + 1 if self.msg_handle is not None: self.msg_handle.dm("initializing dot_store_matrix") dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) start = time.time() while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge max_sim = self.array_max(dot_store_matrix) i = max_sim[0] j = max_sim[1] if i == j: print "got stuck when at " + str(len(clusters)) + " clusters" break; vsum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) clusters[i] = vsum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) cluster_matrix[:, i] = norm_sum[i] del vector_sum[j] del norm_sum[j] cluster_matrix = numpy.delete(cluster_matrix, j, 1) dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) self._dendogram.merge(i, j) if self._names: self._name_dendogram.merge(i, j) end = time.time() if end - start > 5: if self.msg_handle is not None: self.msg_handle.dm(str(len(clusters))) start = end else: print len(clusters) self.update_clusters(len(clusters)) def update_clusters(self, num_clusters): print "entering update clusters with num_clusters = " + str(num_clusters) clusters = self._dendogram.groups(num_clusters) self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? norm_centroid = normalize(centroid) self._centroids.append(norm_centroid) self._num_clusters = len(self._centroids) def compute_rss(self, num_clusters): clusters = self._dendogram.groups(num_clusters) rss = 0 for cluster in clusters: if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector if self._should_normalise: centroid = self._normalise(centroid) for vector in cluster: diff = vector - centroid rss = rss + numpy.sqrt(numpy.vdot(diff, diff)) return rss def get_iteratively_reassigned_clusters(self, num_clusters, max_iterations=100, saveit=True): if num_clusters in self._reassigned_clusters: return self._reassigned_clusters[num_clusters] self.update_clusters(num_clusters) new_centroids = self._centroids clusters = self._dendogram.groups(num_clusters) for iter in range(max_iterations): number_reassigned = 0 new_clusters = [[] for i in range(len(self._centroids))] for cluster_number, cluster in enumerate(clusters): for vec in cluster: dps = [numpy.dot(numpy.transpose(centroid), vec) for centroid in new_centroids] new_cluster_index = dps.index(max(dps)) new_clusters[new_cluster_index].append(vec) if new_cluster_index != cluster_number: number_reassigned += 1 new_centroids = [] for cluster in new_clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? norm_centroid = normalize(centroid) new_centroids.append(norm_centroid) print [len(cluster) for cluster in new_clusters] print "Number reassigned = %i" % number_reassigned if number_reassigned == 0: print "Stable after %i iterations" % iter break; clusters = new_clusters if saveit: self._reassigned_clusters[num_clusters] = (new_centroids,[len(cluster) for cluster in new_clusters], clusters)
class CentroidClusterer(VectorSpaceClusterer): def __init__(self, vector_names=None, num_clusters=1, normalise=True, svd_dimensions=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None self._names = vector_names self._name_dendogram = None def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order self._dendogram = Dendogram([numpy.array(vector, numpy.float64) for vector in vectors]) if self._names: self._name_dendogram = Dendogram(self._names) self._vectors_to_cluster = vectors return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge best = None for i in range(len(clusters)): for j in range(i + 1, len(clusters)): sim = numpy.dot(norm_sum[i], norm_sum[j]) if not best or sim > best[0]: best = (sim, i, j) # merge them and replace in cluster list i, j = best[1:] csum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) # print len(clusters) clusters[i] = csum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) del vector_sum[j] del norm_sum[j] self._dendogram.merge(i, j) if self._names: self._name_dendogram.merge(i, j) if len(clusters) % 50 == 0: print len(clusters) self.update_clusters(self._num_clusters) def update_clusters(self, num_clusters): clusters = self._dendogram.groups(num_clusters) # print clusters self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? if self._should_normalise: self._centroids.append(normalize(centroid)) else: self._centroids.append(centroid) self._num_clusters = len(self._centroids) def compute_rss(self, num_clusters): clusters = self._dendogram.groups(num_clusters) rss = 0 for cluster in clusters: if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector centroid /= float(len(cluster)) for vector in cluster: diff = vector - centroid rss = rss + numpy.sqrt(numpy.vdot(diff, diff)) return rss def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] sim = self._similarity(vector, centroid) if not best or sim > best[0]: best = (sim, i) return best[1] def dendogram(self): """ @return: The dendogram representing the current clustering @rtype: Dendogram """ return self._dendogram def name_dendogram(self): return self._name_dendogram def num_clusters(self): return self._num_clusters def _similarity(self, v1, v2): return (numpy.dot(v1, v2)) def __repr__(self): return '<Centroid Clusterer n=%d>' % self._num_clusters
class GAAClusterer(VectorSpaceClusterer): """ The Group Average Agglomerative starts with each of the N vectors as singleton clusters. It then iteratively merges pairs of clusters which have the closest centroids. This continues until there is only one cluster. The order of merges gives rise to a dendogram: a tree with the earlier merges lower than later merges. The membership of a given number of clusters c, 1 <= c <= N, can be found by cutting the dendogram at depth c. This clusterer uses the cosine similarity metric only, which allows for efficient speed-up in the clustering process. """ def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order self._dendogram = Dendogram( [numpy.array(vector, numpy.float64) for vector in vectors]) return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # the sum vectors vector_sum = copy.copy(vectors) while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge, based on their # S(union c_i, c_j) best = None for i in range(len(clusters)): for j in range(i + 1, len(clusters)): sim = self._average_similarity(vector_sum[i], len(clusters[i]), vector_sum[j], len(clusters[j])) if not best or sim > best[0]: best = (sim, i, j) # merge them and replace in cluster list i, j = best[1:] vsum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) clusters[i] = vsum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] del vector_sum[j] self._dendogram.merge(i, j) self.update_clusters(self._num_clusters) def update_clusters(self, num_clusters): clusters = self._dendogram.groups(num_clusters) self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector centroid /= float(len(cluster)) self._centroids.append(centroid) self._num_clusters = len(self._centroids) def compute_rss(self, num_clusters): clusters = self._dendogram.groups(num_clusters) rss = 0 for cluster in clusters: if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector for vector in cluster: diff = vector - centroid rss = rss + numpy.sqrt(numpy.vdot(diff, diff)) return rss def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] sim = self._average_similarity(vector, 1, centroid, 1) if not best or sim > best[0]: best = (sim, i) return best[1] def dendogram(self): """ @return: The dendogram representing the current clustering @rtype: Dendogram """ return self._dendogram def num_clusters(self): return self._num_clusters def _average_similarity(self, v1, l1, v2, l2): asum = v1 + v2 length = l1 + l2 return (numpy.dot(asum, asum) - length) / (length * (length - 1)) def __repr__(self): return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
class OptCentroidClusterer(VectorSpaceClusterer): def __init__(self, vector_names=None, num_clusters=1, normalise=True, svd_dimensions=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None self._names = vector_names self._name_dendogram = None def array_max(self, ar): for i in range(ar.shape[0]): ar[i, i] = -9e9 location = ar.argmax() r = int(round(location / ar.shape[1])) c = int(numpy.mod(location, ar.shape[1])) return [r, c] def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order self._dendogram = Dendogram( [numpy.array(vector, numpy.float64) for vector in vectors]) if self._names: self._name_dendogram = Dendogram(self._names) self._vectors_to_cluster = vectors return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)]) i = 0 for v in norm_sum: cluster_matrix[:, i] = v i = i + 1 dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge max_sim = self.array_max(dot_store_matrix) i = max_sim[0] j = max_sim[1] if i == j: print "got stuck when at " + str(len(clusters)) + " clusters" break vsum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) clusters[i] = vsum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) cluster_matrix[:, i] = norm_sum[i] del vector_sum[j] del norm_sum[j] cluster_matrix = numpy.delete(cluster_matrix, j, 1) dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) self._dendogram.merge(i, j) if self._names: self._name_dendogram.merge(i, j) if len(clusters) % 50 == 0: print len(clusters) self.update_clusters(len(clusters)) def update_clusters(self, num_clusters): print "entering update clusters with num_clusters = " + str( num_clusters) clusters = self._dendogram.groups(num_clusters) self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? norm_centroid = normalize(centroid) self._centroids.append(norm_centroid) self._num_clusters = len(self._centroids) def compute_rss(self, num_clusters): clusters = self._dendogram.groups(num_clusters) rss = 0 for cluster in clusters: if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector if self._should_normalise: centroid = self._normalise(centroid) for vector in cluster: diff = vector - centroid rss = rss + numpy.sqrt(numpy.vdot(diff, diff)) return rss def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] sim = self._similarity(vector, centroid) if not best or sim > best[0]: best = (sim, i) return best[1] def dendogram(self): """ @return: The dendogram representing the current clustering @rtype: Dendogram """ return self._dendogram def name_dendogram(self): return self._name_dendogram def num_clusters(self): return self._num_clusters def _similarity(self, v1, v2): return (numpy.dot(v1, v2)) def __repr__(self): return '<Opt Centroid Clusterer n=%d>' % self._num_clusters
class CentroidClusterer(VectorSpaceClusterer): def __init__(self, vector_names=None, num_clusters=1, normalise=True, svd_dimensions=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendogram = None self._groups_values = None self._names = vector_names self._name_dendogram = None def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order self._dendogram = Dendogram( [numpy.array(vector, numpy.float64) for vector in vectors]) if self._names: self._name_dendogram = Dendogram(self._names) self._vectors_to_cluster = vectors return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge best = None for i in range(len(clusters)): for j in range(i + 1, len(clusters)): sim = numpy.dot(norm_sum[i], norm_sum[j]) if not best or sim > best[0]: best = (sim, i, j) # merge them and replace in cluster list i, j = best[1:] csum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) # print len(clusters) clusters[i] = csum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) del vector_sum[j] del norm_sum[j] self._dendogram.merge(i, j) if self._names: self._name_dendogram.merge(i, j) if len(clusters) % 50 == 0: print len(clusters) self.update_clusters(self._num_clusters) def update_clusters(self, num_clusters): clusters = self._dendogram.groups(num_clusters) # print clusters self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? if self._should_normalise: self._centroids.append(normalize(centroid)) else: self._centroids.append(centroid) self._num_clusters = len(self._centroids) def compute_rss(self, num_clusters): clusters = self._dendogram.groups(num_clusters) rss = 0 for cluster in clusters: if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector for vector in cluster: diff = vector - centroid rss = rss + numpy.sqrt(numpy.vdot(diff, diff)) return rss def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] sim = self._similarity(vector, centroid) if not best or sim > best[0]: best = (sim, i) return best[1] def dendogram(self): """ @return: The dendogram representing the current clustering @rtype: Dendogram """ return self._dendogram def name_dendogram(self): return self._name_dendogram def num_clusters(self): return self._num_clusters def _similarity(self, v1, v2): print "othertest" return (numpy.dot(v1, v2)) def __repr__(self): return '<Centroid Clusterer n=%d>' % self._num_clusters