def perform_clustering(self, kwargs): """ Main loop to perform the DBSCAN algorithm. """ elements_class = [PointClassType.UNCLASSIFIED ] * self.number_of_elements eps = kwargs["eps"] minpts = kwargs["minpts"] current_cluster_id = PointClassType.NOISE + 1 for i in range(self.number_of_elements): current_element = i if elements_class[current_element] == PointClassType.UNCLASSIFIED: last_forms_a_cluster = self.__expand_cluster( current_element, current_cluster_id, eps, minpts, elements_class) if last_forms_a_cluster: current_cluster_id = current_cluster_id + 1 # Return the clusters once the clustering is done # NOISE elements form a single cluster with ID = PointClassType.NOISE # and will be removed from the clustering clusters = gen_clusters_from_class_list( elements_class, skip_list=[PointClassType.NOISE]) return Clustering(clusters, details="DBSCAN (eps = " + str(eps) + " minpts = " + str(minpts) + ") " + str(self.number_of_elements) + " elems")
def perform_clustering(self, kwargs): """ Performs the hierarchical clustering step and the clustering step. If the hierarchical matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm a second time it will use the last matrix. """ """ Gets a condensed matrix and calculates the clustering. One can use diverse methodologies to do this clustering... With preserve_input=False the matrix is destroyed while clustering, ut it saves memory. The metric is not needed in this case,as we are giving the function the calculated matrix. The method is the method used to determine distances when fusing clusters. methods are described in: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html """ try: cutoff = kwargs["cutoff"] except KeyError: cutoff = None try: hie_mat = kwargs["hie_mat"] except KeyError: hie_mat = None try: method = kwargs["method"] except KeyError: method = 'complete' if hie_mat != None: self.hie_mat = hie_mat # print "[HIERARCHICAL] Matrix provided." else: if self.hie_mat == None: #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False) # print "[HIERARCHICAL] Calculating Matrix" #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method) self.hie_mat = hcluster_fast.linkage( self.condensed_matrix.get_data(), method=method) # else: # print "[HIERARCHICAL] Matrix was already stored" algorithm_details = "Hierarchical with " + method + " method (cutoff = " + str( cutoff) + ")" if cutoff != None: # Then apply the cutoff, this doesn't work much as expected # print "[HIERARCHICAL] getting clustering."+algorithm_details group_list = hcluster.fcluster(self.hie_mat, cutoff) # print "[HIERARCHICAL] Clustering done."+algorithm_details # Then let's generate the clusters clusters = gen_clusters_from_class_list(group_list) return Clustering(clusters, details=algorithm_details) else: return None
def perform_clustering(self, kwargs): """ Performs the hierarchical clustering step and the clustering step. If the hierarchical matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm a second time it will use the last matrix. """ """ Gets a condensed matrix and calculates the clustering. One can use diverse methodologies to do this clustering... With preserve_input=False the matrix is destroyed while clustering, ut it saves memory. The metric is not needed in this case,as we are giving the function the calculated matrix. The method is the method used to determine distances when fusing clusters. methods are described in: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html """ try: cutoff = kwargs["cutoff"] except KeyError: cutoff = None try: hie_mat = kwargs["hie_mat"] except KeyError: hie_mat = None try: method = kwargs["method"] except KeyError: method = 'complete' if hie_mat != None: self.hie_mat = hie_mat # print "[HIERARCHICAL] Matrix provided." else: if self.hie_mat == None: #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False) # print "[HIERARCHICAL] Calculating Matrix" #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method) self.hie_mat = hcluster_fast.linkage(self.condensed_matrix.get_data(), method = method) # else: # print "[HIERARCHICAL] Matrix was already stored" algorithm_details = "Hierarchical with "+method+" method (cutoff = " +str(cutoff)+")" if cutoff != None: # Then apply the cutoff, this doesn't work much as expected # print "[HIERARCHICAL] getting clustering."+algorithm_details group_list = hcluster.fcluster(self.hie_mat,cutoff) # print "[HIERARCHICAL] Clustering done."+algorithm_details # Then let's generate the clusters clusters = gen_clusters_from_class_list(group_list) return Clustering(clusters,details = algorithm_details) else: return None
def update_medoids(self): """ Regenerates the medoids list once the new clusters have been generated. @return: A new medoid list. """ clusters = gen_clusters_from_class_list(self.class_list) medoids = [] for c in clusters: medoids.append(c.calculate_medoid(self.condensed_matrix)) return medoids
def test_gen_clusters_from_grouping_list(self): # numpy.random.random_integers(0,4,20) numclusters = 5 group_list = [4, 1, 2, 2, 4, 4, 3, 4, 2, 0, 0, 3, 3, 4, 0, 3, 1, 1, 1, 2] true_clusters = [Cluster(0,[0,4,5,7,13]), Cluster(1,[1,16,17,18]), Cluster(2,[2,3,8,19]), Cluster(6,[6,11,12,15]), Cluster(9,[9,10,14])] clusters = gen_clusters_from_class_list(group_list) sorted_clusters = sorted(clusters, key=lambda c: c.prototype) self.assertEqual(numclusters,len(sorted_clusters)) for i in range(numclusters): self.assertEqual(true_clusters[i], sorted_clusters[i])
def perform_clustering(self,kwargs): """ Creates a clustering where the clusters have been created by random selection of the elements in the dataset, following a cluster size distribution. """ distribution = kwargs["distribution"] num_of_nodes = self.condensed_matrix.row_length node_class = [] next_class = 0 for d in distribution: node_class.extend([next_class]*int((d/100.)*num_of_nodes)) next_class = next_class + 1 random.shuffle(node_class) clusters = gen_clusters_from_class_list(node_class[0:num_of_nodes]) return Clustering(clusters, details = "Fake Distribution Random (distribution = "+str(distribution)+")")
def perform_clustering(self, kwargs): """ Creates a clustering where the clusters have been created by random selection of the elements in the dataset. It will create a random number of clusters if "max_num_of_clusters" is defined, or an exact number of clusters this clusters if "num_clusters" is defined. """ num_of_nodes = self.condensed_matrix.row_length num_of_clusters = 0 elements_per_cluster = 0 max_num_of_clusters = 0 try: num_of_clusters = kwargs["num_clusters"] elements_per_cluster = max(1, num_of_nodes / num_of_clusters) except KeyError: try: max_num_of_clusters = kwargs["max_num_of_clusters"] num_of_clusters = random.randint(1, max_num_of_clusters) except KeyError: num_of_clusters = 2 elements_per_cluster = 1 node_class = [] try: elements_per_cluster = max(1, num_of_nodes / num_of_clusters) except: elements_per_cluster = 1 for i in range(num_of_clusters): node_class.extend([i] * elements_per_cluster) while len(node_class) < num_of_nodes: node_class.append(0) random.seed() random.shuffle(node_class) clusters = gen_clusters_from_class_list(node_class) return Clustering( clusters, details="Random (max_num_of_clusters = %d, num_of_clusters = %d )" % (max_num_of_clusters, num_of_clusters))
def perform_clustering(self, kwargs): """ Does the actual clustering. @param kwargs: Dictionary with this mandatory keys: - 'k': Number of clusters to generate. - 'seeding_type': One of the initial medoid selectors available (@see seeding_types() ). If seeding type is 'GROMOS', 'seeding_max_cutoff' must be also defined, containing the cutoff that the GROMOS Algorithm will use. Default is EQUIDISTANT """ self.k = kwargs["k"] self.seeding_type = kwargs["seeding_type"] if "seeding_type" in kwargs else "EQUIDISTANT" if self.seeding_type == 'GROMOS': self.seeding_max_cutoff = kwargs["seeding_max_cutoff"] else: self.seeding_max_cutoff = -1.0 # Getting first medoids current_medoids = self.seeding(self.k, self.seeding_max_cutoff, self.seeding_type) # print "Initial medoids:", current_medoids current_iteration = 0 last_medoids = [] while not self.convergence(current_medoids,last_medoids) and current_iteration < KMedoidsAlgorithm.MAX_ITERATIONS: # print "Iteration" # Update clusters self.cluster_update(current_medoids, self.condensed_matrix) # Copy current to last (by reference) last_medoids = current_medoids # Update medoids current_medoids = self.update_medoids() # print "Current medoids:", current_medoids current_iteration = current_iteration + 1 algorithm_details = "K-Medoids algorithm with k ="+str(int(self.k))+" and %s initial seeding"%self.seeding_to_str() clusters = gen_clusters_from_class_list(self.class_list) # Put medoids as representatives for c in clusters: c.set_prototype(c.calculate_medoid(self.condensed_matrix)) return Clustering(clusters,details = algorithm_details)
def perform_clustering(self,kwargs): """ Main loop to perform the DBSCAN algorithm. """ elements_class = [PointClassType.UNCLASSIFIED]*self.number_of_elements eps = kwargs["eps"] minpts = kwargs["minpts"] current_cluster_id = PointClassType.NOISE + 1 for i in range(self.number_of_elements): current_element = i if elements_class[current_element] == PointClassType.UNCLASSIFIED: last_forms_a_cluster = self.__expand_cluster(current_element, current_cluster_id, eps, minpts, elements_class) if last_forms_a_cluster: current_cluster_id = current_cluster_id + 1 # Return the clusters once the clustering is done # NOISE elements form a single cluster with ID = PointClassType.NOISE # and will be removed from the clustering clusters = gen_clusters_from_class_list(elements_class, skip_list = [PointClassType.NOISE]) return Clustering(clusters,details="DBSCAN (eps = "+str(eps)+" minpts = "+str(minpts)+") "+str(self.number_of_elements)+" elems")
def perform_clustering(self, kwargs): """ Creates a clustering where the clusters have been created by random selection of the elements in the dataset. It will create a random number of clusters if "max_num_of_clusters" is defined, or an exact number of clusters this clusters if "num_clusters" is defined. """ num_of_nodes = self.condensed_matrix.row_length num_of_clusters = 0 elements_per_cluster = 0 max_num_of_clusters = 0 try: num_of_clusters = kwargs["num_clusters"] elements_per_cluster = max(1, num_of_nodes / num_of_clusters) except KeyError: try: max_num_of_clusters = kwargs["max_num_of_clusters"] num_of_clusters = random.randint(1, max_num_of_clusters) except KeyError: num_of_clusters = 2 elements_per_cluster = 1 node_class = [] try: elements_per_cluster = max(1, num_of_nodes / num_of_clusters) except: elements_per_cluster = 1 for i in range(num_of_clusters): node_class.extend([i]*elements_per_cluster) while len(node_class) < num_of_nodes: node_class.append(0) random.seed() random.shuffle(node_class) clusters = gen_clusters_from_class_list(node_class) return Clustering(clusters, details = "Random (max_num_of_clusters = %d, num_of_clusters = %d )"%(max_num_of_clusters, num_of_clusters))
def perform_clustering(self, kwargs): """ Does the actual clustering by doing a k-medoids clustering of the first k eigenvector rows. @param kwargs: Dictionary with this mandatory keys: - 'k': Number of clusters to generate. Must be <= than max_clusters @return: a Clustering instance with the clustered data. """ # Mandatory parameter k = int(kwargs["k"]) if k > self.max_clusters: print "[ERROR SpectralClusteringAlgorithm::perform_clustering] this algorithm was defined to generate at most %d clusters."%self.max_clusters, algorithm_details = "Spectral algorithm with k = %d and sigma squared = %.3f" %(int(k), self.sigma_sq) if self.use_k_medoids: # The row vectors we have are in R^k (so k length) eigen_distances = CondensedMatrix(pdist(self.eigenvectors[:,:k])) k_medoids_args = { "k":k, "seeding_max_cutoff":-1, "seeding_type": "RANDOM" } k_medoids_alg = KMedoidsAlgorithm(eigen_distances) clustering = k_medoids_alg.perform_clustering(k_medoids_args) clustering.details = algorithm_details return k_medoids_alg.perform_clustering(k_medoids_args) else: centroid, labels = scipy.cluster.vq.kmeans2(self.eigenvectors[:,:k], k, iter = 1000, minit = 'random') del centroid clusters = gen_clusters_from_class_list(labels) return Clustering(clusters,details = algorithm_details)
def perform_clustering(self, kwargs): """ Does the actual clustering by doing a k-medoids clustering of the first k eigenvector rows. @param kwargs: Dictionary with this mandatory keys: - 'k': Number of clusters to generate. Must be <= than max_clusters @return: a Clustering instance with the clustered data. """ # Mandatory parameter k = int(kwargs["k"]) if k > self.max_clusters: print "[ERROR SpectralClusteringAlgorithm::perform_clustering] this algorithm was defined to generate at most %d clusters." % self.max_clusters, algorithm_details = "Spectral algorithm with k = %d and sigma squared = %.3f" % ( int(k), self.sigma_sq) if self.use_k_medoids: # The row vectors we have are in R^k (so k length) eigen_distances = CondensedMatrix(pdist(self.eigenvectors[:, :k])) k_medoids_args = { "k": k, "seeding_max_cutoff": -1, "seeding_type": "RANDOM" } k_medoids_alg = KMedoidsAlgorithm(eigen_distances) clustering = k_medoids_alg.perform_clustering(k_medoids_args) clustering.details = algorithm_details return k_medoids_alg.perform_clustering(k_medoids_args) else: centroid, labels = scipy.cluster.vq.kmeans2( self.eigenvectors[:, :k], k, iter=1000, minit='random') del centroid clusters = gen_clusters_from_class_list(labels) return Clustering(clusters, details=algorithm_details)