def kmeans(task): connection.close() start = datetime.datetime.now() # import the file to the local SequenceDb sequences = Sequence.objects.all() profiles = [] seqOut = [] for sequence in sequences: profiles.append(np.array(sequence.dna)) seqOut.append(sequence) results, error, nfound = kcluster( profiles, nclusters = task.params['ncluster'], mask=None, weight=None, transpose=0, npass=task.params['npass'], method=task.params['method'], dist=task.params['distance'], initialid=None) # save the results connection.close() stop = datetime.datetime.now() task.running = False task.duration = stop - start task.save() service = ResultService() service.buildResult(task, results, seqOut)
def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None): """Apply k-means or k-median clustering. This method returns a tuple (clusterid, error, nfound). Arguments: - nclusters: number of clusters (the 'k' in k-means) - transpose: if equal to 0, genes (rows) are clustered; if equal to 1, microarrays (columns) are clustered. - npass : number of times the k-means clustering algorithm is performed, each time with a different (random) initial condition. - method : specifies how the center of a cluster is found: - method=='a': arithmetic mean - method=='m': median - dist : specifies the distance function to be used: - dist=='e': Euclidean distance - dist=='b': City Block distance - dist=='c': Pearson correlation - dist=='a': absolute value of the correlation - dist=='u': uncentered correlation - dist=='x': absolute uncentered correlation - dist=='s': Spearman's rank correlation - dist=='k': Kendall's tau - initialid: the initial clustering from which the algorithm should start. If initialid is None, the routine carries out npass repetitions of the EM algorithm, each time starting from a different random initial clustering. If initialid is given, the routine carries out the EM algorithm only once, starting from the given initial clustering and without randomizing the order in which items are assigned to clusters (i.e., using the same order as in the data matrix). In that case, the k-means algorithm is fully deterministic. Return values: - clusterid: array containing the number of the cluster to which each gene/microarray was assigned in the best k-means clustering solution that was found in the npass runs; - error: the within-cluster sum of distances for the returned k-means clustering solution; - nfound: the number of times this solution was found. """ if transpose == 0: weight = self.eweight else: weight = self.gweight return kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid)