def find_clusters(dataset, dci, C=100): # dci is a DocumentClusteringInfo k = 3 best_k = 3 cost = float("inf") while True: k += 1 old_cost = cost means, d = KMeans.compute_means(dataset, k, dci) cost = k*C + d if old_cost > cost: best_k = k if k - best_k > 2: break return KMeans.compute_means(dataset, best_k)
def cluster_index(index_name, k=-1): dci = DocClusteringInfo.objects.get(index_name=index_name) dci.status = 'Retrieving Documents' dci.save() dataset = retrieve_dataset(index_name, dci.doc_type) dci.status = 'Clustering Data' dci.save() if k < 2: find_clusters(list(dataset.values()), dci) else: KMeans.compute_means(list(dataset.values()), k, dci) dci.status = 'Labeling Clusters' dci.save() labels = label_clusters(index_name, dci.doc_type, list(dataset.values()), k) dci.status = 'Updating Index' dci.save() update_index(index_name, dci.doc_type, dataset, labels) dci.status = 'Idle' dci.save()