Пример #1
0
def find_clusters(dataset, dci, C=100):  # dci is a DocumentClusteringInfo
    k = 3
    best_k = 3
    cost = float("inf")
    while True:
        k += 1
        old_cost = cost
        means, d = KMeans.compute_means(dataset, k, dci)
        cost = k*C + d
        if old_cost > cost:
            best_k = k
        if k - best_k > 2:
            break

    return KMeans.compute_means(dataset, best_k)
Пример #2
0
def cluster_index(index_name, k=-1):
    dci = DocClusteringInfo.objects.get(index_name=index_name)

    dci.status = 'Retrieving Documents'
    dci.save()
    dataset = retrieve_dataset(index_name, dci.doc_type)

    dci.status = 'Clustering Data'
    dci.save()
    if k < 2:
        find_clusters(list(dataset.values()), dci)
    else:
        KMeans.compute_means(list(dataset.values()), k, dci)

    dci.status = 'Labeling Clusters'
    dci.save()
    labels = label_clusters(index_name, dci.doc_type, list(dataset.values()), k)

    dci.status = 'Updating Index'
    dci.save()
    update_index(index_name, dci.doc_type, dataset, labels)

    dci.status = 'Idle'
    dci.save()