예제 #1
0
#centroids_idx = [0]
# Distance of each point to the closest cluster
#mindist = np.array([tfidfdct.cosdist(tfidf_list[0]) for tfidfdct in tfidf_list])
#for i in range(1, ncluster) :
#	max_idx = np.argmax(mindist)
#	centroids_idx.append(max_idx)
#	dist = np.array([tfidfdct.cosdist(tfidf_list[max_idx]) for tfidfdct in tfidf_list])
#	mindist = np.amin(np.vstack((mindist, dist)), 0)

#clusters = [Cluster(tfidf_list[c_idx]) for c_idx in centroids_idx]

# Inititalize centroids by hierarchical clustering on a sample
#nsample = math.ceil(0.05*npoint)
nsample = 5*ncluster
idxsample = rnd.sample(range(npoint), nsample)
docIdsample = [reader.read_idx(i)[0] for i in idxsample]
clusters_s = [Cluster(docIdsample[i], reader) for i in range(nsample)]
distmat = np.zeros((nsample, nsample))

print('Calculating initial clustroids')

for i in range(nsample) :
	distmat[i,i] = None
	for j in range(i+1,nsample) :
		dist = cosdist(clusters_s[i].clustroid, clusters_s[j].clustroid)
		distmat[i,j] = dist
		distmat[j,i] = dist

nmerged = nsample
while nmerged != ncluster :
	flatidx = np.nanargmin(distmat)
예제 #2
0
if __name__ == "__main__":
    reader = TFIDF_reader("test.vectors")
    grgpf = GRGPF(
        tfidf_cosine_distance,
        reader.read_docId,
        limit_subnodes=10,
        limit_clusters_per_leaf=10,
        limit_total_clusters=1000,
        limit_total_nodes=200,
        k=10,
        sample_size=30,
        get_next_threshold=threshold_cosine(0.8, 0.99, 10),
    )
    for i in range(0, reader.doc_nb):
        print(i)
        if i != 0 and i % 100 == 0:  # recalcul des samples tout les 100 points
            print("Recomputing samples")
            grgpf.recompute(True)
            print("Recomputing done")
        if i != 0 and i % 5000 == 0:  # recalcul complet des representations tout les 5000 points
            print("Recomputing representations")
            grgpf.recompute(False)
            print("Recomputing done")
        if i > 1000:  # limite sur le nombre de document qu'on ajoute
            break
        grgpf.add_point(reader.read_idx(i))
    print(grgpf.create_cluster_list())
    print(grgpf.create_stdtree())
    print("something")