#centroids_idx = [0] # Distance of each point to the closest cluster #mindist = np.array([tfidfdct.cosdist(tfidf_list[0]) for tfidfdct in tfidf_list]) #for i in range(1, ncluster) : # max_idx = np.argmax(mindist) # centroids_idx.append(max_idx) # dist = np.array([tfidfdct.cosdist(tfidf_list[max_idx]) for tfidfdct in tfidf_list]) # mindist = np.amin(np.vstack((mindist, dist)), 0) #clusters = [Cluster(tfidf_list[c_idx]) for c_idx in centroids_idx] # Inititalize centroids by hierarchical clustering on a sample #nsample = math.ceil(0.05*npoint) nsample = 5*ncluster idxsample = rnd.sample(range(npoint), nsample) docIdsample = [reader.read_idx(i)[0] for i in idxsample] clusters_s = [Cluster(docIdsample[i], reader) for i in range(nsample)] distmat = np.zeros((nsample, nsample)) print('Calculating initial clustroids') for i in range(nsample) : distmat[i,i] = None for j in range(i+1,nsample) : dist = cosdist(clusters_s[i].clustroid, clusters_s[j].clustroid) distmat[i,j] = dist distmat[j,i] = dist nmerged = nsample while nmerged != ncluster : flatidx = np.nanargmin(distmat)
if __name__ == "__main__": reader = TFIDF_reader("test.vectors") grgpf = GRGPF( tfidf_cosine_distance, reader.read_docId, limit_subnodes=10, limit_clusters_per_leaf=10, limit_total_clusters=1000, limit_total_nodes=200, k=10, sample_size=30, get_next_threshold=threshold_cosine(0.8, 0.99, 10), ) for i in range(0, reader.doc_nb): print(i) if i != 0 and i % 100 == 0: # recalcul des samples tout les 100 points print("Recomputing samples") grgpf.recompute(True) print("Recomputing done") if i != 0 and i % 5000 == 0: # recalcul complet des representations tout les 5000 points print("Recomputing representations") grgpf.recompute(False) print("Recomputing done") if i > 1000: # limite sur le nombre de document qu'on ajoute break grgpf.add_point(reader.read_idx(i)) print(grgpf.create_cluster_list()) print(grgpf.create_stdtree()) print("something")