vectorizer = Vectorizer(max_features=10000) X = vectorizer.fit_transform((open(f).read() for f in filenames)) X = Normalizer(norm="l2", copy=False).transform(X) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X.shape print ############################################################################### # Now sparse MiniBatchKmeans print "_" * 80 mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13, chunk_size=1000, tol=0.0, n_init=1) print "Clustering sparse data with %s" % str(mbkm) print t0 = time() mbkm.fit(X) print "done in %0.3fs" % (time() - t0) ri = randindex(labels, mbkm.labels_) vmeasure = metrics.v_measure_score(labels, mbkm.labels_) print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, mbkm.labels_) print "Completeness: %0.3f" % metrics.completeness_score(labels, mbkm.labels_) print "V-measure: %0.3f" % vmeasure print "Rand-Index: %.3f" % ri print
############################################################################## # Compute clustering with Means k_means = KMeans(init='k-means++', k=3) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 k_means_labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ k_means_labels_unique = np.unique(k_means_labels) ############################################################################## # Compute clustering with MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size) t0 = time.time() mbk.fit(X) t_mini_batch = time.time() - t0 mbk_means_labels = mbk.labels_ mbk_means_cluster_centers = mbk.cluster_centers_ mbk_means_labels_unique = np.unique(mbk_means_labels) ############################################################################## # Plot result fig = pl.figure() colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per