def action(self, tweets_list): corpus = [] for tweet in tweets_list: #corpus += [t["text"]] tweet_str = tweet["text"].encode("utf-8") tweet_str = unicode(tweet_str, 'utf-8') corpus.append(tweet_str) print(corpus) vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) M, P = X.shape dist_corpus = euclidean_distances(X) stwf = stopwords.words('french') stwf.append('les') vectorizer = CountVectorizer(stop_words=stwf) X = vectorizer.fit_transform(corpus) dico = vectorizer.vocabulary_ #Tous les print regroupés ici print("Results of Birch algorithm") clusters = birch_algo(X.toarray(), None) quit()
def action(self, tweets_list): corpus = [] for tweet in tweets_list: #corpus += [t["text"]] tweet_str = tweet["text"].encode("utf-8") tweet_str = unicode(tweet_str,'utf-8') corpus.append(tweet_str) print(corpus) vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) M,P=X.shape dist_corpus=euclidean_distances(X) stwf=stopwords.words('french') stwf.append('les') vectorizer=CountVectorizer(stop_words=stwf) X = vectorizer.fit_transform(corpus) dico=vectorizer.vocabulary_ #Tous les print regroupés ici print("Results of Birch algorithm") clusters = birch_algo(X.toarray(), None) quit()
xx, yy = np.meshgrid(xx, yy) n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis])) # Generate blobs to do a comparison between MiniBatchKMeans and Birch. X, y = make_blobs(n_samples=100000, centers=n_centres, random_state=0) # Use all colors that matplotlib provides by default. colors_ = cycle(colors.cnames.keys()) fig = plt.figure(figsize=(12, 4)) fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9) #Compute clustering with Birch with and without the final clustering step and plot. labels1, centroids1, n_clusters1 = birch_algo(X, clustering=None) labels2, centroids2, n_clusters2 = birch_algo(X, clustering=100) labels = labels1, labels2 centroids = centroids1, centroids2 n_clusters = n_clusters1, n_clusters2 final_step = ['without global clustering', 'with global clustering'] #plot the results of birch with and without clustering. for i in range(0, 2): ind = i + 1 ax = fig.add_subplot(1, 3, ind +1) for this_centroids, k, col in zip(centroids[i], range(n_clusters[i]), colors_ ): mask = labels[i] == k