def clustering_tweets_hc(labeled_tweets, num_cluster): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) tweet_vec = vectorizer.fit_transform(labeled_tweets).toarray() # print(tweet_vec) n_clusters = num_cluster from sklearn.neighbors import kneighbors_graph knn_graph = kneighbors_graph(tweet_vec, 1, include_self=False) # print(knn_graph) connectivity = knn_graph from sklearn.cluster import AgglomerativeClustering model = AgglomerativeClustering(linkage='ward', connectivity=connectivity, n_clusters=n_clusters) model.fit(tweet_vec) c = model.labels_ # print(c,len(c)) clustered_tweets = [] for i in range(0, num_cluster): similar_indices = (c == i).nonzero()[0] sent = '' for sid in similar_indices: sent = labeled_tweets[sid] + ' ' + sent clustered_tweets.append(sent) return clustered_tweets
def clustering_texts_using_trainingset(texts, trainingset, cluster_size): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) texts_vec = vectorizer.fit_transform(texts) training_vec = vectorizer.transform(trainingset) from sklearn.metrics.pairwise import pairwise_distances # sim_matrix(i, j) is the distance between the ith array from X and the jth array from Y. # From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]. These metrics support sparse matrix inputs. sim_matrix = 1 - pairwise_distances( texts_vec, training_vec, metric="cosine") # euclidean as well num_texts = texts_vec.shape[0] cluster_size = cluster_size - 1 #减1是因为最后要把texts中放入,所以其实只需选择cluster_size-1个文本 ind_clustered_tweets = np.zeros([num_texts, cluster_size], dtype=int) for i in range(0, num_texts): indx = np.argpartition(sim_matrix[i], -cluster_size)[-cluster_size:] ind_clustered_tweets[i] = indx trainingset = np.array(trainingset) clustered_texts = [] for i in range(0, num_texts): ind = ind_clustered_tweets[i] clustered_texts.append(texts[i] + ' ' + ' '.join(trainingset[ind])) import pickle print( '和training_data聚合在一起的training data保存在了:./acc_tmp/clustering_texts_with_trainingset.p文件中' ) pickle.dump(clustered_texts, open("./acc_tmp/clustering_texts_with_trainingset.p", "wb")) return (clustered_texts, range(num_texts))
def build_clustered_testdata_nearest(tweets, num_clusters): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) tweet_vec = vectorizer.fit_transform(tweets) from sklearn.metrics.pairwise import pairwise_distances sim_matrix = 1 - pairwise_distances(tweet_vec, metric="cosine") # euclidean as well num_tweets = tweet_vec.shape[0] num_tweets_in_cluster = math.ceil( num_tweets / num_clusters) # 一共100tweets放在21个cluster中就会出错:最后一个cluster为空 ind_clustered_tweets = np.zeros([num_clusters, num_tweets_in_cluster], dtype=int) j = 0 for i in range(0, num_tweets): if np.any(sim_matrix[i] != -np.inf) and j < num_clusters: indx = np.argpartition( sim_matrix[i], -num_tweets_in_cluster)[-num_tweets_in_cluster:] ind_clustered_tweets[j] = [ ind if sim_matrix[i, ind] != -np.inf else -1 for ind in indx ] # ind_clustered_tweets[j]=indx sim_matrix[:, indx] = -np.inf sim_matrix[indx, :] = -np.inf j += 1 elif j >= num_clusters: break else: continue tweets = np.array(tweets) clustered_tweets = [] for i in range(0, num_clusters): ind = ind_clustered_tweets[i] ind_illegal = np.where(ind == -1)[0] # index of -1 # print(ind_illegal) if len(ind_illegal) != 0: ind = np.delete(ind, ind_illegal) # print(ind) clustered_tweets.append(' '.join(tweets[ind])) import pickle print('聚合在一起的test data 保存在了:./acc_tmp/aggregated_test_tweets_greedy.p文件中') pickle.dump(clustered_tweets, open("./acc_tmp/aggregated_test_tweets_greedy.p", "wb")) return (clustered_tweets, [ np.where(ind_clustered_tweets == tweets_id)[0][0] for tweets_id in range(0, num_tweets) ])
def build_clustered_testdata(tweets, num_cluster): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) tweet_vec = vectorizer.fit_transform(tweets) km = KMeans(n_clusters=num_cluster, init='k-means++', n_init=10, verbose=1) km.fit(tweet_vec) clustered_tweets = [] for i in range(0, num_cluster): similar_indices = (km.labels_ == i).nonzero()[0] sent = '' for sid in similar_indices: sent = tweets[sid] + ' ' + sent clustered_tweets.append(sent) return clustered_tweets, km.labels_
def create_ngram_model(params=None): print('start create_ngram_model...') # 使用stem在preprocess里面,效果不好 # tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1, 2), analyzer='word', binary=False) # 自定义StemmedTfidfVectorizer,嵌入stem tfidf_ngrams = cstv.StemmedTfidfVectorizer(preprocessor=preprocessor, ngram_range=(1, 3), analyzer='word', binary=False) clf = MultinomialNB() pipeline = Pipeline(steps=[('vect', tfidf_ngrams), ('clf', clf)]) # print(sorted(tfidf_ngrams.get_stop_words())) return pipeline
def clustering_tweets(labeled_tweets, num_cluster): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) tweet_vec = vectorizer.fit_transform(labeled_tweets) km = KMeans(n_clusters=num_cluster, precompute_distances='auto', init='k-means++', n_init=3, verbose=1) km.fit(tweet_vec) clustered_tweets = [] for i in range(0, num_cluster): similar_indices = (km.labels_ == i).nonzero()[0] sent = '' for sid in similar_indices: sent = labeled_tweets[sid] + ' ' + sent clustered_tweets.append(sent) return clustered_tweets
def create_union_model(params=None): print('start create_union_model...') tfidf_ngrams = cstv.StemmedTfidfVectorizer(preprocessor=preprocessor, ngram_range=(1, 3), analyzer='word', binary=False) ling_status = vec_est.LinguisticVectorizer() all_features = FeatureUnion([('ling', ling_status), ('tfidf', tfidf_ngrams)]) clf = MultinomialNB() pipeline = Pipeline([("all", all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def get_candidate_dynamic(texts, trainingset, cluster_size, file_name): vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param) texts_vec = vectorizer.fit_transform(texts) training_vec = vectorizer.transform(trainingset) from sklearn.metrics.pairwise import pairwise_distances # sim_matrix(i, j) is the distance between the ith array from X and the jth array from Y. # From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]. These metrics support sparse matrix inputs. sim_matrix = 1 - pairwise_distances( texts_vec, training_vec, metric="cosine") # euclidean as well num_texts = texts_vec.shape[0] cluster_size = cluster_size - 1 #减1是因为最后要把texts中放入,所以其实只需选择cluster_size-1个文本 ind_clustered_tweets = np.zeros([num_texts, cluster_size], dtype=int) for i in range(0, num_texts): indx = np.argpartition(sim_matrix[i], -cluster_size)[-cluster_size:] ind_clustered_tweets[i] = indx trainingset = np.array(trainingset) clustered_texts = [] extantion_content = [] for i in range(0, num_texts): ind = ind_clustered_tweets[i] clustered_texts.append(texts[i] + ' ' + ' '.join(trainingset[ind])) extantion_content.append(' '.join(trainingset[ind])) import pickle # 推荐file_name的值为neg和pos print('和training_data聚合在一起的test data保存在了:./data/extended_test/文件夹*.p中') pickle.dump( clustered_texts, open("./data/extended_test_data/" + file_name + "_clustered_texts.p", "wb")) pickle.dump( extantion_content, open("./data/extended_test_data/" + file_name + "_extantion_content.p", "wb")) # 执行上述函数时候需要三种变量
dir, f = '.\data', 'smilarity_cluster_testdata.csv' posts = [] with open(os.path.join(dir, f), 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='"') for post in reader: posts.extend(post) #vectorizer = CountVectorizer(min_df=1, stop_words='english') # min_df决定了CountVectorizer如何处理那些不经常使用的词语(最小文档频率), # 如果是整数,小于这个值的词语都将被扔掉,如果是一个比例,所有在整个数据集中出现比例小于这个值的词语都将被丢掉 # 使用停用词stop_words,通过'english'或者None的取值可以控制是否使用 # 更改vectorizer,使用StemmedCountVectorizer # vectorizer=cst_vectorizer.StemmedCountVectorizer(min_df=1,stop_words='english') # 更改vectorizer, 使用StemmedTfidfVectorizer vectorizer=cst_vectorizer.StemmedTfidfVectorizer(min_df=1,stop_words='english',decode_error="ignore") # 新版本中用此代替旧版本的charset_error X_train = vectorizer.fit_transform(posts) num_samples, num_features = X_train.shape #K均值聚类 num_clusters=5 km=KMeans(n_clusters=num_clusters,init='random',n_init=1,verbose=1) km.fit(X_train) print(km.labels_,km.labels_.shape, num_samples) print(vectorizer.get_feature_names(), num_features, num_samples) new_post = "the underlying caste system in america . it's a scathing portrayal" new_post_vec = vectorizer.transform([new_post])