예제 #1
0
def clustering_tweets_hc(labeled_tweets, num_cluster):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    tweet_vec = vectorizer.fit_transform(labeled_tweets).toarray()
    # print(tweet_vec)
    n_clusters = num_cluster

    from sklearn.neighbors import kneighbors_graph

    knn_graph = kneighbors_graph(tweet_vec, 1, include_self=False)
    # print(knn_graph)

    connectivity = knn_graph
    from sklearn.cluster import AgglomerativeClustering

    model = AgglomerativeClustering(linkage='ward',
                                    connectivity=connectivity,
                                    n_clusters=n_clusters)
    model.fit(tweet_vec)
    c = model.labels_
    # print(c,len(c))

    clustered_tweets = []
    for i in range(0, num_cluster):
        similar_indices = (c == i).nonzero()[0]
        sent = ''
        for sid in similar_indices:
            sent = labeled_tweets[sid] + ' ' + sent
        clustered_tweets.append(sent)
    return clustered_tweets
예제 #2
0
def clustering_texts_using_trainingset(texts, trainingset, cluster_size):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    texts_vec = vectorizer.fit_transform(texts)
    training_vec = vectorizer.transform(trainingset)
    from sklearn.metrics.pairwise import pairwise_distances
    # sim_matrix(i, j) is the distance between the ith array from X and the jth array from Y.
    # From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]. These metrics support sparse matrix inputs.
    sim_matrix = 1 - pairwise_distances(
        texts_vec, training_vec, metric="cosine")  # euclidean as well
    num_texts = texts_vec.shape[0]
    cluster_size = cluster_size - 1  #减1是因为最后要把texts中放入,所以其实只需选择cluster_size-1个文本
    ind_clustered_tweets = np.zeros([num_texts, cluster_size], dtype=int)

    for i in range(0, num_texts):
        indx = np.argpartition(sim_matrix[i], -cluster_size)[-cluster_size:]
        ind_clustered_tweets[i] = indx

    trainingset = np.array(trainingset)
    clustered_texts = []
    for i in range(0, num_texts):
        ind = ind_clustered_tweets[i]
        clustered_texts.append(texts[i] + ' ' + ' '.join(trainingset[ind]))

    import pickle

    print(
        '和training_data聚合在一起的training data保存在了:./acc_tmp/clustering_texts_with_trainingset.p文件中'
    )
    pickle.dump(clustered_texts,
                open("./acc_tmp/clustering_texts_with_trainingset.p", "wb"))
    return (clustered_texts, range(num_texts))
예제 #3
0
def build_clustered_testdata_nearest(tweets, num_clusters):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    tweet_vec = vectorizer.fit_transform(tweets)

    from sklearn.metrics.pairwise import pairwise_distances

    sim_matrix = 1 - pairwise_distances(tweet_vec,
                                        metric="cosine")  # euclidean as well
    num_tweets = tweet_vec.shape[0]

    num_tweets_in_cluster = math.ceil(
        num_tweets /
        num_clusters)  # 一共100tweets放在21个cluster中就会出错:最后一个cluster为空

    ind_clustered_tweets = np.zeros([num_clusters, num_tweets_in_cluster],
                                    dtype=int)
    j = 0
    for i in range(0, num_tweets):
        if np.any(sim_matrix[i] != -np.inf) and j < num_clusters:
            indx = np.argpartition(
                sim_matrix[i], -num_tweets_in_cluster)[-num_tweets_in_cluster:]
            ind_clustered_tweets[j] = [
                ind if sim_matrix[i, ind] != -np.inf else -1 for ind in indx
            ]
            # ind_clustered_tweets[j]=indx
            sim_matrix[:, indx] = -np.inf
            sim_matrix[indx, :] = -np.inf
            j += 1

        elif j >= num_clusters:
            break
        else:
            continue

    tweets = np.array(tweets)
    clustered_tweets = []
    for i in range(0, num_clusters):
        ind = ind_clustered_tweets[i]
        ind_illegal = np.where(ind == -1)[0]  # index of -1
        # print(ind_illegal)
        if len(ind_illegal) != 0:
            ind = np.delete(ind, ind_illegal)

        # print(ind)
        clustered_tweets.append(' '.join(tweets[ind]))
    import pickle

    print('聚合在一起的test data 保存在了:./acc_tmp/aggregated_test_tweets_greedy.p文件中')
    pickle.dump(clustered_tweets,
                open("./acc_tmp/aggregated_test_tweets_greedy.p", "wb"))
    return (clustered_tweets, [
        np.where(ind_clustered_tweets == tweets_id)[0][0]
        for tweets_id in range(0, num_tweets)
    ])
예제 #4
0
def build_clustered_testdata(tweets, num_cluster):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    tweet_vec = vectorizer.fit_transform(tweets)
    km = KMeans(n_clusters=num_cluster, init='k-means++', n_init=10, verbose=1)
    km.fit(tweet_vec)
    clustered_tweets = []
    for i in range(0, num_cluster):
        similar_indices = (km.labels_ == i).nonzero()[0]
        sent = ''
        for sid in similar_indices:
            sent = tweets[sid] + ' ' + sent
        clustered_tweets.append(sent)
    return clustered_tweets, km.labels_
예제 #5
0
def create_ngram_model(params=None):
    print('start create_ngram_model...')
    # 使用stem在preprocess里面,效果不好
    # tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1, 2), analyzer='word', binary=False)
    # 自定义StemmedTfidfVectorizer,嵌入stem
    tfidf_ngrams = cstv.StemmedTfidfVectorizer(preprocessor=preprocessor,
                                               ngram_range=(1, 3),
                                               analyzer='word',
                                               binary=False)
    clf = MultinomialNB()
    pipeline = Pipeline(steps=[('vect', tfidf_ngrams), ('clf', clf)])
    # print(sorted(tfidf_ngrams.get_stop_words()))

    return pipeline
예제 #6
0
def clustering_tweets(labeled_tweets, num_cluster):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    tweet_vec = vectorizer.fit_transform(labeled_tweets)
    km = KMeans(n_clusters=num_cluster,
                precompute_distances='auto',
                init='k-means++',
                n_init=3,
                verbose=1)
    km.fit(tweet_vec)
    clustered_tweets = []
    for i in range(0, num_cluster):
        similar_indices = (km.labels_ == i).nonzero()[0]
        sent = ''
        for sid in similar_indices:
            sent = labeled_tweets[sid] + ' ' + sent
        clustered_tweets.append(sent)
    return clustered_tweets
예제 #7
0
def create_union_model(params=None):
    print('start create_union_model...')
    tfidf_ngrams = cstv.StemmedTfidfVectorizer(preprocessor=preprocessor,
                                               ngram_range=(1, 3),
                                               analyzer='word',
                                               binary=False)

    ling_status = vec_est.LinguisticVectorizer()
    all_features = FeatureUnion([('ling', ling_status),
                                 ('tfidf', tfidf_ngrams)])

    clf = MultinomialNB()
    pipeline = Pipeline([("all", all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
예제 #8
0
def get_candidate_dynamic(texts, trainingset, cluster_size, file_name):
    vectorizer = cst_vectorizer.StemmedTfidfVectorizer(**param)
    texts_vec = vectorizer.fit_transform(texts)
    training_vec = vectorizer.transform(trainingset)
    from sklearn.metrics.pairwise import pairwise_distances
    # sim_matrix(i, j) is the distance between the ith array from X and the jth array from Y.
    # From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]. These metrics support sparse matrix inputs.
    sim_matrix = 1 - pairwise_distances(
        texts_vec, training_vec, metric="cosine")  # euclidean as well
    num_texts = texts_vec.shape[0]
    cluster_size = cluster_size - 1  #减1是因为最后要把texts中放入,所以其实只需选择cluster_size-1个文本
    ind_clustered_tweets = np.zeros([num_texts, cluster_size], dtype=int)

    for i in range(0, num_texts):
        indx = np.argpartition(sim_matrix[i], -cluster_size)[-cluster_size:]
        ind_clustered_tweets[i] = indx

    trainingset = np.array(trainingset)
    clustered_texts = []
    extantion_content = []
    for i in range(0, num_texts):
        ind = ind_clustered_tweets[i]
        clustered_texts.append(texts[i] + ' ' + ' '.join(trainingset[ind]))
        extantion_content.append(' '.join(trainingset[ind]))

    import pickle
    # 推荐file_name的值为neg和pos
    print('和training_data聚合在一起的test data保存在了:./data/extended_test/文件夹*.p中')
    pickle.dump(
        clustered_texts,
        open("./data/extended_test_data/" + file_name + "_clustered_texts.p",
             "wb"))
    pickle.dump(
        extantion_content,
        open("./data/extended_test_data/" + file_name + "_extantion_content.p",
             "wb"))


# 执行上述函数时候需要三种变量
예제 #9
0
dir, f = '.\data', 'smilarity_cluster_testdata.csv'
posts = []
with open(os.path.join(dir, f), 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
    for post in reader:
        posts.extend(post)

#vectorizer = CountVectorizer(min_df=1, stop_words='english')  # min_df决定了CountVectorizer如何处理那些不经常使用的词语(最小文档频率),
# 如果是整数,小于这个值的词语都将被扔掉,如果是一个比例,所有在整个数据集中出现比例小于这个值的词语都将被丢掉
# 使用停用词stop_words,通过'english'或者None的取值可以控制是否使用

# 更改vectorizer,使用StemmedCountVectorizer
# vectorizer=cst_vectorizer.StemmedCountVectorizer(min_df=1,stop_words='english')

# 更改vectorizer, 使用StemmedTfidfVectorizer
vectorizer=cst_vectorizer.StemmedTfidfVectorizer(min_df=1,stop_words='english',decode_error="ignore") # 新版本中用此代替旧版本的charset_error

X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape

#K均值聚类
num_clusters=5
km=KMeans(n_clusters=num_clusters,init='random',n_init=1,verbose=1)
km.fit(X_train)
print(km.labels_,km.labels_.shape, num_samples)


print(vectorizer.get_feature_names(), num_features, num_samples)

new_post = "the underlying caste system in america . it's a scathing portrayal"
new_post_vec = vectorizer.transform([new_post])