def main(): target_names = ['alt.atheism', 'comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.guns' ] news_train = fetch_20newsgroups(subset='train', categories=target_names) train_data = news_train.data print(len(train_data)) print(type(train_data), type(train_data[0]), "\n") # <class 'list'> <class 'str'> processed_data = [" ".join(pre_processing(data)) for data in train_data] stopwords = load_stopwords() max_df = 0.3 min_df = 0.005 # #### Learn Bag-of-words (BoW) # count_vec = CountVectorizer(stop_words='english') count_vec = CountVectorizer(stop_words=stopwords, max_df=max_df, min_df=min_df, max_features=FEATURE_NUM) count_vec.fit(processed_data) data_bow = count_vec.transform(processed_data) feature_names_bow = count_vec.get_feature_names() print(len(processed_data), data_bow.shape, type(data_bow)) # #### Learn TF-IDF model # tfidf_vec = TfidfVectorizer(stop_words='english') tfidf_vec = TfidfVectorizer(stop_words=stopwords, max_df=max_df, min_df=min_df, max_features=FEATURE_NUM) tfidf_vec.fit(processed_data) data_tfidf = tfidf_vec.transform(processed_data) feature_names_tfidf = tfidf_vec.get_feature_names() print(len(processed_data), data_tfidf.shape, type(data_tfidf)) # #### Train LDA models for BoW num_topics = 5 num_topic_word = 20 lda_bow = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda_bow.fit(data_bow) display_topics(lda_bow, feature_names_bow, num_topic_word, "BoW_word_cloud") print('\n\n\n') # #### Train LDA models for TF-IDF num_topics = 5 num_topic_word = 20 lda_tfidf = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda_tfidf.fit(data_tfidf) display_topics(lda_tfidf, feature_names_tfidf, num_topic_word, "TF-IDF_word_cloud")
def main(): target_names = ['alt.atheism', 'comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.guns' ] news_train = fetch_20newsgroups(subset='train', categories=target_names, remove=('headers', 'footers', 'quotes')) # news_train = fetch_20newsgroups(subset='train') train_data = news_train.data print(len(train_data)) print(type(train_data), type(train_data[0]), "\n") # <class 'list'> <class 'str'> processed_data = [pre_processing(data) for data in train_data] stopwords = load_stopwords() # ####################################################################### # #### Learn TF-IDF model tfidf_vec = TfidfVectorizer(stop_words=stopwords) tfidf_vec.fit(processed_data) data_tfidf = tfidf_vec.transform(processed_data) feature_names_tfidf = tfidf_vec.get_feature_names() print(len(processed_data), data_tfidf.shape, type(data_tfidf)) print('Start K-means for TF-IDF:') weight_tfidf = data_tfidf.toarray() clf_tfidf = KMeans(n_clusters=5) s = clf_tfidf.fit(weight_tfidf) print(s) # 5个中心点 print(clf_tfidf.cluster_centers_) # # 每个样本所属的簇 # print(clf.labels_) # i = 1 # while i <= len(clf.labels_): # print(i, clf.labels_[i - 1]) # i = i + 1 # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 # Sum of distances of samples to their closest cluster center print("Inertia of TF-IDF", clf_tfidf.inertia_) # 使用T-SNE算法,对权重进行降维,准确度比PCA算法高,但是耗时长 tsne = TSNE(n_components=2) decomposition_data = tsne.fit_transform(weight_tfidf) x, y = [], [] for i in decomposition_data: x.append(i[0]) y.append(i[1]) fig = plt.figure(figsize=(10, 10)) ax = plt.axes() plt.scatter(x, y, c=clf_tfidf.labels_, marker="x") plt.xticks(()) plt.yticks(()) plt.show() plt.savefig('./Chart/kMeans_sample_TFIDF.png', aspect=1)
def main(): target_names = ['alt.atheism', 'comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.guns' ] news_train = fetch_20newsgroups(subset='train', categories=target_names, remove=('headers', 'footers', 'quotes')) # news_train = fetch_20newsgroups(subset='train') train_data = news_train.data print(len(train_data)) print(type(train_data), type(train_data[0]), "\n") # <class 'list'> <class 'str'> processed_data = [pre_processing(data) for data in train_data] stopwords = load_stopwords() # # ####################################################################### # #### Learn Bag-of-words (BoW) count_vec = CountVectorizer(stop_words=stopwords) count_vec.fit(processed_data) data_bow = count_vec.transform(processed_data) feature_names_bow = count_vec.get_feature_names() print(len(processed_data), data_bow.shape, type(data_bow)) # pprint(count_vec.get_feature_names()) # pprint(count_vec.vocabulary_) print('Start K-means for BoW:') num_clusters = 5 weight_bow = data_bow.toarray() clf_bow = KMeans(n_clusters=num_clusters) s = clf_bow.fit(weight_bow) print(s) # 5个中心点 print(clf_bow.cluster_centers_) # Sum of distances of samples to their closest cluster center print("Inertia of BoW", clf_bow.inertia_) # 使用T-SNE算法,对权重进行降维,准确度比PCA算法高,但是耗时长 tsne = TSNE(n_components=2) decomposition_data = tsne.fit_transform(weight_bow) x, y = [], [] for i in decomposition_data: x.append(i[0]) y.append(i[1]) fig = plt.figure(figsize=(10, 10)) ax = plt.axes() plt.scatter(x, y, c=clf_bow.labels_, marker="x") plt.xticks(()) plt.yticks(()) plt.show() plt.savefig('./Chart/kMeans_sample_BoW.png', aspect=1)
def main(): target_names = [ 'alt.atheism', 'comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.guns' ] remove = ('headers', 'footers', 'quotes') news_train = fetch_20newsgroups(subset='train', categories=target_names, remove=remove) train_data = news_train.data print(len(train_data)) print(type(train_data), type(train_data[0]), "\n") # <class 'list'> <class 'str'> processed_data = [pre_processing(data) for data in train_data] # Create Dictionary id2word = corpora.Dictionary(processed_data) # Create Corpus texts = processed_data # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=5, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) pprint(lda_model.print_topics()) # Visualize the topics vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) pyLDAvis.save_html(vis, 'lda.html')
def main(): target_names = ['alt.atheism'] news_train = fetch_20newsgroups(subset='train', categories=target_names) train_data = news_train.data print(len(train_data)) print(type(train_data), type(train_data[0]), "\n") # <class 'list'> <class 'str'> processed_data = [pre_processing(data) for data in train_data] # for i in range(5): # print(processed_data[i]) # print("\n\n") # print("\n\n\n\n\n") max_epochs = 500 vec_size = 20 alpha = 0.025 model = gensim.models.Doc2Vec(processed_data, dm=0, alpha=0.1, size=20, min_alpha=0.025)
def main(): # news_train = fetch_20newsgroups(subset='train') # print("Number of articles: " + str(len(news_train.data))) # print("Number of different categories: " + str(len(news_train.target_names))) # # Number of articles: 11314 # # Number of different categories: 20 # pprint(list(news_train.target_names)) # target_names = ['alt.atheism', # 'comp.graphics', # 'comp.os.ms-windows.misc', # 'comp.sys.ibm.pc.hardware', # 'comp.sys.mac.hardware', # 'comp.windows.x', # 'misc.forsale', # 'rec.autos', # 'rec.motorcycles', # 'rec.sport.baseball', # 'rec.sport.hockey', # 'sci.crypt', # 'sci.electronics', # 'sci.med', # 'sci.space', # 'soc.religion.christian', # 'talk.politics.guns', # 'talk.politics.mideast', # 'talk.politics.misc', # 'talk.religion.misc'] # pie_chart = [] # for i in range(len(target_names)): # parts = [target_names[i]] # part = fetch_20newsgroups(subset='train', categories=parts) # # print(parts, part.filenames.shape) # pie_chart.append(len(part.filenames)) # x = pie_chart # plt.pie(x, labels=target_names, autopct='%.1f%%', startangle=90, counterclock=False) # plt.title("Pie Chart for 20 Newsgroup", fontsize='large', fontweight='bold') # plt.show() # plt.close() # parts = ['alt.atheism'] # 480 # part = fetch_20newsgroups(subset='train', categories=parts) # # part.data is a list # raw = part.data[0] # print(raw, "--------------------------------------------------------\n\n\n\n") # # processed = pre_processing(raw) # print(processed) news_train = fetch_20newsgroups(subset='train') train_data = news_train.data print(len(train_data)) print(type(train_data), type(train_data[0]), "\n") # <class 'list'> <class 'str'> processed_data = [" ".join(pre_processing(data)) for data in train_data] stopwords = load_stopwords() # #### Learn Bag-of-words (BoW) max_df = 0.3 min_df = 0.00005 count_vec = CountVectorizer(strip_accents='unicode', stop_words=stopwords, max_df=max_df, min_df=min_df) count_vec.fit(processed_data) data_bow = count_vec.transform(processed_data) feature_names_bow = count_vec.get_feature_names() print(len(processed_data), data_bow.shape, type(data_bow)) freq_list = data_bow.toarray().sum(axis=0).tolist() print('对应特征总个数:', len(freq_list), type(freq_list)) # 对应特征总个数: 90517 <class 'numpy.ndarray'> # array([ 3, 239, 75, 2, 1, 1, 1, 2, 2, 1], dtype=int64) words = count_vec.vocabulary_ index2words = dict(zip(words.values(), words.keys())) freq_dict = {} top_dict = {} for idx, freq in enumerate(freq_list): top_dict[index2words[idx]] = freq k = 20 all = len(freq_list) top_k_list = sorted(top_dict.items(), key=lambda l: l[1], reverse=True)[:k] # m1 = 100 # m2 = 130 # top_k_list = sorted(top_dict.items(), key=lambda l: l[1], reverse=True)[m1:m2] print(top_k_list) x = range(len(top_k_list)) x_list = [t[1] for t in top_k_list] y_list = [t[0] for t in top_k_list] plt.bar(x, x_list) plt.xticks(x, y_list, rotation=45) plt.tick_params(labelsize=8) plt.title("Top %d / %d Words of with max_df = %.2f and min_df = %.3f" % (k, all, max_df, min_df), fontsize='large', fontweight='bold') # plt.title("Top %d-%d / %d Words of with max_df = %.2f and min_df = %.3f" % (m1, m2, all, max_df, min_df), # fontsize='large', fontweight='bold') plt.show() plt.close()