def show_word_pie(): word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv')) word_df['wordvec'] = word_df['wordvec'].map(eval) n_clusters = counter.get_num_of_value_no_repeat(word_df['word_label']) word_label_value = [word_df[word_df['word_label'] == i].shape[0] for i in range(n_clusters)] word_label_yticks = [word_df[word_df['word_label'] == i]['word'][:5].tolist() for i in range(n_clusters)] drawing.draw_clustering_analysis_pie(n_clusters, word_label_value, word_label_yticks)
def show_hot_barh(): try: df_non_outliers = news_pandas.load_news( os.path.join(results_path, 'news_non_outliers.csv')) df_non_outliers['content_cut'] = df_non_outliers['content_cut'].map( eval) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!') return rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank']) value = [ df_non_outliers[df_non_outliers['rank'] == i].shape[0] for i in range(1, rank_num + 1) ] yticks1 = [ str( counter.get_most_common_words( df_non_outliers[df_non_outliers['rank'] == i]['content_cut'], top_n=10)) + str(i) for i in range(1, rank_num + 1) ] # yticks2 = [modeling.get_key_sentences('\n'.join(df_non_outliers[df_non_outliers['rank'] == i]['title_']), # num=1) for i in range(1, rank_num + 1)] drawing.draw_clustering_analysis_barh(rank_num, value, yticks1, title='热点新闻分布饼图')
def show_word_cluster_result(): word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv')) word_df['wordvec'] = word_df['wordvec'].map(eval) wordvec_list = word_df['wordvec'].tolist() word_label = word_df['word_label'].tolist() word_pca_tsne = modeling.feature_reduction(wordvec_list, pca_n_components=3, tsne_n_components=2) drawing.draw_clustering_result(word_pca_tsne, word_label)
def show_cluster_result(): try: df_non_outliers = news_pandas.load_news(os.path.join(results_path, 'news_non_outliers.csv')) df_non_outliers['pca_tsne'] = df_non_outliers['pca_tsne'].map(eval) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!') return drawing.draw_clustering_result(df_non_outliers['pca_tsne'], df_non_outliers['label'])
def cluster_content(): eps_var = Entry_Eps.get() min_samples_var = Entry_MinSamples.get() if eps_var == '' or min_samples_var == '': messagebox.showinfo('Message', '请输全聚类参数!') return eps_var = float(eps_var) min_samples_var = int(min_samples_var) try: df = news_pandas.load_news(os.path.join(temp_news_path, 'news_cut.csv')) df['content_cut'] = df['content_cut'].map(eval) df['content_'] = df['content_'].map(str) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行预处理!') return word_library_list = counter.get_word_library(df['content_cut']) single_frequency_words_list = counter.get_single_frequency_words( df['content_cut']) max_features = len( word_library_list) - len(single_frequency_words_list) // 2 matrix = modeling.feature_extraction(df['content_'], vectorizer='TfidfVectorizer', vec_args={ 'max_df': 0.95, 'min_df': 1, 'max_features': max_features }) dbscan = modeling.get_cluster(matrix, cluster='DBSCAN', cluster_args={ 'eps': eps_var, 'min_samples': min_samples_var, 'metric': 'cosine' }) labels = modeling.get_labels(dbscan) df['label'] = labels ranks = modeling.label2rank(labels) df['rank'] = ranks news_pandas.save_news(df, os.path.join(results_path, 'news_label.csv')) df['matrix'] = matrix.toarray().tolist() df_non_outliers = df[df['label'] != -1].copy() if df_non_outliers.shape[0] == 0: messagebox.showinfo('Message', '不能聚类出任何热点,请重新设置聚类参数!') return data_pca_tsne = modeling.feature_reduction( df_non_outliers['matrix'].tolist(), pca_n_components=3, tsne_n_components=2) df_non_outliers['pca_tsne'] = data_pca_tsne.tolist() del df_non_outliers['matrix'] news_pandas.save_news(df_non_outliers, os.path.join(results_path, 'news_non_outliers.csv')) rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank']) hot_num.set(rank_num) messagebox.showinfo('Message', '按照新闻内容聚类完成!')
def cut_content(): try: news_df = news_pandas.load_news(os.path.join(news_path, 'news_df.csv')) except FileNotFoundError: messagebox.showinfo('Message', '没有选择新闻内容文本!') return document = '\n'.join([str(content) for content in news_df['content']]) preprocessing.document2txt(document, userdict_path=os.path.join(extra_dict_path, 'self_userdict.txt'), text_path=os.path.join(texts_path, 'document_segment.txt')) messagebox.showinfo('Message', '新闻内容文本分词完成!')
def select_news(): filename = filedialog.askopenfilename(filetypes=[("csv file", "*.csv")]) if len(filename) == 0: return news_df = news_pandas.load_news(filename) news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv')) global filter_df filter_df = preprocessing.data_filter(news_df) news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv')) news_num = filter_df.shape[0] sum_top_n.set(news_num)
def load_data(): """加载数据""" # sina_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sina_latest_news.csv')) # sohu_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_sohu_latest_news.csv')) # xinhuanet_news_df = news_crawler.load_news(os.path.join(news_path, 'sample_xinhuanet_latest_news.csv')) # sina_news_df = news_crawler.load_news(os.path.join(news_path, 'sina_latest_news.csv')) # sohu_news_df = news_crawler.load_news(os.path.join(news_path, 'sohu_latest_news.csv')) # xinhuanet_news_df = news_crawler.load_news(os.path.join(news_path, 'xinhuanet_latest_news.csv')) # news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True) save_file_path = os.path.join(news_path, 'news_df.csv') news_df = news_pandas.load_news(save_file_path) return news_df
def crawler(): sina_top_n = Entry_Sina.get() sohu_top_n = Entry_Sohu.get() xinhuanet_top_n = Entry_XinhuaNet.get() sina_top_n = 0 if sina_top_n == '' else int(sina_top_n) sohu_top_n = 0 if sohu_top_n == '' else int(sohu_top_n) xinhuanet_top_n = 0 if xinhuanet_top_n == '' else int(xinhuanet_top_n) sina_top_n = 0 if sina_top_n <= 0 else sina_top_n sohu_top_n = 0 if sohu_top_n <= 0 else sohu_top_n xinhuanet_top_n = 0 if xinhuanet_top_n <= 0 else xinhuanet_top_n if sina_top_n + sohu_top_n + xinhuanet_top_n == 0: messagebox.showinfo('Message', '新闻数量不能全部为非正数!') return news_crawler.threaded_crawler(sina_top_n, sohu_top_n, xinhuanet_top_n) sina_news_df = pd.DataFrame() sohu_news_df = pd.DataFrame() xinhuanet_news_df = pd.DataFrame() if sina_top_n > 0: sina_news_df = news_pandas.load_news( os.path.join(news_path, 'sina_latest_news.csv')) if sohu_top_n > 0: sohu_news_df = news_pandas.load_news( os.path.join(news_path, 'sohu_latest_news.csv')) if xinhuanet_top_n > 0: xinhuanet_news_df = news_pandas.load_news( os.path.join(news_path, 'xinhuanet_latest_news.csv')) news_df = pd.concat([sina_news_df, sohu_news_df, xinhuanet_news_df], ignore_index=True) news_pandas.save_news(news_df, os.path.join(news_path, 'news_df.csv')) global filter_df filter_df = preprocessing.data_filter(news_df) news_pandas.save_news(filter_df, os.path.join(temp_news_path, 'filter_news.csv')) news_num = filter_df.shape[0] sum_top_n.set(news_num) messagebox.showinfo('Message', '爬取即时新闻完成!共{}条有效新闻!'.format(news_num))
def show_cluster_n_words(): n = Entry_Cluster_N.get() if n == '': messagebox.showinfo('Message', '请先输入想要查看的词汇属于第几类!') return n = int(n) try: word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv')) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!') return word_df['wordvec'] = word_df['wordvec'].map(eval) words_i_df = word_df[word_df['word_label'] == n - 1].copy() cluster_i_words = '\n'.join(words_i_df['word'].tolist()) news_pandas.save_text(cluster_i_words, os.path.join(texts_path, 'cluster_i_words.txt')) filename = os.path.join(texts_path, 'cluster_i_words.txt') editor(filename)
def show_details(): top_num = Entry_TopHot.get() if top_num == '': messagebox.showinfo('Message', '请输入想查看的热点属于第几簇!') return top_num = int(top_num) try: df_non_outliers = news_pandas.load_news(os.path.join(results_path, 'news_non_outliers.csv')) df_non_outliers['content_cut'] = df_non_outliers['content_cut'].map(eval) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!') return global df_rank_i df_rank_i = df_non_outliers[df_non_outliers['rank'] == top_num] all_title = '\n'.join(df_rank_i['title_'].tolist()) hot_title = modeling.get_key_sentences(all_title, num=1) detail_tk = tk.Tk() detail_tk.option_add("*Font", "helvetica 12 bold") detail_tk.geometry("720x540+323+114") detail_tk.title("第{}簇热点详情".format(top_num)) Label_Title = tk.Label(detail_tk, text='''话题:''') Label_Title.place(relx=0.2, rely=0.1, height=26, width=66) # Label_HotTitle = tk.Label(detail_tk, text=hot_title, font=('SimHei', 12, 'bold'), fg='red') Label_HotTitle = tk.Label(detail_tk, text=hot_title, font=('helvetica', 12, 'bold'), fg='red') Label_HotTitle.place(relx=0.25, rely=0.15) Button_HotWords = tk.Button(detail_tk, text='''该处热点相关词汇''', command=show_hot_words_details) Button_HotWords.place(relx=0.25, rely=0.25, height=26, width=140) Button_HotTitles = tk.Button(detail_tk, text='''该处热点热门话题''', command=show_hot_titles) Button_HotTitles.place(relx=0.55, rely=0.25, height=26, width=140) Label_L_6 = tk.Label(detail_tk, text='''热点词汇分''') Label_L_6.place(relx=0.25, rely=0.4, height=18, width=90) n_to_cluster = tk.StringVar() Entry_N_Clusters = tk.Entry(detail_tk, textvariable=n_to_cluster) # n_to_cluster.set('15') Entry_N_Clusters.place(relx=0.37, rely=0.4, height=20, relwidth=0.07) Label_R_6 = tk.Label(detail_tk, text='''类聚类''') Label_R_6.place(relx=0.44, rely=0.4, height=18, width=50) def cluster_word(): n_clusters = Entry_N_Clusters.get() if n_clusters == '': messagebox.showinfo('Message', '请输入词汇聚类的类别数!') return n_clusters = int(n_clusters) top_words_list = counter.get_most_common_words(df_rank_i['content_cut'], top_n=5000, min_frequency=1) model = news_pandas.load_element(os.path.join(models_path, 'word2vec_model.pkl')) word_list, wordvec_list = modeling.get_word_and_wordvec(model, top_words_list) kmeans = modeling.get_cluster(wordvec_list, cluster='KMeans', cluster_args={ 'n_clusters': n_clusters, 'random_state': 9}) word_label = kmeans.labels_ word_df = pd.DataFrame() word_df['word'] = word_list word_df['wordvec'] = wordvec_list word_df['word_label'] = word_label news_pandas.save_news(word_df, os.path.join(results_path, 'word_df.csv')) messagebox.showinfo('Message', '词汇聚类完成!') Button_WordsCluster = tk.Button(detail_tk, text='''词汇聚类''', command=cluster_word) Button_WordsCluster.place(relx=0.55, rely=0.4, height=26, width=80) Button_Show_Word_Cluster_Result = tk.Button(detail_tk, text='''查看词汇聚类效果''', command=show_word_cluster_result) Button_Show_Word_Cluster_Result.place(relx=0.38, rely=0.51, height=26, width=140) Button_Word_Barh = tk.Button(detail_tk, text='''查看词汇聚类条形图''', command=show_word_barh) Button_Word_Barh.place(relx=0.38, rely=0.61, height=26, width=154) Button_Word_Pie = tk.Button(detail_tk, text='''查看词汇聚类饼图''', command=show_word_pie) Button_Word_Pie.place(relx=0.38, rely=0.71, height=26, width=140) Label_L_7 = tk.Label(detail_tk, text='''第''') Label_L_7.place(relx=0.3, rely=0.84, height=18, width=16) cluster_n = tk.StringVar() Entry_Cluster_N = tk.Entry(detail_tk, textvariable=cluster_n) # cluster_n.set('1') Entry_Cluster_N.place(relx=0.34, rely=0.84, height=20, relwidth=0.07) Label_R_7 = tk.Label(detail_tk, text='''类词汇''') Label_R_7.place(relx=0.42, rely=0.84, height=18, width=50) def show_cluster_n_words(): n = Entry_Cluster_N.get() if n == '': messagebox.showinfo('Message', '请先输入想要查看的词汇属于第几类!') return n = int(n) try: word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv')) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行聚类!') return word_df['wordvec'] = word_df['wordvec'].map(eval) words_i_df = word_df[word_df['word_label'] == n - 1].copy() cluster_i_words = '\n'.join(words_i_df['word'].tolist()) news_pandas.save_text(cluster_i_words, os.path.join(texts_path, 'cluster_i_words.txt')) filename = os.path.join(texts_path, 'cluster_i_words.txt') editor(filename) Button_Show_Cluster_N_Word = tk.Button(detail_tk, text='''查询''', command=show_cluster_n_words) Button_Show_Cluster_N_Word.place(relx=0.55, rely=0.84, height=26, width=50) detail_tk.mainloop()
def load_data(): """加载数据""" save_file_path = os.path.join(news_path, 'news_df.csv') news_df = news_pandas.load_news(save_file_path) return news_df