def show_word_cluster_result(): word_df = news_pandas.load_news(os.path.join(results_path, 'word_df.csv')) word_df['wordvec'] = word_df['wordvec'].map(eval) wordvec_list = word_df['wordvec'].tolist() word_label = word_df['word_label'].tolist() word_pca_tsne = modeling.feature_reduction(wordvec_list, pca_n_components=3, tsne_n_components=2) drawing.draw_clustering_result(word_pca_tsne, word_label)
def cluster_content(): eps_var = Entry_Eps.get() min_samples_var = Entry_MinSamples.get() if eps_var == '' or min_samples_var == '': messagebox.showinfo('Message', '请输全聚类参数!') return eps_var = float(eps_var) min_samples_var = int(min_samples_var) try: df = news_pandas.load_news(os.path.join(temp_news_path, 'news_cut.csv')) df['content_cut'] = df['content_cut'].map(eval) df['content_'] = df['content_'].map(str) except FileNotFoundError: messagebox.showinfo('Message', '请先对新闻内容文本进行预处理!') return word_library_list = counter.get_word_library(df['content_cut']) single_frequency_words_list = counter.get_single_frequency_words( df['content_cut']) max_features = len( word_library_list) - len(single_frequency_words_list) // 2 matrix = modeling.feature_extraction(df['content_'], vectorizer='TfidfVectorizer', vec_args={ 'max_df': 0.95, 'min_df': 1, 'max_features': max_features }) dbscan = modeling.get_cluster(matrix, cluster='DBSCAN', cluster_args={ 'eps': eps_var, 'min_samples': min_samples_var, 'metric': 'cosine' }) labels = modeling.get_labels(dbscan) df['label'] = labels ranks = modeling.label2rank(labels) df['rank'] = ranks news_pandas.save_news(df, os.path.join(results_path, 'news_label.csv')) df['matrix'] = matrix.toarray().tolist() df_non_outliers = df[df['label'] != -1].copy() if df_non_outliers.shape[0] == 0: messagebox.showinfo('Message', '不能聚类出任何热点,请重新设置聚类参数!') return data_pca_tsne = modeling.feature_reduction( df_non_outliers['matrix'].tolist(), pca_n_components=3, tsne_n_components=2) df_non_outliers['pca_tsne'] = data_pca_tsne.tolist() del df_non_outliers['matrix'] news_pandas.save_news(df_non_outliers, os.path.join(results_path, 'news_non_outliers.csv')) rank_num = counter.get_num_of_value_no_repeat(df_non_outliers['rank']) hot_num.set(rank_num) messagebox.showinfo('Message', '按照新闻内容聚类完成!')