def cluster_word():
     n_clusters = Entry_N_Clusters.get()
     if n_clusters == '':
         messagebox.showinfo('Message', '请输入词汇聚类的类别数!')
         return
     n_clusters = int(n_clusters)
     top_words_list = counter.get_most_common_words(
         df_rank_i['content_cut'], top_n=5000, min_frequency=1)
     model = news_pandas.load_element(
         os.path.join(models_path, 'word2vec_model.pkl'))
     word_list, wordvec_list = modeling.get_word_and_wordvec(
         model, top_words_list)
     kmeans = modeling.get_cluster(wordvec_list,
                                   cluster='KMeans',
                                   cluster_args={
                                       'n_clusters': n_clusters,
                                       'random_state': 9
                                   })
     word_label = kmeans.labels_
     word_df = pd.DataFrame()
     word_df['word'] = word_list
     word_df['wordvec'] = wordvec_list
     word_df['word_label'] = word_label
     news_pandas.save_news(word_df, os.path.join(results_path,
                                                 'word_df.csv'))
     messagebox.showinfo('Message', '词汇聚类完成!')
예제 #2
0
def wordsimilar():
    model = news_pandas.load_element(os.path.join(models_path, 'word2vec_model.pkl'))
    word = Entry_Word.get()
    if word == '':
        messagebox.showinfo('Message', '请输入词语!')
        return
    try:
        model.wv.get_vector(word)
    except:
        messagebox.showinfo('Message', '词库中没有此词语!')
        return
    similar_words = model.wv.most_similar(word, topn=100)
    similar_words = str(similar_words)[1:-1]
    similar_words = re.sub(r'\), \(', '),\n(', similar_words)
    news_pandas.save_text(similar_words, os.path.join(texts_path, 'similar_words.txt'))
    filename = os.path.join(texts_path, 'similar_words.txt')
    editor(filename)