def get_fen_result(zz): all_sen=[] data=[] sentences = cut_sent(zz) for sent in sentences: sent = sent.replace("\n", "") sent = sent.replace("\t", "") sent = sent.replace(" ", "") if sent: all_sen.append(sent) for line in all_sen: word_list = [x for x in jieba.cut(line.strip())] data.append(word_list) if os.path.exists(root_name): root = load_model(root_name) else: dict_name = FLAGS.data_path + 'dict.txt' word_freq = load_dictionary(dict_name) root = TrieNode('*', word_freq) save_model(root, root_name) for word_list in data: ngrams = generate_ngram(word_list, FLAGS.ngram) for d in ngrams: root.add(d) te_re, add_word = root.find_word(FLAGS.topN, stop_word, jieba_dict, l_zre) del root return te_re
word_freq = load_dictionary(dict_name) #建立词汇树 root = TrieNode('*', word_freq) save_model(root, root_name) # 加载新的文章 #filename = 'data/demo.txt' filename = 'data/jianzhu.txt' #data是二维数组,存储[[第一行list][第二行list].....] data = load_data(filename, stopwords) # 将新的文章插入到Root中 load_data_2_root(data) # 定义取TOP5个 topN = 5 result, add_word = root.find_word(topN) # 如果想要调试和选择其他的阈值,可以print result来调整 # print("\n----\n", result) print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word)) print('#############################') for word, score in add_word.items(): print(word + ' ----> ', score) print('#############################') # 前后效果对比 #test_sentence = '蔡英文在昨天应民进党当局的邀请,准备和陈时中一道前往世界卫生大会,和谈有关九二共识问题' test_sentence = '在分配电箱设置插座直接给末级配电箱供电' print('添加前:') print("".join([(x + '/ ') for x in jieba.cut(test_sentence, cut_all=False) if x not in stopwords]))