示例#1
0
def get_fen_result(zz):
    all_sen=[]
    data=[]
    sentences = cut_sent(zz)
    for sent in sentences:
        sent = sent.replace("\n", "")
        sent = sent.replace("\t", "")
        sent = sent.replace(" ", "")
        if sent:
            all_sen.append(sent)

    for line in all_sen:
        word_list = [x for x in jieba.cut(line.strip())]
        data.append(word_list)
    if os.path.exists(root_name):
        root = load_model(root_name)
    else:
        dict_name = FLAGS.data_path + 'dict.txt'
        word_freq = load_dictionary(dict_name)
        root = TrieNode('*', word_freq)
        save_model(root, root_name)
    for word_list in data:
        ngrams = generate_ngram(word_list, FLAGS.ngram)
        for d in ngrams:
            root.add(d)
    te_re, add_word = root.find_word(FLAGS.topN, stop_word, jieba_dict, l_zre)
    del  root
    return te_re
示例#2
0
        word_freq = load_dictionary(dict_name)
        #建立词汇树
        root = TrieNode('*', word_freq)
        save_model(root, root_name)

    # 加载新的文章
    #filename = 'data/demo.txt'
    filename = 'data/jianzhu.txt'
    #data是二维数组,存储[[第一行list][第二行list].....]
    data = load_data(filename, stopwords)
    # 将新的文章插入到Root中
    load_data_2_root(data)

    # 定义取TOP5个
    topN = 5
    result, add_word = root.find_word(topN)
    # 如果想要调试和选择其他的阈值,可以print result来调整
    # print("\n----\n", result)
    print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word))
    print('#############################')
    for word, score in add_word.items():
        print(word + ' ---->  ', score)
    print('#############################')

    # 前后效果对比
    #test_sentence = '蔡英文在昨天应民进党当局的邀请,准备和陈时中一道前往世界卫生大会,和谈有关九二共识问题'
    test_sentence = '在分配电箱设置插座直接给末级配电箱供电'
    print('添加前:')
    print("".join([(x + '/ ') for x in jieba.cut(test_sentence, cut_all=False)
                   if x not in stopwords]))