예제 #1
0
def get_fen_result(zz):
    all_sen=[]
    data=[]
    sentences = cut_sent(zz)
    for sent in sentences:
        sent = sent.replace("\n", "")
        sent = sent.replace("\t", "")
        sent = sent.replace(" ", "")
        if sent:
            all_sen.append(sent)

    for line in all_sen:
        word_list = [x for x in jieba.cut(line.strip())]
        data.append(word_list)
    if os.path.exists(root_name):
        root = load_model(root_name)
    else:
        dict_name = FLAGS.data_path + 'dict.txt'
        word_freq = load_dictionary(dict_name)
        root = TrieNode('*', word_freq)
        save_model(root, root_name)
    for word_list in data:
        ngrams = generate_ngram(word_list, FLAGS.ngram)
        for d in ngrams:
            root.add(d)
    te_re, add_word = root.find_word(FLAGS.topN, stop_word, jieba_dict, l_zre)
    del  root
    return te_re
예제 #2
0
    def run(self):
        starttime = time.time()
        rootName = (self.rootDir)

        if os.path.exists(rootName):
            root = loadModel(rootName)
        else:
            dictName = self.dictDir
            word_freq = loadWords(dictName)
            root = TrieNode('*', word_freq)
            saveModel(root, rootName)

        # 加载新的文章
        fileName = self.demoDir
        data = self.loadData(fileName, self.stopwords)
        # 将新的文章插入到Root中
        self.loadData2Root(root, data)

        # 定义取TOP5个
        N = 5
        result, add_word = root.wordFind(N)
        # 如果想要调试和选择其他的阈值,可以print result来调整
        print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word))
        print('#############################')
        for word, score in add_word.items():
            print(word + ' ---->  ', score)
        print('#############################\n')

        for word, score in add_word.items():
            jieba.add_word(word)

        print("互信息、信息熵:")
        print("".join([(x + '/ ') for x in jieba.cut(self.test_text, cut_all=False) if x not in self.stopwords]))
        endtime = time.time()
        print('time cost:' + str(round((endtime - starttime), 4)) + ' seconds.\n')
예제 #3
0
 def load_dic_tree(jieba_dic_path, PMI, is_save=True):
     '''
     加载字典树
     :param jieba_dic_path: 结巴词典路径
     :param PMI: 互信息阈值
     :param is_save: 是否保存构建好的字典树,直接加载构建好的树可以节约时间
     :return: 返回字典树
     '''
     Logger.log_DEBUG.debug('-----> 开始加载字典树')
     s_time = time.time()
     if is_save:
         try:
             word_freq = data_read.Load_word_freq(jieba_dic_path)
             root = TrieNode('*', PMI, word_freq)
             joblib.dump(root, 'tree.bin')
             time_elapse = time.time() - s_time
             Logger.log_DEBUG.debug("构建字典树完毕耗时: {}s".format(time_elapse))
         except Exception as e:
             s = "构建字典树发生异常load_dic_tree" + str(e)
             Logger.log_ERROR.error(s)
             Logger.log_ERROR.exception(sys.exc_info())
             raise TypeError(s)
     else:
         try:
             root = joblib.load('tree.bin')
             time_elapse = time.time() - s_time
             Logger.log_DEBUG.debug("加载字典树完毕耗时: {}s".format(time_elapse))
         except Exception as e:
             s = "读取字典树发生异常load_dic_tree" + str(e)
             Logger.log_ERROR.error(s)
             Logger.log_ERROR.exception(sys.exc_info())
             raise TypeError(s)
     return root
예제 #4
0
def create_root(rootName, dictName):
    if os.path.exists(rootName):
        root = loadModel(rootName)
        return root
    else:
        word_freq = loadWords(dictName)
        root = TrieNode('*', word_freq)
        saveModel(root, rootName)
        return root
예제 #5
0
    print('------> 插入成功')


if __name__ == "__main__":
    #root_name = basedir + "/data/root.pkl"
    root_name = basedir + "/data/jianzhu.pkl"
    stopwords = get_stopwords()
    if os.path.exists(root_name):
        root = load_model(root_name)
    else:
        #文档不能正确反映单个词的词频,所以引入Jieba自带的外部词典
        dict_name = basedir + '/data/dict.txt'
        #读取字典文件,取出词频大于2的建立字典{单词:频数}
        word_freq = load_dictionary(dict_name)
        #建立词汇树
        root = TrieNode('*', word_freq)
        save_model(root, root_name)

    # 加载新的文章
    #filename = 'data/demo.txt'
    filename = 'data/jianzhu.txt'
    #data是二维数组,存储[[第一行list][第二行list].....]
    data = load_data(filename, stopwords)
    # 将新的文章插入到Root中
    load_data_2_root(data)

    # 定义取TOP5个
    topN = 5
    result, add_word = root.find_word(topN)
    # 如果想要调试和选择其他的阈值,可以print result来调整
    # print("\n----\n", result)
예제 #6
0
import jieba

# 定义取TOP5个
N = 5

# 加载数据集
data = []
with open('../data/demo.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        line = [x for x in jieba.cut(line, cut_all=False) if x not in stopword]
        data.append(line)

print('------> 初始化字典树')
root = TrieNode('*', word_freq)

print('------> 插入节点')
for i in data:
    tmp = generate_ngram(i, 3)
    for d in tmp:
        root.add(d)

result, add_word = root.wordFind(5)

print('增加了%d个新词, 词语和得分分别为' % len(add_word))
print('#############################')
for word, score in add_word.items():
    print(word + ' ---->  ', score)
print('#############################')
예제 #7
0
            fw2 = open(wordFreq_sorted_path, 'w', encoding='utf-8')
            for name, freq in word_freq_sorted.items():
                fw2.write(name + ": " + str(freq) + '\n')

            fw1.close()
            fw2.close()

if __name__ == "__main__":
    root_name = basedir + "/data/root.pkl"
    stopwords = get_stopwords()
    if os.path.exists(root_name):
        root = load_model(root_name)
    else:
        dict_name = basedir + '/data/dict.txt'
        word_freq = load_dictionary(dict_name)
        root = TrieNode('*', word_freq)
        save_model(root, root_name)


    #choose and modify paths below
    func('data/demo_bid_data.txt', 'data/add_word_bid_data.txt', 'data/wordFreq_bid_data.txt', 'data/wordFreq_sorted_bid_data.txt')
    print("finished 1st run...")

    func('data/demo_bid_data.txt', 'data/add_word_bid_data_3To5.txt', 'data/wordFreq_bid_data_3To5.txt', 'data/wordFreq_sorted_bid_data_3To5.txt')
    # 2nd run will help find out new words composed of 3-5 words.

    # import cProfile
    # cProfile.run("func()", filename="cpresult.out", sort="cumulative")
    #
    # import pstats
    # p = pstats.Stats("cpresult.out")