def load_user_dict(self, path_user=path_dict_user, type_user="******"): """ 加载用户词典 :param path_user:str, like '/home/user.dict' :return: None """ if not os.path.exists(path_user): raise RuntimeError("your path_user is not exist!") if type_user == "json": self.dict_user = load_json(path_user)[0] # 加载json字典文件 for k, v in self.dict_user.items(): if k not in self.dict_words_freq: self.dict_words_freq[k] = v # 更新到总字典, words_freq else: self.dict_words_freq[ k] = self.dict_words_freq[k] + v # 更新到总字典, words_freq self.num_words = sum(self.dict_words_freq.values()) elif type_user == "txt": words_all = txt_read(path_user) for word_freq in words_all: wf = word_freq.split(" ") # 空格' '区分带不带词频的情况 if len(wf) == 2: word = wf[0] freq = wf[1] else: word = wf[0] freq = 132 if word not in self.dict_words_freq: self.dict_words_freq[word] = freq # 更新到总字典, words_freq else: self.dict_words_freq[word] = self.dict_words_freq[ word] + freq # 更新到总字典, words_freq self.num_words = sum(self.dict_words_freq.values()) elif type_user == "csv": words_all = txt_read(path_user) for word_freq in words_all: wf = word_freq.split(",") # 逗号','区分带不带词频的情况 if len(wf) == 2: word = wf[0] freq = wf[1] else: word = wf[0] freq = 132 if word not in self.dict_words_freq: self.dict_words_freq[word] = freq # 更新到总字典, words_freq else: self.dict_words_freq[word] = self.dict_words_freq[ word] + freq # 更新到总字典, words_freq self.num_words = sum(self.dict_words_freq.values()) else: raise EOFError
def evulate_file(path_file): """ 验证切词的各种指标 :param path_file: str, like '/train.txt' :return: float """ # 读取数据 sents = txt_read(path_file) # 初始化统计计数 count_macropodus = 0 count_real = 0 count_true = 0 count = 0 # 切词与统计, true for sent in sents: sent_sp = sent.strip() res_real = sent_sp.split(' ') sentence = sent_sp.replace(' ', '') res_macropodus = macropodus.cut(sentence) print(res_macropodus) count += 1 count_real += len(res_real) count_macropodus += len(res_macropodus) for cm in res_macropodus: if cm in res_real: count_true += 1 res_real.remove(cm) # precision, recall, f1 precision = count_true / count_macropodus recall = count_true / count_real f1 = (precision * recall * 2) / (precision + recall) return precision, recall, f1
# !/usr/bin/python # -*- coding: utf-8 -*- # @time : 2019/12/21 23:11 # @author : Mo # @function: from macropodus.preprocess.tools_common import load_json, save_json from macropodus.preprocess.tools_common import txt_write, txt_read import json pku_training = txt_read("pku_training.utf8") file = open("pku_train.json", "w", encoding="utf-8") pku_ = [] for pku in pku_training: pkus = pku.split(" ") label_pkus = "" for pku_sig in pkus: len_pku = len(pku_sig) if len_pku == 1: label_pkus += "S" elif len_pku == 2: label_pkus += "BE" else: label_pkus += "B" + "M" * (len_pku - 2) + "E" label_pkus_l = list(label_pkus) pku_res = {} pku_res["question"] = list("".join(pkus)) pku_res["label"] = label_pkus_l p_json = json.dumps(pku_res, ensure_ascii=False) file.write(p_json + "\n") # pku_.append(pku_res)
if __name__ == '__main__': sd = SegDAG() sd.add_word(str('知识图谱')) # for i in range(50000): sd_enum = sd.cut(sentence='apple_pir大漠帝国我再也找不到了') print(list(sd_enum)) # 测试性能 from macropodus.preprocess.tools_common import txt_read, txt_write from macropodus.conf.path_config import path_root import time path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt" sentences = txt_read(path_wordseg_a) time_start = time.time() count = 0 for i in range(10000): for sen in sentences: # print("原句:"+sen) count += 1 res = sd.cut(sen) # print(list(res)) time_end = time.time() print(time_end-time_start) print(count/(time_end - time_start)) while True: print("请输入:")
# @author : Mo # @function: tags_res = [ 'm', 'vn', 'v', 'Yg', 'Tg', 'l', 'p', 'nt', 'y', 'Rg', 'e', 'i', 'an', 'q', 'k', 'nr', 'Ag', 'n', 'vvn', 'd', 'f', 'ad', 'vd', 'z', 'Mg', 'nx', 'a', 'h', 's', 'u', 'na', 'Bg', 'j', 'w', 'Ng', 'o', 'nz', 'ns', 'b', 'Vg', 'Dg', 'r', 't', 'c' ] # ['Rg', 'nt', 'Ng', 'm', 'u', 'nx', 'an', 'na', 'b', 'd', 'c', 'vd', 'j', 'ns', 'ad', 's', 'z', 'Mg', 'vn', 'l', 't', 'f', 'v', 'vvn', 'n', 'r', 'Tg', 'Dg', 'Bg', 'i', 'nr', 'k', 'q', 'o', 'a', 'w', 'e', 'h', 'p', 'y', 'nz', 'Ag', 'Yg', 'Vg'] tags_res = [tr.upper() for tr in tags_res] from macropodus.preprocess.tools_common import txt_read tag_jiagus = txt_read("data/tag_jiagu.txt") tag_jiebas = txt_read("data/tag_jieba.txt") tgu = [] for tag_jiagu in tag_jiagus: tags = tag_jiagu.split("\u3000") tag = tags[0].strip() tgu.append(tag.upper()) tga = [] for tag_jieba in tag_jiebas: tags = tag_jieba.split("\t") tag = tags[0].strip() tga.append(tag.upper()) tgus = []