def macropodus_cut(text): """ Macropodus cut :param text: input sentence :return: list """ return macropodus.cut(text)
def evulate_file(path_file): """ 验证切词的各种指标 :param path_file: str, like '/train.txt' :return: float """ # 读取数据 sents = txt_read(path_file) # 初始化统计计数 count_macropodus = 0 count_real = 0 count_true = 0 count = 0 # 切词与统计, true for sent in sents: sent_sp = sent.strip() res_real = sent_sp.split(' ') sentence = sent_sp.replace(' ', '') res_macropodus = macropodus.cut(sentence) print(res_macropodus) count += 1 count_real += len(res_real) count_macropodus += len(res_macropodus) for cm in res_macropodus: if cm in res_real: count_true += 1 res_real.remove(cm) # precision, recall, f1 precision = count_true / count_macropodus recall = count_true / count_real f1 = (precision * recall * 2) / (precision + recall) return precision, recall, f1
def macropodus_cut(text: str) -> List: """ cut words of chinese using macropodus Args: text: text of string which need cws, eg. "大漠帝国是谁" Returns: list of words, eg. ["大漠帝国", "是", "谁"] """ return list(macropodus.cut(text))
macropodus.add_word(word="鲈形目") macropodus.save_add_words(word_freqs={"喜斗": 32, "护卵": 64, "护幼": 132}) macropodus.add_word(word="坑爹的平衡性基金") macropodus.save_add_words(word_freqs={"BBC": 132}) # sent = "今日头条 白嫖 东风快递 令人喷饭 勿谓言之不预也 白嫖 口区 弓虽 口丕 我酸了 祖安人 迷惑行为 5G 996 007 1118 35 120 251 nmsl nsdd wdnmd CSGO 唱跳 rap 篮球 鸡你太美 cxk 盘它 撞梗 融梗 雨女无瓜 要你寡 刺激战场 绝地求生" # sent = "狼灭 狼火 狼炎 狼焱 灵魂八问 硬核 奥力给 有内味了 awsl 影流之主 巨魔之王" # words = sent.split(" ") # word_dict = {} # for w in words: # word_dict[w] = 132 # macropodus.save_add_words(word_freqs=word_dict) print( macropodus.cut("坑爹的平衡性基金啊,坑爹呀斗鱼属,Macropodus (Lacépède, 1801),鲈形目斗鱼科的一属鱼类。" "本属鱼类通称斗鱼。因喜斗而得名。分布于亚洲东南部。中国有2种,即叉尾斗鱼,分布于长江及以南各省;" "叉尾斗鱼,分布于辽河到珠江流域。其喜栖居于小溪、河沟、池塘、稻田等缓流或静水中。" "雄鱼好斗,产卵期集草成巢,雄鱼口吐粘液泡沫,雌鱼产卵其中,卵浮性,受精卵在泡沫内孵化。雄鱼尚有护卵和护幼现象。")) sen_calculate = "23 + 13 * (25+(-9-2-5-2*3-6/3-40*4/(2-3)/5+6*3))加根号144你算得几多" sen_chi2num = "三千零七十八亿三千零十五万零三百一十二点一九九四" sen_num2chi = 1994.1994 sen_roman2int = "IX" sen_int2roman = 132 # sent1 = "PageRank算法简介" # sent2 = "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" sent1 = "香蕉的翻译" sent2 = "用英语说香蕉" summary = "四川发文取缔全部不合规p2p。字节跳动与今日头条。成都日报,成都市,李太白与杜甫"\ "PageRank算法简介。" \ "是上世纪90年代末提出的一种计算网页权重的算法! " \ "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \