def dispatch_me(str_test): print("测试语句:%s" %(str_test)) line_p = hanzi_prep.split_into_sentences(str_test) lines = [] for line_i in line_p: lines.extend(line_i) str_i = ''.join(lines) if USE_SEGMENT == "JIEBA": print("==JIEBA分词==") jieba_i = ' '.join(jieba.cut(str_i, cut_all=False)) elif USE_SEGMENT == "ICTCLAS": print("==NLPIR分词==") jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False)) else: print("ERROR:未知分词系统!") return None print("分词结果:%s"%(repr(jieba_i))) jieba_i = jieba_i.split() jieba_len = len(jieba_i) result_collect = [] for i in range(0,jieba_len): if i > 0: head = jieba_i[i-1] else: head = None if i < jieba_len -1: tail = jieba_i[i+1] else: tail = None ret = calc_list_pro(jieba_i[i], head, tail) if ret: ret_pro = find_max_dict(ret) if ret_pro: print("词汇:[[%s]], 最大概率义项:%s, 概率:%f" %(jieba_i[i], ret_pro[0], ret_pro[1])) print("DEBUG:::"+repr(ret)) result_collect.append((jieba_i[i], ret_pro[0], ret_pro[1])) else: print("无计算结果") return result_collect
FILE_NAME_JIEBA = FILE_NAME + "_ICTCLAS" FILE_NAME_JIEBA_CNT = FILE_NAME_JIEBA + "_CNT" FILE_NAME_JIEBA_LM = FILE_NAME_JIEBA + "_LM" FILE_NAME_JIEBA_PK = FILE_NAME_JIEBA + "_PK" FILE_NAME_JIEBA_PINYIN = FILE_NAME_JIEBA + "_PINYIN" i = 0 if not os.path.exists(FILE_NAME_PREP): with open(FILE_NAME) as fin: with open(FILE_NAME_PREP, "w") as fout: for line in fin: i = i + 1 if not i % 1000: print("C:%d" % (i)) line_p = hanzi_prep.split_into_sentences(line) for line_i in line_p: str_i = "".join(line_i) fout.write(str_i + "\n") # i = 0 # if not os.path.exists(FILE_NAME_UNIC): # with open(FILE_NAME_PREP) as fin: # with open(FILE_NAME_UNIC,"w") as fout: # for line in fin: # i = i + 1 # if not i % 1000: # print("C:%d" %(i)) # line_p = hanzi_prep.split_into_sentences_e(line) # for line_i in line_p: # #用空格分割每个汉字
def correct_me(str_test, enhance = True): print("") print("测试语句:%s" %(str_test)) line_p = hanzi_prep.split_into_sentences(str_test) lines = [] for line_i in line_p: lines.extend(line_i) str_i = ''.join(lines) if USE_SEGMENT == "JIEBA": print("==JIEBA分词==") jieba_i = ' '.join(jieba.cut(str_i, cut_all=False)) elif USE_SEGMENT == "ICTCLAS": print("==NLPIR分词==") jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False)) else: print("未知分词类型!") jieba_i = [] print("分词结果:%s"%(repr(jieba_i))) jieba_i = jieba_i.split() jieba_len = len(jieba_i) if jieba_len < 3: print("词数太小,放弃纠错!") return jieba_key = [] jieba_pro = [] for i in range(1,jieba_len): #不考虑开头结尾模式 tmp_str = jieba_i[i-1] + jieba_i[i] pro = JIEBA_HZ.get(tmp_str) jieba_key.append(tmp_str) if pro: jieba_pro.append(pro) else: jieba_pro.append(0) print("分词表:"+repr(jieba_key)) print("概率表:"+repr(jieba_pro)) jieba_pro_t = [] for i in range(0,jieba_len-2): jieba_pro_t.append( jieba_pro[i] + jieba_pro[i+1]) min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1 print("可疑位置:[%d]->%s"%(min_index,jieba_i[min_index])) to_do = [] g_check_a = None g_check_e = None #纠错位置不可能在开头或者结尾 to_do.append(jieba_i[min_index-1]) to_do.append(jieba_i[min_index]) to_do.append(jieba_i[min_index+1]) if min_index - 2 >= 0: g_check_a = jieba_i[min_index-2] if min_index + 2 < jieba_len: g_check_e = jieba_i[min_index+2] print("需要处理:"+repr(to_do)) print("辅助检测:%s,%s" %(g_check_a, g_check_e)) #保存最终的结果 p_res_stage1 = {} p_res_stage2 = {} p_res_stage3 = {} if enhance: #STAGE1 假设分词没有错误 p_res_st1 = sub_correct_me_ext(to_do[0], to_do[1], to_do[2], 1) #STAGE2 假设第一和第二个合并 p_res_st2 = sub_correct_me_ext(g_check_a, to_do[0]+to_do[1], to_do[2], 2) #STAGE3 假设第二和第三个合并 p_res_st3 = sub_correct_me_ext(to_do[0], to_do[1]+to_do[2], g_check_e, 3) else: #STAGE1 假设分词没有错误 p_res_st1 = sub_correct_me(to_do[0], to_do[1], to_do[2], 1) #STAGE2 假设第一和第二个合并 p_res_st2 = sub_correct_me(g_check_a, to_do[0]+to_do[1], to_do[2], 2) #STAGE3 假设第二和第三个合并 p_res_st3 = sub_correct_me(to_do[0], to_do[1]+to_do[2], g_check_e, 3) #打印纠正结果 cor_ret = correct_result(to_do, [p_res_st1, p_res_st2, p_res_st3], True) if not cor_ret: final_words = ['NONE'] else: if cor_ret['type'] == 1: final_words = jieba_i[0:min_index-1] + [ to_do[0], cor_ret['item'], to_do[2] ] + jieba_i[min_index+2:jieba_len] elif cor_ret['type'] == 2: final_words = jieba_i[0:min_index-1] + [ cor_ret['item'], to_do[2] ] + jieba_i[min_index+2:jieba_len] elif cor_ret['type'] == 3: final_words = jieba_i[0:min_index-1] + [ to_do[0], cor_ret['item'] ] + jieba_i[min_index+2:jieba_len] else: final_words = ['NONE'] return ''.join(final_words)
def correct_me(str_test): # str_len = len(str_test) # print("\n==单字测试==") # for i in range(1,str_len): # tmp_str = str_test[i-1] + str_test[i] # if is_terminator(str_test[i]): # tmp_str = str_test[i-1] + '</s>' # if is_terminator(str_test[i-1]): # tmp_str = '<s>' + str_test[i] # if UNIC_HZ.get(tmp_str): # print("%s->%f" % (tmp_str,UNIC_HZ.get(tmp_str)),end="\t") # else: # print("%s->%f" % (tmp_str,0),end="\t") print("") print("==NLPIR分词==") print("测试语句:%s" % (str_test)) line_p = hanzi_prep.split_into_sentences(str_test) lines = [] for line_i in line_p: lines.extend(line_i) str_i = ''.join(lines) #jieba_i = ' '.join(jieba.cut(str_i, cut_all=False)) jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False)) print("分词结果:%s" % (repr(jieba_i))) jieba_i = jieba_i.split() jieba_len = len(jieba_i) if jieba_len < 3: print("词数太小,放弃纠错!") return jieba_key = [] jieba_pro = [] for i in range(1, jieba_len): #是否是标点符号 # if i == 0: # tmp_str = '<s>' + jieba_i[i] # if i == jieba_len -1: # tmp_str = jieba_i[i] + '</s>' # else: # #默认模式 # tmp_str = jieba_i[i-1] + jieba_i[i] # if len(jieba_i[i]) == 1: # if is_terminator(jieba_i[i]): # tmp_str = jieba_i[i-1] + '</s>' # if len(jieba_i[i-1]) == 1: # if is_terminator(jieba_i[i-1]): # tmp_str = '<s>' + jieba_i[i] #不考虑开头结尾模式 tmp_str = jieba_i[i - 1] + jieba_i[i] pro = JIEBA_HZ.get(tmp_str) jieba_key.append(tmp_str) if pro: jieba_pro.append(pro) else: jieba_pro.append(0) # if min_index != -1: # print("\n可能错误位置:",end="") # if min_index > 1: # print("%s"%jieba_i[min_index-1],end="") # print("%s"%jieba_i[min_index]) # if min_index < (jieba_len - 1): # print("%s"%jieba_i[min_index+1],end="") print("分词表:" + repr(jieba_key)) print("概率表:" + repr(jieba_pro)) jieba_pro_t = [] for i in range(0, jieba_len - 2): jieba_pro_t.append(jieba_pro[i] + jieba_pro[i + 1]) min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1 print("可疑位置:[%d]->%s" % (min_index, jieba_i[min_index])) to_do = [] g_check_a = None g_check_e = None #纠错位置不可能在开头或者结尾 to_do.append(jieba_i[min_index - 1]) to_do.append(jieba_i[min_index]) to_do.append(jieba_i[min_index + 1]) if min_index - 2 >= 0: g_check_a = jieba_i[min_index - 2] if min_index + 2 < jieba_len: g_check_e = jieba_i[min_index + 2] print("需要处理:" + repr(to_do)) print("辅助检测:%s,%s" % (g_check_a, g_check_e)) #保存最终的结果 p_res_stage1 = {} p_res_stage2 = {} p_res_stage3 = {} max_item_1 = None max_item_2 = None max_item_3 = None max_pro_1 = 0 max_item_1 = None max_pro_2 = 0 max_item_2 = None max_pro_3 = 0 max_item_3 = None #STAGE1 假设分词没有错误 pinyin_t = pinyin.word2pinyin_split( to_do[0], '-') + '-' + pinyin.word2pinyin_split(to_do[1], '-') p_res_1 = {} if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: if to_do[0] != item[0:len(to_do[0])]: continue else: p_res_1[item[len(to_do[0]):]] = JIEBA_HZ.get(item) pinyin_t = pinyin.word2pinyin_split( to_do[1], '-') + '-' + pinyin.word2pinyin_split(to_do[2], '-') p_res_2 = {} if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: if to_do[2] != item[len(to_do[1]):]: continue else: p_res_2[item[0:len(to_do[1])]] = JIEBA_HZ.get(item) #print("2.找到:%s-%s,概率%f\t" %(to_do[0],item,JIEBA_HZ.get(item))) p_res_intr = dict.fromkeys(x for x in p_res_1 if x in p_res_2) if p_res_intr: max_pro_1 = 0 max_item_1 = None for item in p_res_intr: p_res_intr[item] = p_res_1[item] * p_res_2[item] / (p_res_1[item] + p_res_2[item]) if p_res_intr[item] > max_pro_1: max_pro_1 = p_res_intr[item] max_item_1 = item print(repr(p_res_intr)) p_res_stage1 = p_res_intr #STAGE2 假设第一和第二个合并 to_do_a = [to_do[0] + to_do[1], to_do[2]] p_res_3 = {} p_res_s3 = {} pinyin_t = pinyin.word2pinyin_split( to_do_a[0], '-') + '-' + pinyin.word2pinyin_split(to_do_a[1], '-') if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: print(item) if to_do_a[1] != item[len(to_do_a[1]):]: continue else: p_res_3[item[:len(to_do_a[1])]] = JIEBA_HZ.get(item) if g_check_a: for item in p_res_3: item_t = g_check_a + item if item_t in JIEBA_HZ.keys(): p_res_s3[item] = JIEBA_HZ.get(item_t) else: p_res_s3 = p_res_3 p_res_intr = dict.fromkeys(x for x in p_res_3 if x in p_res_s3) if p_res_intr: for item in p_res_intr: p_res_intr[item] = p_res_3[item] * p_res_s3[item] / ( p_res_3[item] + p_res_s3[item]) if p_res_intr[item] > max_pro_2: max_pro_2 = p_res_intr[item] max_item_2 = item p_res_stage2 = p_res_intr #STAGE3 假设第二和第三个合并 to_do_b = [to_do[0], to_do[1] + to_do[2]] p_res_4 = {} p_res_s4 = {} pinyin_t = pinyin.word2pinyin_split( to_do_b[0], '-') + '-' + pinyin.word2pinyin_split(to_do_b[1], '-') if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: if to_do_b[0] != item[0:len(to_do_b[0])]: continue else: p_res_4[item[len(to_do_b[0]):]] = JIEBA_HZ.get(item) if g_check_e: for item in p_res_4: item_t = item + g_check_e if item_t in JIEBA_HZ.keys(): p_res_s4[item] = JIEBA_HZ.get(item_t) else: p_res_s4 = p_res_4 p_res_intr = dict.fromkeys(x for x in p_res_4 if x in p_res_s4) if p_res_intr: for item in p_res_intr: p_res_intr[item] = p_res_4[item] * p_res_s4[item] / ( p_res_4[item] + p_res_s4[item]) if p_res_intr[item] > max_pro_3: max_pro_3 = p_res_intr[item] max_item_3 = item print(repr(p_res_intr)) p_res_stage3 = p_res_intr #打印纠正结果 if max_item_1: print("STAGE1:纠错结果:%s %s %s,概率%f" % (to_do[0], max_item_1, to_do[2], p_res_stage1[max_item_1])) else: print("STAGE1:纠错失败") if max_item_2: print("STAGE2:纠错结果:%s %s,概率%f" % (max_item_2, to_do_a[1], p_res_stage2[max_item_2])) else: print("STAGE2:纠错失败") if max_item_3: print("STAGE3:纠错结果:%s %s,概率%f" % (to_do_b[0], max_item_3, p_res_stage3[max_item_3])) else: print("STAGE3:纠错失败") max_pro = max([max_pro_1, max_pro_2, max_pro_3]) if max_pro != 0: if max_pro == max_pro_1: final_words = jieba_i[0:min_index - 1] + [ to_do[0], max_item_1, to_do[2] ] + jieba_i[min_index + 2:jieba_len] elif max_pro == max_pro_2: final_words = jieba_i[0:min_index - 1] + [ max_item_2, to_do_a[1] ] + jieba_i[min_index + 2:jieba_len] elif max_pro == max_pro_3: final_words = jieba_i[0:min_index - 1] + [ to_do_b[0], max_item_3 ] + jieba_i[min_index + 2:jieba_len] print("原句: " + str_test) print("纠正:" + ''.join(final_words)) return (''.join(final_words)) else: print('纠错失败') return None
def correct_me(str_test, enhance=True): print("") print("测试语句:%s" % (str_test)) line_p = hanzi_prep.split_into_sentences(str_test) lines = [] for line_i in line_p: lines.extend(line_i) str_i = ''.join(lines) if USE_SEGMENT == "JIEBA": print("==JIEBA分词==") jieba_i = ' '.join(jieba.cut(str_i, cut_all=False)) elif USE_SEGMENT == "ICTCLAS": print("==NLPIR分词==") jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False)) else: print("未知分词类型!") jieba_i = [] print("分词结果:%s" % (repr(jieba_i))) jieba_i = jieba_i.split() jieba_len = len(jieba_i) if jieba_len < 3: print("词数太小,放弃纠错!") return jieba_key = [] jieba_pro = [] for i in range(1, jieba_len): #不考虑开头结尾模式 tmp_str = jieba_i[i - 1] + jieba_i[i] pro = JIEBA_HZ.get(tmp_str) jieba_key.append(tmp_str) if pro: jieba_pro.append(pro) else: jieba_pro.append(0) print("分词表:" + repr(jieba_key)) print("概率表:" + repr(jieba_pro)) jieba_pro_t = [] for i in range(0, jieba_len - 2): jieba_pro_t.append(jieba_pro[i] + jieba_pro[i + 1]) min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1 print("可疑位置:[%d]->%s" % (min_index, jieba_i[min_index])) to_do = [] g_check_a = None g_check_e = None #纠错位置不可能在开头或者结尾 to_do.append(jieba_i[min_index - 1]) to_do.append(jieba_i[min_index]) to_do.append(jieba_i[min_index + 1]) if min_index - 2 >= 0: g_check_a = jieba_i[min_index - 2] if min_index + 2 < jieba_len: g_check_e = jieba_i[min_index + 2] print("需要处理:" + repr(to_do)) print("辅助检测:%s,%s" % (g_check_a, g_check_e)) #保存最终的结果 p_res_stage1 = {} p_res_stage2 = {} p_res_stage3 = {} if enhance: #STAGE1 假设分词没有错误 p_res_st1 = sub_correct_me_ext(to_do[0], to_do[1], to_do[2], 1) #STAGE2 假设第一和第二个合并 p_res_st2 = sub_correct_me_ext(g_check_a, to_do[0] + to_do[1], to_do[2], 2) #STAGE3 假设第二和第三个合并 p_res_st3 = sub_correct_me_ext(to_do[0], to_do[1] + to_do[2], g_check_e, 3) else: #STAGE1 假设分词没有错误 p_res_st1 = sub_correct_me(to_do[0], to_do[1], to_do[2], 1) #STAGE2 假设第一和第二个合并 p_res_st2 = sub_correct_me(g_check_a, to_do[0] + to_do[1], to_do[2], 2) #STAGE3 假设第二和第三个合并 p_res_st3 = sub_correct_me(to_do[0], to_do[1] + to_do[2], g_check_e, 3) #打印纠正结果 cor_ret = correct_result(to_do, [p_res_st1, p_res_st2, p_res_st3], True) if not cor_ret: final_words = ['NONE'] else: if cor_ret['type'] == 1: final_words = jieba_i[0:min_index - 1] + [ to_do[0], cor_ret['item'], to_do[2] ] + jieba_i[min_index + 2:jieba_len] elif cor_ret['type'] == 2: final_words = jieba_i[0:min_index - 1] + [ cor_ret['item'], to_do[2] ] + jieba_i[min_index + 2:jieba_len] elif cor_ret['type'] == 3: final_words = jieba_i[0:min_index - 1] + [ to_do[0], cor_ret['item'] ] + jieba_i[min_index + 2:jieba_len] else: final_words = ['NONE'] return ''.join(final_words)
def correct_me(str_test): # str_len = len(str_test) # print("\n==单字测试==") # for i in range(1,str_len): # tmp_str = str_test[i-1] + str_test[i] # if is_terminator(str_test[i]): # tmp_str = str_test[i-1] + '</s>' # if is_terminator(str_test[i-1]): # tmp_str = '<s>' + str_test[i] # if UNIC_HZ.get(tmp_str): # print("%s->%f" % (tmp_str,UNIC_HZ.get(tmp_str)),end="\t") # else: # print("%s->%f" % (tmp_str,0),end="\t") print("") print("==NLPIR分词==") print("测试语句:%s" %(str_test)) line_p = hanzi_prep.split_into_sentences(str_test) lines = [] for line_i in line_p: lines.extend(line_i) str_i = ''.join(lines) #jieba_i = ' '.join(jieba.cut(str_i, cut_all=False)) jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False)) print("分词结果:%s"%(repr(jieba_i))) jieba_i = jieba_i.split() jieba_len = len(jieba_i) if jieba_len < 3: print("词数太小,放弃纠错!") return jieba_key = [] jieba_pro = [] for i in range(1,jieba_len): #是否是标点符号 # if i == 0: # tmp_str = '<s>' + jieba_i[i] # if i == jieba_len -1: # tmp_str = jieba_i[i] + '</s>' # else: # #默认模式 # tmp_str = jieba_i[i-1] + jieba_i[i] # if len(jieba_i[i]) == 1: # if is_terminator(jieba_i[i]): # tmp_str = jieba_i[i-1] + '</s>' # if len(jieba_i[i-1]) == 1: # if is_terminator(jieba_i[i-1]): # tmp_str = '<s>' + jieba_i[i] #不考虑开头结尾模式 tmp_str = jieba_i[i-1] + jieba_i[i] pro = JIEBA_HZ.get(tmp_str) jieba_key.append(tmp_str) if pro: jieba_pro.append(pro) else: jieba_pro.append(0) # if min_index != -1: # print("\n可能错误位置:",end="") # if min_index > 1: # print("%s"%jieba_i[min_index-1],end="") # print("%s"%jieba_i[min_index]) # if min_index < (jieba_len - 1): # print("%s"%jieba_i[min_index+1],end="") print("分词表:"+repr(jieba_key)) print("概率表:"+repr(jieba_pro)) jieba_pro_t = [] for i in range(0,jieba_len-2): jieba_pro_t.append( jieba_pro[i] + jieba_pro[i+1]) min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1 print("可疑位置:[%d]->%s"%(min_index,jieba_i[min_index])) to_do = [] g_check_a = None g_check_e = None #纠错位置不可能在开头或者结尾 to_do.append(jieba_i[min_index-1]) to_do.append(jieba_i[min_index]) to_do.append(jieba_i[min_index+1]) if min_index - 2 >= 0: g_check_a = jieba_i[min_index-2] if min_index + 2 < jieba_len: g_check_e = jieba_i[min_index+2] print("需要处理:"+repr(to_do)) print("辅助检测:%s,%s" %(g_check_a, g_check_e)) #保存最终的结果 p_res_stage1 = {} p_res_stage2 = {} p_res_stage3 = {} max_item_1 = None max_item_2 = None max_item_3 = None max_pro_1 = 0 max_item_1 = None max_pro_2 = 0 max_item_2 = None max_pro_3 = 0 max_item_3 = None #STAGE1 假设分词没有错误 pinyin_t = pinyin.word2pinyin_split(to_do[0],'-') + '-' + pinyin.word2pinyin_split(to_do[1],'-') p_res_1 = {} if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: if to_do[0] != item[0:len(to_do[0])]: continue else: p_res_1[item[len(to_do[0]):]] = JIEBA_HZ.get(item) pinyin_t = pinyin.word2pinyin_split(to_do[1],'-') + '-' + pinyin.word2pinyin_split(to_do[2],'-') p_res_2 = {} if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: if to_do[2] != item[len(to_do[1]):]: continue else: p_res_2[item[0:len(to_do[1])]] = JIEBA_HZ.get(item) #print("2.找到:%s-%s,概率%f\t" %(to_do[0],item,JIEBA_HZ.get(item))) p_res_intr = dict.fromkeys(x for x in p_res_1 if x in p_res_2) if p_res_intr: max_pro_1 = 0 max_item_1 = None for item in p_res_intr: p_res_intr[item] = p_res_1[item]*p_res_2[item] / (p_res_1[item] + p_res_2[item]) if p_res_intr[item] > max_pro_1: max_pro_1 = p_res_intr[item] max_item_1 = item print(repr(p_res_intr)) p_res_stage1 = p_res_intr #STAGE2 假设第一和第二个合并 to_do_a = [to_do[0]+to_do[1], to_do[2]] p_res_3 = {} p_res_s3 = {} pinyin_t = pinyin.word2pinyin_split(to_do_a[0],'-') + '-' + pinyin.word2pinyin_split(to_do_a[1],'-') if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: print(item) if to_do_a[1] != item[len(to_do_a[1]):]: continue else: p_res_3[item[:len(to_do_a[1])]] = JIEBA_HZ.get(item) if g_check_a: for item in p_res_3: item_t = g_check_a+item if item_t in JIEBA_HZ.keys(): p_res_s3[item] = JIEBA_HZ.get(item_t) else: p_res_s3 = p_res_3 p_res_intr = dict.fromkeys(x for x in p_res_3 if x in p_res_s3) if p_res_intr: for item in p_res_intr: p_res_intr[item] = p_res_3[item]*p_res_s3[item] / (p_res_3[item] + p_res_s3[item]) if p_res_intr[item] > max_pro_2: max_pro_2 = p_res_intr[item] max_item_2 = item p_res_stage2 = p_res_intr #STAGE3 假设第二和第三个合并 to_do_b = [to_do[0], to_do[1]+to_do[2]] p_res_4 = {} p_res_s4 = {} pinyin_t = pinyin.word2pinyin_split(to_do_b[0],'-') + '-' + pinyin.word2pinyin_split(to_do_b[1],'-') if pinyin_t in JIEBA_PINYIN.keys(): list_t = JIEBA_PINYIN.get(pinyin_t) for item in list_t: if to_do_b[0] != item[0:len(to_do_b[0])]: continue else: p_res_4[item[len(to_do_b[0]):]] = JIEBA_HZ.get(item) if g_check_e: for item in p_res_4: item_t = item + g_check_e if item_t in JIEBA_HZ.keys(): p_res_s4[item] = JIEBA_HZ.get(item_t) else: p_res_s4 = p_res_4 p_res_intr = dict.fromkeys(x for x in p_res_4 if x in p_res_s4) if p_res_intr: for item in p_res_intr: p_res_intr[item] = p_res_4[item]*p_res_s4[item] / (p_res_4[item] + p_res_s4[item]) if p_res_intr[item] > max_pro_3: max_pro_3 = p_res_intr[item] max_item_3 = item print(repr(p_res_intr)) p_res_stage3 = p_res_intr #打印纠正结果 if max_item_1: print("STAGE1:纠错结果:%s %s %s,概率%f"%(to_do[0],max_item_1,to_do[2],p_res_stage1[max_item_1])) else: print("STAGE1:纠错失败") if max_item_2: print("STAGE2:纠错结果:%s %s,概率%f"%(max_item_2,to_do_a[1],p_res_stage2[max_item_2])) else: print("STAGE2:纠错失败") if max_item_3: print("STAGE3:纠错结果:%s %s,概率%f"%(to_do_b[0],max_item_3,p_res_stage3[max_item_3])) else: print("STAGE3:纠错失败") max_pro = max([max_pro_1, max_pro_2, max_pro_3]) if max_pro != 0: if max_pro == max_pro_1: final_words = jieba_i[0:min_index-1] + [ to_do[0], max_item_1, to_do[2] ] + jieba_i[min_index+2:jieba_len] elif max_pro == max_pro_2: final_words = jieba_i[0:min_index-1] + [ max_item_2, to_do_a[1] ] + jieba_i[min_index+2:jieba_len] elif max_pro == max_pro_3: final_words = jieba_i[0:min_index-1] + [ to_do_b[0], max_item_3 ] + jieba_i[min_index+2:jieba_len] print("原句: "+str_test) print("纠正:"+''.join(final_words)) return (''.join(final_words)) else: print('纠错失败') return None
elif USE_SEGMENT == "ICTCLAS": FILE_NAME_JIEBA = FILE_NAME + "_ICTCLAS" FILE_NAME_JIEBA_CNT = FILE_NAME_JIEBA + "_CNT" FILE_NAME_JIEBA_LM = FILE_NAME_JIEBA + "_LM" FILE_NAME_JIEBA_PK = FILE_NAME_JIEBA + "_PK" FILE_NAME_JIEBA_PINYIN = FILE_NAME_JIEBA + "_PINYIN" i = 0 if not os.path.exists(FILE_NAME_PREP): with open(FILE_NAME) as fin: with open(FILE_NAME_PREP, "w") as fout: for line in fin: i = i + 1 if not i % 1000: print("C:%d" % (i)) line_p = hanzi_prep.split_into_sentences(line) for line_i in line_p: str_i = ''.join(line_i) fout.write(str_i + "\n") #i = 0 #if not os.path.exists(FILE_NAME_UNIC): # with open(FILE_NAME_PREP) as fin: # with open(FILE_NAME_UNIC,"w") as fout: # for line in fin: # i = i + 1 # if not i % 1000: # print("C:%d" %(i)) # line_p = hanzi_prep.split_into_sentences_e(line) # for line_i in line_p: # #用空格分割每个汉字