def correct(self, sentence=''): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] maybe_errors = self.detect(sentence) for item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] if err_type == ErrorType.char: # 对非中文的错字不做处理 if not is_chinese_string(item): continue if not self.check_vocab_has_all_token(sentence): continue # 取得所有可能正确的字 corrected_item = self.predict_mask_token( sentence, begin_idx, end_idx) elif err_type == ErrorType.word: corrected_item = item else: print('not strand error_type') # output if corrected_item != item: sentence = before_sent + corrected_item + after_sent detail_word = [ item, corrected_item, begin_idx, end_idx, ErrorType.char ] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) return sentence, detail
def generate_bertScore_sound_shape_file(self, text, right_sentence='',id_lists=[]): """ 生成bert_score、sound_score、shape_score文件 :param text: 句子文本 :return: file """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_2_short_text(text, include_symbol=True) try: blocks = [(blocks[i][0] + blocks[i + 1][0], blocks[i][1]) for i in range(0, len(blocks), 2)] except Exception as e: pass punc='。' if self.is_char_error_detect: for blk, start_idx in blocks: blk=punc+blk blk_new = '' for idx, s in enumerate(blk): # 对非中文的错误不做处理 if is_chinese_string(s): # 对已包含错误不处理 maybe_err = [s, idx, idx + 1, ErrorType.char] if not self._check_contain_details_error(maybe_err, details): sentence_lst = list(blk_new + blk[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): self.write2scorefile(s, top_tokens, start_idx + idx - 1, id_lists, right_sentence[start_idx + idx - 1]) # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = right_sentence[start_idx + idx - 1] if correct_item != s: details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char]) s = correct_item blk_new += s text_new += blk_new punc = blk_new[-1] details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def bert_correct_ssc_origin(self, text): """ 使用ssc音形码进行句子纠错 :param text: 句子文本 :return: list[list], [error_word, begin_pos, end_pos, error_type] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) if self.is_char_error_detect: text_new = "" for idx, s in enumerate(text): # 对非中文的错误不做处理 if is_chinese_string(s): # 对已包含错误不处理 maybe_err = [s, idx, idx + 1, ErrorType.char] if not self._check_contain_details_error(maybe_err, details): sentence_lst = list(text_new + text[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = self.neural_ssc_correct_item(s, top_tokens) if correct_item != s: details.append([s, correct_item, idx, idx + 1, ErrorType.char]) s = correct_item # 取得所有可能正确的词 # candidates = self.generate_items(s) # if candidates: # for token_str in top_tokens: # if token_str in candidates: # details.append([s, token_str, idx, idx + 1,ErrorType.char]) # s = token_str # break text_new += s details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def bert_correct_ssc(self, text): """ 使用ssc音形码进行句子纠错 :param text: 句子文本 :return: list[list], [error_word, begin_pos, end_pos, error_type] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_2_short_text(text, include_symbol=True) # blocks = self.split_2_short_text(text) if self.is_word_error_detect: pass if self.is_char_error_detect: for blk, start_idx in blocks: blk_new = '' for idx, s in enumerate(blk): # 对非中文的错误不做处理 if is_chinese_string(s): sentence_lst = list(blk_new + blk[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = self.neural_ssc_correct_item(s, top_tokens) if correct_item != s: details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char]) s = correct_item blk_new += s text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def electra_correct(self, text): """ 句子纠错 :param text: 句子文本 :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos] """ text_new = '' details = [] # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_2_short_text(text, include_symbol=True) for blk, start_idx in blocks: error_ids = self.electra_detect(blk) sentence_lst = list(blk) for idx in error_ids: s = sentence_lst[idx] if is_chinese_string(s): # 处理中文错误 sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) # 生成器fill-mask预测[mask],默认取top5 predicts = self.g_model(sentence_new) top_tokens = [] for p in predicts: token_id = p.get('token', 0) token_str = self.g_model.tokenizer.convert_ids_to_tokens( token_id) top_tokens.append(token_str) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([ s, token_str, start_idx + idx, start_idx + idx + 1 ]) sentence_lst[idx] = token_str break # 还原 if sentence_lst[idx] == self.mask: sentence_lst[idx] = s blk_new = ''.join(sentence_lst) text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def generate_items_for_word(self, word, fraction=1): candidates_1_order = [] candidates_2_order = [] candidates_3_order = [] # same pinyin word candidates_1_order.extend(self._confusion_word_set(word)) # custom confusion word candidates_1_order.extend(self._confusion_custom_set(word)) if len(word) == 2: # same first char pinyin confusion = [ i + word[1:] for i in self._confusion_char_set(word[0]) if i ] candidates_2_order.extend(confusion) # same last char pinyin confusion = [ word[:-1] + i for i in self._confusion_char_set(word[-1]) if i ] candidates_2_order.extend(confusion) if len(word) > 2: # same first char pinyin confusion = [ i + word[1:] for i in self._confusion_char_set(word[0]) if i ] candidates_3_order.extend(confusion) # same last char pinyin confusion = [ word[:-1] + i for i in self._confusion_char_set(word[-1]) if i ] candidates_3_order.extend(confusion) # add all confusion word list confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order) confusion_word_list = [ item for item in confusion_word_set if is_chinese_string(item) ] confusion_sorted = sorted(confusion_word_list, key=lambda k: self.word_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fraction + 1]
def correct(self, sentence, reverse=True): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] sentences = [] self.check_corrector_initialized() # 长句切分为短句 # sentences = re.split(r";|,|。|\?\s|;\s|,\s", sentence) maybe_errors = self.detect(sentence) # trick: 类似翻译模型,倒序处理 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=reverse) for item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] # 对非中文的错字不做处理 if not is_chinese_string(item): continue # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: # corrected_item = self.custom_confusion[item] corrected_item = (self.custom_confusion[item], ErrorType.confusion) # 对碎片且不常用单字,可能错误是多字少字 elif err_type == ErrorType.word_char: maybe_right_items = self.generate_items_word_char( item, before_sent, after_sent, begin_idx, end_idx) corrected_item = self.lm_correct_item(item, maybe_right_items, before_sent, after_sent) # 多字 elif err_type == ErrorType.redundancy: maybe_right_items = [''] corrected_item = self.lm_correct_item(item, maybe_right_items, before_sent, after_sent) elif err_type == ErrorType.word: # 取得所有可能正确的词 maybe_right_items = self.generate_items(item) # print(maybe_right_items) if not maybe_right_items: continue maybe_right_items = [(item, ErrorType.word) for item in maybe_right_items] corrected_item = self.lm_correct_item(item, maybe_right_items, before_sent, after_sent) else: '''err_type == ErrorType.char''' # 取得所有可能正确的词 maybe_right_items = self.generate_items(item) if not maybe_right_items: continue maybe_right_items = [(item, ErrorType.char) for item in maybe_right_items] # 取得最可能正确的字 corrected_item = self.lm_correct_item(item, maybe_right_items, before_sent, after_sent) # output if corrected_item[0] != item: sentence = before_sent + corrected_item[0] + after_sent # logger.debug('predict:' + item + '=>' + corrected_item) detail_word = [ item, corrected_item[0], begin_idx, end_idx, corrected_item[1] ] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) return sentence, detail
def generate_items(self, word, fraction=1): """ 生成纠错候选集 :param word: :param fraction: :return: """ self.check_corrector_initialized() candidates_1_order = [] candidates_2_order = [] candidates_3_order = [] # same pinyin word candidates_1_order.extend(self._confusion_word_set(word)) # custom confusion word candidates_1_order.extend(self._confusion_custom_set(word)) # same pinyin char if len(word) == 1: # same one char pinyin confusion = [i for i in self._confusion_char_set(word[0]) if i] candidates_1_order.extend(confusion) if len(word) == 2: # same first char pinyin confusion = [ i + word[1:] for i in self._confusion_char_set(word[0]) if i ] candidates_2_order.extend(confusion) # same last char pinyin confusion = [ word[:-1] + i for i in self._confusion_char_set(word[-1]) if i ] candidates_2_order.extend(confusion) if len(word) > 2: # same mid char pinyin confusion = [ word[0] + i + word[2:] for i in self._confusion_char_set(word[1]) ] candidates_3_order.extend(confusion) # same first word pinyin confusion_word = [ i + word[-1] for i in self._confusion_word_set(word[:-1]) ] candidates_3_order.extend(confusion_word) # same last word pinyin confusion_word = [ word[0] + i for i in self._confusion_word_set(word[1:]) ] candidates_3_order.extend(confusion_word) # add all confusion word list confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order) confusion_word_list = [ item for item in confusion_word_set if is_chinese_string(item) ] confusion_sorted = sorted(confusion_word_list, key=lambda k: self.word_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fraction + 1]
def correct(self, sentence, reverse=True): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] sentences = [] self.check_corrector_initialized() # 长句切分为短句 # sentences = re.split(r";|,|。|\?\s|;\s|,\s", sentence) maybe_errors = self.detect(sentence) # trick: 类似翻译模型,倒序处理 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=reverse) for cur_item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] # 对非中文的错字不做处理 if not is_chinese_string(cur_item): continue # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: # corrected_item = self.custom_confusion[item] corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion) # 对碎片且不常用单字,可能错误是多字少字 elif err_type == ErrorType.word_char: maybe_right_items = self.generate_items_word_char( cur_item, before_sent, after_sent, begin_idx, end_idx) corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) # 多字 elif err_type == ErrorType.redundancy: maybe_right_items = [('', ErrorType.redundancy)] corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) elif err_type == ErrorType.word: # 取得所有可能正确的词 candidates = self.generate_items(cur_item) if not candidates: continue candidates = [(item, ErrorType.word) for item in candidates] corrected_item = self.lm_correct_item(cur_item, candidates, before_sent, after_sent) # 对ErrorType.word错误进行双层检测 # 对多字词进行处理 if len(corrected_item[0] ) > 2 and corrected_item[0] not in self.word_freq: candidates = self.generate_items_for_word( corrected_item[0]) if not candidates: continue candidates = [(item, ErrorType.word) for item in candidates] corrected_item = self.lm_correct_item( corrected_item[0], candidates, before_sent, after_sent) else: '''err_type == ErrorType.char''' # 取得所有可能正确的词 candidates = self.generate_items(cur_item) if not candidates: continue # 取得最可能正确的字 corrected_item = self.predict_mask_token( cur_item, sentence, candidates, begin_idx, end_idx) corrected_item = (corrected_item, ErrorType.char) # output if corrected_item[0] != cur_item: sentence = before_sent + corrected_item[0] + after_sent detail_word = [ cur_item, corrected_item[0], begin_idx, end_idx, corrected_item[1] ] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=False) return sentence, detail, '/'.join(self.tokens), maybe_errors
def correct_short(self, text, start_idx=0): text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) if self.is_word_error_detect: maybe_errors = self.detect(text) # trick: 类似翻译模型,倒序处理 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True) for cur_item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = text[:begin_idx] after_sent = text[end_idx:] # 对非中文的错字不做处理 if not is_chinese_string(cur_item): continue # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion) # 对碎片且不常用单字,可能错误是多字少字 elif err_type == ErrorType.word_char: maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx) corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) # 多字 elif err_type == ErrorType.redundancy: maybe_right_items = [('',ErrorType.redundancy)] corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) # output if corrected_item[0] != cur_item: text = before_sent + corrected_item[0] + after_sent detail_word = [cur_item, corrected_item[0], start_idx+begin_idx, start_idx+end_idx, corrected_item[1]] details.append(detail_word) if self.is_char_error_detect: for idx, s in enumerate(text): # 对非中文的错误不做处理 if is_chinese_string(s): sentence_lst = list(text_new + text[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = self.neural_ssc_correct_item(s, top_tokens) if correct_item != s: # details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char]) details.append([s, correct_item, idx + start_idx -1, idx + start_idx, ErrorType.char]) s = correct_item text_new += s details = sorted(details, key=operator.itemgetter(2)) return text_new[1:], details
def bert_correct(self, text): """ 句子纠错 :param text: 句子文本 :return: list[list], [error_word, begin_pos, end_pos, error_type] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) if self.is_word_error_detect: maybe_errors = self.detect(text) # trick: 类似翻译模型,倒序处理 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True) for cur_item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = text[:begin_idx] after_sent = text[end_idx:] # 对非中文的错字不做处理 if not is_chinese_string(cur_item): continue # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion) # 对碎片且不常用单字,可能错误是多字少字 elif err_type == ErrorType.word_char: maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx) corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) # 多字 elif err_type == ErrorType.redundancy: maybe_right_items = [('',ErrorType.redundancy)] corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) elif err_type == ErrorType.word: # 取得所有可能正确的词 candidates = self.generate_items(cur_item) if not candidates: continue candidates=[(item,ErrorType.word) for item in candidates] corrected_item = self.lm_correct_item(cur_item, candidates, before_sent, after_sent) # 对ErrorType.word错误进行双层检测 # 对多字词进行处理 if len(corrected_item[0]) > 2 and corrected_item[0] not in self.word_freq: candidates = self.generate_items_for_word(corrected_item[0]) if not candidates: continue candidates=[(item,ErrorType.word) for item in candidates] corrected_item = self.lm_correct_item(corrected_item[0], candidates, before_sent, after_sent) # output if corrected_item[0] != cur_item: text = before_sent + corrected_item[0] + after_sent detail_word = [cur_item, corrected_item[0], begin_idx, end_idx, corrected_item[1]] details.append(detail_word) if self.is_char_error_detect: text_new = "" for idx, s in enumerate(text): # 对非中文的错误不做处理 if is_chinese_string(s): # 对已包含错误不处理 maybe_err = [s, idx, idx + 1, ErrorType.char] if not self._check_contain_details_error(maybe_err, details): sentence_lst = list(text_new + text[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] for p in predicts: token_id = p.get('token', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) top_tokens.append(token_str) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([s, token_str, idx, idx + 1,ErrorType.char]) s = token_str break text_new += s details = sorted(details, key=operator.itemgetter(2)) return text_new, details