def eval_sighan_2015_by_bert(sighan_path=sighan_2015_path, verbose=True, num_limit_lines=100): from pycorrector.bert.bert_corrector import BertCorrector model = BertCorrector() total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 start_time = time.time() with open(sighan_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line.startswith('#'): continue parts = line.split() if len(parts) != 2: continue if 0 < num_limit_lines < total_count: continue src = parts[0] trg = parts[1] pred, pred_detail = model.bert_correct(src) if src != trg: recall_total_count += 1 if pred == trg: right_count += 1 if src != trg: recall_right_count += 1 if verbose: print("\nright:") print( f'input : {src}\ntruth : {trg}\npredict: {pred} pred_detail: {pred_detail}' ) else: if verbose: print("\nwrong:") print( f'input : {src}\ntruth : {trg}\npredict: {pred} pred_detail: {pred_detail}' ) total_count += 1 spend_time = time.time() - start_time if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print( 'right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s' .format(right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count, spend_time))
def eval_corpus500_by_bert(input_eval_path=eval_data_path, output_eval_path='', verbose=True): from pycorrector.bert.bert_corrector import BertCorrector model = BertCorrector() res = [] corpus = load_json(input_eval_path) total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 start_time = time.time() for data_dict in corpus: text = data_dict.get('text', '') correction = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = model.bert_correct(text) # compute recall if errors: recall_total_count += 1 if errors and pred_detail and correction == pred_sentence: recall_right_count += 1 # compute precision if correction == pred_sentence: right_count += 1 print("\nright:") print('truth :', text, errors) print('predict:', pred_sentence, pred_detail) else: err_data_dict = copy.deepcopy(data_dict) err_data_dict['pred_sentence'] = pred_sentence err_data_dict['pred_errors'] = str(pred_detail) res.append(err_data_dict) if verbose: print("\nwrong:") print('input :', text) print('truth :', correction, errors) print('predict:', pred_sentence, pred_detail) total_count += 1 spend_time = time.time() - start_time if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print('right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count, spend_time)) if output_eval_path: save_json(res, output_eval_path)
def error(data_json): with open(data_json, 'r', encoding='utf-8') as f: data_list = json.load(f) # model mode d = BertCorrector() error_list = [] error_detail = [] for data in data_list: file_type = data.get('file_type') file_no = data.get('file_no') for s in data.get('sentences'): if s.get('type') == "正文" or (file_type != '3' and s.get('type') == '标题'): paragraph_no = s.get('paragraph_no') sentence_no = s.get('sentence_no') text = s.get('text') # model mode corrected_sent, details = d.bert_correct(text) # corrected_sent, details = pycorrector.correct(text) new_details = [] # details : ['七', '器', 1, 2] for detail in details: new_details.append(find_diff(detail)) orign = detail[0] fix = detail[1] s = difflib.SequenceMatcher(None, orign, fix) for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == 'replace': error = '%s%s%s%s%s%s1001,%s' % (file_type, file_no, paragraph_no, sentence_no, str(detail[2] + 1 + i1).zfill(2), str(detail[2] + i2).zfill(2), fix[j1:j2] ) error_list.append(error) error_detail.append((error, corrected_sent, new_details)) return error_list, error_detail
def error(data_json): with open(data_json, 'r', encoding='utf-8') as f: data_list = json.load(f) # model mode d = BertCorrector() error_list = [] error_detail = [] for data in data_list: file_type = data.get('file_type') file_no = data.get('file_no') first_type = None first_no = None local_part = 0 for s in data.get('sentences'): if s.get('type') == "正文" or (file_type != '3' and s.get('type') == '标题'): paragraph_no = s.get('paragraph_no') sentence_no = s.get('sentence_no') text = s.get('text') list_text = [] if s.get('type') == '标题': spilt_text = text.split("\n") for item in spilt_text: list_text.append(item) else: list_text.append(text) for i, item in enumerate(list_text): # model mode corrected_sent, details = d.bert_correct(item) # corrected_sent, details = pycorrector.correct(text) if first_type != s.get('type') or first_no != paragraph_no: local_part = 0 first_type = s.get('type') first_no = paragraph_no new_details = [] # details : ['七', '器', 1, 2] for detail in details: new_details.append(find_diff(detail)) orign = detail[0] fix = detail[1] ds = difflib.SequenceMatcher(None, orign, fix) for tag, i1, i2, j1, j2 in ds.get_opcodes(): if tag == 'replace': part_text_1 = item[:detail[2] + i1 + 1] part_text_2 = item[:detail[2] + i2 + 1] add_1 = seg_char(part_text_1) add_2 = seg_char(part_text_2) error = '%s%s%s%s%s%s1,%s' % ( file_type, file_no, str(int(paragraph_no) + i).zfill(2), str(len(add_1) + local_part).zfill(3), # str(detail[2] + 1 + i1 + local_part).zfill(3), str(int(paragraph_no) + i).zfill(2), str(len(add_2) - 1 + local_part).zfill(3), # str(detail[2] + i2 + local_part).zfill(3), fix[j1:j2]) error_list.append(error) error_detail.append( (error, corrected_sent, new_details)) return error_list, error_detail
# -*- coding: utf-8 -*- # @Author : Dapeng # @File : test.py # @Desc : # @Contact : [email protected] # @Time : 2020/10/12 下午6:59 from pycorrector.bert.bert_corrector import BertCorrector d = BertCorrector() error_sentences = [ '疝気医院那好 为老人让坐,疝気专科百科问答', '少先队员因该为老人让坐', '少 先 队 员 因 该 为 老人让坐', '机七学习是人工智能领遇最能体现智能的一个分知', '今天心情很好', ] for sent in error_sentences: corrected_sent, err = d.bert_correct(sent) print("original sentence:{} => {}, err:{}".format(sent, corrected_sent, err))