def build_eval_corpus(): bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt') clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl') sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl') cged_path = os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml') char_error_path = os.path.join(pwd_path, './bcmi_corpus.json') build_bcmi_corpus(bcmi_path, char_error_path) char_errors = load_json(char_error_path) word_error_path = os.path.join(pwd_path, './sighan_corpus.json') build_sighan_corpus(sighan_path, word_error_path) word_errors = load_json(word_error_path) grammar_error_path = os.path.join(pwd_path, './clp_corpus.json') build_sighan_corpus(clp_path, grammar_error_path) grammar_errors = load_json(grammar_error_path) no_error_path = os.path.join(pwd_path, './noerror_corpus.json') build_cged_no_error_corpus(cged_path, no_error_path) no_errors = load_json(no_error_path) corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample(grammar_errors, 100) + sample(no_errors, 200) save_json(corpus, eval_data_path) print("save eval corpus done", eval_data_path) os.remove(char_error_path) os.remove(word_error_path) os.remove(grammar_error_path) os.remove(no_error_path)
def eval_corpus500_by_macbert(input_eval_path=eval_data_path, output_eval_path='', verbose=True): from pycorrector.macbert.macbert_corrector import MacBertCorrector model = MacBertCorrector() res = [] corpus = load_json(input_eval_path) total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 start_time = time.time() for data_dict in corpus: text = data_dict.get('text', '') correction = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = model.macbert_correct(text) # compute recall if errors: recall_total_count += 1 if errors and pred_detail and correction == pred_sentence: recall_right_count += 1 # compute precision if correction == pred_sentence: right_count += 1 print("\nright:") print('truth :', text, errors) print('predict:', pred_sentence, pred_detail) else: err_data_dict = copy.deepcopy(data_dict) err_data_dict['pred_sentence'] = pred_sentence err_data_dict['pred_errors'] = str(pred_detail) res.append(err_data_dict) if verbose: print("\nwrong:") print('input :', text) print('truth :', correction, errors) print('predict:', pred_sentence, pred_detail) total_count += 1 spend_time = time.time() - start_time if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print('right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count, spend_time)) if output_eval_path: save_json(res, output_eval_path)
def build_eval_corpus(output_eval_path=eval_data_path): """ 生成评估样本集,抽样分布可修改 当前已经生成评估集,可以修改代码生成自己的样本分布 :param output_eval_path: :return: json file """ bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt') clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl') sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl') cged_path = os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml') char_error_path = os.path.join(pwd_path, './bcmi_corpus.json') build_bcmi_corpus(bcmi_path, char_error_path) char_errors = load_json(char_error_path) word_error_path = os.path.join(pwd_path, './sighan_corpus.json') build_sighan_corpus(sighan_path, word_error_path) word_errors = load_json(word_error_path) grammar_error_path = os.path.join(pwd_path, './clp_corpus.json') build_sighan_corpus(clp_path, grammar_error_path) grammar_errors = load_json(grammar_error_path) no_error_path = os.path.join(pwd_path, './noerror_corpus.json') build_cged_no_error_corpus(cged_path, no_error_path) no_errors = load_json(no_error_path) corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample( grammar_errors, 100) + sample(no_errors, 200) save_json(corpus, output_eval_path) print("save eval corpus done", output_eval_path) os.remove(char_error_path) os.remove(word_error_path) os.remove(grammar_error_path) os.remove(no_error_path)
def eval_corpus_by_bert(input_eval_path=eval_data_path, output_eval_path=output_eval_error_path, verbose=True): from pycorrector.bert.bert_corrector import BertCorrector model = BertCorrector() res = [] corpus = load_json(input_eval_path) total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 for data_dict in corpus: text = data_dict.get('text', '') correction = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = model.bert_correct(text) # compute recall if errors: recall_total_count += 1 if errors and pred_detail and correction == pred_sentence: recall_right_count += 1 # compute precision if correction == pred_sentence: right_count += 1 else: err_data_dict = copy.deepcopy(data_dict) err_data_dict['pred_sentence'] = pred_sentence err_data_dict['pred_errors'] = str(pred_detail) res.append(err_data_dict) if verbose: print('truth:', text, errors) print('predict:', pred_sentence, pred_detail) total_count += 1 if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print('right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{},recall_right_count:{},recall_total_count:{}'.format( right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count)) save_json(res, output_eval_path)
def eval_corpus500_by_model(correct_fn, input_eval_path=eval_data_path, verbose=True): """ 句级评估结果,设定需要纠错为正样本,无需纠错为负样本 Args: correct_fn: input_eval_path: output_eval_path: verbose: Returns: Acc, Recall, F1 """ corpus = load_json(input_eval_path) TP = 0.0 FP = 0.0 FN = 0.0 TN = 0.0 total_num = 0 start_time = time.time() for data_dict in corpus: src = data_dict.get('text', '') tgt = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) tgt_pred, pred_detail = correct_fn(src) if verbose: print() print('input :', src) print('truth :', tgt, errors) print('predict:', tgt_pred, pred_detail) # 负样本 if src == tgt: # 预测也为负 if tgt == tgt_pred: TN += 1 print('right') # 预测为正 else: FP += 1 print('wrong') # 正样本 else: # 预测也为正 if tgt == tgt_pred: TP += 1 print('right') # 预测为负 else: FN += 1 print('wrong') total_num += 1 spend_time = time.time() - start_time acc = (TP + TN) / total_num precision = TP / (TP + FP) if TP > 0 else 0.0 recall = TP / (TP + FN) if TP > 0 else 0.0 f1 = 2 * precision * recall / (precision + recall) if precision + recall != 0 else 0 print( f'Sentence Level: acc:{acc:.4f}, precision:{precision:.4f}, recall:{recall:.4f}, f1:{f1:.4f}, cost time:{spend_time:.2f} s' ) return acc, precision, recall, f1