def build_eval_corpus(): bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt') clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl') sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl') cged_path = os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml') char_error_path = os.path.join(pwd_path, './bcmi_corpus.json') build_bcmi_corpus(bcmi_path, char_error_path) char_errors = load_json(char_error_path) word_error_path = os.path.join(pwd_path, './sighan_corpus.json') build_sighan_corpus(sighan_path, word_error_path) word_errors = load_json(word_error_path) grammar_error_path = os.path.join(pwd_path, './clp_corpus.json') build_sighan_corpus(clp_path, grammar_error_path) grammar_errors = load_json(grammar_error_path) no_error_path = os.path.join(pwd_path, './noerror_corpus.json') build_cged_no_error_corpus(cged_path, no_error_path) no_errors = load_json(no_error_path) corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample(grammar_errors, 100) + sample(no_errors, 200) save_json(corpus, eval_data_path) print("save eval corpus done", eval_data_path) os.remove(char_error_path) os.remove(word_error_path) os.remove(grammar_error_path) os.remove(no_error_path)
def build_cged_no_error_corpus(data_path, output_path, limit_size=500): corpus = [] print('Parse data from %s' % data_path) dom_tree = minidom.parse(data_path) docs = dom_tree.documentElement.getElementsByTagName('DOC') count = 0 for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() if correction: count += 1 line_dict = { "text": correction, "correction": correction, "errors": [] } corpus.append(line_dict) if count > limit_size: break save_json(corpus, output_path)
def build_sighan_corpus(data_path, output_path): corpus = [] sighan_data = load_pkl(data_path) for error_sentence, error_details in sighan_data: ids = [] error_word = '' right_word = '' if not error_details: continue for detail in error_details: idx = detail[0] error_word = detail[1] right_word = detail[2] begin_idx = idx - 1 ids.append(begin_idx) correct_sentence = error_sentence.replace(error_word, right_word) details = [] for i in ids: details.append([error_sentence[i], correct_sentence[i], i, i + 1]) line_dict = { "text": error_sentence, "correction": correct_sentence, "errors": details } corpus.append(line_dict) save_json(corpus, output_path)
def eval_corpus500_by_macbert(input_eval_path=eval_data_path, output_eval_path='', verbose=True): from pycorrector.macbert.macbert_corrector import MacBertCorrector model = MacBertCorrector() res = [] corpus = load_json(input_eval_path) total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 start_time = time.time() for data_dict in corpus: text = data_dict.get('text', '') correction = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = model.macbert_correct(text) # compute recall if errors: recall_total_count += 1 if errors and pred_detail and correction == pred_sentence: recall_right_count += 1 # compute precision if correction == pred_sentence: right_count += 1 print("\nright:") print('truth :', text, errors) print('predict:', pred_sentence, pred_detail) else: err_data_dict = copy.deepcopy(data_dict) err_data_dict['pred_sentence'] = pred_sentence err_data_dict['pred_errors'] = str(pred_detail) res.append(err_data_dict) if verbose: print("\nwrong:") print('input :', text) print('truth :', correction, errors) print('predict:', pred_sentence, pred_detail) total_count += 1 spend_time = time.time() - start_time if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print('right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count, spend_time)) if output_eval_path: save_json(res, output_eval_path)
def build_bcmi_corpus(data_path, output_path): corpus = [] with open(data_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() error_sentence, correct_sentence, details = get_bcmi_corpus(line) if not error_sentence: continue line_dict = {"text": error_sentence, "correction": correct_sentence, "errors": details} corpus.append(line_dict) save_json(corpus, output_path)
def eval_corpus_by_bert(input_eval_path=eval_data_path, output_eval_path=output_eval_error_path, verbose=True): from pycorrector.bert.bert_corrector import BertCorrector model = BertCorrector() res = [] corpus = load_json(input_eval_path) total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 for data_dict in corpus: text = data_dict.get('text', '') correction = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = model.bert_correct(text) # compute recall if errors: recall_total_count += 1 if errors and pred_detail and correction == pred_sentence: recall_right_count += 1 # compute precision if correction == pred_sentence: right_count += 1 else: err_data_dict = copy.deepcopy(data_dict) err_data_dict['pred_sentence'] = pred_sentence err_data_dict['pred_errors'] = str(pred_detail) res.append(err_data_dict) if verbose: print('truth:', text, errors) print('predict:', pred_sentence, pred_detail) total_count += 1 if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print('right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{},recall_right_count:{},recall_total_count:{}'.format( right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count)) save_json(res, output_eval_path)
def parse_cged_file(file_dir): rst = [] for fn in os.listdir(file_dir): if fn.endswith('.xml'): path = os.path.join(file_dir, fn) print('Parse data from %s' % path) dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: id = '' text = '' texts = doc.getElementsByTagName('TEXT') for i in texts: id = i.getAttribute('id') # Input the text text = i.childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() wrong_ids = [] for error in doc.getElementsByTagName('ERROR'): start_off = error.getAttribute('start_off') end_off = error.getAttribute('end_off') if start_off and end_off: for i in range(int(start_off), int(end_off) + 1): wrong_ids.append(i) source = text.strip() target = correction.strip() pair = [source, target] if pair not in rst: rst.append({ 'id': id, 'original_text': source, 'wrong_ids': wrong_ids, 'correct_text': target }) save_json(rst, os.path.join(pwd_path, 'output/cged.json')) return rst
def build_eval_corpus(output_eval_path=eval_data_path): """ 生成评估样本集,抽样分布可修改 当前已经生成评估集,可以修改代码生成自己的样本分布 :param output_eval_path: :return: json file """ bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt') clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl') sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl') cged_path = os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml') char_error_path = os.path.join(pwd_path, './bcmi_corpus.json') build_bcmi_corpus(bcmi_path, char_error_path) char_errors = load_json(char_error_path) word_error_path = os.path.join(pwd_path, './sighan_corpus.json') build_sighan_corpus(sighan_path, word_error_path) word_errors = load_json(word_error_path) grammar_error_path = os.path.join(pwd_path, './clp_corpus.json') build_sighan_corpus(clp_path, grammar_error_path) grammar_errors = load_json(grammar_error_path) no_error_path = os.path.join(pwd_path, './noerror_corpus.json') build_cged_no_error_corpus(cged_path, no_error_path) no_errors = load_json(no_error_path) corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample( grammar_errors, 100) + sample(no_errors, 200) save_json(corpus, output_eval_path) print("save eval corpus done", output_eval_path) os.remove(char_error_path) os.remove(word_error_path) os.remove(grammar_error_path) os.remove(no_error_path)
def main(): # 注意:训练样本过少,仅作为模型测试使用 # parse_cged_file(os.path.join(pwd_path, '../data/cn/CGED/')) sighan15_dir = os.path.join(pwd_path, '../data/cn/sighan_2015/') rst_items = [] test_lst = proc_test_set(sighan15_dir) for item in read_data(sighan15_dir): rst_items += proc_item(item) # 拆分训练与测试 print('data_size:', len(rst_items)) train_lst, dev_lst = train_test_split(rst_items, test_size=0.1, random_state=42) save_json(train_lst, os.path.join(pwd_path, 'output/train.json')) save_json(train_lst, os.path.join(pwd_path, 'output/dev.json')) save_json(test_lst, os.path.join(pwd_path, 'output/test.json'))