예제 #1
0
def eval_sighan_2015_by_bert(sighan_path=sighan_2015_path,
                             verbose=True,
                             num_limit_lines=100):
    from pycorrector.bert.bert_corrector import BertCorrector
    model = BertCorrector()
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    start_time = time.time()
    with open(sighan_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) != 2:
                continue
            if 0 < num_limit_lines < total_count:
                continue
            src = parts[0]
            trg = parts[1]

            pred, pred_detail = model.bert_correct(src)

            if src != trg:
                recall_total_count += 1

            if pred == trg:
                right_count += 1
                if src != trg:
                    recall_right_count += 1
                if verbose:
                    print("\nright:")
                    print(
                        f'input  : {src}\ntruth  : {trg}\npredict: {pred} pred_detail: {pred_detail}'
                    )
            else:
                if verbose:
                    print("\nwrong:")
                    print(
                        f'input  : {src}\ntruth  : {trg}\npredict: {pred} pred_detail: {pred_detail}'
                    )
            total_count += 1

    spend_time = time.time() - start_time

    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print(
        'right_rate:{}, right_count:{}, total_count:{};\n'
        'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'
        .format(right_rate, right_count, total_count, recall_rate,
                recall_right_count, recall_total_count, spend_time))
예제 #2
0
def eval_corpus500_by_bert(input_eval_path=eval_data_path, output_eval_path='', verbose=True):
    from pycorrector.bert.bert_corrector import BertCorrector
    model = BertCorrector()
    res = []
    corpus = load_json(input_eval_path)
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    start_time = time.time()
    for data_dict in corpus:
        text = data_dict.get('text', '')
        correction = data_dict.get('correction', '')
        errors = data_dict.get('errors', [])

        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = model.bert_correct(text)
        # compute recall
        if errors:
            recall_total_count += 1
            if errors and pred_detail and correction == pred_sentence:
                recall_right_count += 1

        # compute precision
        if correction == pred_sentence:
            right_count += 1
            print("\nright:")
            print('truth  :', text, errors)
            print('predict:', pred_sentence, pred_detail)
        else:
            err_data_dict = copy.deepcopy(data_dict)
            err_data_dict['pred_sentence'] = pred_sentence
            err_data_dict['pred_errors'] = str(pred_detail)
            res.append(err_data_dict)
            if verbose:
                print("\nwrong:")
                print('input  :', text)
                print('truth  :', correction, errors)
                print('predict:', pred_sentence, pred_detail)
        total_count += 1
    spend_time = time.time() - start_time
    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print('right_rate:{}, right_count:{}, total_count:{};\n'
          'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate,
                                                                                                 right_count,
                                                                                                 total_count,
                                                                                                 recall_rate,
                                                                                                 recall_right_count,
                                                                                                 recall_total_count,
                                                                                                 spend_time))
    if output_eval_path:
        save_json(res, output_eval_path)
예제 #3
0
def main(args):
    if args.data == 'sighan_15' and args.model == 'rule':
        demo()
        # Sentence Level: acc:0.173225, precision:0.979592, recall:0.148541, f1:0.257965, cost time:230.92 s
        eval.eval_sighan2015_by_model(pycorrector.correct)
    if args.data == 'sighan_15' and args.model == 'bert':
        # right_rate:0.37623762376237624, right_count:38, total_count:101;
        # recall_rate:0.3645833333333333, recall_right_count:35, recall_total_count:96, spend_time:503 s
        from pycorrector.bert.bert_corrector import BertCorrector
        model = BertCorrector()
        eval.eval_sighan2015_by_model(model.bert_correct)
    if args.data == 'sighan_15' and args.model == 'macbert':
        # Sentence Level: acc:0.914885, precision:0.995199, recall:0.916446, f1:0.954200, cost time:29.47 s
        from pycorrector.macbert.macbert_corrector import MacBertCorrector
        model = MacBertCorrector()
        eval.eval_sighan2015_by_model(model.macbert_correct)
    if args.data == 'sighan_15' and args.model == 'ernie':
        # right_rate:0.297029702970297, right_count:30, total_count:101;
        # recall_rate:0.28125, recall_right_count:27, recall_total_count:96, spend_time:655 s
        from pycorrector.ernie.ernie_corrector import ErnieCorrector
        model = ErnieCorrector()
        eval.eval_sighan2015_by_model(model.ernie_correct)

    if args.data == 'corpus500' and args.model == 'rule':
        demo()
        # right_rate:0.486, right_count:243, total_count:500;
        # recall_rate:0.18, recall_right_count:54, recall_total_count:300, spend_time:78 s
        eval.eval_corpus500_by_model(pycorrector.correct)
    if args.data == 'corpus500' and args.model == 'bert':
        # right_rate:0.586, right_count:293, total_count:500;
        # recall_rate:0.35, recall_right_count:105, recall_total_count:300, spend_time:1760 s
        from pycorrector.bert.bert_corrector import BertCorrector
        model = BertCorrector()
        eval.eval_corpus500_by_model(model.bert_correct)
    if args.data == 'corpus500' and args.model == 'macbert':
        # Sentence Level: acc:0.724000, precision:0.912821, recall:0.595318, f1:0.720648, cost time:6.43 s
        from pycorrector.macbert.macbert_corrector import MacBertCorrector
        model = MacBertCorrector()
        eval.eval_corpus500_by_model(model.macbert_correct)
    if args.data == 'corpus500' and args.model == 'ernie':
        # right_rate:0.598, right_count:299, total_count:500;
        # recall_rate:0.41333333333333333, recall_right_count:124, recall_total_count:300, spend_time:6960 s
        from pycorrector.ernie.ernie_corrector import ErnieCorrector
        model = ErnieCorrector()
        eval.eval_corpus500_by_model(model.ernie_correct)
예제 #4
0
def error(data_json):
    with open(data_json, 'r', encoding='utf-8') as f:
        data_list = json.load(f)

    # model mode
    d = BertCorrector()

    error_list = []
    error_detail = []
    for data in data_list:
        file_type = data.get('file_type')
        file_no = data.get('file_no')
        for s in data.get('sentences'):
            if s.get('type') == "正文" or (file_type != '3' and s.get('type') == '标题'):
                paragraph_no = s.get('paragraph_no')
                sentence_no = s.get('sentence_no')
                text = s.get('text')

                # model mode
                corrected_sent, details = d.bert_correct(text)
                # corrected_sent, details = pycorrector.correct(text)

                new_details = []
                # details : ['七', '器', 1, 2]
                for detail in details:
                    new_details.append(find_diff(detail))
                    orign = detail[0]
                    fix = detail[1]
                    s = difflib.SequenceMatcher(None, orign, fix)
                    for tag, i1, i2, j1, j2 in s.get_opcodes():
                        if tag == 'replace':
                            error = '%s%s%s%s%s%s1001,%s' % (file_type,
                                                             file_no,
                                                             paragraph_no,
                                                             sentence_no,
                                                             str(detail[2] + 1 + i1).zfill(2),
                                                             str(detail[2] + i2).zfill(2),
                                                             fix[j1:j2]
                                                             )
                            error_list.append(error)
                            error_detail.append((error, corrected_sent, new_details))
    return error_list, error_detail
예제 #5
0
def error(data_json):
    with open(data_json, 'r', encoding='utf-8') as f:
        data_list = json.load(f)

    # model mode
    d = BertCorrector()

    error_list = []
    error_detail = []
    for data in data_list:
        file_type = data.get('file_type')
        file_no = data.get('file_no')
        first_type = None
        first_no = None
        local_part = 0
        for s in data.get('sentences'):
            if s.get('type') == "正文" or (file_type != '3'
                                         and s.get('type') == '标题'):
                paragraph_no = s.get('paragraph_no')
                sentence_no = s.get('sentence_no')
                text = s.get('text')
                list_text = []

                if s.get('type') == '标题':
                    spilt_text = text.split("\n")
                    for item in spilt_text:
                        list_text.append(item)
                else:
                    list_text.append(text)

                for i, item in enumerate(list_text):
                    # model mode
                    corrected_sent, details = d.bert_correct(item)
                    # corrected_sent, details = pycorrector.correct(text)

                    if first_type != s.get('type') or first_no != paragraph_no:
                        local_part = 0
                        first_type = s.get('type')
                        first_no = paragraph_no

                    new_details = []
                    # details : ['七', '器', 1, 2]
                    for detail in details:
                        new_details.append(find_diff(detail))
                        orign = detail[0]
                        fix = detail[1]
                        ds = difflib.SequenceMatcher(None, orign, fix)
                        for tag, i1, i2, j1, j2 in ds.get_opcodes():
                            if tag == 'replace':
                                part_text_1 = item[:detail[2] + i1 + 1]
                                part_text_2 = item[:detail[2] + i2 + 1]
                                add_1 = seg_char(part_text_1)
                                add_2 = seg_char(part_text_2)
                                error = '%s%s%s%s%s%s1,%s' % (
                                    file_type,
                                    file_no,
                                    str(int(paragraph_no) + i).zfill(2),
                                    str(len(add_1) + local_part).zfill(3),
                                    # str(detail[2] + 1 + i1 + local_part).zfill(3),
                                    str(int(paragraph_no) + i).zfill(2),
                                    str(len(add_2) - 1 + local_part).zfill(3),
                                    # str(detail[2] + i2 + local_part).zfill(3),
                                    fix[j1:j2])
                                error_list.append(error)
                                error_detail.append(
                                    (error, corrected_sent, new_details))
    return error_list, error_detail
예제 #6
0
# -*- coding: utf-8 -*-
# @Author  : Dapeng
# @File    : test.py
# @Desc    : 
# @Contact : [email protected]
# @Time    : 2020/10/12 下午6:59
from pycorrector.bert.bert_corrector import BertCorrector

d = BertCorrector()
error_sentences = [
    '疝気医院那好 为老人让坐,疝気专科百科问答',
    '少先队员因该为老人让坐',
    '少 先  队 员 因 该 为 老人让坐',
    '机七学习是人工智能领遇最能体现智能的一个分知',
    '今天心情很好',
]
for sent in error_sentences:
    corrected_sent, err = d.bert_correct(sent)
    print("original sentence:{} => {}, err:{}".format(sent, corrected_sent, err))
예제 #7
0
        '少先队员因该为老人让坐\r\n少 先  队 员 因 该 为 老人让坐\r\n机七学习是人工智能领遇最能体现智能的一个分知\r\n到以深切的问候',
        style="margin: 0px; width: 310px; height: 187px;"),
    # form.Textbox("bax",
    #     form.notnull,
    #     form.regexp('\d+', 'Must be a digit'),
    #     form.Validator('Must be more than 5', lambda x:int(x)>5)),
    form.Textarea(name='output_sentences',
                  style="margin: 0px; width: 310px; height: 187px;"),
    # form.Checkbox('curly'),
    # form.Dropdown('french', ['mustard', 'fries', 'wine'])
)

RetFrom = form.Form(form.Textarea(name='output_sentences'))

from pycorrector.bert.bert_corrector import BertCorrector
d = BertCorrector()


class index:
    def GET(self):
        form = InputForm()
        # 确保通过调用它来创建表单的副本(上面一行)
        # 否则更改将全局显示
        # return form.read()
        return render.formtest(form)

    def POST(self):
        inputform = InputForm()
        if not inputform.validates():
            return render.formtest(inputform)
        else: