Exemplos de BertCorrector.bert_correct em Python, exemplos de pycorrector.bert.bert_corrector.BertCorrector.bert_correct em Python

Exemplo n.º 1

0

Exibir arquivo

def eval_sighan_2015_by_bert(sighan_path=sighan_2015_path,
                             verbose=True,
                             num_limit_lines=100):
    from pycorrector.bert.bert_corrector import BertCorrector
    model = BertCorrector()
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    start_time = time.time()
    with open(sighan_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) != 2:
                continue
            if 0 < num_limit_lines < total_count:
                continue
            src = parts[0]
            trg = parts[1]

            pred, pred_detail = model.bert_correct(src)

            if src != trg:
                recall_total_count += 1

            if pred == trg:
                right_count += 1
                if src != trg:
                    recall_right_count += 1
                if verbose:
                    print("\nright:")
                    print(
                        f'input  : {src}\ntruth  : {trg}\npredict: {pred} pred_detail: {pred_detail}'
                    )
            else:
                if verbose:
                    print("\nwrong:")
                    print(
                        f'input  : {src}\ntruth  : {trg}\npredict: {pred} pred_detail: {pred_detail}'
                    )
            total_count += 1

    spend_time = time.time() - start_time

    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print(
        'right_rate:{}, right_count:{}, total_count:{};\n'
        'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'
        .format(right_rate, right_count, total_count, recall_rate,
                recall_right_count, recall_total_count, spend_time))

Exemplo n.º 2

0

Exibir arquivo

def eval_corpus500_by_bert(input_eval_path=eval_data_path, output_eval_path='', verbose=True):
    from pycorrector.bert.bert_corrector import BertCorrector
    model = BertCorrector()
    res = []
    corpus = load_json(input_eval_path)
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    start_time = time.time()
    for data_dict in corpus:
        text = data_dict.get('text', '')
        correction = data_dict.get('correction', '')
        errors = data_dict.get('errors', [])

        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = model.bert_correct(text)
        # compute recall
        if errors:
            recall_total_count += 1
            if errors and pred_detail and correction == pred_sentence:
                recall_right_count += 1

        # compute precision
        if correction == pred_sentence:
            right_count += 1
            print("\nright:")
            print('truth  :', text, errors)
            print('predict:', pred_sentence, pred_detail)
        else:
            err_data_dict = copy.deepcopy(data_dict)
            err_data_dict['pred_sentence'] = pred_sentence
            err_data_dict['pred_errors'] = str(pred_detail)
            res.append(err_data_dict)
            if verbose:
                print("\nwrong:")
                print('input  :', text)
                print('truth  :', correction, errors)
                print('predict:', pred_sentence, pred_detail)
        total_count += 1
    spend_time = time.time() - start_time
    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print('right_rate:{}, right_count:{}, total_count:{};\n'
          'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate,
                                                                                                 right_count,
                                                                                                 total_count,
                                                                                                 recall_rate,
                                                                                                 recall_right_count,
                                                                                                 recall_total_count,
                                                                                                 spend_time))
    if output_eval_path:
        save_json(res, output_eval_path)

Exemplo n.º 3

0

Exibir arquivo

def error(data_json):
    with open(data_json, 'r', encoding='utf-8') as f:
        data_list = json.load(f)

    # model mode
    d = BertCorrector()

    error_list = []
    error_detail = []
    for data in data_list:
        file_type = data.get('file_type')
        file_no = data.get('file_no')
        for s in data.get('sentences'):
            if s.get('type') == "正文" or (file_type != '3' and s.get('type') == '标题'):
                paragraph_no = s.get('paragraph_no')
                sentence_no = s.get('sentence_no')
                text = s.get('text')

                # model mode
                corrected_sent, details = d.bert_correct(text)
                # corrected_sent, details = pycorrector.correct(text)

                new_details = []
                # details : ['七', '器', 1, 2]
                for detail in details:
                    new_details.append(find_diff(detail))
                    orign = detail[0]
                    fix = detail[1]
                    s = difflib.SequenceMatcher(None, orign, fix)
                    for tag, i1, i2, j1, j2 in s.get_opcodes():
                        if tag == 'replace':
                            error = '%s%s%s%s%s%s1001,%s' % (file_type,
                                                             file_no,
                                                             paragraph_no,
                                                             sentence_no,
                                                             str(detail[2] + 1 + i1).zfill(2),
                                                             str(detail[2] + i2).zfill(2),
                                                             fix[j1:j2]
                                                             )
                            error_list.append(error)
                            error_detail.append((error, corrected_sent, new_details))
    return error_list, error_detail

Exemplo n.º 4

0

Exibir arquivo

def error(data_json):
    with open(data_json, 'r', encoding='utf-8') as f:
        data_list = json.load(f)

    # model mode
    d = BertCorrector()

    error_list = []
    error_detail = []
    for data in data_list:
        file_type = data.get('file_type')
        file_no = data.get('file_no')
        first_type = None
        first_no = None
        local_part = 0
        for s in data.get('sentences'):
            if s.get('type') == "正文" or (file_type != '3'
                                         and s.get('type') == '标题'):
                paragraph_no = s.get('paragraph_no')
                sentence_no = s.get('sentence_no')
                text = s.get('text')
                list_text = []

                if s.get('type') == '标题':
                    spilt_text = text.split("\n")
                    for item in spilt_text:
                        list_text.append(item)
                else:
                    list_text.append(text)

                for i, item in enumerate(list_text):
                    # model mode
                    corrected_sent, details = d.bert_correct(item)
                    # corrected_sent, details = pycorrector.correct(text)

                    if first_type != s.get('type') or first_no != paragraph_no:
                        local_part = 0
                        first_type = s.get('type')
                        first_no = paragraph_no

                    new_details = []
                    # details : ['七', '器', 1, 2]
                    for detail in details:
                        new_details.append(find_diff(detail))
                        orign = detail[0]
                        fix = detail[1]
                        ds = difflib.SequenceMatcher(None, orign, fix)
                        for tag, i1, i2, j1, j2 in ds.get_opcodes():
                            if tag == 'replace':
                                part_text_1 = item[:detail[2] + i1 + 1]
                                part_text_2 = item[:detail[2] + i2 + 1]
                                add_1 = seg_char(part_text_1)
                                add_2 = seg_char(part_text_2)
                                error = '%s%s%s%s%s%s1,%s' % (
                                    file_type,
                                    file_no,
                                    str(int(paragraph_no) + i).zfill(2),
                                    str(len(add_1) + local_part).zfill(3),
                                    # str(detail[2] + 1 + i1 + local_part).zfill(3),
                                    str(int(paragraph_no) + i).zfill(2),
                                    str(len(add_2) - 1 + local_part).zfill(3),
                                    # str(detail[2] + i2 + local_part).zfill(3),
                                    fix[j1:j2])
                                error_list.append(error)
                                error_detail.append(
                                    (error, corrected_sent, new_details))
    return error_list, error_detail

Exemplo n.º 5

0

Exibir arquivo

# -*- coding: utf-8 -*-
# @Author  : Dapeng
# @File    : test.py
# @Desc    : 
# @Contact : [email protected]
# @Time    : 2020/10/12 下午6:59
from pycorrector.bert.bert_corrector import BertCorrector

d = BertCorrector()
error_sentences = [
    '疝気医院那好 为老人让坐，疝気专科百科问答',
    '少先队员因该为老人让坐',
    '少 先  队 员 因 该 为 老人让坐',
    '机七学习是人工智能领遇最能体现智能的一个分知',
    '今天心情很好',
]
for sent in error_sentences:
    corrected_sent, err = d.bert_correct(sent)
    print("original sentence:{} => {}, err:{}".format(sent, corrected_sent, err))