Пример #1
0
def eval_bcmi_data(data_path, verbose=False):
    sentence_size = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            error_sentence, right_sentence = get_bcmi_corpus(line)
            if not error_sentence:
                continue
            pred_sentence, pred_detail = correct(error_sentence)
            if verbose:
                print('input sentence:', error_sentence)
                print('pred sentence:', pred_sentence)
                print('right sentence:', right_sentence)
            sentence_size += 1
            if right_sentence == pred_sentence:
                right_count += 1
                right_result[error_sentence] = [right_sentence, pred_sentence]
            else:
                wrong_result[error_sentence] = [right_sentence, pred_sentence]
    if verbose:
        print('right count:', right_count, ';sentence size:', sentence_size)
    return right_count / sentence_size, right_result, wrong_result
Пример #2
0
def eval_bcmi_data(data_path, verbose=False):
    sentence_size = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            error_sentence, right_sentence = get_bcmi_corpus(line)
            if not error_sentence:
                continue
            pred_sentence, pred_detail = correct(error_sentence)
            if verbose:
                print('input sentence:', error_sentence)
                print('pred sentence:', pred_sentence)
                print('right sentence:', right_sentence)
            sentence_size += 1
            if right_sentence == pred_sentence:
                right_count += 1
                right_result[error_sentence] = [right_sentence, pred_sentence]
            else:
                wrong_result[error_sentence] = [right_sentence, pred_sentence]
    if verbose:
        print('right count:', right_count, ';sentence size:', sentence_size)
    return right_count / sentence_size, right_result, wrong_result
Пример #3
0
 def test_text4():
     error_sentences = [
         '我喜欢打监球,你呢?足球吗',
         '老师工作非常幸苦,我们要遵敬老师',
         ' 我兴高彩列地去公园游玩',
         '老师的生体不好,可她艰持给我们上课',
         '我们要宝护它们',
         '讲台上放着一只漂亮的刚笔',
         '春暖花开之时我们躯车到了海滨渡假村',
         '按照上级布署安排',
         '冬冬今天戴来了一本好看的童话书',
         '少先队员因该为老人让坐',
         '服装店里的衣服各试各样',
         '一只小鱼船浮在平净的河面上',
         '我的家乡是有明的渔米之乡',
         ' _ ,',
         '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物',  # 出租的事
         '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。',  # 题高 => 提高 专业人氏 => 专业人士
         '三个凑皮匠胜过一个诸葛亮也有道理。',  # 凑
         '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。',
         '还有我要看他们的个性,如果跟同时合不来受到压力的话,无法专心地工作。',
     ]
     for line in error_sentences:
         correct_sent = correct(line)
         print("original sentence:{} => correct sentence:{}".format(
             line, correct_sent))
Пример #4
0
def eval_sighan(input_path, output_path, param_ec, param_gd, verbose=False):
    '''
    Input:
        input_path:  file of original sentences      form: (pid)\terror_sentence
        output_path: path of predicted sentences     form: (pid)\tcorrected_sentence
        verbose:     print the error and corrected sentences during running or not
    '''

    sys.stderr.write(
        'sighan15_test            : start correcting sentences......\n')
    sys.stderr.write('error_sentences_path     : ' + input_path + '\n')
    sys.stderr.write('corrected_sentences_path : ' + output_path + '\n')
    sighan_data = open(input_path, 'rb', encoding='utf-8')
    corr_file = open(output_path, 'w+', encoding='utf-8')

    if verbose:
        for line in sighan_data.readlines():
            pid, sentence = line.split('\t')
            pred_sent, pred_detail = correct(sentence.strip(), param_ec,
                                             param_gd)

            sys.stderr.write('input sentence : ' + sentence + '\n')
            sys.stderr.write('pred sentence  : ' + pred_sent + '\n')
            sys.stderr.write('predict change : ' + ', '.join([i[0][0] + '-->' + i[0][1] \
                                       for i in pred_detail if i]) + '\n')

            corr_file.write(pid + '\t' + pred_sent + '\n')
    else:
        for line in tqdm(sighan_data.readlines()):
            pid, sentence = line.split('\t')
            pred_sent, pred_detail = correct(sentence.strip(), param_ec,
                                             param_gd)

            corr_file.write(pid + '\t' + pred_sent + '\n')

    corr_file.close()
    sighan_data.close()

    sys.stderr.write(
        'sighan15_test            : finishing correcting sentences\n')
Пример #5
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    for error_sentence, right_detail in sighan_data:
        pred_sentence, pred_detail = correct(error_sentence)
        if verbose:
            print('input sentence:', error_sentence)
            print('pred sentence:', pred_sentence)
        for (right_loc, right_w, right_r) in right_detail:
            total_count += 1
            # if right_r == pred_r:
            #     right_count += 1
            #     right_result[error_sentence] = [right_r, pred_r]
            # else:
            #     wrong_result[error_sentence] = [right_r, pred_r]
            if verbose:
                print('right: {} => {} , index: {}'.format(right_w, right_r, right_loc))
                # if verbose:
                # print('right count:', right_count, ';total count:', total_count)
    return right_count / total_count, right_result, wrong_result
Пример #6
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    for error_sentence, right_detail in sighan_data:
        pred_sentence, pred_detail = correct(error_sentence)
        if verbose:
            print('input sentence:', error_sentence)
            print('pred sentence:', pred_sentence)
        for (right_loc, right_w, right_r) in right_detail:
            total_count += 1
            # if right_r == pred_r:
            #     right_count += 1
            #     right_result[error_sentence] = [right_r, pred_r]
            # else:
            #     wrong_result[error_sentence] = [right_r, pred_r]
            if verbose:
                print('right: {} => {} , index: {}'.format(
                    right_w, right_r, right_loc))
                # if verbose:
                # print('right count:', right_count, ';total count:', total_count)
    return right_count / total_count, right_result, wrong_result
Пример #7
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    for error_sentence, right_detail in sighan_data:
        pred_sentence, pred_detail = correct(error_sentence)
        # print(pred_detail)
        # if pred_detail and len(pred_detail[0]) > 1:
        # pdb.set_trace()
        if verbose:
            print('input sentence:', error_sentence)
            print('pred sentence :', pred_sentence)
        for (right_loc, right_w, right_r) in right_detail:
            total_count += 1
            # pdb.set_trace()
            if pred_detail:
                # print(pred_detail)
                for [(pred_w, pred_r, pred_beg, pred_end)] in pred_detail:
                    if right_r in pred_r or pred_r in right_r:
                        right_count += 1
                        right_result[error_sentence] = [right_r, pred_r]
                        pred_detail.remove([(pred_w, pred_r, pred_beg,
                                             pred_end)])
                    # else:
                    #     wrong_result[error_sentence] = [right_r, pred_r]
            elif not right_detail:
                right_count += 1

            if verbose:
                print('right: {} => {} , index: {}'.format(
                    right_w, right_r, right_loc))
                # if verbose:
                # print('right count:', right_count, ';total count:', total_count)
    return right_count / total_count, right_result, wrong_result
Пример #8
0
# -*- coding: utf-8 -*-
#!/usr/bin/env python
#
import os
import sys
sys.path.append("../")
import re
from codecs import open
from pycorrector.corrector import correct
from pycorrector.utils.io_utils import load_pkl

from tqdm import tqdm
import pdb

pwd_path = os.path.abspath(os.path.dirname(__file__))
data_path = os.path.join(pwd_path, '../pycorrector/data/test/source.txt')
pred_path = os.path.join(pwd_path, '../pycorrector/data/test/prediction.txt')

input_file = open(data_path, 'rb', encoding='utf-8').readlines()
output_file = open(pred_path, 'w', encoding='utf-8')

for err_sent in tqdm(input_file):
    pred_sent, pred_detail = correct(err_sent)
    output_file.write(pred_sent)

output_file.close()
Пример #9
0
    # '第一位京第二位,他说:第二位利好。',
    # '我准备一些面包给他吃,我也从冰箱拿出来了埤酒',
    # '所以我很高心',
    # '请我座在沙发上',
    # '美食美事皆不可辜负,这场盛会你一定期待已久',
    # '点击咨询痣疮是什么原因?咨询医师痣疮原因',
    # '附睾焱的症状?要引起注意!',
    # '外阴尖锐涅疣怎样治疗?-济群解析',
    # '洛阳大华雅思 30天突破雅思7分',
    # '男人不育少靖子症如何治疗?专业男科,烟台京城医院',
    # '疝気医院那好 疝気专科百科问答',
    # '成都医院治扁平苔鲜贵吗_国家2甲医院',
    # '少先队员因该为老人让坐',
    # '服装店里的衣服各试各样',
    # '一只小鱼船浮在平净的河面上',
    # '我的家乡是有明的渔米之乡',
    # ' _ ,',
    # '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物',  # 出租的事
    # '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。',  # 题高 => 提高 专业人氏 => 专业人士
    # '三个凑皮匠胜过一个诸葛亮也有道理。',  # 凑
    # '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。',
]
for line in error_sentences:
    print("starting correction...")
    correct_sent = corrector.correct(line)
    print("original sentence:{} => correct sentence:{}".format(line, correct_sent))




Пример #10
0
def reader(in_file):
    lines = list()
    cout = 0
    with open(in_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            text = line.split("\t")[0]
            lines.append(text)
            cout += 1
    print("in file: %s, cout: %d" % (in_file, cout))
    return lines


def saver(out_file, lines):
    cout = 0
    with open(out_file, 'w', encoding='utf-8') as f:
        for line in lines:
            line = line.strip()
            f.write(line + '\n')
            cout += 1
    print("save file: %s, cout: %d" % (out_file, cout))


input_lines = reader(in_file)
correct_lines = list()
for line in input_lines:
    correct_sent, error_detail = corrector.correct(line)
    print("{}\t{}\t{}".format(line, correct_sent, error_detail))
    correct_lines.append(line + '\t' + correct_sent + '\t' + str(error_detail))
saver(out_file, correct_lines)
Пример #11
0
 def correct():
     line = '少先队员因该为老人让坐'
     # line = '机七学习是人工智能领遇最能体现智能的'
     print('input sentence is:', line)
     print(correct(line))
Пример #12
0
 def test_text3():
     error_sentence_3 = '我们现今所"使用"的大部分舒学符号,你们用的什么婊点符号'
     correct_sent = correct(error_sentence_3)
     print("original sentence:{} => correct sentence:{}".format(
         error_sentence_3, correct_sent))
Пример #13
0
 def test_text2():
     error_sentence_2 = '杭洲是中国的八大古都之一,因风景锈丽,享有“人间天棠”的美誉!'
     correct_sent = correct(error_sentence_2)
     print("original sentence:{} => correct sentence:{}".format(
         error_sentence_2, correct_sent))
Пример #14
0
 def test_text1():
     error_sentence_1 = '机七学习是人工智能领遇最能体现智能的一个分知'
     correct_sent = correct(error_sentence_1)
     print("original sentence:{} => correct sentence:{}".format(
         error_sentence_1, correct_sent))
Пример #15
0
# Brief:
from pycorrector import corrector

error_sentences = [
    '汽车新式在这条路上',
    '中国人工只能布局很不错',
    '想不想在来一次比赛',
    '你不觉的高兴吗',
    '权利的游戏第八季',
    '美食美事皆不可辜负,这场盛会你一定期待已久',
    '点击咨询痣疮是什么原因?咨询医师痣疮原因',
    '附睾焱的症状?要引起注意!',
    '外阴尖锐涅疣怎样治疗?-济群解析',
    '洛阳大华雅思 30天突破雅思7分',
    '男人不育少靖子症如何治疗?专业男科,烟台京城医院',
    '疝気医院那好 疝気专科百科问答',
    '成都医院治扁平苔鲜贵吗_国家2甲医院',
    '少先队员因该为老人让坐',
    '服装店里的衣服各试各样',
    '一只小鱼船浮在平净的河面上',
    '我的家乡是有明的渔米之乡',
    ' _ ,',
    '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物',  # 出租的事
    '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。',  # 题高 => 提高 专业人氏 => 专业人士
    '三个凑皮匠胜过一个诸葛亮也有道理。',  # 凑
    '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。',
]
for line in error_sentences:
    correct_sent = corrector.correct(line)
    print("original sentence:{} => correct sentence:{}".format(line, correct_sent))
Пример #16
0
    lines = list()
    cout = 0
    with open(in_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            text = line.split("\t")[0]
            lines.append(text)
            cout += 1
    print("in file: %s, cout: %d" % (in_file, cout))
    return lines


def saver(out_file, lines):
    cout = 0
    with open(out_file, 'w', encoding='utf-8') as f:
        for line in lines:
            line = line.strip()
            f.write(line + '\n')
            cout += 1
    print("save file: %s, cout: %d" % (out_file, cout))


input_lines = reader(in_file)
correct_lines = list()
for line in input_lines:
    correct_sent, error_detail = corrector.correct(line)
    print("{}\t{}\t{}".format(
        line, correct_sent, error_detail))
    correct_lines.append(line + '\t' + correct_sent + '\t' + str(error_detail))
saver(out_file, correct_lines)