def main(**kwargs): """ Cmd script of correct. Input text file, output corrected text file. :param kwargs: input, a text file object that will be read from. Should contain utf-8 sentence per line :param output: a text file object where parsed output will be written. Parsed output will be similar to CSV data :type input: text file object in read mode :type output: text file object in write mode :return: """ no_char = kwargs['no_char'] if 'no_char' in kwargs else False if no_char: pycorrector.enable_char_error(enable=False) print('disable char error detect.') detail = kwargs['detail'] if 'detail' in kwargs else False count = 0 with open(kwargs['input'], 'r', encoding='utf-8') as fr, open(kwargs['output'], 'w', encoding='utf-8') as fw: for line in fr: line = line.strip() corrected_sent, info = pycorrector.correct(line) count += 1 r = corrected_sent if detail: r = corrected_sent + '\t' + str(info) fw.write(line + '\t' + r + '\n') print('{} lines in output'.format(count))
def test_brand(): """测试品牌名纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '买衣服就到拼哆哆' # 拼多多 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent)) error_sentence_1 = '这个特仑素牛奶喝起来还不错吧' # 特仑苏 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent))
def test_suyu(): """测试俗语纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '这衣服买给她吧,也是肥水步流外人田' # 肥水不流外人田 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent)) error_sentence_1 = '这么多字让他写也是赶鸭子打架' # 赶鸭子上架 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent))
def test_disease(): """测试疾病名纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent)) error_sentence_1 = '有个药名叫硫酸氢录吡各雷片能治疗高血压' # 硫酸氢氯吡格雷片 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent))
def test_chengyu(): """测试成语纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '这块名表带带相传' # 代代相传 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent)) error_sentence_1 = '他贰话不说把牛奶喝完了' # 二话不说 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent)) # 这家伙还蛮格((恪))尽职守的。 # 报应接中迩((而))来。 # 人群穿((川))流不息。 # 这个消息不径((胫))而走。 # 眼前的场景美仑((轮))美幻简直超出了人类的想象。 # 看着这两个人谈笑风声((生))我心理((里))不由有些忌妒。 # 有了这一番旁证((征))博引。 x = [ '这家伙还蛮格尽职守的', '报应接中迩来', # 接踵而来 '人群穿流不息', '这个消息不径而走', '这个消息不胫儿走', '眼前的场景美仑美幻简直超出了人类的想象', '看着这两个人谈笑风声我心理不由有些忌妒', '有了这一番旁证博引', '有了这一番旁针博引', ] for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i)) pycorrector.enable_char_error(enable=True) print("-" * 42) for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append("../") import pycorrector if __name__ == '__main__': error_sentence_1 = '我的喉咙发炎了要买点阿莫细林吃' pycorrector.enable_char_error(enable=False) correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent))