Пример #1
0
def test_char_correct_wrong():
    errors = [
        '她知难而上,沤心沥血,一心扑在舞台上',
        '还有你们看看清除哈',
        '我国人民义愤填鹰',
        '权利的游戏第八季',
        '2周岁22斤宝宝用多大的啊',
        '这个到底有多辣?',
        '所以先救挨饿的人,然后治疗病人。',
        '现在,常常会到听男女平等这个词。',
        '我的喉咙发炎了要买点阿莫细林吃',
        '做的最倒霉的一件事就帮尼哥檫脚。',
        '战士微笑著轻轻拍了拍少年的肩膀。',
        '差点拌到自己的脚。',
        '面对着熙熙嚷嚷的城市。',
        '你等我和老大商却一下。',
        '报应接中迩来。',
        '我心理不由有些忌妒。',
        '他们不需要怕他门没有钱。',
        '全球的产龄妇女总生育率只生下一半,根据调查很有可能一直到2050年产龄妇女总生育率还是减少的趋势。',
        '但现代的妇女所担任的责任已家重,除了家务以外,仍需出外工作补贴家',
        '加上父母亲自己的看法,想原封不动地、完完全全地全部传给子女们',
        '叶子的绿色与本身枝干的颜色都会变为偏较暗的颜色。',
    ]
    for i in errors:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))
Пример #2
0
def correct_sentence(_sentence):
    corrected_sent, detail = pycorrector.correct(_sentence)
    print(">>> corrected_sent:", corrected_sent)
    print(">>> detail:", detail)

    idx_errors = pycorrector.detect(_sentence)
    print(">>> index of errors:", idx_errors)
Пример #3
0
def test_chengyu():
    """测试成语纠错"""
    pycorrector.enable_char_error(enable=False)
    error_sentence_1 = '这块名表带带相传'  # 代代相传
    correct_sent = pycorrector.correct(error_sentence_1)
    print("original sentence:{} => correct sentence:{}".format(
        error_sentence_1, correct_sent))

    error_sentence_1 = '他贰话不说把牛奶喝完了'  # 二话不说
    correct_sent = pycorrector.correct(error_sentence_1)
    print("original sentence:{} => correct sentence:{}".format(
        error_sentence_1, correct_sent))

    # 这家伙还蛮格((恪))尽职守的。
    # 报应接中迩((而))来。
    # 人群穿((川))流不息。
    # 这个消息不径((胫))而走。
    # 眼前的场景美仑((轮))美幻简直超出了人类的想象。
    # 看着这两个人谈笑风声((生))我心理((里))不由有些忌妒。
    # 有了这一番旁证((征))博引。
    x = [
        '这家伙还蛮格尽职守的',
        '报应接中迩来',  # 接踵而来
        '人群穿流不息',
        '这个消息不径而走',
        '这个消息不胫儿走',
        '眼前的场景美仑美幻简直超出了人类的想象',
        '看着这两个人谈笑风声我心理不由有些忌妒',
        '有了这一番旁证博引',
        '有了这一番旁针博引',
    ]

    for i in x:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))

    pycorrector.enable_char_error(enable=True)
    print("-" * 42)
    for i in x:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))
Пример #4
0
def text_corrector(text, confusion: bool = True, non_level="char"):
    """Correct Text

    Check text whether contains wrong word. If `confusion` is True, add 
    customize confusion words(can add more words, file path is 
    `config/ConfusionWords.txt`) that is a file path. Word level can use `char`, 
    `word` or None, `char` is character level which can check single wrong word; 
    None is can't support single word level

    Parameters:
    @type bool, confusion, whether add confusion word
    @type string, non_level, choose a level, `char` close character level, `word`
        close word level. None can support the two level

    Results:
    @type dict, results, wrong words, and those start index

    Examples:
    >>> text_corrector('少先队员因该为老人让坐') # add confusion and use char level
        defaultdict(dict, {0: {'word': '因该', 'start_index': 4}})
    >>> # add confusion and use all level
    >>> text_corrector('少先队员因该为老人让坐', non_level=None)
        {'因该': 4, '坐': 10}
    """
    # add confusion dict
    if confusion:
        pycorrector.set_custom_confusion_dict(CONFUSION_PATH)

    # import ipdb; ipdb.set_trace()
    # get wrong word information
    report = pycorrector.detect(text)
    if len(report) > 0:
        results = []
    else:
        return []

    for index, item in enumerate(report):
        result = {}
        if non_level is None:
            result["word"] = item[0]
            result["start_index"] = item[1]
        elif non_level == "char" and len(item[0]) > 1:
            result["word"] = item[0]
            result["start_index"] = item[1]
        elif non_level == "word" and len(item[0]) == 1:
            result["word"] = item[0]
            result["start_index"] = item[1]

        if result:
            results.append(result)

    return results
Пример #5
0
    '在最近的项目中,我们采用了pycorrector的九错逻辑,如下图所示',
    '针对医学数据训练出来的,基于编辑举例,可自行训练',
    '妹妹走之前还得给他再个新电脑',
    '项目做的比较急,吊唁的package不多,如果有更好的方案,求告知,谢谢啦!',
    '一只小鱼船夫在平静的河面上',
    '贸易战会不会影像中美关系呢',
    '使用预言模型计算句子或序列的合理性',
    '这就是报应,赤-裸-裸的报应啊!',
    '搜索关注【落红小说】微亅信丨公亅众丨号 回复',
    '关注微·信·公·众·号【 南北书院 】回复书号',
    '保险公司:流程出错一毛不陪',
    ' ',
    '这是人工智能的一个分知',
    '我的家乡是有名的玉米之乡',
    '老师工作非常幸苦,我们要遵敬老师',
    'nihao, 耐得住欲妄',
    '一阙词牌名',
    '我兴高彩列地去公园游玩',
    '吹唐人记忆',
    ' 耐得住欲妄',
    '母子平爱',
    '不由的感叹道',
]

error_sentences.extend(text_list)
for line in error_sentences:
    print(pycorrector.detect(line))
    correct_sent = pycorrector.correct(line)
    print("original sentence:{} => correct sentence:{}".format(
        line, correct_sent))
Пример #6
0
def demo2():
    for i in x:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))
Пример #7
0
def demo1():
    for i in text:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))
Пример #8
0
    '附睾焱的症状?要引起注意!',
    '外阴尖锐涅疣怎样治疗?-济群解析',
    '洛阳大华雅思 30天突破雅思7分',
    '男人不育少靖子症如何治疗?专业男科,烟台京城医院',
    '疝気医院那好 疝気专科百科问答',
    '成都医院治扁平苔鲜贵吗_国家2甲医院',
    '少先队员因该为老人让坐',
    '服装店里的衣服各试各样',
    '一只小鱼船浮在平净的河面上',
    '我的家乡是有明的渔米之乡',
    ' _ ,',
    '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物',  # 出租的事
    '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。',  # 题高 => 提高 专业人氏 => 专业人士
    '三个凑皮匠胜过一个诸葛亮也有道理。',  # 凑
    '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。',
]
t1 = time.time()
for i in range(3):
    for line in error_sentences:
        idx_errors = detect(line)
t2 = time.time()
print('[detect] spend time: %f s' % (t2 - t1))

for i in range(3):
    for line in error_sentences:
        correct_sent = correct(line)
t3 = time.time()
print('[correct] spend time: %f s' % (t3 - t2))
# spend time: 1.497331 s
# spend time: 10.858631 s
Пример #9
0
def test1():
    for i in text:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))
Пример #10
0
    '我对于宠物出租得事非常认同',
    '天地无垠大,我们的舞台无线大',
    '交通先行了怎么过去啊?',
]
# for line in error_sentences:
#     idx_errors = pycorrector.detect(line)
#     print(idx_errors)
#
#     correct_sent = pycorrector.correct(line)
#     print("original sentence:{} => correct sentence:{}".format(line, correct_sent))

print('*' * 53)
pycorrector.set_custom_confusion_dict(path='./my_confusion.txt')
pycorrector.set_custom_word(path='./my_custom_word.txt')
for line in error_sentences:
    idx_errors = pycorrector.detect(line)
    print(idx_errors)
    correct_sent = pycorrector.correct(line)
    print("original sentence:{} => correct sentence:{}".format(
        line, correct_sent))

# original sentence:买iPhone差,要多少钱 => correct sentence:('买iPhone此,要多少钱', [['差', '此', 7, 8]])
# original sentence:我想喝小明同学。 => correct sentence:('我像喝晓明同学。', [['我想', '我像', 0, 2], ['喝小明', '喝晓明', 2, 5]])
# original sentence:哪里卖苹果吧?请大叔给我让坐 => correct sentence:('哪里卖苹果吧?请大叔给我让座', [['坐', '座', 13, 14]])
# original sentence:我对于宠物出租得事非常认同 => correct sentence:('我对于宠物出租得是非常认同', [['得事', '得是', 7, 9]])
# original sentence:天地无垠大,我们的舞台无线大 => correct sentence:('天地无限贷,我们的舞台无限大', [['天地无垠', '天地无限', 0, 4], ['大', '贷', 4, 5], ['无线大', '无限大', 11, 14]])
# original sentence:交通先行了怎么过去啊? => correct sentence:('交通先行了怎么过去啊?', [])
# *****************************************************
# original sentence:买iPhone差,要多少钱 => correct sentence:('买iphoneX,要多少钱', [['iphone差', 'iphoneX', 1, 8]])
# original sentence:我想喝小明同学。 => correct sentence:('我想喝小茗同学。', [['小明同学', '小茗同学', 3, 7]])
# original sentence:哪里卖苹果吧?请大叔给我让坐 => correct sentence:('哪里卖苹果八?请大叔给我让座', [['苹果吧', '苹果八', 3, 6], ['坐', '座', 13, 14]])
Пример #11
0
def test_sent2():
    for line in error_sentences1:
        print(line)
        print(pycorrector.detect(line))
        print(pycorrector.correct(line))
Пример #12
0
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief:
import pycorrector
from pycorrector.tokenizer import segment

c = pycorrector.get_same_pinyin('长')
print('get_same_pinyin:', c)

c = pycorrector.get_same_stroke('长')
print('get_same_stroke:', c)

freq = pycorrector.word_frequency('龟龙麟凤')
print('freq:', freq)

sent = '少先队员应该为老人让座'
sent_seg = segment(sent)
ppl = pycorrector.ppl_score(sent_seg)
print('ppl_score:', ppl)

sent = '少先队员因该为老人让坐'
sent_seg = segment(sent)
ppl = pycorrector.ppl_score(sent_seg)
print('ppl_score:', ppl)

print(pycorrector.detect(sent))

corrected_sent, detail = pycorrector.correct('少先队员因该为老人让坐')
print(corrected_sent, detail)
Пример #13
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

import sys
sys.path.append("../")

import tracemalloc
tracemalloc.start(10)
time1 = tracemalloc.take_snapshot()

import pycorrector

idx_errors = pycorrector.detect('少先队员因该为老人让坐')
print(idx_errors)


time2 = tracemalloc.take_snapshot()
stats = time2.compare_to(time1, 'lineno')
print('*'*32)
for stat in stats[:3]:
    print(stat)

stats = time2.compare_to(time1, 'traceback')
print('*'*32)
for stat in stats[:3]:
    print(stat.traceback.format())
Пример #14
0
def demo():
    idx_errors = pycorrector.detect('少先队员因该为老人让坐')
    print(idx_errors)
def test2():
    for i in x:
        print(i, pycorrector.detect(i))
        print(i, pycorrector.correct(i))
Пример #16
0
    ngrams, total = km.read_ngrams()
    ngrams_2 = km.filter_ngrams(ngrams, total, min_pmi=[0, 1, 3, 5])

    # 新词发现
    km.word_discovery(ngrams_2)
    '''
        智能纠错
    '''
    # 加载模型
    km.model = km.load_model(km.klm_file)

    # n-grams读入
    ngrams, total = km.read_ngrams()
    ngrams_2 = km.filter_ngrams(ngrams, total, min_pmi=[0, 1, 3, 5])

    # 纠错
    import pycorrector
    from pypinyin import lazy_pinyin, Style
    sentence = '这瓶洗棉奶用着狠不错'
    idx_errors = pycorrector.detect(sentence)

    correct = []
    for ide in idx_errors:
        right_word = km.find_best_word(ide[0], ngrams_2, freqs=0)
        if right_word != ide[0]:
            correct.append([right_word] + ide)

    print('错误:', idx_errors)
    print('pycorrector的结果:', pycorrector.correct(sentence))
    print('kenlm的结果:', correct)