def test_char_correct_wrong(): errors = [ '她知难而上,沤心沥血,一心扑在舞台上', '还有你们看看清除哈', '我国人民义愤填鹰', '权利的游戏第八季', '2周岁22斤宝宝用多大的啊', '这个到底有多辣?', '所以先救挨饿的人,然后治疗病人。', '现在,常常会到听男女平等这个词。', '我的喉咙发炎了要买点阿莫细林吃', '做的最倒霉的一件事就帮尼哥檫脚。', '战士微笑著轻轻拍了拍少年的肩膀。', '差点拌到自己的脚。', '面对着熙熙嚷嚷的城市。', '你等我和老大商却一下。', '报应接中迩来。', '我心理不由有些忌妒。', '他们不需要怕他门没有钱。', '全球的产龄妇女总生育率只生下一半,根据调查很有可能一直到2050年产龄妇女总生育率还是减少的趋势。', '但现代的妇女所担任的责任已家重,除了家务以外,仍需出外工作补贴家', '加上父母亲自己的看法,想原封不动地、完完全全地全部传给子女们', '叶子的绿色与本身枝干的颜色都会变为偏较暗的颜色。', ] for i in errors: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
def correct_sentence(_sentence): corrected_sent, detail = pycorrector.correct(_sentence) print(">>> corrected_sent:", corrected_sent) print(">>> detail:", detail) idx_errors = pycorrector.detect(_sentence) print(">>> index of errors:", idx_errors)
def test_chengyu(): """测试成语纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '这块名表带带相传' # 代代相传 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent)) error_sentence_1 = '他贰话不说把牛奶喝完了' # 二话不说 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent)) # 这家伙还蛮格((恪))尽职守的。 # 报应接中迩((而))来。 # 人群穿((川))流不息。 # 这个消息不径((胫))而走。 # 眼前的场景美仑((轮))美幻简直超出了人类的想象。 # 看着这两个人谈笑风声((生))我心理((里))不由有些忌妒。 # 有了这一番旁证((征))博引。 x = [ '这家伙还蛮格尽职守的', '报应接中迩来', # 接踵而来 '人群穿流不息', '这个消息不径而走', '这个消息不胫儿走', '眼前的场景美仑美幻简直超出了人类的想象', '看着这两个人谈笑风声我心理不由有些忌妒', '有了这一番旁证博引', '有了这一番旁针博引', ] for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i)) pycorrector.enable_char_error(enable=True) print("-" * 42) for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
def text_corrector(text, confusion: bool = True, non_level="char"): """Correct Text Check text whether contains wrong word. If `confusion` is True, add customize confusion words(can add more words, file path is `config/ConfusionWords.txt`) that is a file path. Word level can use `char`, `word` or None, `char` is character level which can check single wrong word; None is can't support single word level Parameters: @type bool, confusion, whether add confusion word @type string, non_level, choose a level, `char` close character level, `word` close word level. None can support the two level Results: @type dict, results, wrong words, and those start index Examples: >>> text_corrector('少先队员因该为老人让坐') # add confusion and use char level defaultdict(dict, {0: {'word': '因该', 'start_index': 4}}) >>> # add confusion and use all level >>> text_corrector('少先队员因该为老人让坐', non_level=None) {'因该': 4, '坐': 10} """ # add confusion dict if confusion: pycorrector.set_custom_confusion_dict(CONFUSION_PATH) # import ipdb; ipdb.set_trace() # get wrong word information report = pycorrector.detect(text) if len(report) > 0: results = [] else: return [] for index, item in enumerate(report): result = {} if non_level is None: result["word"] = item[0] result["start_index"] = item[1] elif non_level == "char" and len(item[0]) > 1: result["word"] = item[0] result["start_index"] = item[1] elif non_level == "word" and len(item[0]) == 1: result["word"] = item[0] result["start_index"] = item[1] if result: results.append(result) return results
'在最近的项目中,我们采用了pycorrector的九错逻辑,如下图所示', '针对医学数据训练出来的,基于编辑举例,可自行训练', '妹妹走之前还得给他再个新电脑', '项目做的比较急,吊唁的package不多,如果有更好的方案,求告知,谢谢啦!', '一只小鱼船夫在平静的河面上', '贸易战会不会影像中美关系呢', '使用预言模型计算句子或序列的合理性', '这就是报应,赤-裸-裸的报应啊!', '搜索关注【落红小说】微亅信丨公亅众丨号 回复', '关注微·信·公·众·号【 南北书院 】回复书号', '保险公司:流程出错一毛不陪', ' ', '这是人工智能的一个分知', '我的家乡是有名的玉米之乡', '老师工作非常幸苦,我们要遵敬老师', 'nihao, 耐得住欲妄', '一阙词牌名', '我兴高彩列地去公园游玩', '吹唐人记忆', ' 耐得住欲妄', '母子平爱', '不由的感叹道', ] error_sentences.extend(text_list) for line in error_sentences: print(pycorrector.detect(line)) correct_sent = pycorrector.correct(line) print("original sentence:{} => correct sentence:{}".format( line, correct_sent))
def demo2(): for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
def demo1(): for i in text: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
'附睾焱的症状?要引起注意!', '外阴尖锐涅疣怎样治疗?-济群解析', '洛阳大华雅思 30天突破雅思7分', '男人不育少靖子症如何治疗?专业男科,烟台京城医院', '疝気医院那好 疝気专科百科问答', '成都医院治扁平苔鲜贵吗_国家2甲医院', '少先队员因该为老人让坐', '服装店里的衣服各试各样', '一只小鱼船浮在平净的河面上', '我的家乡是有明的渔米之乡', ' _ ,', '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物', # 出租的事 '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。', # 题高 => 提高 专业人氏 => 专业人士 '三个凑皮匠胜过一个诸葛亮也有道理。', # 凑 '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。', ] t1 = time.time() for i in range(3): for line in error_sentences: idx_errors = detect(line) t2 = time.time() print('[detect] spend time: %f s' % (t2 - t1)) for i in range(3): for line in error_sentences: correct_sent = correct(line) t3 = time.time() print('[correct] spend time: %f s' % (t3 - t2)) # spend time: 1.497331 s # spend time: 10.858631 s
def test1(): for i in text: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
'我对于宠物出租得事非常认同', '天地无垠大,我们的舞台无线大', '交通先行了怎么过去啊?', ] # for line in error_sentences: # idx_errors = pycorrector.detect(line) # print(idx_errors) # # correct_sent = pycorrector.correct(line) # print("original sentence:{} => correct sentence:{}".format(line, correct_sent)) print('*' * 53) pycorrector.set_custom_confusion_dict(path='./my_confusion.txt') pycorrector.set_custom_word(path='./my_custom_word.txt') for line in error_sentences: idx_errors = pycorrector.detect(line) print(idx_errors) correct_sent = pycorrector.correct(line) print("original sentence:{} => correct sentence:{}".format( line, correct_sent)) # original sentence:买iPhone差,要多少钱 => correct sentence:('买iPhone此,要多少钱', [['差', '此', 7, 8]]) # original sentence:我想喝小明同学。 => correct sentence:('我像喝晓明同学。', [['我想', '我像', 0, 2], ['喝小明', '喝晓明', 2, 5]]) # original sentence:哪里卖苹果吧?请大叔给我让坐 => correct sentence:('哪里卖苹果吧?请大叔给我让座', [['坐', '座', 13, 14]]) # original sentence:我对于宠物出租得事非常认同 => correct sentence:('我对于宠物出租得是非常认同', [['得事', '得是', 7, 9]]) # original sentence:天地无垠大,我们的舞台无线大 => correct sentence:('天地无限贷,我们的舞台无限大', [['天地无垠', '天地无限', 0, 4], ['大', '贷', 4, 5], ['无线大', '无限大', 11, 14]]) # original sentence:交通先行了怎么过去啊? => correct sentence:('交通先行了怎么过去啊?', []) # ***************************************************** # original sentence:买iPhone差,要多少钱 => correct sentence:('买iphoneX,要多少钱', [['iphone差', 'iphoneX', 1, 8]]) # original sentence:我想喝小明同学。 => correct sentence:('我想喝小茗同学。', [['小明同学', '小茗同学', 3, 7]]) # original sentence:哪里卖苹果吧?请大叔给我让坐 => correct sentence:('哪里卖苹果八?请大叔给我让座', [['苹果吧', '苹果八', 3, 6], ['坐', '座', 13, 14]])
def test_sent2(): for line in error_sentences1: print(line) print(pycorrector.detect(line)) print(pycorrector.correct(line))
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: import pycorrector from pycorrector.tokenizer import segment c = pycorrector.get_same_pinyin('长') print('get_same_pinyin:', c) c = pycorrector.get_same_stroke('长') print('get_same_stroke:', c) freq = pycorrector.word_frequency('龟龙麟凤') print('freq:', freq) sent = '少先队员应该为老人让座' sent_seg = segment(sent) ppl = pycorrector.ppl_score(sent_seg) print('ppl_score:', ppl) sent = '少先队员因该为老人让坐' sent_seg = segment(sent) ppl = pycorrector.ppl_score(sent_seg) print('ppl_score:', ppl) print(pycorrector.detect(sent)) corrected_sent, detail = pycorrector.correct('少先队员因该为老人让坐') print(corrected_sent, detail)
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append("../") import tracemalloc tracemalloc.start(10) time1 = tracemalloc.take_snapshot() import pycorrector idx_errors = pycorrector.detect('少先队员因该为老人让坐') print(idx_errors) time2 = tracemalloc.take_snapshot() stats = time2.compare_to(time1, 'lineno') print('*'*32) for stat in stats[:3]: print(stat) stats = time2.compare_to(time1, 'traceback') print('*'*32) for stat in stats[:3]: print(stat.traceback.format())
def demo(): idx_errors = pycorrector.detect('少先队员因该为老人让坐') print(idx_errors)
def test2(): for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
ngrams, total = km.read_ngrams() ngrams_2 = km.filter_ngrams(ngrams, total, min_pmi=[0, 1, 3, 5]) # 新词发现 km.word_discovery(ngrams_2) ''' 智能纠错 ''' # 加载模型 km.model = km.load_model(km.klm_file) # n-grams读入 ngrams, total = km.read_ngrams() ngrams_2 = km.filter_ngrams(ngrams, total, min_pmi=[0, 1, 3, 5]) # 纠错 import pycorrector from pypinyin import lazy_pinyin, Style sentence = '这瓶洗棉奶用着狠不错' idx_errors = pycorrector.detect(sentence) correct = [] for ide in idx_errors: right_word = km.find_best_word(ide[0], ngrams_2, freqs=0) if right_word != ide[0]: correct.append([right_word] + ide) print('错误:', idx_errors) print('pycorrector的结果:', pycorrector.correct(sentence)) print('kenlm的结果:', correct)