def evaluateJamspell(modelFile, testText, alphabetFile, maxWords=50000): utils.loadAlphabet(alphabetFile) corrector = JamspellCorrector(modelFile) random.seed(42) originalText = loadText(testText) erroredText = generateTypos(originalText) assert len(originalText) == len(erroredText) originalSentences = generateSentences(originalText) erroredSentences = generateSentences(erroredText) errorsRate, fixRate, broken, topNerr, topNfix, execTime = \ evaluateCorrector('jamspell', corrector, originalSentences, erroredSentences, maxWords) return errorsRate, fixRate, broken, topNerr, topNfix
def train(self, trainFile): print '[info] loading text' text = loadText(trainFile) sentences = generateSentences(text) sentences = self.convertToIDs(sentences) print '[info] generating N-grams', len(sentences) total = len(sentences) lastTime = time.time() for i in xrange(0, total): sentence = sentences[i] for w in sentence: self.gram1[w] += 1 self.totalWords += 1 for j in xrange(len(sentence) - 1): self.gram2[(sentence[j], sentence[j+1])] += 1 for j in xrange(len(sentence) - 2): self.gram3[(sentence[j], sentence[j+1], sentence[j+2])] += 1 if time.time() - lastTime >= 4.0: lastTime = time.time() print '[info] processed %.2f%%' % (100.0 * i / total) print '[info] finished training'
def main(): parser = argparse.ArgumentParser( description='spelling correctors evaluation') parser.add_argument('file', type=str, help='text file to use for evaluation') parser.add_argument('-hs', '--hunspell', type=str, help='path to hunspell model') parser.add_argument('-ns', '--norvig', type=str, help='path to train file for Norvig spell corrector') parser.add_argument('-cs', '--context', type=str, help='path to context spell model') parser.add_argument('-csp', '--context_prototype', type=str, help='path to context spell prototype model') parser.add_argument('-jsp', '--jamspell', type=str, help='path to jamspell model file') parser.add_argument('-t', '--test', action="store_true") parser.add_argument('-mx', '--max_words', type=int, help='max words to evaluate') parser.add_argument('-a', '--alphabet', type=str, help='alphabet file') args = parser.parse_args() if args.alphabet: utils.loadAlphabet(args.alphabet) correctors = { 'dummy': DummyCorrector(), } # corrector = correctors['dummy'] maxWords = args.max_words print('[info] loading models') if args.hunspell: corrector = correctors['hunspell'] = HunspellCorrector(args.hunspell) if args.norvig: corrector = correctors['norvig'] = NorvigCorrector(args.norvig) if args.context: corrector = correctors['context'] = ContextCorrector(args.context) if args.context_prototype: corrector = correctors['prototype'] = ContextPrototypeCorrector( args.context_prototype) if args.jamspell: corrector = correctors['jamspell'] = JamspellCorrector(args.jamspell) if args.test: return testMode(corrector) random.seed(42) print('[info] loading text') originalText = loadText(args.file) originalTextLen = len(list(originalText)) print('[info] generating typos') #将原始的词随机修改,并以单个词的集合-列表返回 erroredText = generateTypos(originalText) erroredTextLen = len(list(erroredText)) assert originalTextLen == erroredTextLen #将原始文本分割成句子(去掉其中的非法符号和非句号)(不包含句号) originalSentences = generateSentences(originalText) erroredSentences = generateSentences(erroredText) assert len(originalSentences) == len(erroredSentences) # for s in originalSentences[:50]: # print ' '.join(s) + '.' print('[info] total words: %d' % len(originalText)) print('[info] evaluating') results = {} for correctorName, corrector in correctors.items(): errorsRate, fixRate, broken, topNerr, topNfix, execTime = \ evaluateCorrector(correctorName, corrector, originalSentences, erroredSentences, maxWords) results[ correctorName] = errorsRate, fixRate, broken, topNerr, topNfix, execTime print('') print('[info] %12s %8s %8s %8s %8s %8s %8s' % ('', 'errRate', 'fixRate', 'broken', 'topNerr', 'topNfix', 'time')) # 将多个打分器的结果 resultsfixRate从大到小排序打印出来 # 匿名函数 ~ 将x替换为results.items()即就是results.items[i][1] for k, _ in sorted(results.items(), key=lambda x: x[1]): print('[info] %10s %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2f%% %8.2fs' % \ (k, 100.0 * results[k][0], 100.0 * results[k][1], 100.0 * results[k][2], 100.0 * results[k][3], 100.0 * results[k][4], results[k][5]))