-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpellCorrect.py
129 lines (111 loc) · 4.63 KB
/
SpellCorrect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
##
# Credit Zhou Yu, Dan Jurafsky, Peter Norvig
# Updated Kevin Jesse
# Open source code under MIT license
##
import math
from Datum import Datum
from Sentence import Sentence
from Corpus import Corpus
from UniformModel import UniformModel
from UnigramModel import UnigramModel
from BackoffModel import BackoffModel
from SmoothUnigramModel import SmoothUnigramModel
from SmoothBigramModel import SmoothBigramModel
from CustomModel import CustomModel
from EditModel import EditModel
from SpellingResult import SpellingResult
import types
import re, collections
class SpellCorrect:
"""Spelling corrector for sentences. Holds edit model, language model and the corpus."""
def __init__(self, lm, corpus):
self.languageModel = lm
self.editModel = EditModel('data/count_1edit.txt', corpus)
def correctSentence(self, sentence):
"""Assuming exactly one error per sentence, returns the most probable corrected sentence.
Sentence is a list of words."""
if len(sentence) == 0:
return []
bestSentence = sentence[:] #copy of sentence
bestScore = float('-inf')
for i in xrange(1, len(sentence) - 1): #ignore <s> and </s>
# TODO: select the maximum probability sentence here, according to the noisy channel model.
# Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
word = sentence[i];
res_edit_list = self.editModel.editProbabilities(word)
# You should iterate through these values instead of enumerating all edits.
for j in res_edit_list:
return_correction = j[0];
return_score = j[1];
new_score = self.languageModel.score(sentence[:i] + [return_correction] + sentence[i+1:]) + return_score
if(new_score > bestScore):
bestScore = new_score;
bestSentence = sentence[:i] + [return_correction] + sentence[i+1:]
# Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
return bestSentence
def evaluate(self, corpus):
"""Tests this speller on a corpus, returns a SpellingResult"""
numCorrect = 0
numTotal = 0
testData = corpus.generateTestCases()
for sentence in testData:
if sentence.isEmpty():
continue
errorSentence = sentence.getErrorSentence()
hypothesis = self.correctSentence(errorSentence)
if sentence.isCorrection(hypothesis):
numCorrect += 1
numTotal += 1
return SpellingResult(numCorrect, numTotal)
def correctCorpus(self, corpus):
"""Corrects a whole corpus, returns a JSON representation of the output."""
string_list = [] # we will join these with commas, bookended with []
sentences = corpus.corpus
for sentence in sentences:
uncorrected = sentence.getErrorSentence()
corrected = self.correctSentence(uncorrected)
word_list = '["%s"]' % '","'.join(corrected)
string_list.append(word_list)
output = '[%s]' % ','.join(string_list)
return output
def main():
"""Trains all of the language models and tests them on the dev data. Change devPath if you
wish to do things like test on the training data."""
trainPath = 'data/tagged-train.dat'
trainingCorpus = Corpus(trainPath)
devPath = 'data/tagged-dev.dat'
devCorpus = Corpus(devPath)
print 'Unigram Language Model: '
unigramLM = UnigramModel(trainingCorpus)
unigramSpell = SpellCorrect(unigramLM, trainingCorpus)
unigramOutcome = unigramSpell.evaluate(devCorpus)
print str(unigramOutcome)
print 'Uniform Language Model: '
uniformLM = UniformModel(trainingCorpus)
uniformSpell = SpellCorrect(uniformLM, trainingCorpus)
uniformOutcome = uniformSpell.evaluate(devCorpus)
print str(uniformOutcome)
print 'Smooth Unigram Language Model: '
smoothUnigramLM = SmoothUnigramModel(trainingCorpus)
smoothUnigramSpell = SpellCorrect(smoothUnigramLM, trainingCorpus)
smoothUnigramOutcome = smoothUnigramSpell.evaluate(devCorpus)
print str(smoothUnigramOutcome)
print 'Smooth Bigram Language Model: '
smoothBigramLM = SmoothBigramModel(trainingCorpus)
smoothBigramSpell = SpellCorrect(smoothBigramLM, trainingCorpus)
smoothBigramOutcome = smoothBigramSpell.evaluate(devCorpus)
print str(smoothBigramOutcome)
print 'Backoff Language Model: '
backoffLM = BackoffModel(trainingCorpus)
backoffSpell = SpellCorrect(backoffLM, trainingCorpus)
backoffOutcome = backoffSpell.evaluate(devCorpus)
print str(backoffOutcome)
#
print 'Custom Language Model: '
customLM = CustomModel(trainingCorpus)
customSpell = SpellCorrect(customLM, trainingCorpus)
customOutcome = customSpell.evaluate(devCorpus)
print str(customOutcome)
if __name__ == "__main__":
main()