class SpellCorrect: """Holds edit model, language model, corpus. trains""" def __init__(self, lm, corpus): """initializes the language model.""" self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus) def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctSentence(self, sentence): """Takes a list of words, returns a corrected list of words.""" if len(sentence) == 0: return [] argmax_i = 0 argmax_w = sentence[0] maxscore = float('-inf') maxlm = float('-inf') maxedit = float('-inf') # skip start and end tokens for i in range(1, len(sentence) - 1): word = sentence[i] editProbs = self.editModel.editProbabilities(word) for alternative, editscore in editProbs.iteritems(): if alternative == word: continue sentence[i] = alternative lmscore = self.languageModel.score(sentence) if editscore != 0: editscore = math.log(editscore) else: editscore = float('-inf') score = lmscore + editscore if score >= maxscore: maxscore = score maxlm = lmscore maxedit = editscore argmax_i = i argmax_w = alternative sentence[i] = word # restores sentence to original state before moving on argmax = list(sentence) # copy it argmax[argmax_i] = argmax_w # correct it return argmax def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) # List<String> word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] #copy of sentence bestScore = float('-inf') for i in range(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence # Get the 2-dimension edits which includes word and corresponding score edits = self.editModel.editProbabilities(sentence[i]) # Try every word in edits for word,score in edits: try_wordscore = score try_word = word newSentence = sentence[:i] + [try_word] + sentence[(i+1):] # Recalculate the total score of new sentence total_try_score = self.languageModel.score(newSentence) + try_wordscore if total_try_score > bestScore: # Replace with the try_word bestSentence = newSentence bestScore = total_try_score return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Holds edit model, language model, corpus. trains""" def __init__(self, lm, corpus): """initializes the language model.""" self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus) def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctSentence(self, sentence): """Takes a list of words, returns a corrected list of words.""" if len(sentence) == 0: return [] argmax_i = 0 argmax_w = sentence[0] maxscore = float('-inf') maxlm = float('-inf') maxedit = float('-inf') # skip start and end tokens for i in range(1, len(sentence) - 1): word = sentence[i] editProbs = self.editModel.editProbabilities(word) for alternative, editscore in editProbs.iteritems(): if alternative == word: continue sentence[i] = alternative lmscore = self.languageModel.score(sentence) if editscore != 0: editscore = math.log(editscore) else: editscore = float('-inf') score = lmscore + editscore if score >= maxscore: maxscore = score maxlm = lmscore maxedit = editscore argmax_i = i argmax_w = alternative sentence[ i] = word # restores sentence to original state before moving on argmax = list(sentence) # copy it argmax[argmax_i] = argmax_w # correct it return argmax def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) # List<String> word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] #bestSentence = sentence[:] #copy of sentence trySentence = sentence[:] #copy of sentence bestScore = float('-inf') # checking original sentence score: #print self.languageModel.score(bestSentence) for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence # checking contents : # print self.editModel.editProbabilities(sentence[i]) for w, p in self.editModel.editProbabilities(sentence[i]): trySentence[i] = w if self.languageModel.score( trySentence) + p > bestScore: # p : channel model bestScore = self.languageModel.score( trySentence ) + p # self.languageModel.score(trySentence) : prior bestSentence = trySentence[:] trySentence[i] = sentence[i] #pass # if True: #bestSentence != sentence: # print self.languageModel.score(sentence), ' '.join(sentence) # print self.languageModel.score(bestSentence),' '.join(bestSentence) # print return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] #copy of sentence bestScore = float('-inf') for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # COMPLETED: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence trialSentence = sentence[:] # make a copy word = trialSentence[i] # pick the target word to try other edits # iterate through all possible edits for the target word # iterable items (key-value tuples) are returned by editProbabilities() currentEditProbabilities = self.editModel.editProbabilities(word) for edit in currentEditProbabilities: ''' For each edit: Let x be the wrong word, w be the correct word Let W be the correct sentence We have P(x->w) = P(x|W) * P(W) Then log(P(x->w)) = log(P(x|W)) + log(P(W)) - Here, log(P(W)) is the score for the sentence generated by the score() function of each language model - log(P(x|W)) are the values returned by editProbabilities() function (check EditModel.py) ''' trialSentence[i] = edit[0] score = self.languageModel.score(trialSentence) + edit[1] if score >= bestScore: bestScore = score bestSentence = trialSentence[:] return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] #copy of sentence potentialSentence = [] bestScore = float('-inf') # skip start and end tokens for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence begin = list( sentence[:i] ) #the starting part of the sentence have to be same except index i end = list( sentence[i + 1:] ) #the ending part of the sentence have to be same except index i editProbs = self.editModel.editProbabilities( sentence[i] ) #list of the tuples of (correction,log(P(misspelling|correction))) for elem in editProbs: correction, probability = elem #unpack the values from the tuple potentialSentence = list( begin + [correction] + end) #the new sentence with the prospected correction score = self.languageModel.score( potentialSentence ) + probability #gets the P(x|W) * P(W) 's log value if score > bestScore: #we modify only when score is strictly greater than the best Score bestScore = score bestSentence = potentialSentence return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] #bestSentence = sentence[:] #copy of sentence trySentence = sentence[:] #copy of sentence bestScore = float('-inf') # checking original sentence score: #print self.languageModel.score(bestSentence) for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence # checking contents : # print self.editModel.editProbabilities(sentence[i]) for w, p in self.editModel.editProbabilities(sentence[i]): trySentence[i] = w if self.languageModel.score(trySentence) + p > bestScore: # p : channel model bestScore = self.languageModel.score(trySentence) + p # self.languageModel.score(trySentence) : prior bestSentence = trySentence[:] trySentence[i] = sentence[i] #pass # if True: #bestSentence != sentence: # print self.languageModel.score(sentence), ' '.join(sentence) # print self.languageModel.score(bestSentence),' '.join(bestSentence) # print return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] # get a copy of sentence for comparison bestScore = float('-inf') # bestScore for comparing sentence score for i in xrange(1, len(sentence) - 1): # ignore <s> and </s> editProb = self.editModel.editProbabilities( sentence[i]) # list of canditate word # iterate each of returned candidate edits for current word in the sentence for j in xrange(0, len(editProb)): # loop through each candidate word tempSentence = list( sentence) # make a copy of original sentence tempSentence[i] = editProb[j][ 0] # taking a word out and replacing it with a candidate edit # calculate newScore and compare newScore with bestScore to find bestSentence newScore = self.languageModel.score( tempSentence) + editProb[j][1] if newScore > bestScore: bestSentence = tempSentence bestScore = newScore return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output