def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') if partId in [1,2]: editModel = EditModel('../data/count_1edit.txt', trainCorpus) return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")]) else: testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId in [3,4]: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId in [5,6]: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId in [7,8]: lm = StupidBackoffLanguageModel(trainCorpus) elif partId in [9,10]: lm = CustomLanguageModel(trainCorpus) else: print 'Unknown partId: " + partId' return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
def scan_edits( edits_file ): print >> sys.stderr, "Processing "+edits_file editmodel = EditModel('') edit_probs = Counter() edits1 = read_edit1s(edits_file) print >> sys.stderr, "Counting" for error,correct in edits1: count_chars(correct) v, edit_types = editmodel.get_edits( correct, error ) edit_types = set(edit_types) edit_types = [each for each in edit_types if each[0] != editmodel.nc] edit_probs.update(edit_types) num_char_unigrams = len(char_counts) print >> sys.stderr, "Normalizing" norm_edit_probs = {} for kind,str in edit_probs.keys(): if kind == editmodel.dl: norm_edit_probs[(kind,str)] = (edit_probs[(kind,str)] + 1.0)/(get_char_bigram_count(str) + num_char_unigrams + 1) elif kind == editmodel.ins: norm_edit_probs[(kind,str)] = (edit_probs[(kind,str)] + 1.0)/(get_char_unigram_count(str[0]) + num_char_unigrams + 1) elif kind == editmodel.sub: #If this is a substitution, reverse the characters because of bug in get_edits norm_edit_probs[(kind,str[::-1])] = (edit_probs[(kind,str)] + 1.0)/(get_char_unigram_count(str[0]) + num_char_unigrams + 1) elif kind == editmodel.trs: norm_edit_probs[(kind,str)] = (edit_probs[(kind,str)] + 1.0)/(get_char_bigram_count(str) + num_char_unigrams + 1) print >> sys.stderr, "Writing to file - edits_model" serialize_data(norm_edit_probs, 'edit_model') serialize_data(dict(char_counts), 'char_unigram_model') serialize_data(dict(char_bigram_counts), 'char_bigram_model')
def scan_edits(edits_file): print >> sys.stderr, "Processing " + edits_file editmodel = EditModel('') edit_probs = Counter() edits1 = read_edit1s(edits_file) print >> sys.stderr, "Counting" for error, correct in edits1: count_chars(correct) v, edit_types = editmodel.get_edits(correct, error) edit_types = set(edit_types) edit_types = [each for each in edit_types if each[0] != editmodel.nc] edit_probs.update(edit_types) num_char_unigrams = len(char_counts) print >> sys.stderr, "Normalizing" norm_edit_probs = {} for kind, str in edit_probs.keys(): if kind == editmodel.dl: norm_edit_probs[(kind, str)] = (edit_probs[(kind, str)] + 1.0) / ( get_char_bigram_count(str) + num_char_unigrams + 1) elif kind == editmodel.ins: norm_edit_probs[(kind, str)] = (edit_probs[(kind, str)] + 1.0) / ( get_char_unigram_count(str[0]) + num_char_unigrams + 1) elif kind == editmodel.sub: #If this is a substitution, reverse the characters because of bug in get_edits norm_edit_probs[( kind, str[::-1])] = (edit_probs[(kind, str)] + 1.0) / ( get_char_unigram_count(str[0]) + num_char_unigrams + 1) elif kind == editmodel.trs: norm_edit_probs[(kind, str)] = (edit_probs[(kind, str)] + 1.0) / ( get_char_bigram_count(str) + num_char_unigrams + 1) print >> sys.stderr, "Writing to file - edits_model" serialize_data(norm_edit_probs, 'edit_model') serialize_data(dict(char_counts), 'char_unigram_model') serialize_data(dict(char_bigram_counts), 'char_bigram_model')
def output(self, partId, ch_aux): """Uses the student code to compute the output for test cases.""" trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat') if partId in [1, 2]: editModel = EditModel('../data/count_1edit.txt', trainCorpus) return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")]) else: testCorpus = HolbrookCorpus() testCorpus.slurpString(ch_aux) lm = None if partId in [3, 4]: lm = LaplaceUnigramLanguageModel(trainCorpus) elif partId in [5, 6]: lm = LaplaceBigramLanguageModel(trainCorpus) elif partId in [7, 8]: lm = StupidBackoffLanguageModel(trainCorpus) elif partId in [9, 10]: lm = CustomLanguageModel(trainCorpus) else: print 'Unknown partId: " + partId' return None speller = SpellCorrect(lm, trainCorpus) output = speller.correctCorpus(testCorpus) # put in the part ID as well output = '[["%d"],%s' % (partId, output[1:]) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] #copy of sentence bestScore = float('-inf') for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence originalWord = sentence[i] editProbabilities = self.editModel.editProbabilities(originalWord) # print(editProbabilities) for word, mass in editProbabilities: sentence[i] = word newScore = mass + self.languageModel.score(sentence) if (newScore > bestScore): bestScore = newScore bestSentence = sentence[:] sentence[i] = originalWord return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" #print "\n" * 10, sentence if len(sentence) == 0: return [] # Test if sentence[1] != 'bob': return [] # Initialize list of canidates starting with 1 (ignore start of sentence) canidates = [0] # Dictionary to hold score for each sentence weights = {} bestSentence = sentence[:] #copy of sentence #bestScore = float('-inf') # Assume initial Sentece is correct (i.e. no errors) bestScore = self.languageModel.score(bestSentence) key = ' '.join(word for word in bestSentence) weights[key] = float(bestScore) for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence # Gather all canidate edits including original word with lambda = 0.95 tmp = self.editModel.editProbabilities(sentence[i]) #tmp.append((word, math.log(0.95))) canidates.append(tmp) for ci in canidates[i]: # Modifies Sentence at i with canidate word bestSentence[i] = ci[0] print 'original = %s \n score = %s' % ( sentence, self.languageModel.score(sentence)) print 'new = %s \n score = %s + %s' % ( bestSentence, self.languageModel.score(bestSentence), ci[1]) # Hash into dictonary new setence with total probability as value # Form: {newSentence : Probability} bestScore = self.languageModel.score(bestSentence) + ci[1] # Hash as string convert after key = ' '.join(word for word in bestSentence) weights[key] = bestScore # Reset best Sentence bestSentence = sentence[:] # Test if (len(sentence) < 10): print "\n" * 10 for w in weights.items(): print w print "\n" * 5 # Find bestSentence according to highest weight bestSentence = max(weights, key=weights.get) print 'original = ', sentence, self.languageModel.score(sentence) print "best match = ", bestSentence for c in canidates: print "given canidates = ", c return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] # get a copy of sentence for comparison bestScore = float('-inf') # bestScore for comparing sentence score for i in xrange(1, len(sentence) - 1): # ignore <s> and </s> editProb = self.editModel.editProbabilities( sentence[i]) # list of canditate word # iterate each of returned candidate edits for current word in the sentence for j in xrange(0, len(editProb)): # loop through each candidate word tempSentence = list( sentence) # make a copy of original sentence tempSentence[i] = editProb[j][ 0] # taking a word out and replacing it with a candidate edit # calculate newScore and compare newScore with bestScore to find bestSentence newScore = self.languageModel.score( tempSentence) + editProb[j][1] if newScore > bestScore: bestSentence = tempSentence bestScore = newScore return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
def __init__(self, lm, corpus): """initializes the language model.""" self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus)
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] #copy of sentence bestScore = float('-inf') for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # COMPLETED: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence trialSentence = sentence[:] # make a copy word = trialSentence[i] # pick the target word to try other edits # iterate through all possible edits for the target word # iterable items (key-value tuples) are returned by editProbabilities() currentEditProbabilities = self.editModel.editProbabilities(word) for edit in currentEditProbabilities: ''' For each edit: Let x be the wrong word, w be the correct word Let W be the correct sentence We have P(x->w) = P(x|W) * P(W) Then log(P(x->w)) = log(P(x|W)) + log(P(W)) - Here, log(P(W)) is the score for the sentence generated by the score() function of each language model - log(P(x|W)) are the values returned by editProbabilities() function (check EditModel.py) ''' trialSentence[i] = edit[0] score = self.languageModel.score(trialSentence) + edit[1] if score >= bestScore: bestScore = score bestSentence = trialSentence[:] return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] #copy of sentence potentialSentence = [] bestScore = float('-inf') # skip start and end tokens for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence begin = list( sentence[:i] ) #the starting part of the sentence have to be same except index i end = list( sentence[i + 1:] ) #the ending part of the sentence have to be same except index i editProbs = self.editModel.editProbabilities( sentence[i] ) #list of the tuples of (correction,log(P(misspelling|correction))) for elem in editProbs: correction, probability = elem #unpack the values from the tuple potentialSentence = list( begin + [correction] + end) #the new sentence with the prospected correction score = self.languageModel.score( potentialSentence ) + probability #gets the P(x|W) * P(W) 's log value if score > bestScore: #we modify only when score is strictly greater than the best Score bestScore = score bestSentence = potentialSentence return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
def __init__(self, lm, corpus): self.lm = lm self.editModel = EditModel("./data/count_1edit.txt", corpus)
class SpellCorrection: """ Holds edit model, language model, corpus, trains """ def __init__(self, lm, corpus): self.lm = lm self.editModel = EditModel("./data/count_1edit.txt", corpus) @timeit def evaluation(self, corpus): """ Tests this speller on a corpus Returns a spelling result """ numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue # get any possible spell error sentence errorSentence = sentence.getErrorSentence() # use specific language model to guess highest possible corrected sentence hypothesis = self.correctSentence(errorSentence) # use test data to check correctness if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellResult(numCorrect, numTotal) def correctSentence(self, sentence): """ Takes a list of words, including words or error Returns a corrected list of words. """ if len(sentence) == 0: return [] argmax_index = 0 argmax_word = sentence[0] maxscore = float('-inf') maxlm = float('-inf') maxedit = float('-inf') # skip start and end tokens for i in range(1, len(sentence)-1): word = sentence[i] # return a dictionary {corrected-word: P(corrected-word|misspelled-word)} given a might-mis-spelled word editProbs = self.editModel.getProbabilities(word) for alternative, editscore in editProbs.items(): # no mis-spell happened, pass if alternative == word: continue sentence[i] = alternative # get score of the corrected-sentence from language model lmscore = self.lm.score(sentence) try: editscore = math.log(editscore) except ValueError: editscore = float('-inf') print word print " log-probabilities = 0, go check editModel output!" # P_final=P(corrected_sentence)*P(corrected-word|misspelled-word); score = lmscore + editscore # find the highest one and store it if score >= maxscore: maxscore = score maxlm = lmscore maxedit = editscore argmax_index = i argmax_word = alternative sentence[i] = word argmax = list(sentence) # correct the spell error given might-mis-spelled word argmax[argmax_index] = argmax_word return argmax
def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus)
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] #bestSentence = sentence[:] #copy of sentence trySentence = sentence[:] #copy of sentence bestScore = float('-inf') # checking original sentence score: #print self.languageModel.score(bestSentence) for i in xrange(1, len(sentence) - 1): #ignore <s> and </s> # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence # checking contents : # print self.editModel.editProbabilities(sentence[i]) for w, p in self.editModel.editProbabilities(sentence[i]): trySentence[i] = w if self.languageModel.score(trySentence) + p > bestScore: # p : channel model bestScore = self.languageModel.score(trySentence) + p # self.languageModel.score(trySentence) : prior bestSentence = trySentence[:] trySentence[i] = sentence[i] #pass # if True: #bestSentence != sentence: # print self.languageModel.score(sentence), ' '.join(sentence) # print self.languageModel.score(bestSentence),' '.join(bestSentence) # print return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
class SpellCorrect: """Holds edit model, language model, corpus. trains""" def __init__(self, lm, corpus): """initializes the language model.""" self.languageModel = lm self.editModel = EditModel('../data/count_1edit.txt', corpus) def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctSentence(self, sentence): """Takes a list of words, returns a corrected list of words.""" if len(sentence) == 0: return [] argmax_i = 0 argmax_w = sentence[0] maxscore = float('-inf') maxlm = float('-inf') maxedit = float('-inf') # skip start and end tokens for i in range(1, len(sentence) - 1): word = sentence[i] editProbs = self.editModel.editProbabilities(word) for alternative, editscore in editProbs.iteritems(): if alternative == word: continue sentence[i] = alternative lmscore = self.languageModel.score(sentence) if editscore != 0: editscore = math.log(editscore) else: editscore = float('-inf') score = lmscore + editscore if score >= maxscore: maxscore = score maxlm = lmscore maxedit = editscore argmax_i = i argmax_w = alternative sentence[i] = word # restores sentence to original state before moving on argmax = list(sentence) # copy it argmax[argmax_i] = argmax_w # correct it return argmax def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) # List<String> word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
return result, max if __name__ == '__main__': if len(sys.argv) < 4: print "Usage: python corrector.py <dev | test> <uniform | empirical> <queries file>" exit(0) queries_file = sys.argv[3] queries, gold, google = read_query_data(queries_file) kind_of_editmodel = sys.argv[2] #Read in unigram and bigram probs print >> sys.stderr, "Loading language model" languagemodel = LanguageModel('unigram_model','bigram_model') print >> sys.stderr, "Loading edit model" editmodel = EditModel(kind_of_editmodel,languagemodel) languagemodel.init_edit_model(editmodel) print >> sys.stderr,"Loading spell correct" spell_corrector = SpellCorrect(languagemodel, editmodel) answers = [] qc = 0 for eachquery in queries: answer = spell_corrector.spell_correct_query(eachquery) print answer print >> sys.stderr, "%d" % (qc) qc+=1 answers.append(answer) #Accuracy evaluation wrong = 0 correct = 0 for i in range(len(answers)):
class SpellCorrect: """Spelling corrector for sentences. Holds edit model, language model and the corpus.""" def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus) def correctSentence(self, sentence): """Assuming exactly one error per sentence, returns the most probable corrected sentence. Sentence is a list of words.""" if len(sentence) == 0: return [] bestSentence = sentence[:] #copy of sentence bestScore = float('-inf') for i in range(1, len(sentence) - 1): #ignore <s> and </s>, iterate each word # TODO: select the maximum probability sentence here, according to the noisy channel model. # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model. # You should iterate through these values instead of enumerating all edits. """ if the misspelling word is 'hallo', it gives the list of pairs (edited word, P(hallo|edited word)) the list will be like [('hello', -1.5),...('hall', -2.1),...('fool', -10.5),...] iterate and choose the most probable misspelling """ # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence """ if the original sentence is ['I', 'said', 'hallo'], it returns logP(W_original) = logP(I) + logP(said) + logP(hallo) In this case, both logP(I) and logP(said) are large, but logP(hallo) is small, therefore, logP(I) + logP(said) + logP(hello) + logP(hallo|hello) may be larger than logP(W_original) """ candidate_list = self.editModel.editProbabilities( sentence[i]) # get the list of (correction, log-probability) for candidate in candidate_list: new_sentence = sentence[:] new_sentence[i] = candidate[ 0] # replace i-th word with candidate # get the score of "new sentence probability + conditinal probability" probability = self.languageModel.score( new_sentence) + candidate[1] if probability > bestScore: bestScore = probability bestSentence = new_sentence return bestSentence def evaluate(self, corpus): """Tests this speller on a corpus, returns a SpellingResult""" numCorrect = 0 numTotal = 0 testData = corpus.generateTestCases() for sentence in testData: if sentence.isEmpty(): continue errorSentence = sentence.getErrorSentence() hypothesis = self.correctSentence(errorSentence) if sentence.isCorrection(hypothesis): numCorrect += 1 numTotal += 1 return SpellingResult(numCorrect, numTotal) def correctCorpus(self, corpus): """Corrects a whole corpus, returns a JSON representation of the output.""" string_list = [] # we will join these with commas, bookended with [] sentences = corpus.corpus for sentence in sentences: uncorrected = sentence.getErrorSentence() corrected = self.correctSentence(uncorrected) word_list = '["%s"]' % '","'.join(corrected) string_list.append(word_list) output = '[%s]' % ','.join(string_list) return output
def __init__(self, lm, corpus): self.languageModel = lm self.editModel = EditModel('data/count_1edit.txt', corpus)
def __init__(self, models, lm, corpus, classModel=None): """initializes the language model.""" self.classModel = classModel self.languageModels = models self.languageModelFunction = lm self.editModel = EditModel('./data/count_1edit.txt', corpus)