示例#1
0
  def output(self, partId, ch_aux):
    """Uses the student code to compute the output for test cases."""
    trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

    if partId in [1,2]:
      editModel = EditModel('../data/count_1edit.txt', trainCorpus)
      return json.dumps([[(e.editedWord, e.rule()) for e in editModel.edits(line.strip())] for line in ch_aux.split("\n")])
    else:
      testCorpus = HolbrookCorpus()
      testCorpus.slurpString(ch_aux)
      lm = None
      if partId in [3,4]:
        lm = LaplaceUnigramLanguageModel(trainCorpus)
      elif partId in [5,6]:
        lm = LaplaceBigramLanguageModel(trainCorpus)
      elif partId in [7,8]:
        lm = StupidBackoffLanguageModel(trainCorpus)
      elif partId in [9,10]:
        lm = CustomLanguageModel(trainCorpus)
      else:
        print 'Unknown partId: " + partId'
        return None

      speller = SpellCorrect(lm, trainCorpus)
      output = speller.correctCorpus(testCorpus)
      # put in the part ID as well
      output = '[["%d"],%s' % (partId, output[1:])
      return output
示例#2
0
def scan_edits( edits_file ):
    
    print >> sys.stderr, "Processing "+edits_file
    editmodel = EditModel('')
    edit_probs = Counter()
    edits1 = read_edit1s(edits_file)
    print >> sys.stderr, "Counting"
    for error,correct in edits1:
        count_chars(correct)
        v, edit_types = editmodel.get_edits( correct, error )
        edit_types = set(edit_types)
        edit_types = [each for each in edit_types if each[0] != editmodel.nc]
        edit_probs.update(edit_types)
    num_char_unigrams = len(char_counts)
    print >> sys.stderr, "Normalizing"
    norm_edit_probs = {}
    for kind,str in edit_probs.keys():
        if kind == editmodel.dl:
            norm_edit_probs[(kind,str)] = (edit_probs[(kind,str)] + 1.0)/(get_char_bigram_count(str) + num_char_unigrams + 1)
        elif kind == editmodel.ins:
            norm_edit_probs[(kind,str)] = (edit_probs[(kind,str)] + 1.0)/(get_char_unigram_count(str[0]) + num_char_unigrams + 1)
        elif kind == editmodel.sub:
            #If this is a substitution, reverse the characters because of bug in get_edits
            norm_edit_probs[(kind,str[::-1])] = (edit_probs[(kind,str)] + 1.0)/(get_char_unigram_count(str[0]) + num_char_unigrams + 1)
        elif kind == editmodel.trs:
            norm_edit_probs[(kind,str)] = (edit_probs[(kind,str)] + 1.0)/(get_char_bigram_count(str) + num_char_unigrams + 1)
    print >> sys.stderr, "Writing to file - edits_model"
    serialize_data(norm_edit_probs, 'edit_model')
    serialize_data(dict(char_counts), 'char_unigram_model')
    serialize_data(dict(char_bigram_counts), 'char_bigram_model')
示例#3
0
def scan_edits(edits_file):

    print >> sys.stderr, "Processing " + edits_file
    editmodel = EditModel('')
    edit_probs = Counter()
    edits1 = read_edit1s(edits_file)
    print >> sys.stderr, "Counting"
    for error, correct in edits1:
        count_chars(correct)
        v, edit_types = editmodel.get_edits(correct, error)
        edit_types = set(edit_types)
        edit_types = [each for each in edit_types if each[0] != editmodel.nc]
        edit_probs.update(edit_types)
    num_char_unigrams = len(char_counts)
    print >> sys.stderr, "Normalizing"
    norm_edit_probs = {}
    for kind, str in edit_probs.keys():
        if kind == editmodel.dl:
            norm_edit_probs[(kind, str)] = (edit_probs[(kind, str)] + 1.0) / (
                get_char_bigram_count(str) + num_char_unigrams + 1)
        elif kind == editmodel.ins:
            norm_edit_probs[(kind, str)] = (edit_probs[(kind, str)] + 1.0) / (
                get_char_unigram_count(str[0]) + num_char_unigrams + 1)
        elif kind == editmodel.sub:
            #If this is a substitution, reverse the characters because of bug in get_edits
            norm_edit_probs[(
                kind, str[::-1])] = (edit_probs[(kind, str)] + 1.0) / (
                    get_char_unigram_count(str[0]) + num_char_unigrams + 1)
        elif kind == editmodel.trs:
            norm_edit_probs[(kind, str)] = (edit_probs[(kind, str)] + 1.0) / (
                get_char_bigram_count(str) + num_char_unigrams + 1)
    print >> sys.stderr, "Writing to file - edits_model"
    serialize_data(norm_edit_probs, 'edit_model')
    serialize_data(dict(char_counts), 'char_unigram_model')
    serialize_data(dict(char_bigram_counts), 'char_bigram_model')
示例#4
0
    def output(self, partId, ch_aux):
        """Uses the student code to compute the output for test cases."""
        trainCorpus = HolbrookCorpus('../data/holbrook-tagged-train.dat')

        if partId in [1, 2]:
            editModel = EditModel('../data/count_1edit.txt', trainCorpus)
            return json.dumps([[(e.editedWord, e.rule())
                                for e in editModel.edits(line.strip())]
                               for line in ch_aux.split("\n")])
        else:
            testCorpus = HolbrookCorpus()
            testCorpus.slurpString(ch_aux)
            lm = None
            if partId in [3, 4]:
                lm = LaplaceUnigramLanguageModel(trainCorpus)
            elif partId in [5, 6]:
                lm = LaplaceBigramLanguageModel(trainCorpus)
            elif partId in [7, 8]:
                lm = StupidBackoffLanguageModel(trainCorpus)
            elif partId in [9, 10]:
                lm = CustomLanguageModel(trainCorpus)
            else:
                print 'Unknown partId: " + partId'
                return None

            speller = SpellCorrect(lm, trainCorpus)
            output = speller.correctCorpus(testCorpus)
            # put in the part ID as well
            output = '[["%d"],%s' % (partId, output[1:])
            return output
示例#5
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  #copy of sentence
        bestScore = float('-inf')

        for i in xrange(1, len(sentence) - 1):  #ignore <s> and </s>
            # TODO: select the maximum probability sentence here, according to the noisy channel model.
            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
            originalWord = sentence[i]
            editProbabilities = self.editModel.editProbabilities(originalWord)
            # print(editProbabilities)
            for word, mass in editProbabilities:
                sentence[i] = word
                newScore = mass + self.languageModel.score(sentence)
                if (newScore > bestScore):
                    bestScore = newScore
                    bestSentence = sentence[:]
            sentence[i] = originalWord

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#6
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        #print "\n" * 10, sentence

        if len(sentence) == 0:
            return []

        # Test
        if sentence[1] != 'bob': return []

        # Initialize list of canidates starting with 1 (ignore start of sentence)
        canidates = [0]
        # Dictionary to hold score for each sentence
        weights = {}

        bestSentence = sentence[:]  #copy of sentence
        #bestScore = float('-inf')

        # Assume initial Sentece is correct (i.e. no errors)
        bestScore = self.languageModel.score(bestSentence)
        key = ' '.join(word for word in bestSentence)
        weights[key] = float(bestScore)

        for i in xrange(1, len(sentence) - 1):  #ignore <s> and </s>
            # TODO: select the maximum probability sentence here, according to the noisy channel model.
            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence

            # Gather all canidate edits including original word with lambda = 0.95
            tmp = self.editModel.editProbabilities(sentence[i])
            #tmp.append((word, math.log(0.95)))
            canidates.append(tmp)

            for ci in canidates[i]:
                # Modifies Sentence at i with canidate word
                bestSentence[i] = ci[0]

                print 'original = %s \n score = %s' % (
                    sentence, self.languageModel.score(sentence))
                print 'new = %s \n score = %s + %s' % (
                    bestSentence, self.languageModel.score(bestSentence),
                    ci[1])

                # Hash into dictonary new setence with total probability as value
                # Form: {newSentence : Probability}
                bestScore = self.languageModel.score(bestSentence) + ci[1]
                # Hash as string convert after
                key = ' '.join(word for word in bestSentence)
                weights[key] = bestScore
                # Reset best Sentence
                bestSentence = sentence[:]

        # Test
        if (len(sentence) < 10):
            print "\n" * 10
            for w in weights.items():
                print w
            print "\n" * 5
            # Find bestSentence according to highest weight
            bestSentence = max(weights, key=weights.get)

            print 'original = ', sentence, self.languageModel.score(sentence)
            print "best match = ", bestSentence

            for c in canidates:
                print "given canidates = ", c

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#7
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  # get a copy of sentence for comparison
        bestScore = float('-inf')  # bestScore for comparing sentence score

        for i in xrange(1, len(sentence) - 1):  # ignore <s> and </s>
            editProb = self.editModel.editProbabilities(
                sentence[i])  # list of canditate word

            # iterate each of returned candidate edits for current word in the sentence
            for j in xrange(0,
                            len(editProb)):  # loop through each candidate word
                tempSentence = list(
                    sentence)  # make a copy of original sentence
                tempSentence[i] = editProb[j][
                    0]  # taking a word out and replacing it with a candidate edit

                # calculate newScore and compare newScore with bestScore to find bestSentence
                newScore = self.languageModel.score(
                    tempSentence) + editProb[j][1]
                if newScore > bestScore:
                    bestSentence = tempSentence
                    bestScore = newScore
        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
 def __init__(self, lm, corpus):
   """initializes the language model."""
   self.languageModel = lm
   self.editModel = EditModel('../data/count_1edit.txt', corpus)
示例#9
0
 def __init__(self, lm, corpus):
   """initializes the language model."""
   self.languageModel = lm
   self.editModel = EditModel('../data/count_1edit.txt', corpus)
示例#10
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  #copy of sentence
        bestScore = float('-inf')

        for i in xrange(1, len(sentence) - 1):  #ignore <s> and </s>
            # COMPLETED: select the maximum probability sentence here, according to the noisy channel model.
            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence

            trialSentence = sentence[:]  # make a copy
            word = trialSentence[i]  # pick the target word to try other edits
            # iterate through all possible edits for the target word
            # iterable items (key-value tuples) are returned by editProbabilities()
            currentEditProbabilities = self.editModel.editProbabilities(word)
            for edit in currentEditProbabilities:
                ''' For each edit:
          Let x be the wrong word, w be the correct word
          Let W be the correct sentence
          We have P(x->w) = P(x|W) * P(W)
          Then log(P(x->w)) = log(P(x|W)) + log(P(W))
            - Here, log(P(W)) is the score for the sentence generated by the score() function of each language model
            - log(P(x|W)) are the values returned by editProbabilities() function (check EditModel.py)
        '''
                trialSentence[i] = edit[0]
                score = self.languageModel.score(trialSentence) + edit[1]
                if score >= bestScore:
                    bestScore = score
                    bestSentence = trialSentence[:]

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#11
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  #copy of sentence
        potentialSentence = []
        bestScore = float('-inf')
        # skip start and end tokens
        for i in xrange(1, len(sentence) - 1):  #ignore <s> and </s>
            # TODO: select the maximum probability sentence here, according to the noisy channel model.
            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence

            begin = list(
                sentence[:i]
            )  #the starting part of the sentence have to be same except index i
            end = list(
                sentence[i + 1:]
            )  #the ending part of the sentence have to be same except index i
            editProbs = self.editModel.editProbabilities(
                sentence[i]
            )  #list of the tuples of (correction,log(P(misspelling|correction)))
            for elem in editProbs:
                correction, probability = elem  #unpack the values from the tuple
                potentialSentence = list(
                    begin + [correction] +
                    end)  #the new sentence with the prospected correction
                score = self.languageModel.score(
                    potentialSentence
                ) + probability  #gets the P(x|W) * P(W) 's log value
                if score > bestScore:  #we modify only when score is strictly greater than the best Score
                    bestScore = score
                    bestSentence = potentialSentence

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#12
0
 def __init__(self, lm, corpus):
     self.lm = lm
     self.editModel = EditModel("./data/count_1edit.txt", corpus)
示例#13
0
class SpellCorrection:
    """
    Holds edit model, language model, corpus, trains
    """
    
    def __init__(self, lm, corpus):
        self.lm = lm
        self.editModel = EditModel("./data/count_1edit.txt", corpus)
    
    @timeit    
    def evaluation(self, corpus):
        
        """
        Tests this speller on a corpus
        Returns a spelling result
        """
        
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            # get any possible spell error sentence
            errorSentence = sentence.getErrorSentence()
            # use specific language model to guess highest possible corrected sentence
            hypothesis = self.correctSentence(errorSentence)
            # use test data to check correctness
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellResult(numCorrect, numTotal)
        
    def correctSentence(self, sentence):
        
        """
        Takes a list of words, including words or error
        Returns a corrected list of words.
        """
        
        if len(sentence) == 0:
            return []
        argmax_index = 0
        argmax_word = sentence[0]
        maxscore = float('-inf')
        maxlm = float('-inf')
        maxedit = float('-inf')
        
        # skip start and end tokens
        for i in range(1, len(sentence)-1):
            word = sentence[i]
            # return a dictionary {corrected-word: P(corrected-word|misspelled-word)} given a might-mis-spelled word
            editProbs = self.editModel.getProbabilities(word) 
            for alternative, editscore in editProbs.items():
                # no mis-spell happened, pass
                if alternative == word:
                    continue
                sentence[i] = alternative
                # get score of the corrected-sentence from language model
                lmscore = self.lm.score(sentence)
                try:
                    editscore = math.log(editscore)
                except ValueError:
                    editscore = float('-inf')
                    print word
                    print " log-probabilities = 0, go check editModel output!"
                # P_final=P(corrected_sentence)*P(corrected-word|misspelled-word);
                score = lmscore + editscore
                # find the highest one and store it
                if score >= maxscore:
                    maxscore = score
                    maxlm = lmscore
                    maxedit = editscore
                    argmax_index = i
                    argmax_word = alternative
            sentence[i] = word 
            argmax = list(sentence)
            # correct the spell error given might-mis-spelled word
            argmax[argmax_index] = argmax_word
        return argmax
示例#14
0
 def __init__(self, lm, corpus):
   self.languageModel = lm
   self.editModel = EditModel('../data/count_1edit.txt', corpus)
示例#15
0
class SpellCorrect:
  """Spelling corrector for sentences. Holds edit model, language model and the corpus."""

  def __init__(self, lm, corpus):
    self.languageModel = lm
    self.editModel = EditModel('../data/count_1edit.txt', corpus)

  def correctSentence(self, sentence):
    """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

    if len(sentence) == 0:
      return []

    #bestSentence = sentence[:] #copy of sentence
    trySentence  = sentence[:] #copy of sentence
    bestScore = float('-inf')

    # checking original sentence score: #print self.languageModel.score(bestSentence)

    for i in xrange(1, len(sentence) - 1): #ignore <s> and </s>
      # TODO: select the maximum probability sentence here, according to the noisy channel model.
      # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
      #      You should iterate through these values instead of enumerating all edits.
      # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
      # checking contents : # print self.editModel.editProbabilities(sentence[i])
      for w, p in self.editModel.editProbabilities(sentence[i]):
          trySentence[i] = w
          if self.languageModel.score(trySentence) + p > bestScore:   # p : channel model
              bestScore = self.languageModel.score(trySentence) + p   # self.languageModel.score(trySentence) : prior
              bestSentence = trySentence[:]
      trySentence[i] = sentence[i]


      #pass

    # if True: #bestSentence != sentence:
    #     print self.languageModel.score(sentence), ' '.join(sentence)
    #     print self.languageModel.score(bestSentence),' '.join(bestSentence)
    #     print

    return bestSentence

  def evaluate(self, corpus):  
    """Tests this speller on a corpus, returns a SpellingResult"""
    numCorrect = 0
    numTotal = 0
    testData = corpus.generateTestCases()
    for sentence in testData:
      if sentence.isEmpty():
        continue
      errorSentence = sentence.getErrorSentence()
      hypothesis = self.correctSentence(errorSentence)
      if sentence.isCorrection(hypothesis):
        numCorrect += 1
      numTotal += 1
    return SpellingResult(numCorrect, numTotal)

  def correctCorpus(self, corpus): 
    """Corrects a whole corpus, returns a JSON representation of the output."""
    string_list = [] # we will join these with commas,  bookended with []
    sentences = corpus.corpus
    for sentence in sentences:
      uncorrected = sentence.getErrorSentence()
      corrected = self.correctSentence(uncorrected)
      word_list = '["%s"]' % '","'.join(corrected)
      string_list.append(word_list)
    output = '[%s]' % ','.join(string_list)
    return output
示例#16
0
class SpellCorrect:
  """Holds edit model, language model, corpus. trains"""
  

  def __init__(self, lm, corpus):
    """initializes the language model."""
    self.languageModel = lm
    self.editModel = EditModel('../data/count_1edit.txt', corpus)


  def evaluate(self, corpus):  
    """Tests this speller on a corpus, returns a SpellingResult"""
    numCorrect = 0
    numTotal = 0
    testData = corpus.generateTestCases()
    for sentence in testData:
      if sentence.isEmpty():
        continue
      errorSentence = sentence.getErrorSentence()
      hypothesis = self.correctSentence(errorSentence)
      if sentence.isCorrection(hypothesis):
        numCorrect += 1
      numTotal += 1
    return SpellingResult(numCorrect, numTotal)

  def correctSentence(self, sentence):
    """Takes a list of words, returns a corrected list of words."""
    if len(sentence) == 0:
      return []
    argmax_i = 0
    argmax_w = sentence[0]
    maxscore = float('-inf')
    maxlm = float('-inf')
    maxedit = float('-inf')

    # skip start and end tokens
    for i in range(1, len(sentence) - 1):
      word = sentence[i]
      editProbs = self.editModel.editProbabilities(word)
      for alternative, editscore in editProbs.iteritems():
        if alternative == word:
          continue
        sentence[i] = alternative
        lmscore = self.languageModel.score(sentence)
        if editscore != 0:
          editscore = math.log(editscore)
        else:
          editscore = float('-inf')
        score = lmscore + editscore
        if score >= maxscore:
          maxscore = score
          maxlm = lmscore
          maxedit = editscore
          argmax_i = i
          argmax_w = alternative

      sentence[i] = word # restores sentence to original state before moving on
    argmax = list(sentence) # copy it
    argmax[argmax_i] = argmax_w # correct it
    return argmax


  def correctCorpus(self, corpus): 
    """Corrects a whole corpus, returns a JSON representation of the output."""
    string_list = [] # we will join these with commas,  bookended with []
    sentences = corpus.corpus
    for sentence in sentences:
      uncorrected = sentence.getErrorSentence()
      corrected = self.correctSentence(uncorrected) # List<String>
      word_list = '["%s"]' % '","'.join(corrected)
      string_list.append(word_list)
    output = '[%s]' % ','.join(string_list)
    return output
示例#17
0
            return result, max



if __name__ == '__main__':
    if len(sys.argv) < 4:
        print "Usage: python corrector.py <dev | test> <uniform | empirical> <queries file>"
        exit(0)
    queries_file = sys.argv[3]
    queries, gold, google = read_query_data(queries_file)
    kind_of_editmodel = sys.argv[2]
    #Read in unigram and bigram probs
    print >> sys.stderr, "Loading language model"
    languagemodel = LanguageModel('unigram_model','bigram_model')
    print >> sys.stderr, "Loading edit model"
    editmodel = EditModel(kind_of_editmodel,languagemodel)
    languagemodel.init_edit_model(editmodel)
    print >> sys.stderr,"Loading spell correct"
    spell_corrector = SpellCorrect(languagemodel, editmodel)
    answers = []
    qc = 0
    for eachquery in queries:
        answer = spell_corrector.spell_correct_query(eachquery)  
        print answer  
        print >> sys.stderr, "%d" % (qc)
        qc+=1
        answers.append(answer)
    #Accuracy evaluation
    wrong = 0
    correct = 0
    for i in range(len(answers)):
class SpellCorrect:
  """Holds edit model, language model, corpus. trains"""


  def __init__(self, lm, corpus):
    """initializes the language model."""
    self.languageModel = lm
    self.editModel = EditModel('../data/count_1edit.txt', corpus)


  def evaluate(self, corpus):
    """Tests this speller on a corpus, returns a SpellingResult"""
    numCorrect = 0
    numTotal = 0
    testData = corpus.generateTestCases()
    for sentence in testData:
      if sentence.isEmpty():
        continue
      errorSentence = sentence.getErrorSentence()
      hypothesis = self.correctSentence(errorSentence)
      if sentence.isCorrection(hypothesis):
        numCorrect += 1
      numTotal += 1
    return SpellingResult(numCorrect, numTotal)

  def correctSentence(self, sentence):
    """Takes a list of words, returns a corrected list of words."""
    if len(sentence) == 0:
      return []
    argmax_i = 0
    argmax_w = sentence[0]
    maxscore = float('-inf')
    maxlm = float('-inf')
    maxedit = float('-inf')

    # skip start and end tokens
    for i in range(1, len(sentence) - 1):
      word = sentence[i]
      editProbs = self.editModel.editProbabilities(word)
      for alternative, editscore in editProbs.iteritems():
        if alternative == word:
          continue
        sentence[i] = alternative
        lmscore = self.languageModel.score(sentence)
        if editscore != 0:
          editscore = math.log(editscore)
        else:
          editscore = float('-inf')
        score = lmscore + editscore
        if score >= maxscore:
          maxscore = score
          maxlm = lmscore
          maxedit = editscore
          argmax_i = i
          argmax_w = alternative

      sentence[i] = word # restores sentence to original state before moving on
    argmax = list(sentence) # copy it
    argmax[argmax_i] = argmax_w # correct it
    return argmax


  def correctCorpus(self, corpus):
    """Corrects a whole corpus, returns a JSON representation of the output."""
    string_list = [] # we will join these with commas,  bookended with []
    sentences = corpus.corpus
    for sentence in sentences:
      uncorrected = sentence.getErrorSentence()
      corrected = self.correctSentence(uncorrected) # List<String>
      word_list = '["%s"]' % '","'.join(corrected)
      string_list.append(word_list)
    output = '[%s]' % ','.join(string_list)
    return output
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  #copy of sentence
        bestScore = float('-inf')

        for i in range(1,
                       len(sentence) -
                       1):  #ignore <s> and </s>, iterate each word
            # TODO: select the maximum probability sentence here, according to the noisy channel model.

            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            """
      if the misspelling word is 'hallo', it gives the list of pairs (edited word, P(hallo|edited word))
      the list will be like
      [('hello', -1.5),...('hall', -2.1),...('fool', -10.5),...]
      iterate and choose the most probable misspelling 
      """
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
            """
      if the original sentence is ['I', 'said', 'hallo'], it returns
      logP(W_original) = logP(I) + logP(said) + logP(hallo)
      In this case, both logP(I) and logP(said) are large, but logP(hallo) is small,
      therefore, logP(I) + logP(said) + logP(hello) + logP(hallo|hello) may be larger than logP(W_original)
      """
            candidate_list = self.editModel.editProbabilities(
                sentence[i])  # get the list of (correction, log-probability)
            for candidate in candidate_list:
                new_sentence = sentence[:]
                new_sentence[i] = candidate[
                    0]  # replace i-th word with candidate
                # get the score of  "new sentence probability + conditinal probability"
                probability = self.languageModel.score(
                    new_sentence) + candidate[1]
                if probability > bestScore:
                    bestScore = probability
                    bestSentence = new_sentence

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#20
0
 def __init__(self, lm, corpus):
     self.languageModel = lm
     self.editModel = EditModel('data/count_1edit.txt', corpus)
 def __init__(self, models, lm, corpus, classModel=None):
   """initializes the language model."""
   self.classModel = classModel
   self.languageModels = models
   self.languageModelFunction = lm
   self.editModel = EditModel('./data/count_1edit.txt', corpus)