class SpellCorrect:
  """Holds edit model, language model, corpus. trains"""


  def __init__(self, lm, corpus):
    """initializes the language model."""
    self.languageModel = lm
    self.editModel = EditModel('../data/count_1edit.txt', corpus)


  def evaluate(self, corpus):
    """Tests this speller on a corpus, returns a SpellingResult"""
    numCorrect = 0
    numTotal = 0
    testData = corpus.generateTestCases()
    for sentence in testData:
      if sentence.isEmpty():
        continue
      errorSentence = sentence.getErrorSentence()
      hypothesis = self.correctSentence(errorSentence)
      if sentence.isCorrection(hypothesis):
        numCorrect += 1
      numTotal += 1
    return SpellingResult(numCorrect, numTotal)

  def correctSentence(self, sentence):
    """Takes a list of words, returns a corrected list of words."""
    if len(sentence) == 0:
      return []
    argmax_i = 0
    argmax_w = sentence[0]
    maxscore = float('-inf')
    maxlm = float('-inf')
    maxedit = float('-inf')

    # skip start and end tokens
    for i in range(1, len(sentence) - 1):
      word = sentence[i]
      editProbs = self.editModel.editProbabilities(word)
      for alternative, editscore in editProbs.iteritems():
        if alternative == word:
          continue
        sentence[i] = alternative
        lmscore = self.languageModel.score(sentence)
        if editscore != 0:
          editscore = math.log(editscore)
        else:
          editscore = float('-inf')
        score = lmscore + editscore
        if score >= maxscore:
          maxscore = score
          maxlm = lmscore
          maxedit = editscore
          argmax_i = i
          argmax_w = alternative

      sentence[i] = word # restores sentence to original state before moving on
    argmax = list(sentence) # copy it
    argmax[argmax_i] = argmax_w # correct it
    return argmax


  def correctCorpus(self, corpus):
    """Corrects a whole corpus, returns a JSON representation of the output."""
    string_list = [] # we will join these with commas,  bookended with []
    sentences = corpus.corpus
    for sentence in sentences:
      uncorrected = sentence.getErrorSentence()
      corrected = self.correctSentence(uncorrected) # List<String>
      word_list = '["%s"]' % '","'.join(corrected)
      string_list.append(word_list)
    output = '[%s]' % ','.join(string_list)
    return output
示例#2
0
class SpellCorrect:
  """Spelling corrector for sentences. Holds edit model, language model and the corpus."""

  def __init__(self, lm, corpus):
    self.languageModel = lm
    self.editModel = EditModel('data/count_1edit.txt', corpus)

  def correctSentence(self, sentence):
    """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

    if len(sentence) == 0:
      return []

    bestSentence = sentence[:] #copy of sentence
    bestScore = float('-inf')
    
    for i in range(1, len(sentence) - 1): #ignore <s> and </s>
      # TODO: select the maximum probability sentence here, according to the noisy channel model.
      # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
      #      You should iterate through these values instead of enumerating all edits.
      # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
      
      # Get the 2-dimension edits which includes word and corresponding score
      edits = self.editModel.editProbabilities(sentence[i])
      
      # Try every word in edits
      for word,score in edits:
          try_wordscore = score
          try_word = word
          newSentence = sentence[:i] + [try_word] + sentence[(i+1):]
          # Recalculate the total score of new sentence
          total_try_score = self.languageModel.score(newSentence) + try_wordscore
          
          if total_try_score > bestScore: # Replace with the try_word
              bestSentence = newSentence
              bestScore = total_try_score
    return bestSentence

  def evaluate(self, corpus):  
    """Tests this speller on a corpus, returns a SpellingResult"""
    numCorrect = 0
    numTotal = 0
    testData = corpus.generateTestCases()
    for sentence in testData:
      if sentence.isEmpty():
        continue
      errorSentence = sentence.getErrorSentence()
      hypothesis = self.correctSentence(errorSentence)
      if sentence.isCorrection(hypothesis):
        numCorrect += 1
      numTotal += 1
    return SpellingResult(numCorrect, numTotal)

  def correctCorpus(self, corpus): 
    """Corrects a whole corpus, returns a JSON representation of the output."""
    string_list = [] # we will join these with commas,  bookended with []
    sentences = corpus.corpus
    for sentence in sentences:
      uncorrected = sentence.getErrorSentence()
      corrected = self.correctSentence(uncorrected)
      word_list = '["%s"]' % '","'.join(corrected)
      string_list.append(word_list)
    output = '[%s]' % ','.join(string_list)
    return output
示例#3
0
class SpellCorrect:
    """Holds edit model, language model, corpus. trains"""
    def __init__(self, lm, corpus):
        """initializes the language model."""
        self.languageModel = lm
        self.editModel = EditModel('../data/count_1edit.txt', corpus)

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctSentence(self, sentence):
        """Takes a list of words, returns a corrected list of words."""
        if len(sentence) == 0:
            return []
        argmax_i = 0
        argmax_w = sentence[0]
        maxscore = float('-inf')
        maxlm = float('-inf')
        maxedit = float('-inf')

        # skip start and end tokens
        for i in range(1, len(sentence) - 1):
            word = sentence[i]
            editProbs = self.editModel.editProbabilities(word)
            for alternative, editscore in editProbs.iteritems():
                if alternative == word:
                    continue
                sentence[i] = alternative
                lmscore = self.languageModel.score(sentence)
                if editscore != 0:
                    editscore = math.log(editscore)
                else:
                    editscore = float('-inf')
                score = lmscore + editscore
                if score >= maxscore:
                    maxscore = score
                    maxlm = lmscore
                    maxedit = editscore
                    argmax_i = i
                    argmax_w = alternative

            sentence[
                i] = word  # restores sentence to original state before moving on
        argmax = list(sentence)  # copy it
        argmax[argmax_i] = argmax_w  # correct it
        return argmax

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)  # List<String>
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#4
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('../data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        #bestSentence = sentence[:] #copy of sentence
        trySentence = sentence[:]  #copy of sentence
        bestScore = float('-inf')

        # checking original sentence score: #print self.languageModel.score(bestSentence)

        for i in xrange(1, len(sentence) - 1):  #ignore <s> and </s>
            # TODO: select the maximum probability sentence here, according to the noisy channel model.
            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
            # checking contents : # print self.editModel.editProbabilities(sentence[i])
            for w, p in self.editModel.editProbabilities(sentence[i]):
                trySentence[i] = w
                if self.languageModel.score(
                        trySentence) + p > bestScore:  # p : channel model
                    bestScore = self.languageModel.score(
                        trySentence
                    ) + p  # self.languageModel.score(trySentence) : prior
                    bestSentence = trySentence[:]
            trySentence[i] = sentence[i]

            #pass

        # if True: #bestSentence != sentence:
        #     print self.languageModel.score(sentence), ' '.join(sentence)
        #     print self.languageModel.score(bestSentence),' '.join(bestSentence)
        #     print

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#5
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  #copy of sentence
        bestScore = float('-inf')

        for i in xrange(1, len(sentence) - 1):  #ignore <s> and </s>
            # COMPLETED: select the maximum probability sentence here, according to the noisy channel model.
            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence

            trialSentence = sentence[:]  # make a copy
            word = trialSentence[i]  # pick the target word to try other edits
            # iterate through all possible edits for the target word
            # iterable items (key-value tuples) are returned by editProbabilities()
            currentEditProbabilities = self.editModel.editProbabilities(word)
            for edit in currentEditProbabilities:
                ''' For each edit:
          Let x be the wrong word, w be the correct word
          Let W be the correct sentence
          We have P(x->w) = P(x|W) * P(W)
          Then log(P(x->w)) = log(P(x|W)) + log(P(W))
            - Here, log(P(W)) is the score for the sentence generated by the score() function of each language model
            - log(P(x|W)) are the values returned by editProbabilities() function (check EditModel.py)
        '''
                trialSentence[i] = edit[0]
                score = self.languageModel.score(trialSentence) + edit[1]
                if score >= bestScore:
                    bestScore = score
                    bestSentence = trialSentence[:]

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
示例#6
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  #copy of sentence
        potentialSentence = []
        bestScore = float('-inf')
        # skip start and end tokens
        for i in xrange(1, len(sentence) - 1):  #ignore <s> and </s>
            # TODO: select the maximum probability sentence here, according to the noisy channel model.
            # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
            #      You should iterate through these values instead of enumerating all edits.
            # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence

            begin = list(
                sentence[:i]
            )  #the starting part of the sentence have to be same except index i
            end = list(
                sentence[i + 1:]
            )  #the ending part of the sentence have to be same except index i
            editProbs = self.editModel.editProbabilities(
                sentence[i]
            )  #list of the tuples of (correction,log(P(misspelling|correction)))
            for elem in editProbs:
                correction, probability = elem  #unpack the values from the tuple
                potentialSentence = list(
                    begin + [correction] +
                    end)  #the new sentence with the prospected correction
                score = self.languageModel.score(
                    potentialSentence
                ) + probability  #gets the P(x|W) * P(W) 's log value
                if score > bestScore:  #we modify only when score is strictly greater than the best Score
                    bestScore = score
                    bestSentence = potentialSentence

        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output
class SpellCorrect:
  """Spelling corrector for sentences. Holds edit model, language model and the corpus."""

  def __init__(self, lm, corpus):
    self.languageModel = lm
    self.editModel = EditModel('../data/count_1edit.txt', corpus)

  def correctSentence(self, sentence):
    """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

    if len(sentence) == 0:
      return []

    #bestSentence = sentence[:] #copy of sentence
    trySentence  = sentence[:] #copy of sentence
    bestScore = float('-inf')

    # checking original sentence score: #print self.languageModel.score(bestSentence)

    for i in xrange(1, len(sentence) - 1): #ignore <s> and </s>
      # TODO: select the maximum probability sentence here, according to the noisy channel model.
      # Tip: self.editModel.editProbabilities(word) gives edits and log-probabilities according to your edit model.
      #      You should iterate through these values instead of enumerating all edits.
      # Tip: self.languageModel.score(trialSentence) gives log-probability of a sentence
      # checking contents : # print self.editModel.editProbabilities(sentence[i])
      for w, p in self.editModel.editProbabilities(sentence[i]):
          trySentence[i] = w
          if self.languageModel.score(trySentence) + p > bestScore:   # p : channel model
              bestScore = self.languageModel.score(trySentence) + p   # self.languageModel.score(trySentence) : prior
              bestSentence = trySentence[:]
      trySentence[i] = sentence[i]


      #pass

    # if True: #bestSentence != sentence:
    #     print self.languageModel.score(sentence), ' '.join(sentence)
    #     print self.languageModel.score(bestSentence),' '.join(bestSentence)
    #     print

    return bestSentence

  def evaluate(self, corpus):  
    """Tests this speller on a corpus, returns a SpellingResult"""
    numCorrect = 0
    numTotal = 0
    testData = corpus.generateTestCases()
    for sentence in testData:
      if sentence.isEmpty():
        continue
      errorSentence = sentence.getErrorSentence()
      hypothesis = self.correctSentence(errorSentence)
      if sentence.isCorrection(hypothesis):
        numCorrect += 1
      numTotal += 1
    return SpellingResult(numCorrect, numTotal)

  def correctCorpus(self, corpus): 
    """Corrects a whole corpus, returns a JSON representation of the output."""
    string_list = [] # we will join these with commas,  bookended with []
    sentences = corpus.corpus
    for sentence in sentences:
      uncorrected = sentence.getErrorSentence()
      corrected = self.correctSentence(uncorrected)
      word_list = '["%s"]' % '","'.join(corrected)
      string_list.append(word_list)
    output = '[%s]' % ','.join(string_list)
    return output
示例#8
0
class SpellCorrect:
    """Spelling corrector for sentences. Holds edit model, language model and the corpus."""
    def __init__(self, lm, corpus):
        self.languageModel = lm
        self.editModel = EditModel('data/count_1edit.txt', corpus)

    def correctSentence(self, sentence):
        """Assuming exactly one error per sentence, returns the most probable corrected sentence.
       Sentence is a list of words."""

        if len(sentence) == 0:
            return []

        bestSentence = sentence[:]  # get a copy of sentence for comparison
        bestScore = float('-inf')  # bestScore for comparing sentence score

        for i in xrange(1, len(sentence) - 1):  # ignore <s> and </s>
            editProb = self.editModel.editProbabilities(
                sentence[i])  # list of canditate word

            # iterate each of returned candidate edits for current word in the sentence
            for j in xrange(0,
                            len(editProb)):  # loop through each candidate word
                tempSentence = list(
                    sentence)  # make a copy of original sentence
                tempSentence[i] = editProb[j][
                    0]  # taking a word out and replacing it with a candidate edit

                # calculate newScore and compare newScore with bestScore to find bestSentence
                newScore = self.languageModel.score(
                    tempSentence) + editProb[j][1]
                if newScore > bestScore:
                    bestSentence = tempSentence
                    bestScore = newScore
        return bestSentence

    def evaluate(self, corpus):
        """Tests this speller on a corpus, returns a SpellingResult"""
        numCorrect = 0
        numTotal = 0
        testData = corpus.generateTestCases()
        for sentence in testData:
            if sentence.isEmpty():
                continue
            errorSentence = sentence.getErrorSentence()
            hypothesis = self.correctSentence(errorSentence)
            if sentence.isCorrection(hypothesis):
                numCorrect += 1
            numTotal += 1
        return SpellingResult(numCorrect, numTotal)

    def correctCorpus(self, corpus):
        """Corrects a whole corpus, returns a JSON representation of the output."""
        string_list = []  # we will join these with commas,  bookended with []
        sentences = corpus.corpus
        for sentence in sentences:
            uncorrected = sentence.getErrorSentence()
            corrected = self.correctSentence(uncorrected)
            word_list = '["%s"]' % '","'.join(corrected)
            string_list.append(word_list)
        output = '[%s]' % ','.join(string_list)
        return output