class SpellChecker(): def __init__(self, max_distance, channel_model=None, language_model=None): self.nlp = spacy.load('en', pipeline=["tagger", "parser"]) self.max_distance = max_distance # self.load_channel_model(channel_model) # self.load_language_model(language_model) def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): prevFocusScore = self.language_model.bigram_prob(prev_word, focus_word) focusNextScore = self.language_model.bigram_prob(focus_word, next_word) return (prevFocusScore + focusNextScore) / 2 def unigram_score(self, word): return self.language_model.unigram_prob(word) def cm_score(self, error_word, corrected_word): return self.channel_model.prob(error_word, corrected_word) def inserts(self, word): ''' Takes in word and return a list of words that are within one insert of word ''' # Insert every letter possibleWords = [] for letter in string.ascii_lowercase: # Every possible position for i in range(len(word) + 1): # Check if the resulting word is a word testWord = word[:i] + letter + word[i:] if self.language_model.__contains__(testWord): possibleWords.append(testWord) return possibleWords def deletes(self, word): # Delete every letter possibleWords = [] for i in range(len(word) + 1): # Check if the resulting word is a word testWord = word[:i] + word[i + 1:] if self.language_model.__contains__(testWord): possibleWords.append(testWord) return possibleWords def substitutions(self, word): # Substitute every letter possibleWords = [] for letter in string.ascii_lowercase: # At every possible position for i in range(len(word) + 1): # Check if the resulting word is a word testWord = word[:i] + letter + word[i + 1:] if self.language_model.__contains__(testWord): possibleWords.append(testWord) return possibleWords def generate_candidates(self, word): ''' Takes in a candidate word and returns words that are within self.max_distance edits of word ''' for i in range(1, self.max_distance + 1): if i == 1: candidateWords = self.inserts(word) + self.deletes( word) + self.substitutions(word) else: newWords = [] for currentWord in candidateWords: newWords += self.inserts(currentWord) + self.deletes( currentWord) + self.substitutions(currentWord) candidateWords += newWords # Get rid of duplicates return list(set(candidateWords)) def check_sentence(self, sentence, fallback=False): returnList = [] for i in range(len(sentence)): if i == 0 and i == len(sentence) - 1: prevWord = '<s>' nextWord = '</s>' elif i == 0: prevWord = '<s>' nextWord = sentence[i + 1] elif i == len(sentence) - 1: nextWord = '</s>' prevWord = sentence[i - 1] else: prevWord = sentence[i - 1] nextWord = sentence[i + 1] word = sentence[i] # If it's in the language model, add just that word if self.language_model.__contains__(word): returnList.append([word]) else: # Get all the candidates for that word candidates = self.generate_candidates(word) candidateList = [] if candidates == [] and fallback: candidateList = [word] else: for candidate in candidates: unigramScore = self.unigram_score(candidate) bigramScore = self.bigram_score( prevWord, candidate, nextWord) languageScore = (0.5*unigramScore) + \ (0.5 * bigramScore) candidateScore = languageScore + \ self.cm_score(word, candidate) candidateList.append([candidate, candidateScore]) # Sort the list by the second element candidateList.sort(key=lambda x: x[1], reverse=True) # Remove the second element, and append candidateList = [x[0] for x in candidateList] returnList += [candidateList] return returnList def check_text(self, text, fallback=False): ''' take a string as input, tokenize and sentence segment it with spacy, and then return the concatenation of the result of calling check_sentence on all of the resulting sentence objects. ''' tokens = self.nlp(text) sentences = list(tokens.sents) processedSentences = [] for sentence in sentences: # Convert sentence into list of lowercase words wordList = sentence.text.split() wordList = [x.lower() for x in wordList] processedSentences.append(self.check_sentence(wordList, fallback)) return processedSentences def autocorrect_sentence(self, sentence): ''' take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence with fallback=True, and return a new list of tokens where each non-word has been replaced by its most likely spelling correction. ''' corrections = self.check_sentence(sentence, fallback=True) return [x[0] for x in corrections] def autocorrect_line(self, line): ''' take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling autocorrect_sentence on all of the resulting sentence objects. ''' tokens = self.nlp(line) sentences = list(tokens.sents) processedSentences = [] for sentence in sentences: # Convert sentence into list of lowercase words wordList = sentence.text.split() if len(wordList) == 0: continue wordList = [x.lower() for x in wordList] processedSentences.append(self.autocorrect_sentence(wordList)) return processedSentences def suggest_sentence(self, sentence, max_suggestions): ''' take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence, and return a new list where: Real words are just strings in the list Non-words are lists of up to max_suggestions suggested spellings, ordered by your model’s preference for them. ''' sentenceCorrections = self.check_sentence(sentence) returnList = [] for word in sentenceCorrections: if len(word) == 1: returnList += word else: returnList.append(word[:max_suggestions]) return returnList def suggest_text(self, text, max_suggestions): ''' take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling suggest_sentence on all of the resulting sentence objects ''' tokens = self.nlp(text) sentences = list(tokens.sents) processedSentences = [] for sentence in sentences: # Convert sentence into list of lowercase words wordList = sentence.text.split() wordList = [x.lower() for x in wordList] # Get rid of the period if wordList[-1][-1] == '.': wordList[-1] = wordList[-1][:-1] processedSentences.append( self.suggest_sentence(wordList, max_suggestions)) return processedSentences
class SpellChecker(): def __init__(self, channel_model=None, language_model=None, max_distance=100): self.nlp = spacy.load("en", pipeline=["tagger", "parser"]) self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): """ takes 3 words and returns the average bigram probability of the first and last pair """ return (self.language_model.bigram_prob(prev_word, focus_word) + self.language_model.bigram_prob(focus_word, next_word)) / 2 def unigram_score(self, word): """ takes a word and returns the unigram probability """ return self.language_model.unigram_prob(word) def cm_score(self, error_word, corrected_word): """ gives the probability of a word having been transformed into a given erroneous form params ------ error_word - the observed misspelling corrected_word - the proposed corrected word returns ------- prob - the probability of the corrected word having been transformed into the error word """ return self.channel_model.prob(error_word, corrected_word) def inserts(self, word): wordsFound = [] wordLen = len(word) for v in self.language_model.vocabulary: if v.isalpha() and len(v) == (wordLen + 1): if self.subseq(word, v): wordsFound.append(v) return wordsFound def subseq(self, word1, word2): """ returns true if word1 is a subsequence of word2 """ for i in range(len(word1)): if (word1[i] not in word2): return False else: index = word2.index(word1[i]) word2 = word2[index + 1:] return True def deletes(self, word): wordsFound = [] wordLen = len(word) for v in self.language_model.vocabulary: if v.isalpha() and len(v) == (wordLen - 1): if self.subseq(v, word): wordsFound.append(v) return wordsFound def substitutions(self, word): """ take a word as input and return a list of words (that are in the LanguageModel) that are within one substitution of word. """ subList = [] wordLen = len(word) for candidate in self.language_model.vocabulary: if candidate.isalpha() and len(candidate) == wordLen: for i in range(wordLen): candidateDel = candidate[:i] + candidate[i + 1:] wordDel = word[:i] + word[i + 1:] if candidateDel == wordDel: if candidate not in subList: subList.append(candidate) break return subList def transpositions(self, word): """ take a word as input and return a list of words in LanguageModel that are within one substitution of the word. """ wordsFound = [] wordLen = len(word) for i in range(wordLen - 2): transp = word[0:i] + word[i + 1] + word[i] + word[i + 2:] if transp in self.language_model: wordsFound.append(transp) return wordsFound def generate_candidates(self, word): """ returns a list of words within max_distance edits of the given word """ words = {word} for i in range(self.max_distance): # find all words within edit distance 1 of the words currently in words new_words = set() for candidate in words: new_words |= set(self.inserts(candidate)) | set( self.deletes(candidate)) | set( self.substitutions(candidate)) | set( self.transpositions(candidate)) words |= new_words if word not in self.language_model: # we started with word to generate first set of candidates, but we don't want it in the final return if it isn't actually a word words.remove(word) return list(words) def check_non_words(self, sentence, fallback=False): words = [] for i in range(len(sentence)): if sentence[i] in self.language_model: words.append([sentence[i]]) else: candidates = self.generate_candidates(sentence[i]) prev_word = '<s>' if i == 0 else sentence[i - 1] next_word = '</s>' if i == len(sentence) - 1 else sentence[i + 1] candidates.sort(key=lambda x: 0.7 * (0.7 * self.bigram_score( prev_word, x, next_word) + 0.3 * self.unigram_score(x)) + 0.3 * self.cm_score(sentence[i], x), reverse=True) if fallback and not candidates: candidates = [sentence[i]] words.append(candidates) return words def check_sentence(self, sentence, fallback=False): sentList = [ ''.join([ char for char in token.text if char in string.ascii_lowercase ]) for token in sentence ] sentList = [token for token in sentList if token] return self.check_non_words(sentList, fallback) def check_text(self, text, fallback=False): """ takes a string as input, tokenize and sentence segment it with spacy, and then return the concatenation of the result of calling check_sentence on all of the resulting sentence objects. """ self.nlp.tokenizer = Tokenizer(self.nlp.vocab) doc = self.nlp(text.lower()) result = [] for sent in doc.sents: correctionList = self.check_sentence(sent, fallback) result.extend(correctionList) return result def autocorrect_sentence(self, sentence): """Take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence with fallback=True, and return a new list of tokens where each non-word has been replaced by its most likely spelling correction """ words = self.check_sentence(sentence, True) newSentence = [] for i in range(len(sentence)): newSentence.append(words[i][0]) return newSentence def autocorrect_line(self, line): """Take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling autocorrect_sentence on all of the resulting sentence objects. """ checkLines = self.check_text(line, True) newSentence = [] for i in range(len(checkLines)): newSentence.append(checkLines[i][0]) return ' '.join(newSentence) def suggest_sentence(self, sentence, max_suggestions): words = self.check_sentence(sentence, True) newSentence = [] for i in range(len(sentence)): if sentence[i].text in self.language_model: newSentence.append(sentence[i].text) else: newSentence.append(words[i][0:max_suggestions]) return newSentence def suggest_text(self, text, max_suggestions): # checkLines = self.check_text(text, True) # newSentence = [] # for i in range(len(checkLines)): # if checkLines[i] in self.language_model: # newSentence.append(checkLines[i]) # else: # newSentence.append(checkLines[i][0:max_suggestions]) # return newSentence self.nlp.tokenizer = Tokenizer(self.nlp.vocab) doc = self.nlp(text.lower()) result = [] for sent in doc.sents: # if sent.text in self.language_model: # result.extend(sent.text) # else: correctionList = self.suggest_sentence(sent, max_suggestions) result.extend(correctionList) return result
def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp)
def __init__(self, channel_model=None, language_model=None, max_distance=1): self.channel_model = channel_model if channel_model is not None else EditDistanceFinder() self.language_model = language_model if language_model is not None else LanguageModel() self.max_distance = max_distance self.char_set = list(string.ascii_lowercase)
class SpellChecker(object): def __init__(self, max_distance, channel_model=None, language_model=None): self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance self.nlp = spacy.load("en", pipeline=["tagger", "parser"]) def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): score = lambda x, y: self.language_model.bigram_prob(x,y) return (score(prev_word, focus_word) + score(focus_word, next_word))/(2.0) def unigram_score(self, word): return self.language_model.unigram_prob(word) def inserts(self, word): l = [] for i in range(len(word)): for char in string.ascii_lowercase: l.append(word[:i] + char + word[i:]) return [x for x in l if x in self.language_model] def deletes(self, word): l = [] for i in range(len(word)): l.append(word[:i] + word[i+1:]) return [x for x in l if x in self.language_model] def substitutions(self, word): l = [] for i in range(len(word)): for char in string.ascii_lowercase: l.append(word[:i] + char + word[i+1:]) return [x for x in l if x in self.language_model] def transposes(self, word): l = [] for i in range(1,len(word)): l.append(word[:i-1] + word[i] + word[i-1] + word[i+1:]) return [x for x in l if x in self.language_model] def cm_score(self, error_word, corrected_word): return self.channel_model.align(error_word, corrected_word)[0] def generate_candidates(self, word): source = [word] for i in range(self.max_distance): nested = list(map(self._one_step, source)) flat = [l for sublist in nested for l in sublist] source = list(set(flat)) return source def check_sentence(self, sentence, fallback=False): l = [] for i in range(len(sentence)): word = sentence[i] if (word in self.language_model) or (word in string.punctuation) or word == '\n': l.append([word]) else: choices = self.generate_candidates(word) if len(choices) == 0: if fallback: l.append([word]) else: if i<1: prev_word = '<s>' else: prev_word = sentence[i-1] if i+1 == len(sentence): next_word = '</s>' else: next_word = sentence[i+1] #rank = lambda x: self.cm_score(x, word) #rank = lambda x: self.bigram_score(prev_word, x, next_word) rank = lambda x: self._combine_scores(self.cm_score(x, word), self.bigram_score(prev_word, x, next_word), self.unigram_score(x)) ranked = sorted(choices, key = rank, reverse=False) l.append(list(ranked)) return l def _combine_scores(self, cm_score, bigram_score,unigram_score): return cm_score - 0.5*(bigram_score+unigram_score) def _one_step(self, word): if transpose: return self.inserts(word) + self.deletes(word) + self.substitutions(word) + self.transposes(word) else: return self.inserts(word) + self.deletes(word) + self.substitutions(word) def autocorrect_sentence(self, sentence): options = self.check_sentence(sentence, fallback=True) return [x[0] for x in options] def suggest_sentence(self, sentence, max_suggestions): options = self.check_sentence(sentence) get = lambda x: x[0] if len(x) == 0 else x[:max_suggestions] return [get(x) for x in options] def check_text(self, text, fallback=False): func = lambda x: self.check_sentence(x, fallback) return self._spacy_map(text, func) def autocorrect_line(self, line): return self._spacy_map(line, self.autocorrect_sentence) def suggest_text(self, text, max_suggestions): func = lambda x: self.suggest_sentence(x, max_suggestions) return self._spacy_map(text, func) def _spacy_map(self, text, function): doc = self.nlp(text.lower()) l = [] for sentence in doc.sents: stringlist = [str(x) for x in sentence] l += function(stringlist) return l
class SpellChecker(): def __init__(self, max_distance=1, channel_model=None, language_model=None): self.nlp = spacy.load("en", pipeline=["tagger", "parser"]) self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): if self.language_model: return (self.language_model.bigram_prob(prev_word, focus_word) +\ self.language_model.bigram_prob(focus_word, next_word))/2 def unigram_score(self, word): if self.language_model: return self.language_model.unigram_prob(word) def cm_score(self, error_word, corrected_word): if self.channel_model: return self.channel_model.prob(error_word, corrected_word) def inserts(self, word): words = [] for letter in string.ascii_lowercase: for i in range(len(word)+1): pos = word[:i] + letter + word[i:] if pos in self.language_model.vocabulary: words.append(pos) return words def deletes(self, word): words = [] for i in range(len(word)): pos = word[:i] + word[i+1:] if pos in self.language_model.vocabulary: words.append(pos) return words def substitutions(self, word): words = [] for letter in string.ascii_lowercase: for i in range(len(word)): pos = word[:i] + letter + word[i+1:] if pos in self.language_model.vocabulary: words.append(pos) return words def transpositions(self, word): words = [] for i in range(1, len(word)): pos = word[:i-1] + word[i] + word[i-1] + word[i+1:] if pos in self.language_model.vocabulary: words.append(pos) return words def generate_candidates(self, word): candidates = set() candidates.update(self.inserts(word)) candidates.update(self.deletes(word)) candidates.update(self.substitutions(word)) d = self.max_distance - 1 while d > 0: for word in candidates.copy(): candidates.update(self.inserts(word)) candidates.update(self.deletes(word)) candidates.update(self.substitutions(word)) d -= 1 return list(candidates) def generate_candidates_optimized(self, word): return self.optimized_finder(word, self.max_distance) def score(self, prev_word, focus_word, next_word, observed_word): lang_score = 0.2*self.unigram_score(focus_word) + 0.8*self.bigram_score(prev_word, focus_word, next_word) return 0.7*lang_score + 0.3*self.cm_score(observed_word, focus_word) def check_sentence(self, sentence, fallback=False): suggestion = [] for i in range(len(sentence)): observed_word = sentence[i] if observed_word.lower() in self.language_model or (len(observed_word) == 1 and observed_word not in string.ascii_lowercase): suggestion.append([observed_word]) continue prev_word = None next_word = None if i == 0: prev_word = '<s>' else: prev_word = sentence[i-1] if i == len(sentence) - 1: next_word = '</s>' else: next_word = sentence[i+1] suggested = self.generate_candidates(observed_word) if fallback and len(suggested) == 0: suggested.append(observed_word) suggestion.append( sorted( suggested, key=lambda e: self.score(prev_word, e, next_word, observed_word), reverse=True ) ) return suggestion def get_tokens(self, sentence): return [x.text for x in sentence] def check_text(self, text, fallback=False): doc = self.nlp(text) result = [] for sentence in doc.sents: tokens = self.get_tokens(sentence) result.append(self.check_sentence(tokens)) return result def autocorrect_sentence(self, sentence): temp = self.check_sentence(sentence, True) result = [] for token in temp: result.append(token[0]) return result def autocorrect_line(self, line): doc = self.nlp(line) result = [] for sentence in doc.sents: tokens = self.get_tokens(sentence) result.append(self.autocorrect_sentence(tokens)) return '\n'.join([' '.join(sentence) for sentence in result]) def suggest_sentence(self, sentence, max_suggestions): temp = self.check_sentence(sentence, True) result = [] for token in temp: result.append(token[:max_suggestions]) return result def suggest_text(self, text, max_suggestions): doc = self.nlp(text) result = [] for sentence in doc.sents: tokens = self.get_tokens(sentence) result.append(self.suggest_sentence(tokens, max_suggestions)) return result
class SpellChecker(): def __init__(self, channel_model=None, language_model=None, max_distance): self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.train(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): return (self.language_model.bigram_score(prev_word,focus_word) + self.language_model.bigram_score(focus_word,next_word))/2 def unigram_score(self, word): return self.language_model.unigram_score(word): def cm_score(self, error_word, corrected_word): return self.channel_model.align(error_word,corrected_word)[0] @staticmethod def isSubstring(w1, w2): for letter in w1: try: w2 = w2[w2.index(letter):] except: return False return True def inserts(self, word): output = [] for w in self.language_model: if len(w) == len(word) + 1: if isSubstring(word, w): output.append(w) return output def deletes(self, word): output = [] for w in self.language_model: if len(w) == len(word) - 1: if isSubstring(w,word): output.append(w) return output def substitutions(self, word): output = [] for w in self.language_model: if len(w) == len(word): numInc = 0 for i in range(len(w)): if w[i] != word[i]: numInc += 1 if numInc == 1: output.append(w) return output def generate_candidates(self, word): output = [word] for _ in range(self.max_distance): newOutput = [] for w in output: newOutput += self.inserts(word) + self.deletes(word) + self.substitutions(word) output = newOutput return output def check_non_words(self, sentence, fallback=False): output = [] for word in sentence: if word in self.language_model: output.append([word]) else: L = self.generate_candidates(word) if fallback && len(L) == 0: output.append([word]) else: L.sort(key=lambda w: self.language_model.unigram_score(w) + self.channel_model.align(w)[0]) output.append(L) return output def check_sentence(self, sentence, fallback=False): return self.check_non_words(sentence, fallback) def check_text def check_sentence(self, sentence, fallback=False):
class SpellChecker: def __init__(self, max_distance, channel_model=None, language_model=None): self.nlp = spacy.load("en", pipeline=["tagger", "parser"]) self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance self.punc = '.?:;"\'!\n,/\\' def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): # returns the average log prob between bigrams (prev, focus) # and (focus, next). return 0.5 * (self.language_model.bigram_prob(prev_word, focus_word) + self.language_model.bigram_prob(focus_word, next_word)) def unigram_score(self, word): # returns the log probability of this unigram return self.language_model.unigram_prob(word) def cm_score(self, error_word, corrected_word): # returns the log probability that error_word was typed when # corrected_word was intended. return self.channel_model.prob(error_word, corrected_word) def inserts(self, word): # returns a list of words that are within one insert of word # we try inserting each character in each position of the word l = [] for i in range(len(word) + 1): for new_char in string.ascii_lowercase: new_word = word[0:i] + new_char + word[i:] if new_word in self.language_model: l.append(new_word) return list(set(l)) def deletes(self, word): # returns a list of words that are within one delete of word # we try deleting each character in the word l = [] for i in range(len(word)): new_word = word[0:i] + word[i + 1:] if new_word in self.language_model: l.append(new_word) return list(set(l)) def substitutions(self, word): # returns a list of words that are within one substitution of word # we try substituting each character in the word with every other char l = [] for i, char in enumerate(word): for new_char in string.ascii_lowercase: new_word = word[0:i] + new_char + word[i + 1:] if char != new_char and new_word in self.language_model: l.append(new_word) return l def transpositions(self, word): # returns a list of words that are within one transposition of word # we try transposing each pair of adjacent characters l = [] for i in range(len(word) - 1): new_word = word[0:i] + word[i + 1] + word[i] + word[i + 2:] if new_word in self.language_model: l.append(new_word) return list(set(l)) def generate_candidtates(self, word): # returns a list of words that are within max_distance # edits from the input word. We do this by first generating # the words that are 1 edit away, then the words that are 1 # edit away from those, and so on. checked_word_list = [] # tracks words we've already checked words_to_check = [word] # tracks words we need to check word_list = [] # tracks our final word list for _ in range(self.max_distance): new_words_list = [] # new words on this iteration for w in words_to_check: if w in checked_word_list: continue # we alreadly checked this word checked_word_list.append(w) # try deletion/insertion/substitution to find new words new_words_list.extend(self.inserts(w)) new_words_list.extend(self.deletes(w)) new_words_list.extend(self.substitutions(w)) new_words_list.extend(self.transpositions(w)) # add new unique words to our word list words_to_check = [] for w in new_words_list: if w not in word_list: word_list.append(w) words_to_check.append(w) return word_list def sort_candidates(self, error_word, prev_word, next_word, candidates): """ takes as input a spelling error and a list of candidates and returns a sorted list of candidates, where earlier candidates are "better" suggestions, in terms of a weighted combination of unigram score, bigram score, and edit distance score. Note, our choice depends somewhat on the context of the word """ score_list = [] for candidate in candidates: bigram_score = self.bigram_score(prev_word, candidate, next_word) unigram_score = self.unigram_score(candidate) edit_score = self.cm_score(error_word, candidate) # we use an equally weighted linear combination of log edit score # and language model score. score = 0.5 * edit_score + 0.25 * (bigram_score + unigram_score) score_list.append((candidate, score)) # sort list so that highest score comes first sorted_list = sorted(score_list, key=lambda x: -x[1]) return [w for w, s in sorted_list] def check_non_words(self, sentence, fallback=False): """ Takes as input a list of words, and returns a list of lists of words. If the word is in the language model, the list contains just the original word. Otherwise, it contains a list of spell correcting suggestions. If fallback is true, we will replace any word with no suggestions with the list of just the word itself. """ l = [] for i, word in enumerate(sentence): word = word.lower() # enfore lowercase if word in self.language_model or word in self.punc: l.append([word]) # correctly spelled word/punctuation else: candidates = self.generate_candidtates(word) prevW = sentence[i - 1] if i > 0 else "<s>" nextW = sentence[i + 1] if i + 1 < len(sentence) else "</s>" canditates = self.sort_candidates(word, prevW, nextW, candidates) if canditates or not fallback: l.append(canditates) # give candidate suggestions else: l.append([word]) # fallback case, no candidates return l def check_sentence(self, sentence, fallback=False): """ Takes as input a list of words, and returns a list of lists of words. Correctly spelled words appear in their own list; otherwise, a list of spelling corrections is given in order of likelihood. """ return self.check_non_words(sentence, fallback=fallback) def check_line(self, line, fallback=False): """ Takes as input a string, tokenizes it, and returns a list of lists of words. Correctly spelled words appear in their own list; otherwise, a list of spelling corrections is given in order of likelihood. """ doc = self.nlp(line) # use spacy to segment sentences l = [] for sent in doc.sents: # genreate sentence as list of strings # ignore punctuation characters sentence = [str(w) for w in sent] # pass our sentence to the check_sentence method l.extend(self.check_sentence(sentence, fallback=fallback)) return l def autocorrect_sentence(self, sentence): """ takes a list of tokens and returns a new list of tokens where each non-word has been replaced by its most likely spelling correction. """ l = self.check_sentence(sentence, fallback=True) return [w[0] for w in l] def autocorrect_line(self, line): """ takes a string as input, tokenizes and segment it with spacy, and then returns the concatenation of the result of calling autocorrect_sentence on all of the resulting sentence objects """ doc = self.nlp(line) # use spacy to segment sentences l = [] for sent in doc.sents: sentence = [str(w) for w in sent] l.extend(self.autocorrect_sentence(sentence)) return l def suggest_sentence(self, sentence, max_suggestions): """ Takes as input a list of words, and returns a list of words and lists of words. Correctly spelled words appear on their own; otherwise, a list of spelling suggestions is given in order of likelihood. """ suggestions = self.check_sentence(sentence, fallback=True) l = [] for i in range(len(sentence)): if sentence[i] in self.language_model or sentence[i] in self.punc: l.append(sentence[i]) else: l.append(suggestions[i][0:max_suggestions]) return l def suggest_line(self, line, max_suggestions): """ takes a string as input, tokenizes and segments it with spacy, and then returns the concatenation of the result of calling suggest_sentence on all of the resulting sentence objects """ doc = self.nlp(line) # use spacy to segment sentences l = [] for sent in doc.sents: sentence = [str(w) for w in sent] l.extend(self.suggest_sentence(sentence, max_suggestions)) return l
class SpellChecker(): def __init__(self, max_distance, channel_model=None, language_model=None): self.nlp = spacy.load("en") self.channel_model = channel_model self.language_model = language_model self.max_distance = max_distance def load_channel_model(self, fp): self.channel_model = EditDistanceFinder() self.channel_model.load(fp) def load_language_model(self, fp): self.language_model = LanguageModel() self.language_model.load(fp) def bigram_score(self, prev_word, focus_word, next_word): return (self.language_model.bigram_prob(prev_word, focus_word) + self.language_model.bigram_prob(focus_word, next_word)) / 2 def unigram_score(self, word): return self.language_model.unigram_prob(word) def cm_score(self, error_word, corrected_word): return self.channel_model.prob(error_word, corrected_word) def isSubstring(self, w1, w2): if w1 == "": return True elif w2 == "": return False else: if w1[0] == w2[0]: return self.isSubstring(w1[1:], w2[1:]) else: return self.isSubstring(w1, w2[1:]) def inserts(self, word): output = [] for w in self.language_model.vocabulary: if len(w) == len(word) + 1: if self.isSubstring(word, w): output.append(w) return output def deletes(self, word): output = [] for w in self.language_model.vocabulary: if len(w) == len(word) - 1: if self.isSubstring(w, word): output.append(w) return output def substitutions(self, word): output = [] for w in self.language_model.vocabulary: if len(w) == len(word): numInc = 0 for i in range(len(w)): if w[i] != word[i]: numInc += 1 if numInc == 1: output.append(w) return output def generate_candidates(self, word): output = [] newWords = [word] for _ in range(self.max_distance): checkWords = [] for w in newWords: if not all([x in string.ascii_lowercase for x in w]): continue checkWords.extend(self.inserts(word)) checkWords.extend(self.deletes(word)) checkWords.extend(self.substitutions(word)) output.extend(checkWords) newWords = checkWords return list(set(output)) def sortList(self, wordList, prevWord, targetWord, nextWord): output = [] for word in wordList: bs = self.bigram_score(prevWord, word, nextWord) us = self.unigram_score(word) cm = self.cm_score(targetWord, word) score = 0.5 * cm + 0.25 * bs + 0.25 * us output.append((word, score)) output.sort(key=lambda w: w[1]) return [w[0] for w in output] def check_non_words(self, sentence, fallback=False): output = [] for i in range(len(sentence)): if sentence[i] in self.language_model: output.append([sentence[i]]) else: L = self.generate_candidates(sentence[i]) if fallback and len(L) == 0: output.append([sentence[i]]) else: if i > 0: prevWord = sentence[i - 1] else: prevWord = "<s>" if i + 1 == len(sentence): nextWord = "</s>" else: nextWord = sentence[i + 1] self.sortList(L, prevWord, sentence[i], nextWord) output.append(L) return output def check_sentence(self, sentence, fallback=False): return self.check_non_words(sentence, fallback) def check_line(self, line, fallback=False): sentences = self.nlp(line).sents output = [] for sent in sentences: sentence = [str(w) for w in sent] output.extend(self.check_sentence(sentence, fallback=fallback)) return output def check_text(self, text, fallback=False): sentences = self.nlp(text).sents output = [] for sent in sentences: output.append(self.check_sentence(sent, fallback)) return output def autocorrect_sentence(self, sentence): suggestions = self.check_sentence(sentence, True) return [word[0] for word in suggestions] def autocorrect_line(self, line): sentences = self.nlp(line).sents output = [] for sent in sentences: sentence = [str(w) for w in sent] output.extend(self.autocorrect_sentence(sentence)) return output def suggest_sentence(self, sentence, max_suggestions): suggestions = self.check_sentence(sentence, True) output = [] for i in range(len(sentence)): if sentence[i] in self.language_model: output.append(sentence[i]) else: output.append(suggestions[i][0:max_suggestions]) return output def suggest_line(self, line, max_suggestions): sentences = self.nlp(line).sents output = [] for sent in sentences: sentence = [str(w) for w in sent] output.extend(self.suggest_sentence(sentence, max_suggestions)) return output def suggest_text(self, text, max_suggestions): sentences = self.nlp(text).sents output = [] for sent in sentences: suggestion = self.suggest_sentence(sent, max_suggestions) for sug in suggestion: output.append(sug) return output
bigram_score = self.bigram_score(sentence[current_index - 1], words[i], "<s>") elif current_index == 0: # start of sentence bigram_score = self.bigram_score("<s>", words[i], sentence[current_index + 1]) else: bigram_score = self.bigram_score(sentence[current_index - 1], words[i], sentence[current_index + 1]) scores.append(self.cm_score(error_word, words[i]) + 0.5 * bigram_score + 0.5 * self.unigram_score(words[i])) i += 1 return_list.append(scores) current_index += 1 return list_of_words, return_list if __name__ == "__main__": lm = AddAlphaBigramModel(alpha=.1) lm.train() cm = EditDistanceFinder() cm.train("wikipedia_misspellings.txt") s = SpellChecker(cm, lm, 1) print (s.check_sentence(["how", "are", "yoo", "sir"])) print () print (s.check_sentence(["they", "did", "not", "yb", "any", "menas"])) print () print (s.autocorrect_sentence(["they", "did", "not", "yb", "any", "menas"])) print () print (s.autocorrect_sentence(["menas"])) print () print (s.__score_probabilities__(["they", "did", "not", "yb", "any", "menas"])) print () print (s.suggest_sentence(["they", "did", "not", "yb", "any", "menas"], 3)) print ()