class UnsupervisedGrammarCorrector: def __init__(self, threshold=0.96): basename = os.path.dirname(os.path.realpath(__file__)) self.lm = LanguageModel() # Load spaCy self.nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. self.gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ self.gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners self.determiners = {"", "the", "a", "an"} # List of common prepositions self.prepositions = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } self.threshold = threshold def correct(self, sentence): # If the line is empty, preserve the newline in output and continue if not sentence: return "" best = sentence score = self.lm.score(best) while True: new_best, new_score = self.process(best) if new_best and new_score > score: best = new_best score = new_score else: break return best def process(self, sentence: str) -> Tuple[str, bool]: # Process sent with spacy proc_sent = self.nlp.tokenizer(sentence) self.nlp.tagger(proc_sent) # Calculate avg token prob of the sent so far. orig_prob = self.lm.score(proc_sent.text) # Store all the candidate corrected sentences here candidates = [] # Process each token. for tok in proc_sent: # SPELLCHECKING # Spell check: tok must be alphabetical and not a real word. candidate_tokens = set() lower_cased_token = tok.lower_ if lower_cased_token.isalpha( ) and not self.gb.spell(lower_cased_token): candidate_tokens |= set(self.gb.suggest(lower_cased_token)) # MORPHOLOGY if tok.lemma_ in self.gb_infl: candidate_tokens |= self.gb_infl[tok.lemma_] # DETERMINERS if lower_cased_token in self.determiners: candidate_tokens |= self.determiners # PREPOSITIONS if lower_cased_token in self.prepositions: candidate_tokens |= self.prepositions candidate_tokens = [ c for c in candidate_tokens if self.gb.spell(c) ] if candidate_tokens: if tok.is_title: candidate_tokens = [c.title() for c in candidate_tokens] elif tok.is_upper: candidate_tokens = [c.upper() for c in candidate_tokens] candidates.extend( self._generate_candidates(tok.i, candidate_tokens, proc_sent)) best_prob = orig_prob best = sentence for candidate in candidates: # Score the candidate sentence cand_prob = self.lm.score(candidate.text) print(candidate.text, self.lm.score(candidate.text), cand_prob) # Compare cand_prob against weighted orig_prob and best_prob if cand_prob > best_prob: best_prob = cand_prob best = candidate.text # Return the best sentence and a boolean whether to search for more errors return best, best_prob def _generate_candidates(self, tok_id, candidate_tokens, tokenized_sentence) -> List[str]: # Save candidates here. candidates = [] prefix = tokenized_sentence[:tok_id] suffix = tokenized_sentence[tok_id + 1:] # Loop through the input alternative candidates for token in candidate_tokens: candidate = prefix.text_with_ws if token: candidate += token + " " candidate += suffix.text_with_ws candidate = self.nlp.tokenizer(candidate) candidates.append(candidate) return candidates