Пример #1
0
class UnsupervisedGrammarCorrector:
    def __init__(self, threshold=0.96):
        basename = os.path.dirname(os.path.realpath(__file__))
        self.lm = LanguageModel()
        # Load spaCy
        self.nlp = spacy.load("en")
        # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
        # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
        self.gb = Hunspell("en_GB-large",
                           hunspell_data_dir=basename + '/resources/spelling/')
        # Inflection forms: http://wordlist.aspell.net/other/
        self.gb_infl = loadWordFormDict(basename +
                                        "/resources/agid-2016.01.19/infl.txt")
        # List of common determiners
        self.determiners = {"", "the", "a", "an"}
        # List of common prepositions
        self.prepositions = {
            "", "about", "at", "by", "for", "from", "in", "of", "on", "to",
            "with"
        }
        self.threshold = threshold

    def correct(self, sentence):
        # If the line is empty, preserve the newline in output and continue
        if not sentence:
            return ""
        best = sentence
        score = self.lm.score(best)

        while True:
            new_best, new_score = self.process(best)
            if new_best and new_score > score:
                best = new_best
                score = new_score
            else:
                break

        return best

    def process(self, sentence: str) -> Tuple[str, bool]:
        # Process sent with spacy
        proc_sent = self.nlp.tokenizer(sentence)
        self.nlp.tagger(proc_sent)
        # Calculate avg token prob of the sent so far.
        orig_prob = self.lm.score(proc_sent.text)
        # Store all the candidate corrected sentences here
        candidates = []
        # Process each token.
        for tok in proc_sent:
            # SPELLCHECKING
            # Spell check: tok must be alphabetical and not a real word.

            candidate_tokens = set()

            lower_cased_token = tok.lower_

            if lower_cased_token.isalpha(
            ) and not self.gb.spell(lower_cased_token):
                candidate_tokens |= set(self.gb.suggest(lower_cased_token))
            # MORPHOLOGY
            if tok.lemma_ in self.gb_infl:
                candidate_tokens |= self.gb_infl[tok.lemma_]
            # DETERMINERS
            if lower_cased_token in self.determiners:
                candidate_tokens |= self.determiners
            # PREPOSITIONS
            if lower_cased_token in self.prepositions:
                candidate_tokens |= self.prepositions

            candidate_tokens = [
                c for c in candidate_tokens if self.gb.spell(c)
            ]

            if candidate_tokens:
                if tok.is_title:
                    candidate_tokens = [c.title() for c in candidate_tokens]
                elif tok.is_upper:
                    candidate_tokens = [c.upper() for c in candidate_tokens]

                candidates.extend(
                    self._generate_candidates(tok.i, candidate_tokens,
                                              proc_sent))

        best_prob = orig_prob
        best = sentence

        for candidate in candidates:
            # Score the candidate sentence
            cand_prob = self.lm.score(candidate.text)
            print(candidate.text, self.lm.score(candidate.text), cand_prob)

            # Compare cand_prob against weighted orig_prob and best_prob
            if cand_prob > best_prob:
                best_prob = cand_prob
                best = candidate.text
        # Return the best sentence and a boolean whether to search for more errors
        return best, best_prob

    def _generate_candidates(self, tok_id, candidate_tokens,
                             tokenized_sentence) -> List[str]:
        # Save candidates here.
        candidates = []

        prefix = tokenized_sentence[:tok_id]
        suffix = tokenized_sentence[tok_id + 1:]
        # Loop through the input alternative candidates
        for token in candidate_tokens:
            candidate = prefix.text_with_ws
            if token:
                candidate += token + " "
            candidate += suffix.text_with_ws
            candidate = self.nlp.tokenizer(candidate)
            candidates.append(candidate)
        return candidates