示例#1
0
    def _correct_word(self, word):
        """
        Suggest words for given word, follow below steps:
            - check if length of word is eligible for correction from min max length allowed from config,
                else return None
            - if word belong to domain word correction (created for words which occur less in domain words and are
                replaced with most similar words, either from domain or global), return suggested word from dict
            - if word exists in global dictionary, return None
            - if word exists in domain dictionary, return None
            - get suggestions from domain symspell & phoneme
            - if not suggestions from domain, get suggestions from global symspell and phoneme
            - filter top 5 suggestions
            - also add top 3 suggestions from global data up to min edit distance of above dict
            - return final suggestions
        Args:
            word (str): word to be corrected
        Returns:
            (list): list of suggested words
        """
        if ((self.config.min_length_for_spellcorrection > len(word))
                or (len(word) > self.config.max_length_for_spellcorrection)
                or (re.search(r'\d', word))):
            return None

        phoneme = self.phoneme_model.spell_correct(word)
        if phoneme.is_correct:
            return None
        phoneme_suggestions = phoneme.suggestions

        symspell = self.symspell_model.spell_correct(word)
        if symspell.is_correct:
            return None
        symspell_suggestions = symspell.suggestions

        logger.debug(f"Symspell suggestions: {symspell_suggestions}")
        logger.debug(f"Phoneme suggestions: {phoneme_suggestions}")

        phoneme_suggestions = [(word, edit_distance, 0)
                               for (word, edit_distance) in phoneme_suggestions
                               ]
        symspell_suggestions = [(word, edit_distance, 1)
                                for (word,
                                     edit_distance) in symspell_suggestions]

        suggestions = list(phoneme_suggestions + symspell_suggestions)
        suggestions.sort(key=operator.itemgetter(1, 2))

        # filtering duplicate suggestions
        unique_suggestions = []
        for suggestion in suggestions:
            if suggestion[0] not in unique_suggestions:
                unique_suggestions.append(suggestion[0])

        suggestions = unique_suggestions

        return suggestions
示例#2
0
    def train(self, data: Union[List[str], Dict[str, int]], **kwargs):
        """
        Train all models of spellcorrection
        Args:
            data (list|dict): list of text or dict having word and their count
        Returns:
            None
        """

        logger.debug("Spello training started..")

        words_counter = {}
        if not isinstance(data, list) and not isinstance(data, dict):
            raise ValueError(
                'Argument `data` should be either List[str] or Dict[str, int]')

        if isinstance(data, list):
            texts = [get_clean_text(text) for text in data]
            lower_case_texts = [text.lower() for text in texts]

            logger.debug("Context model training started ...")
            # Context model get trained only when list of text are given for training
            # train context model: find most probable correct word for given suggestions for each word in texts
            # based on context word
            self.context_train(lower_case_texts)

            tokens = " ".join(lower_case_texts).strip().split()

            words_counter = dict(Counter(tokens))

        elif isinstance(data, dict):
            words_counter = {
                word.lower(): count
                for word, count in data.items()
            }

        words_counter = {
            word: count
            for word, count in words_counter.items()
            if self.config.min_length_for_spellcorrection
        }

        logger.debug("Symspell training started ...")
        # train symspell model: give suggestion based on edit distance
        self.symspell_train(words_counter)

        logger.debug("Phoneme training started ...")
        # train phoneme model: give suggestion for similar sounding words
        self.phoneme_train(words_counter)

        logger.debug("Spello training completed successfully ...")
示例#3
0
    def spell_correct(self, text: str, verbose=0) -> Dict[str, Any]:
        """
        Get spell corrected text, and dict of token level suggestion
        Args:
            text (str):
            verbose (int): define verbose level
        Returns:
            (str): spell corrected text
            (dict): dict of token level suggestion

        Examples:
            >>> text = 'i wnt to play kricket'
            >>> SpellCorrectionModel().spell_correct(text)
            {
                'original_text': 'i wnt to play kricket',
                'corrected_text': 'i want to play cricket',
                'correction_dict': {'wnt': 'want', 'kricket': 'cricket'}
            }
        """
        levels = [logging.CRITICAL, logging.ERROR, logging.INFO, logging.DEBUG]
        verbosity = min(verbose, len(levels) - 1)
        with loglevel(levels[verbosity]):
            spellcorrection_result = {
                ORIGINAL_TEXT: text,
                CORRECTED_TEXT: text,
                CORRECTIONS_DICT: {}
            }

            suggestions_dict = {}
            #             clean_text = get_clean_text(text)
            clean_text = text
            tokens = clean_text.split()
            for token in tokens:
                lowercase_token = token.lower()
                token_suggestion = self._correct_word(lowercase_token)
                if token_suggestion:
                    suggestions_dict[lowercase_token] = token_suggestion

            logger.debug(
                f"Suggestions dict from phoneme and symspell are: {suggestions_dict}"
            )

            context_corrected_text, context_corrections = self.context_suggestion(
                clean_text, suggestions_dict)

            logger.debug(f"text after context model: {context_corrected_text}")

            spellcorrection_result[CORRECTED_TEXT] = context_corrected_text
            spellcorrection_result[CORRECTIONS_DICT] = context_corrections

            logger.debug(f"Spell-correction Results {spellcorrection_result}")

        return spellcorrection_result