def _correct_word(self, word): """ Suggest words for given word, follow below steps: - check if length of word is eligible for correction from min max length allowed from config, else return None - if word belong to domain word correction (created for words which occur less in domain words and are replaced with most similar words, either from domain or global), return suggested word from dict - if word exists in global dictionary, return None - if word exists in domain dictionary, return None - get suggestions from domain symspell & phoneme - if not suggestions from domain, get suggestions from global symspell and phoneme - filter top 5 suggestions - also add top 3 suggestions from global data up to min edit distance of above dict - return final suggestions Args: word (str): word to be corrected Returns: (list): list of suggested words """ if ((self.config.min_length_for_spellcorrection > len(word)) or (len(word) > self.config.max_length_for_spellcorrection) or (re.search(r'\d', word))): return None phoneme = self.phoneme_model.spell_correct(word) if phoneme.is_correct: return None phoneme_suggestions = phoneme.suggestions symspell = self.symspell_model.spell_correct(word) if symspell.is_correct: return None symspell_suggestions = symspell.suggestions logger.debug(f"Symspell suggestions: {symspell_suggestions}") logger.debug(f"Phoneme suggestions: {phoneme_suggestions}") phoneme_suggestions = [(word, edit_distance, 0) for (word, edit_distance) in phoneme_suggestions ] symspell_suggestions = [(word, edit_distance, 1) for (word, edit_distance) in symspell_suggestions] suggestions = list(phoneme_suggestions + symspell_suggestions) suggestions.sort(key=operator.itemgetter(1, 2)) # filtering duplicate suggestions unique_suggestions = [] for suggestion in suggestions: if suggestion[0] not in unique_suggestions: unique_suggestions.append(suggestion[0]) suggestions = unique_suggestions return suggestions
def train(self, data: Union[List[str], Dict[str, int]], **kwargs): """ Train all models of spellcorrection Args: data (list|dict): list of text or dict having word and their count Returns: None """ logger.debug("Spello training started..") words_counter = {} if not isinstance(data, list) and not isinstance(data, dict): raise ValueError( 'Argument `data` should be either List[str] or Dict[str, int]') if isinstance(data, list): texts = [get_clean_text(text) for text in data] lower_case_texts = [text.lower() for text in texts] logger.debug("Context model training started ...") # Context model get trained only when list of text are given for training # train context model: find most probable correct word for given suggestions for each word in texts # based on context word self.context_train(lower_case_texts) tokens = " ".join(lower_case_texts).strip().split() words_counter = dict(Counter(tokens)) elif isinstance(data, dict): words_counter = { word.lower(): count for word, count in data.items() } words_counter = { word: count for word, count in words_counter.items() if self.config.min_length_for_spellcorrection } logger.debug("Symspell training started ...") # train symspell model: give suggestion based on edit distance self.symspell_train(words_counter) logger.debug("Phoneme training started ...") # train phoneme model: give suggestion for similar sounding words self.phoneme_train(words_counter) logger.debug("Spello training completed successfully ...")
def spell_correct(self, text: str, verbose=0) -> Dict[str, Any]: """ Get spell corrected text, and dict of token level suggestion Args: text (str): verbose (int): define verbose level Returns: (str): spell corrected text (dict): dict of token level suggestion Examples: >>> text = 'i wnt to play kricket' >>> SpellCorrectionModel().spell_correct(text) { 'original_text': 'i wnt to play kricket', 'corrected_text': 'i want to play cricket', 'correction_dict': {'wnt': 'want', 'kricket': 'cricket'} } """ levels = [logging.CRITICAL, logging.ERROR, logging.INFO, logging.DEBUG] verbosity = min(verbose, len(levels) - 1) with loglevel(levels[verbosity]): spellcorrection_result = { ORIGINAL_TEXT: text, CORRECTED_TEXT: text, CORRECTIONS_DICT: {} } suggestions_dict = {} # clean_text = get_clean_text(text) clean_text = text tokens = clean_text.split() for token in tokens: lowercase_token = token.lower() token_suggestion = self._correct_word(lowercase_token) if token_suggestion: suggestions_dict[lowercase_token] = token_suggestion logger.debug( f"Suggestions dict from phoneme and symspell are: {suggestions_dict}" ) context_corrected_text, context_corrections = self.context_suggestion( clean_text, suggestions_dict) logger.debug(f"text after context model: {context_corrected_text}") spellcorrection_result[CORRECTED_TEXT] = context_corrected_text spellcorrection_result[CORRECTIONS_DICT] = context_corrections logger.debug(f"Spell-correction Results {spellcorrection_result}") return spellcorrection_result