def test_suggestions(self): """Test the automatic spelling corrections""" spell_checker = HunSpelling(self.dic, self.aff) # top 1 candidates for query in json_controller.stream_field(self.samples, 'noisy'): candidates = spell_checker.correct(query, topn=1) self.assertTrue(len(candidates) <= 1) # top 5 candidates for query in json_controller.stream_field(self.samples, 'noisy'): candidates = spell_checker.correct(query, topn=5) self.assertTrue(len(candidates) <= 5)
class B1Correction: """ Automatically correct spelling errors Baseline 1 - use spacy for tokenization and named-entity detection - use hunspell for detecting isolated non-word spelling errors and suggesting candidate corrections - rerank candidates using a n-gram language model """ def __init__(self): """Initialize the baseline""" self.nlp = None self.hunspell = None self.ngram = None self.logger = logging.getLogger(__name__) def load_spacy(self, nlp_model, disable=None): """Load the spacy NLP pipelines""" self.nlp = SpacyLoader(nlp_model, disable=disable) self.logger.info('Loaded spacy NLP model') def load_hunspell(self, dic_file, aff_file, extra_dic=None): """Load the hunspell analysis""" self.hunspell = HunSpelling(dic_file, aff_file, extra_dic=extra_dic) self.logger.info('Loaded hunspell checker') def load_ngram(self, ngram_model, **kwargs): """Load the n-gram language model""" self.ngram = LanguageModel(ngram_model, **kwargs) self.logger.info('Loaded n-gram language model') def correct(self, query, topn=5): """Return top candidate corrections for given query""" # recover tokens and flags for tokens to ignore by spellchecker tokens, flags = self.nlp.split_and_flag(query) # recover the list of correction suggestions made by hunspell candidates = self.hunspell.correct(tokens, ignore=flags) # re-order the candidates list by the n-gram language model candidates = self.ngram.order_sequences(candidates) # post-process sequences (remove spaces surrounding punctuation marks) candidates = [ str_utils.remove_spaces_apostrophes(s) for s in candidates ] return candidates[:topn]
def test_eval_suggestions(self): """Test the evaluation of automatic spelling corrections""" spell_checker = HunSpelling(self.dic, self.aff) gold_solutions = [] cand_solutions = [] # top 1 candidates for query, correction in json_controller.stream( self.samples, 'noisy', 'clean'): candidates = spell_checker.correct(query, topn=1) gold_solutions.append(correction) cand_solutions.append(candidates) evaluation = Evaluation() evaluation.load_from_lists(cand_solutions, gold_solutions) scores = evaluation.performance(1) # 1 / 300 correct suggestions ('méthode de recherche en histoire') self.assertEqual(scores, (0.33, 0.33, 0.33))
def load_hunspell(self, dic_file, aff_file, extra_dic=None): """Load the hunspell analysis""" self.hunspell = HunSpelling(dic_file, aff_file, extra_dic=extra_dic) self.logger.info('Loaded hunspell checker')
def test_new_dict(self): """Test adding an extra dictionary""" spell_checker = HunSpelling(self.dic, self.aff) spell_checker.add_extra_dictionary(self.dic) self.assertFalse(None, spell_checker)
def test_spellings(self): """Check spellings""" spell_checker = HunSpelling(self.dic, self.aff) words = ['aide', 'brut', 'pourquoi', 'calcul', 'ville'] errors = [spell_checker.is_misspelled(w) for w in words] self.assertEqual([False, False, True, False, True], errors) # incorrectly test adding new words spell_checker.add_words(None) # add new words and recheck spelling nwords = ['pourquoi', 'ville'] spell_checker.add_words(nwords) errors = [spell_checker.is_misspelled(w) for w in nwords] self.assertEqual([False, False], errors) # remove the newly added words and recheck spelling spell_checker.remove_words(None) spell_checker.remove_words(nwords) errors = [spell_checker.is_misspelled(w) for w in nwords] self.assertEqual([True, True], errors)
def test_extra_load(self): """Add an extra dictionary""" spell_checker = HunSpelling(self.dic, self.aff, self.dic) self.assertFalse(None, spell_checker)