def test_pickle_invalid(self): pickle_path = os.path.join(self.fortests_path, "dictionary.pickle") is_compressed = False edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) pickle_data = { "deletes": {}, "words": {}, "max_length": 0, "data_version": -1 } with open(pickle_path, "wb") as f: pickle.dump(pickle_data, f) self.assertFalse(sym_spell.load_pickle(pickle_path, is_compressed)) os.remove(pickle_path) pickle_data = { "deletes": {}, "words": {}, "max_length": 0 } with open(pickle_path, "wb") as f: pickle.dump(pickle_data, f) self.assertFalse(sym_spell.load_pickle(pickle_path, is_compressed)) os.remove(pickle_path)
def test_pickle_bytes(self, symspell_default_load): sym_spell, _ = symspell_default_load sym_spell_2 = SymSpell(123, 456, 789) assert sym_spell._count_threshold != sym_spell_2._count_threshold assert (sym_spell._max_dictionary_edit_distance != sym_spell_2._max_dictionary_edit_distance) assert sym_spell._prefix_length != sym_spell_2._prefix_length with TestCase.assertLogs("symspellpy.symspellpy.logger", level="WARNING") as cm: sym_spell_2.load_pickle(sym_spell.save_pickle(to_bytes=True), from_bytes=True) assert ( "Loading data which was created using different ('count_threshold', " "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting " "current SymSpell instance with loaded settings ..." ) == cm.records[0].getMessage() assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words assert sym_spell.bigrams == sym_spell_2.bigrams assert sym_spell.deletes == sym_spell_2.deletes assert sym_spell.words == sym_spell_2.words assert sym_spell._max_length == sym_spell_2._max_length assert sym_spell._count_threshold == sym_spell_2._count_threshold assert (sym_spell._max_dictionary_edit_distance == sym_spell_2._max_dictionary_edit_distance) assert sym_spell._prefix_length == sym_spell_2._prefix_length
def test_pickle_compressed(self): pickle_path = os.path.join(self.fortests_path, "dictionary.pickle") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.save_pickle(pickle_path) sym_spell_2 = SymSpell(edit_distance_max, prefix_length) sym_spell_2.load_pickle(pickle_path) self.assertEqual(sym_spell.deletes, sym_spell_2.deletes) self.assertEqual(sym_spell.words, sym_spell_2.words) self.assertEqual(sym_spell._max_length, sym_spell_2._max_length) os.remove(pickle_path)
def test_pickle_same_settings(self, pickle_path, symspell_default_load, is_compressed): sym_spell, _ = symspell_default_load sym_spell.save_pickle(pickle_path, is_compressed) sym_spell_2 = SymSpell() sym_spell_2.load_pickle(pickle_path, is_compressed) assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words assert sym_spell.bigrams == sym_spell_2.bigrams assert sym_spell.deletes == sym_spell_2.deletes assert sym_spell.words == sym_spell_2.words assert sym_spell._max_length == sym_spell_2._max_length assert sym_spell._count_threshold == sym_spell_2._count_threshold assert (sym_spell._max_dictionary_edit_distance == sym_spell_2._max_dictionary_edit_distance) assert sym_spell._prefix_length == sym_spell_2._prefix_length os.remove(pickle_path)
def init_sym_spell(): from pathlib import Path from symspellpy import SymSpell from ds_tools.fs.paths import get_user_cache_dir sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=1) dict_path_pkl = Path( get_user_cache_dir('music_manager')).joinpath('words.pkl.gz') if dict_path_pkl.exists(): log.debug(f'Loading pickled spellcheck dictionary: {dict_path_pkl}') sym_spell.load_pickle(dict_path_pkl) else: import lzma import pkg_resources dict_path = pkg_resources.resource_filename( 'symspellpy', 'frequency_dictionary_en_82_765.txt') sym_spell.load_dictionary(dict_path, 0, 1) word_list_path_xz = Path( pkg_resources.resource_filename( 'music', '../../etc/scowl/words.xz')).resolve() log.debug( f'Loading default dictionary + word list from {word_list_path_xz}') with lzma.open(word_list_path_xz, 'rt', encoding='utf-8') as f: word_list = f.read().splitlines() loaded = sym_spell._words min_count = min(loaded.values()) add_word = sym_spell.create_dictionary_entry for word in word_list: try: loaded[word] except KeyError: add_word(word, min_count) fmt = 'Saving pickled spellcheck dictionary (this is a one-time action that may take about 15 seconds): {}' log.info(fmt.format(dict_path_pkl)) sym_spell.save_pickle(dict_path_pkl) return sym_spell
class spellchecker: def __init__( self, max_dictionary_edit_distance, prefix_length, unigram_freq_file, bigram_freq_file=None, pickle_file=None, ): self.sym_spell = SymSpell( max_dictionary_edit_distance=max_dictionary_edit_distance, prefix_length=prefix_length, ) if pickle_file is not None: self.sym_spell.load_pickle(pickle_file, ) else: self.sym_spell.load_dictionary( unigram_freq_file, term_index=0, count_index=1, encoding="utf-8", ) if bigram_freq_file: self.sym_spell.load_bigram_dictionary( bigram_freq_file, term_index=0, count_index=2, encoding="utf-8", ) def suggest( self, word, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): # defaults if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) return { 'original_term': word, 'suggestions': suggestions, } def suggest_compound( self, phrase, max_edit_dist=None, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup_compound( phrase, max_edit_distance=max_edit_dist, # ignore_non_words=False, # split_phrase_by_space=True, ) return { 'original_term': phrase, 'suggestions': suggestions, } def tokenize(self, phrases): return tokenize_sentence(phrases) # Tokenize into individual phrases and return a list of suggestions for each def suggest_tokenize( self, phrases, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE words = self.tokenize(phrases) sentence_suggestions = [] for word in words: suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) sentence_suggestions.append({ 'original_term': word, 'suggestions': suggestions, }) return sentence_suggestions