Пример #1
0
    def test_pickle_invalid(self):
        pickle_path = os.path.join(self.fortests_path, "dictionary.pickle")
        is_compressed = False
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)

        pickle_data = {
            "deletes": {},
            "words": {},
            "max_length": 0,
            "data_version": -1
        }
        with open(pickle_path, "wb") as f:
            pickle.dump(pickle_data, f)
        self.assertFalse(sym_spell.load_pickle(pickle_path, is_compressed))
        os.remove(pickle_path)

        pickle_data = {
            "deletes": {},
            "words": {},
            "max_length": 0
        }
        with open(pickle_path, "wb") as f:
            pickle.dump(pickle_data, f)
        self.assertFalse(sym_spell.load_pickle(pickle_path, is_compressed))
        os.remove(pickle_path)
Пример #2
0
    def test_pickle_bytes(self, symspell_default_load):
        sym_spell, _ = symspell_default_load
        sym_spell_2 = SymSpell(123, 456, 789)

        assert sym_spell._count_threshold != sym_spell_2._count_threshold
        assert (sym_spell._max_dictionary_edit_distance !=
                sym_spell_2._max_dictionary_edit_distance)
        assert sym_spell._prefix_length != sym_spell_2._prefix_length

        with TestCase.assertLogs("symspellpy.symspellpy.logger",
                                 level="WARNING") as cm:
            sym_spell_2.load_pickle(sym_spell.save_pickle(to_bytes=True),
                                    from_bytes=True)
        assert (
            "Loading data which was created using different ('count_threshold', "
            "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting "
            "current SymSpell instance with loaded settings ..."
        ) == cm.records[0].getMessage()
        assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
        assert sym_spell.bigrams == sym_spell_2.bigrams
        assert sym_spell.deletes == sym_spell_2.deletes
        assert sym_spell.words == sym_spell_2.words
        assert sym_spell._max_length == sym_spell_2._max_length
        assert sym_spell._count_threshold == sym_spell_2._count_threshold
        assert (sym_spell._max_dictionary_edit_distance ==
                sym_spell_2._max_dictionary_edit_distance)
        assert sym_spell._prefix_length == sym_spell_2._prefix_length
Пример #3
0
    def test_pickle_compressed(self):
        pickle_path = os.path.join(self.fortests_path, "dictionary.pickle")
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.save_pickle(pickle_path)

        sym_spell_2 = SymSpell(edit_distance_max, prefix_length)
        sym_spell_2.load_pickle(pickle_path)
        self.assertEqual(sym_spell.deletes, sym_spell_2.deletes)
        self.assertEqual(sym_spell.words, sym_spell_2.words)
        self.assertEqual(sym_spell._max_length, sym_spell_2._max_length)
        os.remove(pickle_path)
Пример #4
0
    def test_pickle_same_settings(self, pickle_path, symspell_default_load,
                                  is_compressed):
        sym_spell, _ = symspell_default_load
        sym_spell.save_pickle(pickle_path, is_compressed)

        sym_spell_2 = SymSpell()
        sym_spell_2.load_pickle(pickle_path, is_compressed)

        assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
        assert sym_spell.bigrams == sym_spell_2.bigrams
        assert sym_spell.deletes == sym_spell_2.deletes
        assert sym_spell.words == sym_spell_2.words
        assert sym_spell._max_length == sym_spell_2._max_length
        assert sym_spell._count_threshold == sym_spell_2._count_threshold
        assert (sym_spell._max_dictionary_edit_distance ==
                sym_spell_2._max_dictionary_edit_distance)
        assert sym_spell._prefix_length == sym_spell_2._prefix_length
        os.remove(pickle_path)
Пример #5
0
def init_sym_spell():
    from pathlib import Path
    from symspellpy import SymSpell
    from ds_tools.fs.paths import get_user_cache_dir

    sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=1)
    dict_path_pkl = Path(
        get_user_cache_dir('music_manager')).joinpath('words.pkl.gz')
    if dict_path_pkl.exists():
        log.debug(f'Loading pickled spellcheck dictionary: {dict_path_pkl}')
        sym_spell.load_pickle(dict_path_pkl)
    else:
        import lzma
        import pkg_resources

        dict_path = pkg_resources.resource_filename(
            'symspellpy', 'frequency_dictionary_en_82_765.txt')
        sym_spell.load_dictionary(dict_path, 0, 1)
        word_list_path_xz = Path(
            pkg_resources.resource_filename(
                'music', '../../etc/scowl/words.xz')).resolve()
        log.debug(
            f'Loading default dictionary + word list from {word_list_path_xz}')
        with lzma.open(word_list_path_xz, 'rt', encoding='utf-8') as f:
            word_list = f.read().splitlines()

        loaded = sym_spell._words
        min_count = min(loaded.values())
        add_word = sym_spell.create_dictionary_entry
        for word in word_list:
            try:
                loaded[word]
            except KeyError:
                add_word(word, min_count)

        fmt = 'Saving pickled spellcheck dictionary (this is a one-time action that may take about 15 seconds): {}'
        log.info(fmt.format(dict_path_pkl))
        sym_spell.save_pickle(dict_path_pkl)

    return sym_spell
Пример #6
0
class spellchecker:
    def __init__(
        self,
        max_dictionary_edit_distance,
        prefix_length,
        unigram_freq_file,
        bigram_freq_file=None,
        pickle_file=None,
    ):
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=max_dictionary_edit_distance,
            prefix_length=prefix_length,
        )

        if pickle_file is not None:
            self.sym_spell.load_pickle(pickle_file, )
        else:
            self.sym_spell.load_dictionary(
                unigram_freq_file,
                term_index=0,
                count_index=1,
                encoding="utf-8",
            )

            if bigram_freq_file:
                self.sym_spell.load_bigram_dictionary(
                    bigram_freq_file,
                    term_index=0,
                    count_index=2,
                    encoding="utf-8",
                )

    def suggest(
        self,
        word,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        # defaults
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup(
            word,
            verbosity,
            max_edit_distance=max_edit_dist,
            include_unknown=include_unknown,
        )
        return {
            'original_term': word,
            'suggestions': suggestions,
        }

    def suggest_compound(
        self,
        phrase,
        max_edit_dist=None,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup_compound(
            phrase,
            max_edit_distance=max_edit_dist,
            # ignore_non_words=False,
            # split_phrase_by_space=True,
        )
        return {
            'original_term': phrase,
            'suggestions': suggestions,
        }

    def tokenize(self, phrases):
        return tokenize_sentence(phrases)

    # Tokenize into individual phrases and return a list of suggestions for each
    def suggest_tokenize(
        self,
        phrases,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        words = self.tokenize(phrases)

        sentence_suggestions = []
        for word in words:
            suggestions = self.sym_spell.lookup(
                word,
                verbosity,
                max_edit_distance=max_edit_dist,
                include_unknown=include_unknown,
            )
            sentence_suggestions.append({
                'original_term': word,
                'suggestions': suggestions,
            })

        return sentence_suggestions