Exemplo n.º 1
0
 def test_load_bigram_dictionary_bad_dict(self):
     dictionary_path = os.path.join(self.fortests_path, "bad_dict.txt")
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(
         True, sym_spell.load_bigram_dictionary(dictionary_path, 0, 2))
Exemplo n.º 2
0
 def test_load_dictionary_invalid_path(self):
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(
         False,
         sym_spell.load_dictionary("invalid/dictionary/path.txt", 0, 1))
Exemplo n.º 3
0
def test_save_pickle_symspellcpppy(benchmark):
    sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary(dict_path,
                              term_index=0,
                              count_index=1,
                              separator=" ")
    os.makedirs("temp_cpppy", exist_ok=True)
    result = benchmark(sym_spell.save_pickle, "temp_cpppy/temp.bin")
    assert (sym_spell.max_length() == 28)
Exemplo n.º 4
0
 def test_load_dictionary_separator(self):
     dictionary_path = os.path.join(self.fortests_path,
                                    "separator_dict.txt")
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(True,
                      sym_spell.load_dictionary(dictionary_path, 0, 1, "$"))
     self.assertEqual(5, sym_spell.word_count())
Exemplo n.º 5
0
    def test_create_dictionary(self):
        corpus_path = os.path.join(self.fortests_path, "big_modified.txt")
        big_words_path = os.path.join(self.fortests_path, "big_words.txt")

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.create_dictionary(corpus_path)
        self.assertEqual(68, sym_spell.max_length())
Exemplo n.º 6
0
def test_lookup_transfer_casing_symspellcpppy(benchmark):
    sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.create_dictionary_entry("steam", 4)
    result = benchmark(sym_spell.lookup,
                       "StreaM",
                       VerbosityCpp.TOP,
                       2,
                       transfer_casing=True)
    assert (result[0].term == "SteaM")
Exemplo n.º 7
0
    def test_word_segmentation_apostrophe(self):
        edit_distance_max = 0
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = "There'resomewords"
        correction = ("There' re some words")
        result = sym_spell.word_segmentation(typo)
        self.assertEqual(correction, result.corrected_string)
Exemplo n.º 8
0
    def test_words_with_shared_prefix_should_retain_counts(self):
        sym_spell = SymSpell(1, 3)
        sym_spell.create_dictionary_entry("pipe", 5)
        sym_spell.create_dictionary_entry("pips", 10)

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)
Exemplo n.º 9
0
def test_lookup_compound_term_symspellcpppy(benchmark):
    sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary(dict_path,
                              term_index=0,
                              count_index=1,
                              separator=" ")
    input_term = "whereis th elove"
    result = benchmark(sym_spell.lookup_compound,
                       input_term,
                       max_edit_distance=2)
    assert (result[0].term == "whereas the love")
Exemplo n.º 10
0
    def test_load_dictionary_encoding(self):
        dictionary_path = os.path.join(self.fortests_path, "non_en_dict.txt")

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        result = sym_spell.lookup("АБ", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        self.assertEqual("АБИ", result[0].term)
Exemplo n.º 11
0
def test_lookup_term_symspellcpppy(benchmark):
    sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary(dict_path,
                              term_index=0,
                              count_index=1,
                              separator=" ")
    input_term = "mEmEbers"
    result = benchmark(sym_spell.lookup,
                       input_term,
                       VerbosityCpp.CLOSEST,
                       max_edit_distance=2)
    assert (result[0].term == "members")
Exemplo n.º 12
0
def test_lookup_compund_transfer_casing_symspellcpppy(benchmark):
    sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary(dict_path, 0, 1)
    typo = ("Whereis th elove hehaD Dated forImuch of thepast who "
            "couqdn'tread in sixthgrade AND ins pired him")
    correction = ("Whereas the love heaD Dated for much of the past "
                  "who couldn't read in sixth grade AND inspired him")
    results = benchmark(sym_spell.lookup_compound,
                        typo,
                        2,
                        transfer_casing=True)
    assert (results[0].term == correction)
Exemplo n.º 13
0
def test_word_segmentation_symspellcpppy(benchmark):
    sym_spell = SymSpellCpp(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary(dict_path,
                              term_index=0,
                              count_index=1,
                              separator=" ")
    input_term = "thequickbrownfoxjumpsoverthelazydog"
    result = benchmark(sym_spell.word_segmentation,
                       input_term,
                       max_edit_distance=0,
                       max_segmentation_word_length=5)
    assert (result.segmented_string ==
            "the quick brown fox jumps over the lazy dog")
Exemplo n.º 14
0
    def test_lookup_compound_transfer_casing_no_bigram(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = ("Whereis th elove hehaD Dated forImuch of thepast who "
                "couqdn'tread in sixthgrade AND ins pired him")
        correction = ("Whereas the love heaD Dated for much of the past "
                      "who couldn't read in sixth grade AND inspired him")

        results = sym_spell.lookup_compound(typo,
                                            edit_distance_max,
                                            transfer_casing=True)
        self.assertEqual(correction, results[0].term)
Exemplo n.º 15
0
    def test_add_additional_counts_should_not_add_word_again(self):
        sym_spell = SymSpell()
        word = "hello"
        sym_spell.create_dictionary_entry(word, 11)
        self.assertEqual(1, sym_spell.word_count())

        sym_spell.create_dictionary_entry(word, 3)
        self.assertEqual(1, sym_spell.word_count())
Exemplo n.º 16
0
 def setUpClass(cls):
     cls.symSpell = SymSpell()
     cls.symSpell.load_dictionary(
         "resources/frequency_dictionary_en_82_765.txt", 0, 1, " ")
     cls.fortests_path = "tests/fortests"
     cls.dictionary_path = "resources/frequency_dictionary_en_82_765.txt"
     cls.bigram_path = "resources/frequency_bigramdictionary_en_243_342.txt"
Exemplo n.º 17
0
 def test_deletes(self):
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("stream", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steamb", result[0].term)
     self.assertEqual(6, result[0].count)
     self.assertTrue(sym_spell.entry_count())
Exemplo n.º 18
0
 def test_lookup_should_not_return_low_count_word_that_are_also_delete_word(
         self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("flam", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
Exemplo n.º 19
0
 def test_lookup_include_unknown(self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("flam", Verbosity.TOP, 0, True)
     self.assertEqual(1, len(result))
     self.assertEqual("flam", result[0].term)
Exemplo n.º 20
0
    def test_create_dictionary_entry_negative_count(self):
        sym_spell = SymSpell(1, 3)
        self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", 0))
        self.assertEqual(False, sym_spell.create_dictionary_entry("pipe", -1))

        sym_spell = SymSpell(1, 3, count_threshold=0)
        self.assertEqual(True, sym_spell.create_dictionary_entry("pipe", 0))
Exemplo n.º 21
0
 def test_lookup_should_find_exact_match(self):
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("streama", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steama", result[0].term)
Exemplo n.º 22
0
    def test_add_additional_counts_should_not_overflow(self):
        sym_spell = SymSpell()
        word = "hello"
        sym_spell.create_dictionary_entry(word, sys.maxsize - 10)
        result = sym_spell.lookup(word, Verbosity.ALL)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(sys.maxsize - 10, count)

        sym_spell.create_dictionary_entry(word, 11)
        result = sym_spell.lookup(word, Verbosity.ALL)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(sys.maxsize, count)
Exemplo n.º 23
0
    def test_add_additional_counts_should_increase_count(self):
        sym_spell = SymSpell()
        word = "hello"
        sym_spell.create_dictionary_entry(word, 11)
        result = sym_spell.lookup(word, Verbosity.ALL)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(11, count)

        sym_spell.create_dictionary_entry(word, 3)
        result = sym_spell.lookup(word, Verbosity.ALL)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(11 + 3, count)
Exemplo n.º 24
0
 def test_lookup_should_return_most_frequent(self):
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("stream", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steamb", result[0].term)
     self.assertEqual(6, result[0].count)
Exemplo n.º 25
0
 def test_lookup_should_not_return_non_word_delete(self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("pawn", 10)
     result = sym_spell.lookup("paw", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
     result = sym_spell.lookup("awn", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
Exemplo n.º 26
0
    def test_lookup_compound_no_suggestion(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.create_dictionary_entry("steam", 1)
        sym_spell.create_dictionary_entry("machine", 1)

        typo = "qwer erty ytui a"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(typo, results[0].term)
Exemplo n.º 27
0
    def test_lookup_compound_only_combi(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.create_dictionary_entry("steam", 1)
        sym_spell.create_dictionary_entry("machine", 1)

        typo = "ste am machie"
        correction = "steam machine"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
Exemplo n.º 28
0
 def test_save_load(self):
     before_save = self.symSpell.lookup("tke", Verbosity.CLOSEST)[0].term
     before_max_length = self.symSpell.max_length()
     os.makedirs("temp", exist_ok=True)
     self.symSpell.save_pickle("temp/temp.bin")
     load_sym_spell = SymSpell()
     load_sym_spell.load_pickle("temp/temp.bin")
     after_load = load_sym_spell.lookup("tke", Verbosity.CLOSEST)[0].term
     after_max_length = load_sym_spell.max_length()
     os.remove("temp/temp.bin")
     os.rmdir("temp")
     assert (before_save == after_load)
     assert (before_max_length == after_max_length)
Exemplo n.º 29
0
    def test_word_segmentation_capitalize(self):
        edit_distance_max = 0
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = "Thequickbrownfoxjumpsoverthelazydog"
        correction = "The quick brown fox jumps over the lazy dog"
        result = sym_spell.word_segmentation(typo)
        self.assertEqual(correction, result.corrected_string)

        typo = "Itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"
        correction = ("It was a bright cold day in april and the clocks "
                      "were striking thirteen")
        result = sym_spell.word_segmentation(typo)
        self.assertEqual(correction, result.segmented_string)

        typo = ("Itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom"
                "itwastheageoffoolishness")
        correction = ("It was the best of times it was the worst of times "
                      "it was the age of wisdom it was the age of foolishness")
        result = sym_spell.word_segmentation(typo)
        self.assertEqual(correction, result.segmented_string)
Exemplo n.º 30
0
 def test_empty_deletes(self):
     self.assertEqual(SymSpell(2).lookup("ab", Verbosity.CLOSEST), [])
     self.assertEqual(SymSpell().entry_count(), 0)