예제 #1
0
    def test_lookup_compound(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(300000, results[0].count)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(23121323, results[0].count)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(3813904, results[0].count)

        typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes")
        correction = ("can you read this message despite the horrible "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(10, results[0].distance)
        self.assertEqual(6218089, results[0].count)
예제 #2
0
    def test_lookup_compound_transfer_casing(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2)

        typo = ("Whereis th elove hehaD Dated forImuch of thepast who "
                "couqdn'tread in sixthgrade AND ins pired him")
        correction = ("Where is the love he haD Dated for much of the past "
                      "who couldn't read in sixth grade AND inspired him")

        results = sym_spell.lookup_compound(typo,
                                            edit_distance_max,
                                            transfer_casing=True)
        self.assertEqual(correction, results[0].term)
def load_symspell():
    import pkg_resources
    from symspellpy import SymSpell, Verbosity
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
    return sym_spell
예제 #4
0
    def test_lookup_compound(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes")
        correction = ("can you read this message despite the horrible "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
def initialize_models():
    spacy_nlp = spacy.load("en_core_web_sm")

    dictionary_path = pkg_resources_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    sym_spell_len5 = SymSpell(max_dictionary_edit_distance=3, prefix_length=5)
    # term_index is the column of the term and count_index is the column of the term frequency
    sym_spell_len5.load_dictionary(dictionary_path,
                                   term_index=0,
                                   count_index=1)

    # The length of word prefixes used for spell checking.
    sym_spell_len7 = SymSpell(max_dictionary_edit_distance=4, prefix_length=7)
    # term_index is the column of the term and count_index is the column of the term frequency
    sym_spell_len7.load_dictionary(dictionary_path,
                                   term_index=0,
                                   count_index=1)

    c2v_model = load_c2v_model("single_word_trained_model")

    return spacy_nlp, c2v_model, sym_spell_len5, sym_spell_len7
예제 #6
0
def init_sym_spell():
    from pathlib import Path
    from symspellpy import SymSpell
    from ds_tools.fs.paths import get_user_cache_dir

    sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=1)
    dict_path_pkl = Path(
        get_user_cache_dir('music_manager')).joinpath('words.pkl.gz')
    if dict_path_pkl.exists():
        log.debug(f'Loading pickled spellcheck dictionary: {dict_path_pkl}')
        sym_spell.load_pickle(dict_path_pkl)
    else:
        import lzma
        import pkg_resources

        dict_path = pkg_resources.resource_filename(
            'symspellpy', 'frequency_dictionary_en_82_765.txt')
        sym_spell.load_dictionary(dict_path, 0, 1)
        word_list_path_xz = Path(
            pkg_resources.resource_filename(
                'music', '../../etc/scowl/words.xz')).resolve()
        log.debug(
            f'Loading default dictionary + word list from {word_list_path_xz}')
        with lzma.open(word_list_path_xz, 'rt', encoding='utf-8') as f:
            word_list = f.read().splitlines()

        loaded = sym_spell._words
        min_count = min(loaded.values())
        add_word = sym_spell.create_dictionary_entry
        for word in word_list:
            try:
                loaded[word]
            except KeyError:
                add_word(word, min_count)

        fmt = 'Saving pickled spellcheck dictionary (this is a one-time action that may take about 15 seconds): {}'
        log.info(fmt.format(dict_path_pkl))
        sym_spell.save_pickle(dict_path_pkl)

    return sym_spell
예제 #7
0
def load_name_corection(dictionary_path, bigram_path):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    # dictionary_path = pkg_resources.resource_filename(
    #     dictionary_path)
    # bigram_path = pkg_resources.resource_filename(
    #     bigram_path)
    sym_spell.load_dictionary(dictionary_path,
                              term_index=0,
                              count_index=1,
                              encoding='utf-8')
    sym_spell.load_bigram_dictionary(bigram_path,
                                     term_index=0,
                                     count_index=2,
                                     encoding='utf-8')
    return sym_spell
def load_cpn_corection(companies_list, debug=False):
    with open(companies_list, 'r', encoding='utf-8') as f:
        l = f.read()
    l = l.lower()
    l = l.split('\n')
    m = []
    for w in l:
        m.append(w.split())
    bi = export_freq_bigram(m)
    uni = export_freq_dic(m)
    if debug:
        print(uni)
        print(bi)
    sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7)
    sym_spell.load_dictionary_from_list(uni, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary_from_list(bi, term_index=0, count_index=2)
    return sym_spell
예제 #9
0
    def test_word_segmentation_with_arguments(self):
        edit_distance_max = 0
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = "thequickbrownfoxjumpsoverthelazydog"
        correction = "the quick brown fox jumps over the lazy dog"
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result.corrected_string)

        typo = "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen"
        correction = ("it was a bright cold day in april and the clocks "
                      "were striking thirteen")
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result.corrected_string)

        typo = (" itwasthebestoftimesitwastheworstoftimesitwastheageofwisdom"
                "itwastheageoffoolishness")
        correction = ("it was the best of times it was the worst of times "
                      "it was the age of wisdom it was the age of foolishness")
        result = sym_spell.word_segmentation(typo, edit_distance_max, 11)
        self.assertEqual(correction, result.corrected_string)
예제 #10
0
    def test_lookup_compound_ignore_non_words(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2)

        typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who "
                "couqdn'tread in SIXTHgrade and ins pired him")
        correction = ("where is the love 123 he had dated for much of THEPAST "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the DHIRD 1 quarter of last year he had learned "
                      "of a secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY "
                "of 12 funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with PLETY of 12 fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible 1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible 1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible AB1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible AB1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "PI on leave, arrange Co-I to do screening"
        correction = "PI on leave arrange co i to do screening"
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("is the officeon 1st floor oepn 24/7")
        correction = ("is the office on 1st floor open 24/7")
        results = sym_spell.lookup_compound(typo,
                                            edit_distance_max,
                                            split_phrase_by_space=True,
                                            ignore_non_words=True,
                                            ignore_any_term_with_digits=True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(2, results[0].distance)
        self.assertEqual(0, results[0].count)
예제 #11
0
    def test_lookup_compound_replaced_words_no_bigram(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("whereas the love head dated for much of the past who "
                      "couldn't read in sixth grade and inspired him")
        replacement_1 = {
            "whereis": "whereas",
            "th": "the",
            "elove": "love",
            "hehad": "head",
            "forimuch": "for much",
            "thepast": "the past",
            "couqdn'tread": "couldn't read",
            "sixthgrade": "sixth grade",
            "ins": "in"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(len(replacement_1), len(sym_spell.replaced_words))
        for k, v in replacement_1.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        replacement_2 = {
            "te": "the",
            "dhird": "third",
            "qarter": "quarter",
            "oflast": "of last",
            "jear": "year",
            "hadlearned": "had learned",
            "ofca": "of a",
            "sekretplan": "secret plan"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2),
            len(sym_spell.replaced_words))
        for k, v in replacement_2.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        replacement_3 = {
            "bigjest": "biggest",
            "playrs": "players",
            "strogsommer": "strong summer",
            "slatew": "slate",
            "ith": "with",
            "plety": "plenty",
            "funn": "fun"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2) + len(replacement_3),
            len(sym_spell.replaced_words))
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        for k, v in replacement_3.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)
예제 #12
0
 def test_negative_count_threshold(self):
     with pytest.raises(ValueError) as excinfo:
         __ = SymSpell(1, 3, -1)
     self.assertEqual("count_threshold cannot be negative",
                      str(excinfo.value))
예제 #13
0
 def test_negative_initial_capacity(self):
     print('  - %s' % inspect.stack()[0][3])
     with pytest.raises(ValueError) as excinfo:
         __ = SymSpell(-16, 1, 3)
     self.assertEqual("initial_capacity cannot be negative",
                      str(excinfo.value))
예제 #14
0
import nltk
import csv
import string
from symspellpy import SymSpell, Verbosity
import pkg_resources
import pickle
from nltk.sentiment.vader import SentimentIntensityAnalyzer

with open('reviews_train.csv') as review_file:
    reader = list(csv.reader(review_file, delimiter=','))
    documents = [[row[0], row[1], row[2]] for row in reader]

result = []
spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

negs = ['not', 'no', 'didnt']
for review in documents:
    last_neg = False
    neg_index = 0
    for word in review[1].split(' '):
        if (len(word) > 0):
            if review[0].lower() not in word.lower():
                word = word.translate(str.maketrans('', '',
                                                    string.punctuation))
                try:
                    word = str(
                        spell.lookup(word,
                                     Verbosity.CLOSEST,
예제 #15
0
 def test_lookup_should_not_return_low_count_word(self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell(16, 2, 7, 10)
     sym_spell.create_dictionary_entry("pawn", 1)
     result = sym_spell.lookup("pawn", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
예제 #16
0
    def test_delete_dictionary_entry_invalid_word(self):
        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("stea", 1)
        sym_spell.create_dictionary_entry("steama", 2)
        sym_spell.create_dictionary_entry("steem", 3)

        result = sym_spell.lookup("steama", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        self.assertEqual("steama", result[0].term)
        self.assertEqual(len("steama"), sym_spell._max_length)

        self.assertFalse(sym_spell.delete_dictionary_entry("steamab"))
        result = sym_spell.lookup("steama", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        self.assertEqual("steama", result[0].term)
        self.assertEqual(len("steama"), sym_spell._max_length)
예제 #17
0
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy",
                                                  "freq_name_dic.txt")
bigram_path = pkg_resources.resource_filename("symspellpy",
                                              "freq_name_bigram.txt")
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path,
                          term_index=0,
                          count_index=1,
                          encoding='utf-8')
# sym_spell.load_dictionary('C:/Users/nt.anh6/PycharmProjects/aicr_vn/nlp_model/spell_checker/dict/vi_full.txt', term_index=0, count_index=1, encoding='utf-8')
sym_spell.load_bigram_dictionary(bigram_path,
                                 term_index=0,
                                 count_index=2,
                                 encoding='utf-8')

# lookup suggestions for multi-word input strings (supports compound
# splitting & merging)
input_term = "Ngyễn tành nm"
# max edit distance per lookup (per single word, not per whole input string)
# suggestions = sym_spell.lookup(input_term, Verbosity.ALL, max_edit_distance=2, include_unknown=True)
suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
# display suggestion term, edit distance, and term frequency
for suggestion in suggestions:
    print(suggestion)

예제 #18
0
 def test_lookup_should_not_return_low_count_word_that_are_also_delete_word(self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("flam", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
예제 #19
0
class WordSuggester:
    """
    Suggest words when the input is mispelled
    """
    def __init__(self, ):
        d_print("Initializing the vocabulary set..")
        self.d = enchant.Dict("en_US")
        d_print("Initializing BERT pipeline..")

        self.tok = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.bert = BertForMaskedLM.from_pretrained("bert-base-uncased")
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.sym_spell_cut = SymSpell(max_dictionary_edit_distance=0,
                                      prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        # term_index is the column of the term and count_index is the
        # column of the term frequency
        self.sym_spell.load_dictionary(dictionary_path,
                                       term_index=0,
                                       count_index=1)
        self.sym_spell_cut.load_dictionary(dictionary_path,
                                           term_index=0,
                                           count_index=1)

    def cross_word_validate(self, word, word_counts, min_counts=2):
        """
        A word is considered valid if it occures many times or
        """
        tot = sum(word_counts.values())
        return word_counts[word] >= min_counts

    def is_multiword(self, word):
        suggestions = self.d.suggest(word)
        for sugg in suggestions:
            if "".join(sugg.split(" ")) == word:
                return True, sugg
            if "".join(sugg.split("-")) == word:
                return True, sugg.replace("-", " ")
        return False, ""

    def cross_sugg_validate(self, word, word_counts):
        suggestions = [
            s.term for s in self.sym_spell.lookup(
                word, Verbosity.CLOSEST, max_edit_distance=2)
        ]
        present_words = {
            word: count
            for word, count in word_counts.items() if word in suggestions
        }
        if len(present_words) == 0:
            return False, ""
        corr_word = max(present_words.items(), key=operator.itemgetter(1))[0]
        return True, corr_word

    def get_word_suggestions(self, word, word_counts):
        """
        Return the suggestions for the word passed in parameter. If the
        word passed in parameter is valid, return a list of len 1 with the
        word inside.

        Args:
            word (str): the word to find suggestions for
            word_counts (dict): value counts of word for a given emoji (context)
        """
        # If the word appears many times in answers, we keep it
        if self.cross_word_validate(word, word_counts):
            return {"status": "present", "words": [word]}

        # If the word is part of the english vocabulary we keep it
        if self.d.check(word):
            return {"status": "exist", "words": [word]}

        # If the suggestions associated to the word appear in the rest of the answers
        # we keep the most common one
        cross_sugg, corr_word = self.cross_sugg_validate(word, word_counts)
        if cross_sugg:
            return {"status": "cross_suggested", "words": [corr_word]}

        # If the cutting of the word into several words is very confident, we disassemble it
        result = self.sym_spell_cut.word_segmentation(word)
        log_confidence = result.log_prob_sum / len(result.corrected_string)
        if log_confidence > -1:
            suggestions = result.corrected_string
            return {
                "status": "disassembled1",
                "words": [result.corrected_string]
            }

        # Same approach using another library
        is_multi, corr_word = self.is_multiword(word)
        if is_multi:
            return {"status": "disassembled2", "words": [corr_word]}

        # We use the other words as a context to select among the suggestions
        suggestions = [
            sugg.term for sugg in self.sym_spell.lookup(
                word, Verbosity.CLOSEST, max_edit_distance=2)
        ]
        if len(suggestions) > 0:
            return {"status": "corrected", "words": suggestions}

        # The word is probably unknown
        return {"status": "notfound", "words": [word]}

    def get_context_suggestions(self, word_list):
        """
        Applies get_word_suggestions for every word of an emoji's vocabulary (context)

        Args:
            word_list (list of str): words to describe the emoji

        Returns:
            [list of list of str]: list of suggestions: each word receives suggestions (list of str)
        """
        word_counts = Counter(word_list)
        context_suggestions = [
            self.get_word_suggestions(word, word_counts) for word in word_list
        ]
        return context_suggestions

    def find_best_word(self, context, suggestions):
        """
        Find the most appropriate word in suggestions given the context

        Args:
            context (list of str): words defining the context
            suggestions (list of str): suggestions for the word to find

        Returns:
            [str]: the word of suggestions that matches the best the context
            according to BERT output
        """
        # We place the word of interest in the middle of the context
        n = len(context) // 2
        pre_context = " ".join(context[:n])
        post_context = " ".join(context[n:])
        sentence = f"{pre_context} {self.tok.mask_token} {post_context}"

        input_tokens = self.tok.encode(sentence)
        answer_pos = input_tokens.index(self.tok.mask_token_id)

        logits = self.bert(torch.tensor([input_tokens]))[0][0]
        logits = logits[answer_pos]
        suggestions_tokens = [
            self.tok.encode(word)[1:-1] for word in suggestions
        ]
        scores = [
            np.mean([logits[i].item() for i in tokens])
            for tokens in suggestions_tokens
        ]
        best_sugg_idx = np.argmax(scores)
        return suggestions[best_sugg_idx]

    def extract_context_suggestions(self, context_suggestions):
        """
        Extract best words for each suggestions in the context suggestions

        Args:
            context_suggestions (list of list of str): list of suggestions

        Returns:
            [list of str]: most appropriate words

        """
        # we don't need the status in the current function
        context_suggestions = [sugg["words"] for sugg in context_suggestions]
        ret_words = []
        for suggestions in context_suggestions:
            # single suggestion: the word is not ambiguous
            if len(suggestions) == 1:
                ret_words.append(suggestions[0])
            else:
                # we gather the single words considered as healthy
                context = [
                    word_list[0] for word_list in context_suggestions
                    if word_list != suggestions and len(word_list) == 1
                ]
                word = self.find_best_word(context, suggestions)

                ret_words.append(word)
        return ret_words

    def process_context(self, context, verbose=False):
        """
        Args:
            context (list of str): words

        Returns:
            [list of str]: corrected words
        """
        if os.environ.get("DEBUG") is not None:
            d_print("Test --> test")
            d_print("Test --> test")
            return context
        context_suggestions = self.get_context_suggestions(context)
        corr_words = self.extract_context_suggestions(context_suggestions)
        if verbose:
            for word, suggestions, corr_word in zip(context,
                                                    context_suggestions,
                                                    corr_words):
                status = suggestions["status"]
                if status == "notfound":
                    d_print(f"Nof found:  {word}")
                elif status not in ["present", "exist"] and word != corr_word:
                    d_print(f"Modified:  {word} --> {corr_word} ({status})")

        return corr_words

    def correct_prod_df(self, form_df, debug=False):
        """
        Correct inplace mispelled words of a dataframe in productions format
        """
        grouped_df = form_df.groupby("emoji")
        # TODO: remove the limitation
        em_indexes = [(key, val) for key, val in grouped_df.groups.items()]

        for emoji, indexes in tqdm(em_indexes):
            group = grouped_df.get_group(emoji)["word"]
            words = group.to_list()
            corr_words = self.process_context(words, verbose=True)
            form_df["word"].loc[indexes] = corr_words
import pkg_resources
from symspellpy import SymSpell, Verbosity

#An average 5 letter word has about 3 million possible spelling errors within a maximum edit distance of 3
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")


# term_index is the column of the term and count_index is the
# column of the term frequency
def spell_corrector(input_term):
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    #input_term = ('The yougn boy finaly understod the diffrence betwen paralell and perpendcular.')
    #input_term = ("whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him")

    # max edit distance per lookup (per single word, not per whole input string)
    suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)

    # display suggestion term, edit distance, and term frequency
    sent = []
    for suggestion in suggestions:
        sent.append(suggestion)

    predicted_sentence = str(sent[0])
예제 #21
0
 def test_negative_count_threshold(self):
     print('  - %s' % inspect.stack()[0][3])
     with pytest.raises(ValueError) as excinfo:
         __ = SymSpell(16, 1, 3, -1)
     self.assertEqual("count_threshold cannot be negative",
                      str(excinfo.value))
예제 #22
0
 def test_negative_max_dictionary_edit_distance(self):
     print('  - %s' % inspect.stack()[0][3])
     with pytest.raises(ValueError) as excinfo:
         __ = SymSpell(16, -1, 3)
     self.assertEqual("max_dictionary_edit_distance cannot be negative",
                      str(excinfo.value))
예제 #23
0
 def test_create_dictionary_invalid_path(self):
     edit_distance_max = 2
     prefix_length = 7
     sym_spell = SymSpell(edit_distance_max, prefix_length)
     self.assertEqual(
         False, sym_spell.create_dictionary("invalid/dictionary/path.txt"))
예제 #24
0
    def test_verbosity_should_control_lookup_results(self):
        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 1)
        sym_spell.create_dictionary_entry("steams", 2)
        sym_spell.create_dictionary_entry("steem", 3)

        result = sym_spell.lookup("steems", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        result = sym_spell.lookup("steems", Verbosity.CLOSEST, 2)
        self.assertEqual(2, len(result))
        result = sym_spell.lookup("steems", Verbosity.ALL, 2)
        self.assertEqual(3, len(result))
예제 #25
0
    def test_lookup_transfer_casing(self):
        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 4)
        result = sym_spell.lookup("Stream",
                                  Verbosity.TOP,
                                  2,
                                  transfer_casing=True)
        self.assertEqual("Steam", result[0].term)

        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 4)
        result = sym_spell.lookup("StreaM",
                                  Verbosity.TOP,
                                  2,
                                  transfer_casing=True)
        self.assertEqual("SteaM", result[0].term)

        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 4)
        result = sym_spell.lookup("STREAM",
                                  Verbosity.TOP,
                                  2,
                                  transfer_casing=True)
        self.assertEqual("STEAM", result[0].term)
예제 #26
0
 def test_lookup_should_not_return_low_count_word(self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("pawn", 1)
     result = sym_spell.lookup("pawn", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
class SymSpellChecker:
    def __init__(self, config_loader: ConfigLoader, parser: Parser):
        self.__high_frequency_threshold = config_loader.get_high_frequency_threshold(
        )
        self.__parser = parser
        self.__sym_spell_filtered_file_path = config_loader.get_sym_spell_filtered_file_path(
        )
        self.__sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                    prefix_length=7)
        self.__english_dictionary = set(words.words())

    def __get_best_suggestion_term(self, suggestions, split_term):
        res = suggestions[0]

        for suggestion in suggestions:
            if suggestion._count > res._count:
                res = suggestion

        if suggestions[0]._term == split_term and suggestions[
                0]._count >= res._count * self.__high_frequency_threshold:
            return split_term

        return res._term

    def __load_sym_spell_dictionary(self, dictionary_file):
        self.__sym_spell.load_dictionary(dictionary_file,
                                         term_index=0,
                                         count_index=1,
                                         separator=',')

    def __sym_spell_lookup(self, input_term):
        return self.__sym_spell.lookup(input_term,
                                       Verbosity.ALL,
                                       max_edit_distance=1,
                                       include_unknown=True)

    def __sym_spell_step(self, input_file, output_file, filter_file):
        updated_dictionary = {}
        file = open(filter_file, "w")
        queries_file = open(input_file, "r")
        for line in queries_file:
            (input_term, frequency) = line.split(",")
            split_terms = input_term.split(" ")
            best_suggested_query = ""
            for split_term in split_terms:
                if len(split_term) > 2 and split_term.isalpha() \
                        and split_term not in self.__english_dictionary:
                    suggestions = self.__sym_spell_lookup(split_term)
                    suggestion = self.__get_best_suggestion_term(
                        suggestions, split_term)
                else:
                    suggestion = split_term

                if len(best_suggested_query) > 0:
                    best_suggested_query += " "
                best_suggested_query += suggestion

            updated_dictionary[best_suggested_query] = updated_dictionary.get(
                best_suggested_query, 0) + int(frequency)
            if input_term != best_suggested_query:
                file.write(input_term + "," + best_suggested_query + "," +
                           frequency)

        file.close()
        queries_file.close()

        self.__parser.write_dictionary_to_file(updated_dictionary, output_file)

    def run_sym_spell(self, iterations, input_file, output_file,
                      sym_spell_dictionary_file):
        self.__load_sym_spell_dictionary(sym_spell_dictionary_file)
        file_name = self.__sym_spell_filtered_file_path
        self.__sym_spell_step(input_file, output_file, file_name + "1.csv")
        for i in range(iterations):
            self.__sym_spell_step(output_file, output_file,
                                  file_name + str(i + 2) + ".csv")
예제 #28
0
 def test_negative_max_dictionary_edit_distance(self):
     with pytest.raises(ValueError) as excinfo:
         __ = SymSpell(-1, 3)
     self.assertEqual("max_dictionary_edit_distance cannot be negative",
                      str(excinfo.value))
예제 #29
0
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import re
import nltk
from nltk.tokenize import word_tokenize
from language_detector import detect_language

import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
if sym_spell.word_count:
    pass
else:
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

###################################
#### sentence level preprocess ####
###################################


# lowercase + base filter
# some basic normalization
def f_base(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # normalization 1: xxxThis is a --> xxx. This is a (missing delimiter)
예제 #30
0
    def test_lookup_compound_replaced_words(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        replacement_1 = {
            "whereis": "where is",
            "th": "the",
            "elove": "love",
            "hehad": "he had",
            "forimuch": "for much",
            "thepast": "the past",
            "couqdn'tread": "couldn't read",
            "sixthgrade": "sixth grade",
            "ins": "in"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(len(replacement_1), len(sym_spell.replaced_words))
        for k, v in replacement_1.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        replacement_2 = {
            "te": "the",
            "dhird": "third",
            "qarter": "quarter",
            "oflast": "of last",
            "jear": "year",
            "hadlearned": "had learned",
            "ofca": "of a",
            "sekretplan": "secret plan"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2),
            len(sym_spell.replaced_words))
        for k, v in replacement_2.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        replacement_3 = {
            "bigjest": "biggest",
            "playrs": "players",
            "strogsommer": "strong summer",
            "slatew": "slate",
            "ith": "with",
            "plety": "plenty",
            "funn": "fun"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2) + len(replacement_3),
            len(sym_spell.replaced_words))
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        for k, v in replacement_3.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)