def test_words_with_shared_prefix_should_retain_counts(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3) sym_spell.create_dictionary_entry("pipe", 5) sym_spell.create_dictionary_entry("pips", 10) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
class Autocorrect: def __init__(self, words=None, max_edit_distance=2): self._symspell = SymSpell() self._max_edit_distance = max_edit_distance if words is not None: self.add_words(words) def add_word(self, word): if word is not None: self._symspell.create_dictionary_entry(word, 1) def add_words(self, words): if words is not None: self._symspell.create_dictionary(words) def delete_word(self, word): if word is not None: self._symspell.delete_dictionary_entry(word) def correct(self, bad_word): return self._symspell.lookup(bad_word, Verbosity.TOP, max_edit_distance=self._max_edit_distance, include_unknown=True)[0].term def predictions(self, bad_word): return self._symspell.lookup(bad_word, Verbosity.CLOSEST, max_edit_distance=self._max_edit_distance, include_unknown=True)
def test_words_from_list_with_shared_prefix_should_retain_counts(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3, words=[ "pipe", "pipe", "pipe", "pipe", "pipe", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips" ]) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def test_lookup_should_not_return_non_word_delete(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("pawn", 10) result = sym_spell.lookup("paw", Verbosity.TOP, 0) self.assertEqual(0, len(result)) result = sym_spell.lookup("awn", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_verbosity_should_control_lookup_results(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("steams", 2) sym_spell.create_dictionary_entry("steem", 3) result = sym_spell.lookup("steems", Verbosity.TOP, 2) self.assertEqual(1, len(result)) result = sym_spell.lookup("steems", Verbosity.CLOSEST, 2) self.assertEqual(2, len(result)) result = sym_spell.lookup("steems", Verbosity.ALL, 2) self.assertEqual(3, len(result))
def test_add_additional_counts_should_increase_count(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11, count) sym_spell.create_dictionary_entry(word, 3) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(11 + 3, count)
def test_add_additional_counts_should_not_overflow(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell() word = "hello" sym_spell.create_dictionary_entry(word, sys.maxsize - 10) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize - 10, count) sym_spell.create_dictionary_entry(word, 11) result = sym_spell.lookup(word, Verbosity.TOP) count = result[0].count if len(result) == 1 else 0 self.assertEqual(sys.maxsize, count)
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "bangeeet" # misspelling # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def test_lookup_should_replicate_noisy_results(self): print(' - %s' % inspect.stack()[0][3]) cwd = path.realpath(path.dirname(__file__)) dictionary_path = path.realpath( path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt") edit_distance_max = 2 prefix_length = 7 verbosity = Verbosity.CLOSEST sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) test_list = [] with open(query_path, "r") as infile: for line in infile.readlines(): line_parts = line.rstrip().split(" ") if len(line_parts) >= 2: test_list.append(line_parts[0]) result_sum = 0 for phrase in test_list: result_sum += len( sym_spell.lookup(phrase, verbosity, edit_distance_max)) self.assertEqual(4945, result_sum)
class SpellCorrector(): def __init__(self, max_edit_distance_dictionary=2, prefix_length=7): self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname('../'), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): raise("Dictionary file not found") # manually # this works. about 0.003 up # self.corr_dict = {"awsome": "awesome"} def reduce_lengthening(self, text): # not work pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", text) def strip_punc(self, word): # not work return re.sub(r"[\-\_\.\!]$", "", word) def __call__(self, word): word = self.reduce_lengthening(word) # if word in self.corr_dict: # word = self.corr_dict[word] if len(word) > 2 and "'" not in word: suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, 2) if suggestions: return suggestions[0].term return word
def spelling_correction(data,column): from symspellpy.symspellpy import SymSpell , Verbosity # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = "frequency_dictionary_en_82_765.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL df_final = pd.DataFrame() for index , row in data.iterrows(): # lookup suggestions for single-word input strings text = row[column] # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) for input_term in text.split(): suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) if len(suggestions)>0: df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]}) df_final = df_final.append(df_local) return df_final
def test_lookup_should_not_return_low_count_word_that_are_also_delete_word( self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 2, 7, 10) sym_spell.create_dictionary_entry("flame", 20) sym_spell.create_dictionary_entry("flam", 1) result = sym_spell.lookup("flam", Verbosity.TOP, 0) self.assertEqual(0, len(result))
def test_lookup_should_find_exact_match(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("streama", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steama", result[0].term)
def symspell_test(tokenpos_list, max_edit_distance_lookup=3, initial_capacity=83000, max_edit_distance_dictionary=3, prefix_length=7, term_index=0, count_index=1): """ This is a function that tests the SymSpell library for spell-checking performance. Key-word arguments are: ** max_edit_distance_lookup : (Recommended maximum = 3) ** term_index : term column in dictionary (0) ** count_index : frequency column in dictionary (1) """ print('\n{} \nBegin \'Symspellpy\' testing \n'.format('#' * 20)) try: sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) suggestion_verbosity = Verbosity.CLOSEST dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return 'Error loading dictionary file' suggestion_list = [] proper_noun = [] for (word, pos) in tokenpos_list: if pos == 'PROPN': suggestion_list.append(word) proper_noun.append(word) elif len(word) < 3: suggestion_list.append(word) proper_noun.append(word) else: suggestions = sym_spell.lookup(word, suggestion_verbosity, max_edit_distance_lookup) suggestion = (list(suggestions))[0] # display suggestion term, term frequency, and edit distance print( "input_term = {}, suggestion_term = {}, suggestion_count = {},\ suggestion_distance = {}".format(word, suggestion.term, suggestion.count, suggestion.distance)) suggestion_list.append(suggestion.term) print("\n\nThe corrected sentence is : {}".format( ' '.join(suggestion_list))) print(suggestion_list) print(proper_noun) return suggestion_list, proper_noun except TypeError as error: print(f'Invalid type : {error}') return 405
def test_lookup_should_return_most_frequent(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell() sym_spell.create_dictionary_entry("steama", 4) sym_spell.create_dictionary_entry("steamb", 6) sym_spell.create_dictionary_entry("steamc", 2) result = sym_spell.lookup("stream", Verbosity.TOP, 2) self.assertEqual(1, len(result)) self.assertEqual("steamb", result[0].term) self.assertEqual(6, result[0].count)
def common_keywords(text): keyword_data = pd.read_csv( 'D:/ML/QNA_project/CSV_files/keywords.csv') filter_data = pd.read_csv( 'D:/ML/QNA_project/CSV_files/filters.csv') # text = "he lives in bangalor1" text = text.lower() w = text.split(' ') print(w) max_edit_distance_dictionary = 2 prefix_length = 9 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST ques = "" for input in w: suggestions = sym_spell.lookup(input, max_edit_distance_lookup) try: ques = ques + suggestions[0].term + " " except: ques = ques + input + " " ques = ques + text # print(ques) words = [] for i in range(len(keyword_data)): str = keyword_data['Keywords'][i] str = str.lower() if (ques.find(str, 0, len(str)) != -1): words.append(str) for i in range(len(filter_data)): str = filter_data['Filters'][i] str = str.lower() if (ques.find(str, 0, len(str)) != -1): words.append(str) return len(words)
def main(): # create object initial_capacity = 83000 # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: s.bind((host, port)) while (True): # lookup suggestions for single-word input strings try: input_term, source = s.recvfrom(1024) # Network input input_term = input_term.decode() print("Test2 Input: {}".format(input_term)) # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("Test2 Output: {}, {}, {}".format( suggestion.term, suggestion.count, suggestion.distance)) # s.sendto(suggestions[0].term.encode(), source) s.sendto(str(len(suggestions)).encode(), source) for i in range(0, len(suggestions)): s.sendto(suggestions[i].term.encode(), source) except Exception as e: print(e) time.sleep(1)
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "pyth" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # a sentence without any spaces input_term = "thequuickbrownfoxjumpsoverthelazydog" result = sym_spell.word_segmentation(input_term) # display suggestion term, term frequency, and edit distance print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
def correction(input_term): # create object sym_spell = SymSpell() # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) suggestions.extend( sym_spell.lookup_compound(input_term, max_edit_distance_lookup)) suggestions = sorted(suggestions, key=lambda x: (x.distance)) #to remove dupicate objects import collections seen = collections.OrderedDict() for obj in suggestions: if obj.term not in seen: seen[obj.term] = obj suggestions = list(seen.values()) #when the no correction is needed seen = collections.OrderedDict() for obj in suggestions: if obj.term != input_term: seen[obj.term] = obj correctWords = list(seen.values()) if len(correctWords) == 0: return # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.distance, # suggestion.count)) return suggestions
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 9 data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv') # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings # input_term = "agricultr" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL s = "" print('original') # print(len(words)) for i in range(len(data)): # print(i) if i == 0 or i == 51124 or i == 65070: continue input_term = data['Final_words'][i] suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) print(i) try: s = s + str(suggestions[0].term) + " " except: s = s + input_term s = s[:-1] words = s.split(' ') # print(len(words)) print('After') print(len(words))
def extract_misspellings(s): global sym_spell if sym_spell is None: # Initialize SymSpell checker # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") if not sym_spell.load_bigram_dictionary( bigram_path, term_index=0, count_index=2): print("Bigram dictionary file not found") max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL # Start correcting word by word article_text = s.split() misspelled = 0 for word in article_text: word = word.strip() suggestions = sym_spell.lookup(word, suggestion_verbosity, max_edit_distance_lookup) # Correct the text if len(suggestions) == 0: continue sug = suggestions[0] if sug.term != word: s = re.sub("\s+" + word + "\s+", " " + sug.term + " ", s) misspelled = misspelled + 1 mpw = misspelled / len(article_text) return mpw, s
def main(): # create object initial_capacity = 83000 # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( "/Users/meheresh/Documents/cm_spellchecker/spellcheck/data", "freqdict.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "memebers" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.count, suggestion.distance)) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.count, suggestion.distance))
def main(argv): if len(argv) == 3: input = argv[1] markdown = argv[2] else: print('usage:\n python .py "<categoria>" <markdown gerado>') return initial_capacity = 83000 max_edit_distance_dictionary = 3 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) dictionary_path = "category_count.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return categorys = open(dictionary_path, 'r') d = defaultdict(lambda: 0) for x in categorys.readlines(): z = x.split(' ') d[z[0]] = z[2] f = open(markdown, 'a') f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize()) input = input.lower() suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL inputs = input.split(' ') total_avg = sum(map(len, inputs)) / len(inputs) max_edit_distance_lookup = 3 if total_avg > 4 else 2 for input_term in inputs: suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) for suggestion in suggestions: f.write("* {}, https://a2oj.com/{}".format( (suggestion.term).capitalize(), d[suggestion.term])) f.close() categorys.close()
def check_spelling(content): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return # print("words",sym_spell._words) # if not sym_spell.load_bigram_dictionary(dictionary_path, term_index=0, # count_index=2): # print("Bigram dictionary file not found") # return # result = sym_spell.word_segmentation(content,max_edit_distance=0,max_segmentation_word_length=None,ignore_token=None) # display suggestion term, term frequency, and edit distance # print("{}, {}, {}".format(result.corrected_string, result.distance_sum, # result.log_prob_sum)) # print("corrrectedstring",result.corrected_string) doc = nlp(content) suggest = {} for word in doc: print("content", word.text) suggestions = sym_spell.lookup(word.text, Verbosity.TOP, max_edit_distance=2, include_unknown=False) for suggestion in suggestions: if suggestion._distance > 0: suggest[word.text] = suggestion._term # print("sugg",suggestion,suggestion._term,type(suggestion)) print(suggest) return suggest
def spellcheck(text): max_dictionary_edit_distance = 2 prefix_length = 7 sym_spell = SymSpell( max_dictionary_edit_distance=max_dictionary_edit_distance, prefix_length=prefix_length) dictionary_path = '6._Ranking/tools_for_spellcheck/frequency_dictionary_en_82_765.txt' term_index = 0 count_index = 1 if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return final_text = '' newtext = expandContractions(text) wordlist = nltk.word_tokenize(newtext.lower()) for item in wordlist: if item in '.,:;?!-': final_text = final_text + item elif item == 'i': final_text = final_text + ' ' + item elif (item == 'ive'): final_text = final_text + ' i have' elif (item == 'id'): final_text = final_text + ' i would' elif (item == 'im'): final_text = final_text + ' i am' elif (item == 'dont'): final_text = final_text + ' do not' else: input_term = item max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.TOP # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) if len( suggestions ) == 0: #if suggestion not found, then leave as is to avoid deleting words final_text = final_text + ' ' + input_term else: for suggestion in suggestions: final_text = final_text + ' ' + str(suggestion.term) return final_text
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( os.path.dirname(__file__), "/home/raghu/Downloads/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "memebers" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("AGUDATA OF BIRTH") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency print(suggestions) for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def _symspell(self, sentences): """ SymSpell tool to spelling correction through Symmetric Delete spelling algorithm. Reference: Author: Wolf Garbe <*****@*****.**> Description: https://medium.com/@wolfgarbe/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f URL: https://github.com/wolfgarbe/symspell Python module: symspellpy (https://github.com/mammothb/symspellpy) """ symspell = SymSpell(max_dictionary_edit_distance=self.N) symspell.create_dictionary(self.corpus_path) with open(self.dictionary_path, "w") as f: for key, count in symspell.words.items(): f.write(f"{key} {count}\n") symspell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) predicts = [] if not isinstance(sentences, list): sentences = [sentences] for i in range(len(sentences)): split = [] for x in sentences[i].split(): sugg = symspell.lookup( x.lower(), verbosity=0, max_edit_distance=self.N, transfer_casing=True ) if x not in string.punctuation else None split.append(sugg[0].term if sugg else x) predicts.append(" ".join(split)) return predicts
def main(argv): if len(argv) == 3: input = argv[1] markdown = argv[2] else: print ('usage:\n python .py "<categoria>" <markdown gerado>') return initial_capacity = 83000 max_edit_distance_dictionary = 3 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) dictionary_path = "category_count.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return categorys = open(dictionary_path, 'r') d = defaultdict(lambda: 0) for x in categorys.readlines(): z = x.split(' ') d[z[0]] = z[2] f = open(markdown, 'a') f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize()) input = input.lower() suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL inputs = input.split(' ') total_avg = sum( map(len, inputs) ) / len(inputs) max_edit_distance_lookup = 3 if total_avg > 4 else 2 for input_term in inputs: suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) for suggestion in suggestions: f.write("* {}, https://a2oj.com/{}".format((suggestion.term).capitalize(), d[suggestion.term])) f.close() categorys.close()
def process(input_string): max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not sym_spell.load_bigram_dictionary( bigram_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST suggestions = sym_spell.lookup(input_string, suggestion_verbosity, max_edit_distance_lookup) return list( map(lambda sug: (sug.term, sug.distance, sug.count), suggestions))
def symspell_test(tokenpos_list: list, ignore_length=2, max_edit_distance_lookup=2, initial_capacity=83000, max_edit_distance_dictionary=2, prefix_length=7, suggestion_verbosity=Verbosity.TOP) -> list: """ keyword arguments are: suggestion_verbosity = TOP: Top suggestion with smallest edit distance with highest term frequency. CLOSEST: All suggestions of smallest edit distance found ordered by frequency. ALL: All suggestions within maxEditDistance. :return: list of suggested corrections, list of ignored words :return: 410 Error: Wrong input type! (Expected list of 2 element tuples) """ try: sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) check_symspell_dictionary(sym_spell) suggestion_list = [] intact_words = [] for (word, pos) in tokenpos_list: if pos == 'PROPN' or len(word) <= ignore_length: suggestion_list.append(word) intact_words.append(word) else: suggestions = sym_spell.lookup(word, suggestion_verbosity, max_edit_distance_lookup) suggestion = (list(suggestions))[0] suggestion_list.append(suggestion.term) return suggestion_list, intact_words except (ValueError, TypeError): logging.error('Invalid type! Type List of tuples expected as input.') return 410
def main(): dictionary = 'Dictionary_symspell_50_clusters.txt' performance_sym = 'performance_sym.txt' fout = open(performance_sym, 'w') word_list = [] n = 100 with open(dictionary, 'r') as f: for line in f: x = line.strip("\n").split(' ') word_list.append(x[0]) initial_capacity = 600 max_edit_distance_dictionary = 3 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) term_index = 0 count_index = 1 dictionary_path = os.path.join(os.path.dirname(__file__), dictionary) if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return number_of_wrong_char = 2 number_of_word = 2 for i in range(n): c, w = gen_word(number_of_word, number_of_wrong_char, word_list) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST input_term = "" # print ("correct word : " + c) fout.write(c + "\t" + w + "\t") suggestions = sym_spell.lookup(w, suggestion_verbosity, max_edit_distance_lookup) for suggestion in suggestions: fout.write(suggestion.term + " ") fout.write("\n")