def main(): initial_capacity = 83000 # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 0 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = "thequickbrownfoxjumpsoverthelazydog" correction = "the quick brown fox jumps over the lazy dog" result = sym_spell.word_segmentation(typo) # create object # a sentence without any spaces input_term = "thequickbrownfoxjumpsoverthelazydog" result = sym_spell.word_segmentation(input_term) # display suggestion term, term frequency, and edit distance print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
class SymSpellCorrection: """ Use SymSpell for correction """ def __init__(self, dictionary_path, term_index=0, count_index=1, max_edit_distance_dictionary=0, prefix_length=7, **args): """ Input: - dictionary_path: string - term_index: int, column of the term in the dictionary text file, default is 0 - count_index: int, column of the term frequency in the dictionary text file, default is 1 - max_edit_distance_dictionary: int, maximum edit distance per dictionary precalculation, default is 0 - prefix_length, int, default is 7 """ from symspellpy.symspellpy import SymSpell self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) self.sym_spell.load_dictionary(dictionary_path, term_index, count_index) def __call__(self, sentence): """ Input: - sentence: string Output: - string """ if len(sentence) < 1: return sentence try: corrected = self.sym_spell.word_segmentation(sentence).corrected_string except: print("Error spell correction:", sentence) corrected = sentence return corrected
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "pyth" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # a sentence without any spaces input_term = "thequuickbrownfoxjumpsoverthelazydog" result = sym_spell.word_segmentation(input_term) # display suggestion term, term frequency, and edit distance print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
def main(): max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) if not sym_spell.create_dictionary('training_data.txt', encoding="ISO-8859-1"): print("Corpus file not found") return dictlist = [] for key, count in sym_spell.words.items(): print("{} {}\n".format(key, count)) dictlist.append("{} {}\n".format(key, count)) # save Dictionary with open("dict.txt", "a+", encoding="ISO-8859-1") as text_file: text_file.write(str(dictlist)) print('Saved Dic') # load dictionary dictionary_path = os.path.join(os.path.dirname('./'), "dict.txt") print(dictionary_path) term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # a sentence without any spaces data = '' with open('missing_spaces.txt', 'r', encoding="utf8") as myfile: data = myfile.read() splitline = data.split(',') # for line in splitline: # data.append(splitline[line]) for indx in range(0, (len(splitline) - 1)): try: strval = splitline[indx] # print(strval) result = sym_spell.word_segmentation(strval, max_edit_distance_dictionary, prefix_length) # display suggestion term, term frequency, and edit distance print("{}".format(result.corrected_string)) except: print('out of index')
def postprocessing(text): max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") if not sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not sym_spell.load_bigram_dictionary(dictionary_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return result = sym_spell.word_segmentation(text.lower()) return result.corrected_string
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 0 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # a sentence without any spaces input_term = "bangalore" # input_term = "thequickbrownfoxjumpsoverthelazydog" # input_term = 'universitycollegesbangalore' result = sym_spell.word_segmentation(input_term) x = result.corrected_string.split(' ') # display suggestion term, term frequency, and edit distance print(x) print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
import time import pkg_resources from symspellpy.symspellpy import SymSpell import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer import string import re import os import time import pandas as pd # Set max_dictionary_edit_distance to increase lenght of spelling correction sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=4) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) tic= time.time() directory = r'TestData' for entry in os.scandir(directory): if (entry.path.endswith(".txt")): a_file = open(entry.path, "r") new_path = os.path.join(r"TestData\Temp", ((entry.path).split('\\')[1])) copy = open(new_path, "w+",encoding='utf-8') for line in a_file: result = sym_spell.word_segmentation(line) copy.write(format(result.corrected_string)+'\n') copy.close() a_file.close() toc = time.time() print("Time "+new_path+": " +str(toc-tic))