def initialize(self): print("Initializing Text Cleaner..") print("Initializing Smart Contractions Module..") self.cont = Contractions(self.embedding_for_smart_contraction) self.cont.load_models() print("Initializing Stopwords Module..") self.stop_words = set(stopwords.words('english')) stop_words_without_negation = copy.deepcopy(self.stop_words) stop_words_without_negation.remove('no') stop_words_without_negation.remove('nor') stop_words_without_negation.remove('not') self.stop_words_without_negation = stop_words_without_negation self.pos_tags_set_1 = {'NNP'} print("Initializing Wordnet Lemmatizer Module..") self.wnl = WordNetLemmatizer() print("Initializing Spellcheck Module..") max_edit_distance_dictionary = 2 prefix_length = 7 self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath self.sym_spell.load_dictionary(dictionary_path, 0, 1) print("Initialization complete!")
def test_lookup_should_replicate_noisy_results(self): print(' - %s' % inspect.stack()[0][3]) cwd = path.realpath(path.dirname(__file__)) dictionary_path = path.realpath( path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt") edit_distance_max = 2 prefix_length = 7 verbosity = Verbosity.CLOSEST sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) test_list = [] with open(query_path, "r") as infile: for line in infile.readlines(): line_parts = line.rstrip().split(" ") if len(line_parts) >= 2: test_list.append(line_parts[0]) result_sum = 0 for phrase in test_list: result_sum += len( sym_spell.lookup(phrase, verbosity, edit_distance_max)) self.assertEqual(4945, result_sum)
def spell_correction(texte): max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = "../ressources/fr-100k.txt" bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not sym_spell.load_bigram_dictionary( bigram_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return input_term = texte # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) if (len(suggestions) > 0): return suggestions[0].term else: print("error with : ", texte) return texte
def correctly_spelled(data, max_edit_distance_lookup=None): global sym_speller # Make the SymspellPy-based speller global to be able to be used in the body of this function if sym_speller is None: # If the speller is not initialized sym_speller = SymSpell( max_edit_distance_dictionary, prefix_length) # Initialize the speller provided its parameters as # previously defined sym_spell_dict_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt" ) # Load the frequency dictionary # to the speller term_index = 0 # Column of the term in the dictionary text file count_index = 1 # Column of the term frequency in the dictionary text file if not sym_speller.load_dictionary( sym_spell_dict_path, term_index, count_index): # If the dictionary was not found print("ERROR! SymSpellPy dictionary not found at following path:", sym_spell_dict_path ) # Print error message informing about this os._exit(1) # Exit the entire program if max_edit_distance_lookup is None: # If no maximum edit distance during lookup is specified max_edit_distance_lookup = max_edit_distance_dictionary # Assign the same edit distance to that as to the maximum edit distance # on the dictionary # Correct spelling of each token in the text and return the data sample return " ".join([ (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if t.isalpha() and not (t == data[0] or t == data[1] or ("".join([x[0] for x in data[1].split()]) == t if len(data[1].split()) >= 3 else False)) else t) for t in tokenized(data[2]) ])
class SymSpellCorrection: """ Use SymSpell for correction """ def __init__(self, dictionary_path, term_index=0, count_index=1, max_edit_distance_dictionary=0, prefix_length=7, **args): """ Input: - dictionary_path: string - term_index: int, column of the term in the dictionary text file, default is 0 - count_index: int, column of the term frequency in the dictionary text file, default is 1 - max_edit_distance_dictionary: int, maximum edit distance per dictionary precalculation, default is 0 - prefix_length, int, default is 7 """ from symspellpy.symspellpy import SymSpell self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) self.sym_spell.load_dictionary(dictionary_path, term_index, count_index) def __call__(self, sentence): """ Input: - sentence: string Output: - string """ if len(sentence) < 1: return sentence try: corrected = self.sym_spell.word_segmentation(sentence).corrected_string except: print("Error spell correction:", sentence) corrected = sentence return corrected
def __init__(self, lm, max_ed=4, prefix_length=7, l=1, channel_method_poisson=True, channel_prob_param=0.02): self.show_progress = False self.lm = lm self.l = l self.channel_method_poisson = channel_method_poisson self.channel_prob_param = channel_prob_param self.sym_spell = SymSpell(max_ed, prefix_length) if isinstance(self.lm, GPT2LMHeadModel): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.lm_sent_logscore = self.gpt2_sent_logscore self.beam_init = self.beam_GPT_init self.skipstart = 1 self.skipend = -1 self.update_sentence_history = self.updateGPT2history self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') for subword in range(self.tokenizer.vocab_size): self.sym_spell.create_dictionary_entry(key=self.tokenizer.decode(subword), count=1) else: self.lm_sent_logscore = self.ngram_sent_logscore self.beam_init = self.beam_ngram_init self.skipstart = self.lm.order-1 self.skipend = None self.update_sentence_history = self.updatengramhistory self.tokenizer = ngramTokenizer(self.lm) for word in lm.vocab: self.sym_spell.create_dictionary_entry(key=word, count=self.lm.counts[word])
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 3 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 3 f = open("note.html", "r") noteString = f.read() noteString = stripHTML(noteString) print(noteString) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.") tstart = datetime.now() suggestions = sym_spell.lookup_compound(noteString, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) tend = datetime.now() time = tend - tstart print(time.seconds)
def spelling_preprocessor(): import os from symspellpy.symspellpy import SymSpell, Verbosity max_edit_distance_dictionary = 2 prefix_length = 7 sc = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.join(os.getenv('HOME'), 'symspellpy/symspellpy/frequency_dictionary_en_82_765.txt') term_index = 0 count_index = 1 if not sc.load_dictionary(dictionary_path, term_index, count_index): raise ImportError('Unable to load spelling dictionary') max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST @string_check def checker(s): words = s.split() corrected_words = list() for word in words: correction = sc.lookup(word, suggestion_verbosity, max_edit_distance_lookup) if correction: corrected_words.append(correction[0].term) else: corrected_words.append(word) return ' '.join(corrected_words) return checker
def __init__(self, max_dictionary_edit_distance=2, prefix_length=7, dictionary_path=None): # maximum edit-distance for doing lookups self.max_dictionary_edit_distance = max_dictionary_edit_distance # Length of word prefixes used for spell checking self.prefix_length = prefix_length # create object self.sym_spell = SymSpell( max_dictionary_edit_distance=self.max_dictionary_edit_distance, prefix_length=self.prefix_length) # load dictionary if dictionary_path is None: dictionary_path = os.path.join( os.path.dirname('__file__'), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): print('Dictionary file not found')
def initializeSymspell(): print("inside initializeSymspell()") symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) print("symspell created") resourceNames = [ "symspellpy", "frequency_dictionary_en_82_765.txt", "frequency_bigramdictionary_en_243_342.txt" ] dictionaryPath = pkg_resources.resource_filename(resourceNames[0], resourceNames[1]) bigramPath = pkg_resources.resource_filename(resourceNames[0], resourceNames[2]) print("dictionaryPath created") symspell.load_dictionary(dictionaryPath, 0, 1) symspell.create_dictionary_entry(key='ap', count=500000000) symspell.create_dictionary_entry(key="ain't", count=500000000) print(list(islice(symspell.words.items(), 5))) print("symspell.load_ditionary() done") symspell.load_bigram_dictionary(bigramPath, 0, 1) print(list(islice(symspell.bigrams.items(), 5))) print("symspell.load_bigram_ditionary() done") # Create vocab vocab = set([w for w, f in symspell.words.items()]) return symspell, vocab
def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Train a symspell Spell Corrector. Returns ------- result: malaya.spell.SYMSPELL class """ check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) try: from symspellpy.symspellpy import SymSpell, Verbosity except: raise Exception( 'symspellpy not installed. Please install it and try again.') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = PATH_NGRAM['symspell']['model'] sym_spell.load_dictionary(dictionary_path, term_index, count_index) with open(PATH_NGRAM[1]['model']) as fopen: corpus = json.load(fopen) return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)
def setup(initial_capacity=83000, prefix_length=7, max_edit_distance_dictionary=2): global maximum_edit_distance maximum_edit_distance = max_edit_distance_dictionary dict_path = '/home/fa6/data/symspellpy/frequency_dictionary_en_82_765.txt' sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length, count_threshold=30) term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dict_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings # input_term = "memebers" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) # max_edit_distance_lookup = 2 # suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, # max_edit_distance_lookup) # # display suggestion term, term frequency, and edit distance # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.count, # suggestion.distance)) return sym_spell
def symspell(max_edit_distance_dictionary: int = 2, prefix_length: int = 7, term_index: int = 0, count_index: int = 1, top_k: int = 10, **kwargs): """ Load a symspell Spell Corrector for Malay. Returns ------- result: malaya.spell.Symspell class """ try: from symspellpy.symspellpy import SymSpell, Verbosity except BaseException: raise ModuleNotFoundError( 'symspellpy not installed. Please install it and try again.') path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs) sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(path['model'], term_index, count_index) path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs) with open(path['model']) as fopen: corpus = json.load(fopen) return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)
def create_context_speller(): """Creates a context speller, which uses the context frequency lookup table""" # Initialize Context Symspell Checker context_sym_spell = SymSpell(83000, 2, 7) # load dictionary lookup_path = os.path.join(os.path.dirname( __file__), "./data/dict/context_dist_small.txt") if not context_sym_spell.load_dictionary(lookup_path, 0, 1): raise Exception("Dictionary file not found") # Creates the spell checker def check_spell(word): suggestions = context_sym_spell.lookup(word, Verbosity.CLOSEST, 2) if len(suggestions) == 0: # Not in context return True else: correct = True for suggestion in suggestions: if suggestion.distance == 1: correct = False return correct return check_spell
def spelling_correction(data,column): from symspellpy.symspellpy import SymSpell , Verbosity # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = "frequency_dictionary_en_82_765.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL df_final = pd.DataFrame() for index , row in data.iterrows(): # lookup suggestions for single-word input strings text = row[column] # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) for input_term in text.split(): suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) if len(suggestions)>0: df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]}) df_final = df_final.append(df_local) return df_final
def test_words_from_list_with_shared_prefix_should_retain_counts(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3, words=[ "pipe", "pipe", "pipe", "pipe", "pipe", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips", "pips" ]) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "bangeeet" # misspelling # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def correct_spelling(sentence): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 5 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return if "& ;" in sentence: sentence = sentence.replace("& ;", "and") max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup) save = "" for suggestion in suggestions: save = suggestion.term #print("{}".format(save)) break #if "#" in save: # save = sym_spell.word_segmentation(save) return save
class SpellCorrector(): def __init__(self, max_edit_distance_dictionary=2, prefix_length=7): self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname('../'), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): raise("Dictionary file not found") # manually # this works. about 0.003 up # self.corr_dict = {"awsome": "awesome"} def reduce_lengthening(self, text): # not work pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", text) def strip_punc(self, word): # not work return re.sub(r"[\-\_\.\!]$", "", word) def __call__(self, word): word = self.reduce_lengthening(word) # if word in self.corr_dict: # word = self.corr_dict[word] if len(word) > 2 and "'" not in word: suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, 2) if suggestions: return suggestions[0].term return word
def __init__(self, train=False, save=False, corpus_path=CORPUS_PATH, threshold=2): self.slang_dict = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_slang_words.p"), "rb")) self.slang_dict['dr'] = 'dari' self.slang_dict['k'] = 'ke' self.slang_dict['sc'] = 'sesar' if train: create_dictionary.main() self.words = self.__words(corpus_path) self.counter = self.__counter(self.words) self.model = model.LanguageModel(corpus_path=corpus_path) else: self.words = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_words.p"), "rb")) self.counter = pickle.load( open( os.path.join(os.path.dirname(__file__), "pickled/_spell_counter.p"), "rb")) self.model = model.LanguageModel(load=True) try: for key in self.counter: if self.counter[key] <= threshold: self.words.remove(key) except: pass self.candidates_dict = {} # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) self.factory = StemmerFactory() self.stemmer = self.factory.create_stemmer() # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/dictionary/dictionary.txt") # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary( dictionary_path, term_index, count_index, encoding="utf-8"): print("Dictionary file not found") return if save == True: self.save()
def createSymSpell(dict='ru-100k.txt', encoding='utf-8'): symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5) symspell.load_dictionary(dict, encoding=encoding, term_index=0, count_index=1) return symspell
def getSymspellDict(direc): print("loading symspell object") sym_spell = SymSpell(83000, 2, 7) if not sym_spell.load_dictionary(direc, 0, 1): print("Dictionary file not found") return sym_spell
def load_symspell(dict_path='symspell/frequency_dictionary_en_82_765.txt', max_edit_distance_dictionary=2, prefix_length=7, term_index=0, count_index=1): sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell.load_dictionary(dict_path, term_index, count_index) return sym_spell
def __init__(self, progress, directory, countries_dict): self.progress = progress self.logger = logging.getLogger(__name__) self.spelling_update = Counter() self.directory = directory self.spell_path = os.path.join(self.directory, 'spelling.pkl') self.countries_dict = countries_dict self.sym_spell = SymSpell()
def symspell_dict(max_edit_dist, prefix_len): dictfile = DICT_DIR / "big.txt" #downloaded from Peter Norvig's site sym_spell = SymSpell(max_edit_dist, prefix_len) #create the symspell dictionary using the dictfile if not sym_spell.create_dictionary(str(dictfile)): print("corpus file not found") return sym_spell
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 9 # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings # input_term = "agricultr" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL s = "" # print('original') # print(len(words)) # for i in range(len(data)): # # print(i) # if i==0 or i==51124 or i==65070: # continue # input_term = data['Final_words'][i] # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, # max_edit_distance_lookup) # print(i) # try: # s = s + str(suggestions[0].term)+" " # except: # s = s+ input_term # # s = s[:-1] # words = s.split(' ') # # print(len(words)) # print('After') # print(len(words)) # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.distance, # suggestion.count)) # input_term = ("whereis th elove hehad dated forImuch of thepast who " # "couqdn'tread in sixtgrade and ins pired him") input_term = 'live' # max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def __init__(self, max_edit_distance_dictionary=2, prefix_length=7): self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname('../'), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): raise("Dictionary file not found")
def symspell_test(tokenpos_list, max_edit_distance_lookup=3, initial_capacity=83000, max_edit_distance_dictionary=3, prefix_length=7, term_index=0, count_index=1): """ This is a function that tests the SymSpell library for spell-checking performance. Key-word arguments are: ** max_edit_distance_lookup : (Recommended maximum = 3) ** term_index : term column in dictionary (0) ** count_index : frequency column in dictionary (1) """ print('\n{} \nBegin \'Symspellpy\' testing \n'.format('#' * 20)) try: sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) suggestion_verbosity = Verbosity.CLOSEST dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return 'Error loading dictionary file' suggestion_list = [] proper_noun = [] for (word, pos) in tokenpos_list: if pos == 'PROPN': suggestion_list.append(word) proper_noun.append(word) elif len(word) < 3: suggestion_list.append(word) proper_noun.append(word) else: suggestions = sym_spell.lookup(word, suggestion_verbosity, max_edit_distance_lookup) suggestion = (list(suggestions))[0] # display suggestion term, term frequency, and edit distance print( "input_term = {}, suggestion_term = {}, suggestion_count = {},\ suggestion_distance = {}".format(word, suggestion.term, suggestion.count, suggestion.distance)) suggestion_list.append(suggestion.term) print("\n\nThe corrected sentence is : {}".format( ' '.join(suggestion_list))) print(suggestion_list) print(proper_noun) return suggestion_list, proper_noun except TypeError as error: print(f'Invalid type : {error}') return 405
def test_words_with_shared_prefix_should_retain_counts(self): print(' - %s' % inspect.stack()[0][3]) sym_spell = SymSpell(16, 1, 3) sym_spell.create_dictionary_entry("pipe", 5) sym_spell.create_dictionary_entry("pips", 10) result = sym_spell.lookup("pipe", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pipe", result[0].term) self.assertEqual(5, result[0].count) self.assertEqual("pips", result[1].term) self.assertEqual(10, result[1].count) result = sym_spell.lookup("pips", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count) result = sym_spell.lookup("pip", Verbosity.ALL, 1) self.assertEqual(2, len(result)) self.assertEqual("pips", result[0].term) self.assertEqual(10, result[0].count) self.assertEqual("pipe", result[1].term) self.assertEqual(5, result[1].count)
def main(): initial_capacity = 83000 # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 0 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return edit_distance_max = 0 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = "thequickbrownfoxjumpsoverthelazydog" correction = "the quick brown fox jumps over the lazy dog" result = sym_spell.word_segmentation(typo) # create object # a sentence without any spaces input_term = "thequickbrownfoxjumpsoverthelazydog" result = sym_spell.word_segmentation(input_term) # display suggestion term, term frequency, and edit distance print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
def main(argv): if len(argv) == 3: input = argv[1] markdown = argv[2] else: print ('usage:\n python .py "<categoria>" <markdown gerado>') return initial_capacity = 83000 max_edit_distance_dictionary = 3 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) dictionary_path = "category_count.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return categorys = open(dictionary_path, 'r') d = defaultdict(lambda: 0) for x in categorys.readlines(): z = x.split(' ') d[z[0]] = z[2] f = open(markdown, 'a') f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize()) input = input.lower() suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL inputs = input.split(' ') total_avg = sum( map(len, inputs) ) / len(inputs) max_edit_distance_lookup = 3 if total_avg > 4 else 2 for input_term in inputs: suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) for suggestion in suggestions: f.write("* {}, https://a2oj.com/{}".format((suggestion.term).capitalize(), d[suggestion.term])) f.close() categorys.close()