class Autocorrect: def __init__(self, words=None, max_edit_distance=2): self._symspell = SymSpell() self._max_edit_distance = max_edit_distance if words is not None: self.add_words(words) def add_word(self, word): if word is not None: self._symspell.create_dictionary_entry(word, 1) def add_words(self, words): if words is not None: self._symspell.create_dictionary(words) def delete_word(self, word): if word is not None: self._symspell.delete_dictionary_entry(word) def correct(self, bad_word): return self._symspell.lookup(bad_word, Verbosity.TOP, max_edit_distance=self._max_edit_distance, include_unknown=True)[0].term def predictions(self, bad_word): return self._symspell.lookup(bad_word, Verbosity.CLOSEST, max_edit_distance=self._max_edit_distance, include_unknown=True)
def load(cls, language: str) -> "SpellCorrectGenerator": # maximum edit distance per dictionary pre-calculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) if language == "en": dict_path = (pathlib.Path(__file__).parent / "resources" / "frequency_dictionary_en_82_765.txt") sym_spell.create_dictionary(str(dict_path)) spacy_model = spacy.load("en", disable=["parser", "ner"]) else: raise RuntimeError( f"The language {language} is currently not language.") return cls(sym_spell, spacy_model)
def symspell_dict(max_edit_dist, prefix_len): dictfile = DICT_DIR / "big.txt" #downloaded from Peter Norvig's site sym_spell = SymSpell(max_edit_dist, prefix_len) #create the symspell dictionary using the dictfile if not sym_spell.create_dictionary(str(dictfile)): print("corpus file not found") return sym_spell
def _symspell(self, sentences): """ SymSpell tool to spelling correction through Symmetric Delete spelling algorithm. Reference: Author: Wolf Garbe <*****@*****.**> Description: https://medium.com/@wolfgarbe/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f URL: https://github.com/wolfgarbe/symspell Python module: symspellpy (https://github.com/mammothb/symspellpy) """ symspell = SymSpell(max_dictionary_edit_distance=self.N) symspell.create_dictionary(self.corpus_path) with open(self.dictionary_path, "w") as f: for key, count in symspell.words.items(): f.write(f"{key} {count}\n") symspell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) predicts = [] if not isinstance(sentences, list): sentences = [sentences] for i in range(len(sentences)): split = [] for x in sentences[i].split(): sugg = symspell.lookup( x.lower(), verbosity=0, max_edit_distance=self.N, transfer_casing=True ) if x not in string.punctuation else None split.append(sugg[0].term if sugg else x) predicts.append(" ".join(split)) return predicts
def create_symspell(max_edit_distance, prefix_length, freq_file_path): # create object sym_spell = SymSpell(max_edit_distance, prefix_length) # create dictionary using corpus.txt if not sym_spell.create_dictionary(freq_file_path): print("Corpus file not found") return None return sym_spell
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # create dictionary using corpus.txt if not sym_spell.create_dictionary('D:/ML/QNA_project/corpus.txt'): print("Corpus file not found") return for key, count in sym_spell.words.items(): print("{} {}".format(key, count))
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # create dictionary using corpus.txt if not sym_spell.create_dictionary( "C:/Users/tyc64/Desktop/PythonStuff/REHS/Spellchecker_and_NER/corpus.txt" ): print("Corpus file not found") return for key, count in sym_spell.words.items(): print("{} {}".format(key, count))
def main(): max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) if not sym_spell.create_dictionary('training_data.txt', encoding="ISO-8859-1"): print("Corpus file not found") return dictlist = [] for key, count in sym_spell.words.items(): print("{} {}\n".format(key, count)) dictlist.append("{} {}\n".format(key, count)) # save Dictionary with open("dict.txt", "a+", encoding="ISO-8859-1") as text_file: text_file.write(str(dictlist)) print('Saved Dic') # load dictionary dictionary_path = os.path.join(os.path.dirname('./'), "dict.txt") print(dictionary_path) term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # a sentence without any spaces data = '' with open('missing_spaces.txt', 'r', encoding="utf8") as myfile: data = myfile.read() splitline = data.split(',') # for line in splitline: # data.append(splitline[line]) for indx in range(0, (len(splitline) - 1)): try: strval = splitline[indx] # print(strval) result = sym_spell.word_segmentation(strval, max_edit_distance_dictionary, prefix_length) # display suggestion term, term frequency, and edit distance print("{}".format(result.corrected_string)) except: print('out of index')
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # create dictionary using corpus.txt if not sym_spell.create_dictionary(CORPUS_FILE): print("Corpus file not found") return f = open(DICT_FILE, "w+") for key, count in sym_spell.words.items(): #print("{} {}".format(key, count)) f.write("{} {} \r\n".format(key, count)) f.close() print('dictionary file created') #create another dictionary file using corpus.txt sentence_list = [] with open(CORPUS_FILE, 'r') as file: for line in file.readlines(): line = re.sub('\n', '', line) sentence_list.append(line) corpus = ' '.join(sentence_list) word_count = Counter(corpus.split()) df = pd.DataFrame({ 'word': list(word_count.keys()), 'count': list(word_count.values()) }) df.loc[df['count'].isin(['2', '3', '4'])].sort_values(by='count').to_csv( WORD_COUNT_FILE, index=False) print('word count file created')
class SpellCheck: def __init__(self, progress, directory, countries_dict): self.progress = progress self.logger = logging.getLogger(__name__) self.spelling_update = Counter() self.directory = directory self.spell_path = os.path.join(self.directory, 'spelling.pkl') self.countries_dict = countries_dict self.sym_spell = SymSpell() def insert(self, name, iso): if 'gothland cemetery' not in name and name not in noise_words: name_tokens = name.split(' ') for word in name_tokens: key = f'{word}' if len(key) > 2: self.spelling_update[key] += 1 def write(self): # Create blank spelling dictionary path = os.path.join(self.directory, 'spelling.tmp') fl = open(path, 'w') fl.write('the,1\n') fl.close() success = self.sym_spell.create_dictionary(corpus=path) if not success: self.logger.error(f"error creating spelling dictionary") self.logger.info('Building Spelling Dictionary') # Add all words from geonames into spelling dictionary for key in self.spelling_update: self.sym_spell.create_dictionary_entry( key=key, count=self.spelling_update[key]) self.logger.info('Writing Spelling Dictionary') self.sym_spell.save_pickle(self.spell_path) def read(self): success = False if os.path.exists(self.spell_path): self.logger.info( f'Loading Spelling Dictionary from {self.spell_path}') success = self.sym_spell.load_pickle(self.spell_path) else: self.logger.error( f"spelling dictionary not found: {self.spell_path}") if not success: self.logger.error( f"error loading spelling dictionary from {self.spell_path}") else: self.sym_spell.delete_dictionary_entry(key='gothland') size = len(self.sym_spell.words) self.logger.info(f"Spelling Dictionary contains {size} words") def lookup(self, input_term): #suggestions = [SymSpell. SuggestItem] if '*' in input_term: return input_term res = '' if len(input_term) > 1: suggestions = self.sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) for idx, item in enumerate(suggestions): if idx > 3: break #self.logger.debug(f'{item._term}') if item._term[0] == input_term[0]: # Only accept results where first letter matches res += item._term + ' ' return res else: return input_term def lookup_compound(self, phrase): suggestions = self.sym_spell.lookup_compound(phrase=phrase, max_edit_distance=2, ignore_non_words=False) for item in suggestions: self.logger.debug(f'{item._term}') return suggestions[0]._term def fix_spelling(self, text): new_text = text if bool(re.search(r'\d', text)): # Has digits, just return text, no spellcheck pass elif 'st ' in text: # Spellcheck not handling St properly pass else: if len(text) > 0: new_text = self.lookup(text) self.logger.debug(f'Spell {text} -> {new_text}') return new_text.strip(' ')
class Fuzzy: ''' This class defines the fuzzy joining tools and parameters for string approximation. The primary toolkit for operations is the Symspell module and its associated Python port https://github.com/mammothb/symspellpy The input corpus file must be formatted as a record per row; all words on a single line are assumed to be part of a single space- separated string. Args: input_corpus: path to a text corpus containing the data to which query strings will be searched. Default is None, in which case a corpus can be loaded later preprocesser: an instance of the fuzzypanda.preprocess.PreProcessor class containing the 'preprocess' method used to pre-process the input strings. If set to None, will instantiate the default pre-processor. This option can be used to create a custom pre-processor to pass to the get_fuzzy_columns function. max_edit_distance_dictionary: maximum edit distance to consider in SymSpell dictionary searches. prefix_length: length of the SymSpell dictionary prefix Attributes: corpus: path to the text corpus. If preprocessed, will point to the preprocessed file and unprocessed_corpus will point to the unprocessed file. preprocess_flag: Flag for indicating that preprocessing should be completed unprocessed_corpus: if pre-processing is requested, will point to the file containing the unprocessed input file sym_spell: the SymSpell object max_edit_distance_dictionary: maximum edit distance to consider in SymSpell dictionary searches. prefix_length: length of the SymSpell dictionary prefix ''' def __init__(self, input_corpus: str = None, preprocesser=None, max_edit_distance_dictionary: int = 2, prefix_length: int = 7): # Set flags and initial variables self._preprocess_flag = False self.unprocessed_corpus = None self.sym_spell = None self.index_dictionary = None # Set inputs to attributes self.corpus = input_corpus self.max_edit_distance_dictionary = max_edit_distance_dictionary self.prefix_length = prefix_length # Setup the pre-processor object if preprocesser is None: self.preprocesser = preprocess.PreProcessor() else: self.preprocesser = preprocesser # Check the corpus and bootstrap preprocessing and Symspell self.check_corpus() if preprocess: self.preprocess_corpus() if self._preprocess_flag: self.create_symspell_dict() self.create_index() def is_preprocessed(self): ''' Returns true if the corpus has been processed, false otherwise ''' return (self._preprocess_flag) def check_corpus(self): ''' Verifies that the corpus file exists ''' logger.debug('Checking corpus file %s', self.corpus) if self.corpus is None: logger.warning('Corpus file not defined') return elif not os.path.exists(self.corpus): logger.error('Corpus file %s not found', self.corpus) raise FileNotFoundError(f'Corpus file {self.corpus} not found') else: logger.debug('Corpus file found') return def preprocess_corpus(self): ''' Preprocesses the given corpus file in self.corpus. Will copy the processed results to 'process_[self.corpus]' file and change the self.corpus file to point to it. ''' # Status checking logger.debug('Preprocessing corpus file %s', self.corpus) if self._preprocess_flag: logger.warning('Corpus already preprocessed! Skipping') return if self.corpus is None: logger.error('Attempted to pre-process undefined corpus file') raise FileNotFoundError('self.corpus must be specified before' ' pre-processing') self.check_corpus() # Creating filenames corpus_directory = os.path.dirname(self.corpus) corpus_name = os.path.basename(self.corpus) processed_corpus = os.path.join(corpus_directory, 'preprocessed_' + corpus_name) # Pre-processing the input corpus strings with open(self.corpus, 'r') as cf: with open(processed_corpus, 'w') as pcf: for line in cf: pcf.write(self.preprocesser.preprocess(line) + '\n') self.unprocessed_corpus = self.corpus self.corpus = processed_corpus self._preprocess_flag = True logger.debug('Corpus processed to %s', self.corpus) def create_symspell_dict(self): ''' Creates the SymSpell dictionary object for later lookup. Required to lookup strings ''' logger.debug('Creating SymSpell dictionary') self.check_corpus() # create SymSpell object try: self.sym_spell = SymSpell(self.max_edit_distance_dictionary, self.prefix_length) except Exception as ex: # in case an error occurs in SymSpell logger.exception('Failure to create SymSpell object!') raise ex # Create the dictionary for SymSpell self.sym_spell.create_dictionary(self.corpus) def create_index(self): ''' The SymSpell dictionary will lookup strings closest to the preprocessed version of the query string. To convert back to the original string, an index dictionary is created to map back to the original string match. This function will create the in-memory dictionary used to lookup the original string from the pre-processed string The resulting index_dictionary will return strings such that index_dictionary[processed string] = unprocessed string ''' logger.debug('Creating corpus index') # Status checking self.check_corpus() if self.index_dictionary is not None: logger.warning('index_dictionary already created. Overwritting.') if not self._preprocess_flag: logger.error( 'Corpus %s not processed. Cannot create' ' index dictionary', self.corpus) raise FileNotFoundError( 'Corpus not processed. ' 'Cannot create index dictionary', self.corpus) if not os.path.exists(self.unprocessed_corpus): logger.error('Unprocessed Corpus file %s' 'not found', self.unprocessed_corpus) raise FileNotFoundError('Unprocessed Corpus file' f'{self.unprocessed_corpus} not found') # Create pre-process index as dictionary self.index_dictionary = {} with open(self.unprocessed_corpus, 'r') as ucf: for line in ucf: # Create the index entries original_string = line.strip() processed_string = self.preprocesser.preprocess( original_string) # Warn if the same string conflicts with an existing entry if processed_string in self.index_dictionary: conflict_string = self.index_dictionary[processed_string] # Don't flag if they are the same to begin with if conflict_string != original_string: logger.warning( 'index_dictionary conflict: ' '%s conflicts with %s for key ' '%s. Keeping index_dictionary[%s] = %s', original_string, conflict_string, processed_string, processed_string, conflict_string) continue # if no conflict, add to index self.index_dictionary.update( {processed_string: original_string}) logger.debug('Corpus index created') def query(self, qstring: str): ''' Queries an input string to the corpus, and retrieves the closest value in the corpus by edit distance. Args: qstring: string to query from the corpus Returns: (term, found_flag): Tuple of the suggested term and a flag of True if found in the corpus, or False if not. If not found within the corpus, returns the original qstring ''' # Status checks # Check qstring qstring_type = type(qstring) if qstring_type is not type(''): msg = f'{qstring} is type {qstring_type} not string' logger.error(msg) raise ValueError(msg) # Check index_dictionary if self.index_dictionary is None: msg = 'index_dictionary not created' logger.error(msg) raise ValueError(msg) # Check sym_spell if self.sym_spell is None: msg = 'sym_spell SymSpell object not created' logger.error(msg) raise ValueError(msg) # pre-process the string processed_string = self.preprocesser.preprocess(qstring) logger.debug('Querying string: \'%s\', preprocessed to \'%s\' ', qstring, processed_string) # Look up string using Symspell suggest = self.sym_spell.lookup(processed_string, Verbosity.TOP, include_unknown=True) found_term = suggest[0].term found_edit_distance = suggest[0].distance # Determine if string is a hit or miss and return result if found_edit_distance > self.max_edit_distance_dictionary: # indicates a failed lookup logger.debug('String \'%s\' not found!', qstring) return (qstring, False) else: # Found a term; backsolve and return found string backprocessed_string = self.index_dictionary[found_term] logger.debug('String \'%s\' found! Backprocessed \'%s\' to \'%s\'', qstring, found_term, backprocessed_string) return (backprocessed_string, True) def get_fuzzy_column(self, dataframe, col_name, null_return=None): ''' Given a Pandas dataframe and the name of a column, returns a new column with values taken from a fuzzy search of the underlying dictionary of names. Args: dataframe (pandas.DataFrame): Input dataframe from which column values will be taken col_name (str): string of the column name used for searching values null_return (str): string to return if value is not found in the underlying dictionary. If None, will return the input string from the old column in the new column. Default is None. Returns: fuzzy_col (pandas.Series): Output pandas series of query results Raises: LookupError: if col_name is not in dataframe ValueError: if null_return is not a string type ''' logger.debug('Creating fuzzy column for %s', col_name) # Input checking # Check col_name if col_name not in dataframe.columns: msg = [f'{col_name} not in dataframe columns:'] for col in dataframe.columns: msg.append(col) logger.error(' '.join(msg)) raise LookupError(' '.join(msg)) # Check null_return str_type = type('') nr_type = type(null_return) if null_return is not None and nr_type is not str_type: msg = f'null_return is type {nr_type} not {str_type}' logger.error(msg) raise ValueError(msg) # Define a simple lookup function for serial df.apply def apply_query(value): (out, found) = self.query(value) if found: return out else: if null_return is None: return out else: return null_return fuzzy_col = dataframe[col_name].apply(apply_query) return fuzzy_col
output_dictionary_name = "response_validator_spelling_dictionary.txt" # 1) Load up the original symspell dictionary and convert to pandas dataframe sym_spell = SymSpell(3, 7) sym_spell.load_dictionary(symspell_dictionary, 0, 1) df_original = pd.DataFrame.from_dict(sym_spell.words, orient="index").reset_index() df_original.columns = ["term", "count"] # 2) Create a dictionary for each of the external datafiles. Append together # and get total counts for each term df_external = pd.DataFrame() regexp = re.compile(r"[.!?\-\\+\[\]\#\$\%\^\&\*\(\)\@\d\']+") for file in external_files: sym_spell = SymSpell(3, 7) sym_spell.create_dictionary(file) df_temp = pd.DataFrame.from_dict(sym_spell.words, orient="index").reset_index() df_temp.columns = ["term", "count"] df_temp = df_temp[df_temp["term"].apply(lambda x: not regexp.search(x))] df_external = df_external.append(df_temp) df_external = df_external.groupby("term")["count"].sum().reset_index() # 3) Adjust the counts in the original dataframe to be comparable to those in # the external dataframe This avoids mangling the prior when doing Bayesian # spelling correction N_external = df_external["count"].sum() N_original = df_original["count"].sum() df_original["count"] = df_original["count"].apply( lambda x: int(np.ceil(x / (N_original / N_external))))
class SymDeletingTypoCorrecter(Module): def __init__(self, max_edit_dist: int = 2, prefix_length: int = 10): self.symspell = SymSpell(max_dictionary_edit_distance=max_edit_dist, prefix_length=prefix_length) self.max_edit_dist = max_edit_dist def train(self, corpus_path: str, save_path: str, unigram_dict_prefix: str, bigram_dict_prefix: str = None, **kwargs): self.symspell.create_dictionary(corpus_path) # 1) Unigram dict worddict = '' for key, count in self.symspell.words.items(): worddict += '{} {}\n'.format(''.join(flat_hangeul(key)), count) unigram_save_path = os.path.join(save_path, unigram_dict_prefix + '.txt') with open(unigram_save_path, 'w', encoding='utf-8') as file: for line in worddict: file.write(line) file.close() print("Total {} Unigrams are saved!".format( len(self.symspell.words.items()))) if bigram_dict_prefix: # 2) Bigram dict with open(corpus_path, 'r', encoding='utf-8') as file: corpus = file.readlines() corpus = [s.strip() for s in corpus] bi_count = self.count_bigrams(corpus, min_count=5) bi_dict = '' for key, count in bi_count.items(): s1, s2 = key.split(' ') bi_dict += '{} {} {}\n'.format(''.join(flat_hangeul(s1)), ''.join(flat_hangeul(s2)), count) bigram_save_path = os.path.join(save_path, bigram_dict_prefix + '.txt') with open(bigram_save_path, 'w', encoding='utf-8') as biFile: for line in bi_dict: biFile.write(line) biFile.close() print("Total {} bigrams are saved!".format(len(bi_count))) def load_model(self, unigram_dict_path: str, bigram_dict_path: str = None, **kwargs): try: here = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) default_path = os.path.join(here, "resources", 'default_uni_dict.txt') self.symspell.load_dictionary(default_path, term_index=0, count_index=1) self.symspell.load_dictionary(unigram_dict_path, term_index=0, count_index=1) except ValueError: raise ValueError("Specified unigram dictionary path not exist") if bigram_dict_path: try: self.symspell.load_bigram_dictionary(unigram_dict_path, term_index=0, count_index=1) except ValueError: raise ValueError("Specified bigram dictionary path not exist") def infer(self, word: Text, **kwargs): suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = self.symspell.lookup(''.join(flat_hangeul(word)), suggestion_verbosity, self.max_edit_dist) if suggestions: word = list(suggestions[0].term) return merge_flatted_hangeul(word) return word @staticmethod def count_bigrams(corpus: list, min_count: int): bigrams = [] for t in tqdm(corpus): if t.__class__ != str: continue else: text = t.split(' ') _bigrams = zip(*[text[i:] for i in range(2)]) bigrams += [' '.join(s) for s in list(_bigrams)] count = Counter(bigrams) new_dict = {} for key, value in count.items(): if value >= min_count: new_dict[key] = value return new_dict