def test_lookup_compound(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def test_lookup_compound(self): print(' - %s' % inspect.stack()[0][3]) cwd = os.path.realpath(os.path.dirname(__file__)) dictionary_path = os.path.realpath( os.path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(300000, results[0].count) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(23121323, results[0].count) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(3813904, results[0].count) typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(10, results[0].distance) self.assertEqual(6218089, results[0].count)
class SegmentText(): def __init__(self, dictionary_path = None, bigram_path = None): self.name = "SegmenText" self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) # dictionary_path = pkg_resources.resource_filename( # "symspellpy", "frequency_dictionary_en_82_765.txt") if dictionary_path != None: self.dictionary_path = dictionary_path else: self.dictionary_path = os.path.join("./symspellfre_", "frequency_dictionary_en_82_765.txt") if bigram_path != None: self.bigram_path = bigram_path else: self.bigram_path = os.path.join("./symspellfre_", "frequency_bigramdictionary_en_243_342.txt") # self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2) def split(self, sentence): # lookup suggestions for multi-word input strings (supports compound # splitting & merging) # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan") # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan eoy") # max edit distance per lookup (per single word, not per whole input string) suggestions = self.sym_spell.lookup_compound(sentence, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion) return suggestions
class _Project: def __init__(self): initial_capacity = 83000 max_edit_distance_dictionary = 2 prefix_length = 7 self.sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = Path('dict_final.txt') count_index = 1 # column of the term frequency in the dictionary text file term_index = 0 # column of the term in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return def correct_name(self, query): input_term = ( query ) # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = self.sym_spell.lookup_compound( input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency # writer = csv.writer(f, delimiter='\t') for suggestion in suggestions: # writer.writerow(['']+[suggestion.term]) return suggestion.term
class SpellCorrect(): def __init__(self, dictionary_path=dictionary_path__, bigram_path=bigram_path__): self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) if self.is_valid_path(dictionary_path) and self.is_valid_path( bigram_path): self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) self.load_status = True else: self.load_status = False self.name = "Spell Corrector" def is_valid_path(self, path_file): if not os.path.exists(path_file): logging.error("The {} is not exists".format(path_file)) return False return True def correct(self, sentence): if self.load_status: # max edit distance per lookup (per single word, not per whole input string) suggestions = self.sym_spell.lookup_compound(sentence, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: return suggestion.term return self.load_status
def fix_spelling(directory, filename): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 corrected_list = [] for line in open(directory + filename, 'r'): suggestions = sym_spell.lookup_compound(line, max_edit_distance_lookup) for suggestion in suggestions: corrected_list.append(suggestion.term) print(corrected_list) # text = " ".join(corrected_list) with open("output/" + filename + ".spell", 'w') as f: for line in corrected_list: f.write(line) f.write('\n')
def test_lookup_compound_no_suggestion(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("machine", 1) typo = "qwer erty ytui a" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(typo, results[0].term)
def test_lookup_compound_only_combi(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.create_dictionary_entry("steam", 1) sym_spell.create_dictionary_entry("machine", 1) typo = "ste am machie" correction = "steam machine" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def test_lookup_compound_transfer_casing(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("Whereis th elove hehaD Dated forImuch of thepast who " "couqdn'tread in sixthgrade AND ins pired him") correction = ("Where is the love he haD Dated for much of the past " "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, transfer_casing=True) self.assertEqual(correction, results[0].term)
def test_lookup_compound_ignore_non_words(self): print(' - %s' % inspect.stack()[0][3]) cwd = os.path.realpath(os.path.dirname(__file__)) dictionary_path = os.path.realpath( os.path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who " "couqdn'tread in SIXTHgrade and ins pired him") correction = ("where is the love 123 he had dated for much of THEPAST " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the DHIRD 1 quarter of last year he had learned " "of a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY " "of 12 funn") correction = ("the biggest players in the strong summer film slate " "with PLETY of 12 fun") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible 1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible 1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible AB1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible AB1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "PI on leave, arrange Co-I to do screening" correction = "PI on leave arrange co i to do screening" results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def symspell_correction( misspelled): # not used because it is too expensive from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(83000, 2) dictionary_path = resdir + "frequency_dictionary_en_82_765.txt" if not sym_spell.load_dictionary(dictionary_path, 0, 1): return "" suggestions = sym_spell.lookup(misspelled, Verbosity.CLOSEST, 2) if suggestions: return sorted(suggestions, key=lambda x: x.count, reverse=True)[0].term return sorted(sym_spell.lookup_compound(misspelled, 2),\ key = lambda x: x.count,\ reverse = True)[0].term
def spell_checker(inputTerm, path='./dictionary.txt'): symspell = SymSpell() symspell.load_dictionary(path, term_index=0, count_index=1) maxEditDistance = 2 # ignore_non_words = True means if a particular word is not present then # we'll return as is. correct_sent = [] for i in inputTerm.split(): if i.isalnum(): suggestion = symspell.lookup_compound(i, maxEditDistance, ignore_non_words=True) suggestion = str(suggestion[0]).split(',')[0].strip() else: suggestion = i.strip() correct_sent.append(suggestion) return " ".join(correct_sent)
def SpellCorrect(strings): sym_spell = SymSpell(max_dictionary_edit_distance=1, prefix_length=7) # dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(os.getcwd() + '\\frequency.txt', term_index=0, count_index=1) temp = [] # lookup suggestions for single-word input strings for row in strings: try: suggestions = sym_spell.lookup_compound(row, max_edit_distance=1) temp.append(str(suggestions[0]).split(',')[0]) except: temp.append('\n') return temp
def test_lookup_compound_ignore_non_words(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2) typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who " "couqdn'tread in SIXTHgrade and ins pired him") correction = ("where is the love 123 he had dated for much of THEPAST " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the DHIRD 1 quarter of last year he had learned " "of a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY " "of 12 funn") correction = ("the biggest players in the strong summer film slate " "with PLETY of 12 fun") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible 1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible 1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible AB1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible AB1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "PI on leave, arrange Co-I to do screening" correction = "PI on leave arrange co i to do screening" results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
class WordCorrector(LogicAdapter): def __init__(self, chatbot, **kwargs): super().__init__(chatbot, **kwargs) self.language = kwargs.get('language', languages.ENG) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt") # TODO : 숫자 없음. dictionary modifying 필요 self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2) def can_process(self, statement): try: if " " in statement.text.lower(): return False else: response = self.process(statement) return response.confidence == 1 except: return False def process(self, statement, additional_response_selection_parameters=None): input_text = statement.text input_text = (input_text) suggestions = self.sym_spell.lookup_compound(input_text, max_edit_distance=2) for suggestion in suggestions: #print(suggestion) #print(type(suggestion)) expression = "Do you mean \""+ str(suggestion).split(",")[0] +"\"" if input_text == str(suggestion).split(",")[0]: expression = "" response = Statement(text=expression) response.confidence = 1 #TODO: corrector 돌렸을 때 같을 땐 confidence 0, 다를 땐 confidence 1로 줬었는데 모 딴 거하다 이걸로 냅둠 return response
class Spell_Checker(): def __init__(self): self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") self.bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2) def Correct_It(self, data): suggestions = self.sym_spell.lookup_compound(data, max_edit_distance=2, transfer_casing=True) clean_data = list() for suggestion in suggestions: clean_data.append(str(suggestion.term)) correct_data = " ".join(clean_data) return correct_data
import pkg_resources from symspellpy import SymSpell sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) s = "02/05/2016" input_term = (s.replace(" ", "")).lower() suggestions = sym_spell.lookup_compound(s, max_edit_distance=2) # display suggestion term, edit distance, and term frequency print(suggestions[0].term)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, encoding='utf-8') # sym_spell.load_dictionary('C:/Users/nt.anh6/PycharmProjects/aicr_vn/nlp_model/spell_checker/dict/vi_full.txt', term_index=0, count_index=1, encoding='utf-8') sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2, encoding='utf-8') # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = "Ngyễn tành nm" # max edit distance per lookup (per single word, not per whole input string) # suggestions = sym_spell.lookup(input_term, Verbosity.ALL, max_edit_distance=2, include_unknown=True) suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion) def load_name_corection(dictionary_path, bigram_path): sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) # dictionary_path = pkg_resources.resource_filename( # dictionary_path) # bigram_path = pkg_resources.resource_filename( # bigram_path) sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, encoding='utf-8')
class SpellCheck(): def __init__(self, init_path=None): """Spelling checker: symspellpy==6.5.2. https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage. https://towardsdatascience.com/essential-text-correction-process-for-nlp-tasks-f731a025fcc3.""" self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.set_dictionary_path(init_path) self.set_dictionary() # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1) def set_dictionary_path(self, path): if path: self.path = path else: self.path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") return self.path def set_df(self): self.df = pd.read_csv(self.path, sep=' ', header=None, dtype={ 0: str, 1: np.int }) return self.df def set_dict(self): self.set_df() self.dictionary = { self.df.loc[i, 0]: self.df.loc[i, 1] for i in self.df.index } return self.dictionary def set_dictionary(self): self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1) self.set_dict() return None def find(self, term): return self.dictionary.get(term, 'nothing found') def append_dict(self, df_custom, cust_path='./data/cust_freq_dict_en.txt'): """Add custom dictionary. df: [term, freq]""" df_init = self.set_df() try: df_custom = df_custom.replace([np.inf, -np.inf, np.nan], 99) df_custom[1] = df_custom[1].astype(int) df = pd.concat([df_init, df_custom], ignore_index=True) except Exception as err: st.write('something went wrong', err) return -1 # Remove duplicate terms and sort on frequency df.drop_duplicates(subset=[0], keep='first', inplace=True) df.sort_values(by=[1], ascending=False, inplace=True) # Save & Load after adding custom dictionary self.set_dictionary_path(cust_path) df.to_csv(self.path, sep=' ', index=None, header=None) # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1) self.set_dictionary() return None def __call__(self, input_term, N=8): """lookup suggestions for single- and multi-word input strings""" # Check loner words (N chars) on possible concatenation # https://symspellpy.readthedocs.io/en/latest/api/symspellpy.html#symspellpy.symspellpy.Verbosity if (len(input_term.split(' '))) == 1 or (len(input_term) < N): suggestions = self.sym_spell.lookup(input_term, Verbosity.TOP, max_edit_distance=2, transfer_casing=True, include_unknown=True) else: # Punctuation get's lost! suggestions = self.sym_spell.lookup_compound(input_term, max_edit_distance=2, transfer_casing=True) # Suggestion term, term frequency, and edit distance # return [(sug.term, sug.count, sug.distance) for sug in suggestions] return [sug.term for sug in suggestions][0]
def test_lookup_compound_replaced_words(self): print(' - %s' % inspect.stack()[0][3]) cwd = os.path.realpath(os.path.dirname(__file__)) dictionary_path = os.path.realpath( os.path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") replacement_1 = { "whereis": "where is", "th": "the", "elove": "love", "hehad": "he had", "forimuch": "for much", "thepast": "the past", "couqdn'tread": "couldn't read", "sixthgrade": "sixth grade", "ins": "in" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(len(replacement_1), len(sym_spell.replaced_words)) for k, v in replacement_1.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") replacement_2 = { "te": "the", "dhird": "third", "qarter": "quarter", "oflast": "of last", "jear": "year", "hadlearned": "had learned", "ofca": "of a", "sekretplan": "secret plan" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2), len(sym_spell.replaced_words)) for k, v in replacement_2.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") replacement_3 = { "bigjest": "biggest", "playrs": "players", "strogsommer": "strong summer", "slatew": "slate", "ith": "with", "plety": "plenty", "funn": "fun" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2) + len(replacement_3), len(sym_spell.replaced_words)) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) for k, v in replacement_3.items(): self.assertEqual(v, sym_spell.replaced_words[k].term)
import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt") sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) file = open('../lycurgusOCR.txt', 'r', encoding='utf-8') file_contents = '' for line in file: if(len(line) < 15): continue file_contents += line suggestions = sym_spell.lookup_compound(file_contents, max_edit_distance=2) for suggestion in suggestions: print(suggestion)
def export(): import os import torch import zipfile import torchaudio from glob import glob device = torch.device('cpu') # gpu also works, but our models are fast enough for CPU model, decoder, utils = torch.hub.load('snakers4/silero-models', model='silero_stt', language='en') (read_batch, split_into_batches, read_audio, prepare_model_input) = utils # see function signature for details os.system("ffmpeg -i 'video.mp4' -vn -acodec copy audio.aac") os.system("ffmpeg -i audio.aac audio.wav") # download a single file, any format compatible with TorchAudio (soundfile backend) # torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav', # dst ='speech_orig.wav', progress=True) test_files = glob('audio.wav') batches = split_into_batches(test_files, batch_size=10) input = prepare_model_input(read_batch(batches[0])) text = "" output = model(input) for example in output: pred = decoder(example.cpu()) text = text + pred os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt") os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt") import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) # input_term = ("whereis th elove hehad dated forImuch of thepast who " # "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) suggestions = sym_spell.lookup_compound(text, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion) text = str(suggestion) cnt = 0 textlines = [] while cnt < len(text.split(" ")): print(text.split(" ")[cnt:cnt+5]) line = "\n" + " ".join(text.split(" ")[cnt:cnt+5]) textlines.append(line) cnt += 5 f = open("script_cleaned.txt", "a") f.writelines(textlines) f.close() os.system("python -m aeneas.tools.execute_task \ audio.wav \ script_cleaned.txt \ 'task_language=eng|os_task_file_format=srt|is_text_type=plain' \ subtitles.srt") with open("subtitles.srt") as f: srt = f.read() return Response( srt, mimetype="text/srt", headers={ "Content-disposition": "attachment; filename=subtitiles.srt" } )
class spellchecker: def __init__( self, max_dictionary_edit_distance, prefix_length, unigram_freq_file, bigram_freq_file=None, pickle_file=None, ): self.sym_spell = SymSpell( max_dictionary_edit_distance=max_dictionary_edit_distance, prefix_length=prefix_length, ) if pickle_file is not None: self.sym_spell.load_pickle(pickle_file, ) else: self.sym_spell.load_dictionary( unigram_freq_file, term_index=0, count_index=1, encoding="utf-8", ) if bigram_freq_file: self.sym_spell.load_bigram_dictionary( bigram_freq_file, term_index=0, count_index=2, encoding="utf-8", ) def suggest( self, word, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): # defaults if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) return { 'original_term': word, 'suggestions': suggestions, } def suggest_compound( self, phrase, max_edit_dist=None, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup_compound( phrase, max_edit_distance=max_edit_dist, # ignore_non_words=False, # split_phrase_by_space=True, ) return { 'original_term': phrase, 'suggestions': suggestions, } def tokenize(self, phrases): return tokenize_sentence(phrases) # Tokenize into individual phrases and return a list of suggestions for each def suggest_tokenize( self, phrases, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE words = self.tokenize(phrases) sentence_suggestions = [] for word in words: suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) sentence_suggestions.append({ 'original_term': word, 'suggestions': suggestions, }) return sentence_suggestions
def test_lookup_compound_replaced_words_no_bigram(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("whereas the love head dated for much of the past who " "couldn't read in sixth grade and inspired him") replacement_1 = { "whereis": "whereas", "th": "the", "elove": "love", "hehad": "head", "forimuch": "for much", "thepast": "the past", "couqdn'tread": "couldn't read", "sixthgrade": "sixth grade", "ins": "in" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(len(replacement_1), len(sym_spell.replaced_words)) for k, v in replacement_1.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") replacement_2 = { "te": "the", "dhird": "third", "qarter": "quarter", "oflast": "of last", "jear": "year", "hadlearned": "had learned", "ofca": "of a", "sekretplan": "secret plan" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2), len(sym_spell.replaced_words)) for k, v in replacement_2.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") replacement_3 = { "bigjest": "biggest", "playrs": "players", "strogsommer": "strong summer", "slatew": "slate", "ith": "with", "plety": "plenty", "funn": "fun" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2) + len(replacement_3), len(sym_spell.replaced_words)) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) for k, v in replacement_3.items(): self.assertEqual(v, sym_spell.replaced_words[k].term)
def test3(): # from autocorrect import Speller # doc = docx.Document("Word docs_Peace/1_CTS_119_eng_text.docx") # result = [p.text for p in doc.paragraphs] # # spell = Speller(lang='en') # # for j in range(15): # print(spell(result[j])) # import jamspell # # corrector = jamspell.TSpellCorrector() # corrector.LoadLangModel('en.bin') # text = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the Imperial Court, or in the Imperial Chamber,\n", # # text = corrector.FixFragment(text) # print(text) sys.path.append("treatyUtil") import pkg_resources from symspellpy import SymSpell, Verbosity from treatyUtil import spellcheck_keep_punctuation sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term1 = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII.\ According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred \ Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir \ Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the \ account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either \ Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, \ or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, \ both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding \ all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions \ ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with \ the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor \ to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the\ Imperial Court, or in the Imperial Chamber,\n" #input_term = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited " input_term = "God, and Safety of the Chriſtian World (the Electors,\nPrinces and States of the Sacred Roman Empire \ being\npreſent, approving and conſenting) the Articles of Peace\nand Anity, whereof the Tenour follows.\n1. That \ there be a Chriſtian, univerſal\nThe Re-efta. and perpetual Peace, and a true and ſincere\nbliſhment of Friendſhip and \ Amity between his Sacred\nPeace and A. Imperial Majeſty, the Houſe of Austria,\nmity.\nand all his Allies and Adherents, \ and the\nHeirs and Succeffors of each of them, chiefly the King\nof Spain, and the Electors, Princes and States of the En-\npire,\ of the one ſide, and her Sacred Royal Majeſty,\nand the Kingdom of Sweden, her Allies and Adherents,\nand the Heirs and Succeſſors\ of each of them, eſpecially\nthe moſt Chriſtian King, the reſpective Electors, Princes\nand States of the Empire, of the other ſide ; \ and that this\nPeace be obſerv'd and cultivated ſincerely and ſeriouſly,\nſo that each Party may procure the Benefit, Honour and\nAdvantage \ of one another, and thereby the Fruits of this\nPeace and Amity may be ſeen to grow up and fouriſh a-\nnew, by a ſure and reciprocal \ maintaining of a good\nand faithful Neighbourhood between the Roman Empire\nand the Kingdom of Sweden reciprocally,\nII. That there be \ on both ſides à perpe-\nAn Amneſty\ntua) Oblivion and Amneſty of all that has\nfrom all Hoffi- been done Since the beginning of theſe\nlity.\nTroubles, \ in what Place or in what Man-\n" input_term2 = "God, and Safety of the Chriſtian World (the Electors,\nPrinces" input_term = re.sub("\n", " ", input_term) input_term = re.sub("- ", "", input_term) #input_term = re.sub("-", "", input_term) input_term = re.sub("ſ", "s", input_term) # word_split = re.compile(r"[^\W]+", re.U) # suggestions = sym_spell.lookup_compound((input_term), ignore_non_words=True, max_edit_distance=2) # for suggestion in suggestions: # print(suggestion) # # corrected = suggestions[0].term # # This combined with split_phrase_by_space=True would be enough just to spell check # # but punctuation is lost. # # # The spell check is already done in 'corrected'. Now we just want to keep the punctuation. # in_list = word_split.findall(input_term) # chk_list = word_split.findall(corrected) # print(input_term) # print(corrected) # print(in_list) # print(chk_list) # pdb.set_trace() # # # To keep punctuation we take the original phrase and do word by word replacement # out_term = "" # outs = input_term.split() # word_count = 0 # for word in in_list: # print(out_term) # print(outs[word_count].lower(), word, chk_list[word_count]) # temp = outs[word_count].lower().replace(word, chk_list[word_count]) # word_count += 1 # out_term += temp+" " # # print(out_term) # return # max edit distance per lookup (per single word, not per whole input string) #pdb.set_trace() #print(spellcheck_keep_punctuation(input_term)) suggestions = sym_spell.lookup_compound((input_term), transfer_casing=True, ignore_non_words=True, max_edit_distance=2) # display suggestion term, edit distance, and term frequency #print(suggestions) for suggestion in suggestions: print(suggestion)