def test_lookup_compound_ignore_non_words(self): print(' - %s' % inspect.stack()[0][3]) cwd = path.realpath(path.dirname(__file__)) dictionary_path = path.realpath( path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who " "couqdn'tread in SIXTHgrade and ins pired him") correction = ("where is the love 123 he had dated for much of THEPAST " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the DHIRD 1 quarter of last year he had learned " "of a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY " "of 12 funn") correction = ("the biggest players in the strong summer film slate " "with PLETY of 12 fun") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible 1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible 1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "PI on leave, arrange Co-I to do screening" correction = "PI on leave arrange co i to do screening" results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def correct_spelling(sentence): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 5 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return if "& ;" in sentence: sentence = sentence.replace("& ;", "and") max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup) save = "" for suggestion in suggestions: save = suggestion.term #print("{}".format(save)) break #if "#" in save: # save = sym_spell.word_segmentation(save) return save
def spell_correction(texte): max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = "../ressources/fr-100k.txt" bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not sym_spell.load_bigram_dictionary( bigram_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return input_term = texte # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) if (len(suggestions) > 0): return suggestions[0].term else: print("error with : ", texte) return texte
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 3 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 3 f = open("note.html", "r") noteString = f.read() noteString = stripHTML(noteString) print(noteString) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.") tstart = datetime.now() suggestions = sym_spell.lookup_compound(noteString, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) tend = datetime.now() time = tend - tstart print(time.seconds)
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return sym_spell.load_dictionary( "/home/yadi/projectDISK/Python-Projects/ML-NLP/dictionary.txt", 0, 1) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him." "I'm workig in th e yadolah shahrary working in githib") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 1 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup, transfer_casing=True) # display suggestion term, edit distance, and term frequency print(input_term) for suggestion in suggestions: print("{}".format(suggestion.term))
def correctly_spelled(data, max_edit_distance_lookup=None): global sym_speller # Make the SymspellPy-based speller global to be able to be used in the body of this function if sym_speller is None: # If the speller is not initialized sym_speller = SymSpell( max_edit_distance_dictionary, prefix_length) # Initialize the speller provided its parameters as # previously defined sym_spell_dict_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt" ) # Load the frequency dictionary # to the speller term_index = 0 # Column of the term in the dictionary text file count_index = 1 # Column of the term frequency in the dictionary text file if not sym_speller.load_dictionary( sym_spell_dict_path, term_index, count_index): # If the dictionary was not found print("ERROR! SymSpellPy dictionary not found at following path:", sym_spell_dict_path ) # Print error message informing about this os._exit(1) # Exit the entire program if max_edit_distance_lookup is None: # If no maximum edit distance during lookup is specified max_edit_distance_lookup = max_edit_distance_dictionary # Assign the same edit distance to that as to the maximum edit distance # on the dictionary # Correct spelling of each token in the text and return the data sample return " ".join([ (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if t.isalpha() and not (t == data[0] or t == data[1] or ("".join([x[0] for x in data[1].split()]) == t if len(data[1].split()) >= 3 else False)) else t) for t in tokenized(data[2]) ])
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 9 # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv') sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.join(os.path.dirname(__file__), "dictionary_final.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings # input_term = "agricultr" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL s = "" # print('original') # print(len(words)) # for i in range(len(data)): # # print(i) # if i==0 or i==51124 or i==65070: # continue # input_term = data['Final_words'][i] # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, # max_edit_distance_lookup) # print(i) # try: # s = s + str(suggestions[0].term)+" " # except: # s = s+ input_term # # s = s[:-1] # words = s.split(' ') # # print(len(words)) # print('After') # print(len(words)) # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.distance, # suggestion.count)) # input_term = ("whereis th elove hehad dated forImuch of thepast who " # "couqdn'tread in sixtgrade and ins pired him") input_term = 'live' # max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def symspell_checker(text): from symspellpy.symspellpy import SymSpell spell = SymSpell() spell.load_dictionary(r"frequency_dictionary_en_82_765.txt", 0, 1) spell.load_bigram_dictionary(r"frequency_bigramdictionary_en_243_342.txt", 0, 2) result = spell.lookup_compound(text, 2) for r in result: return r.term return text
def test_lookup_compound(self): print(' - %s' % inspect.stack()[0][3]) cwd = path.realpath(path.dirname(__file__)) dictionary_path = path.realpath( path.join(cwd, pardir, "symspellpy", "frequency_dictionary_en_82_765.txt")) edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(83000, edit_distance_max, prefix_length) sym_spell.load_dictionary(dictionary_path, 0, 1) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
async def quote(self, message, args): msg = None if try_parse_int64(args[0]) is not None: msg_id = args[0] try: msg = await self.client.get_message(message.channel.id, msg_id) except Exception as exception: # pylint: disable=W0703 LOG.exception(exception) else: input_term = args[0] sym_spell = SymSpell() for term in input_term.split(" "): sym_spell.create_dictionary_entry(term, 1) target = sym_spell.lookup_compound(input_term, 2)[0].term iterator = message.channel.history(limit=100) for __ in range(100): try: msg = await iterator.next() suggestion = sym_spell.lookup_compound(msg.content, 2)[0] if suggestion.term == target: msg = await self.client.get_message( message.channel.id, msg.id) break except NoMoreItems: msg = None if msg is not None: display_name = message.guild.get_member(int( msg["author"]["id"])).display_name time_str = ( datetime.strptime(msg["timestamp"].split(".")[0], "%Y-%m-%dT%H:%M:%S") + timedelta(hours=TZ_OFFSET)).strftime("%Y-%m-%d %I:%M %p") quote_msg = "```{} - {} UTC+{}\n{}```".format( display_name, time_str, TZ_OFFSET, msg["content"]) else: quote_msg = "Message not found!" await self.client.send_message(message.channel.id, quote_msg) await message.delete()
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "pyth" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # a sentence without any spaces input_term = "thequuickbrownfoxjumpsoverthelazydog" result = sym_spell.word_segmentation(input_term) # display suggestion term, term frequency, and edit distance print("{}, {}, {}".format(result.corrected_string, result.distance_sum, result.log_prob_sum))
def correction(input_term): # create object sym_spell = SymSpell() # load dictionary dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) suggestions.extend( sym_spell.lookup_compound(input_term, max_edit_distance_lookup)) suggestions = sorted(suggestions, key=lambda x: (x.distance)) #to remove dupicate objects import collections seen = collections.OrderedDict() for obj in suggestions: if obj.term not in seen: seen[obj.term] = obj suggestions = list(seen.values()) #when the no correction is needed seen = collections.OrderedDict() for obj in suggestions: if obj.term != input_term: seen[obj.term] = obj correctWords = list(seen.values()) if len(correctWords) == 0: return # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.distance, # suggestion.count)) return suggestions
def main(): # create object initial_capacity = 83000 # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( "/Users/meheresh/Documents/cm_spellchecker/spellcheck/data", "freqdict.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "memebers" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.count, suggestion.distance)) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.count, suggestion.distance))
def main(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 2 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join( os.path.dirname(__file__), "/home/raghu/Downloads/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # lookup suggestions for single-word input strings input_term = "memebers" # misspelling of "members" # max edit distance per lookup # (max_edit_distance_lookup <= max_edit_distance_dictionary) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup) # display suggestion term, term frequency, and edit distance for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count)) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = ("AGUDATA OF BIRTH") # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency print(suggestions) for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))
def _correct_spelling_errors(self): """ Corrects spelling errors in tweets using symspell. :return: """ sym_spell = SymSpell( Corpus.symspell_config["initial_capacity"], Corpus.symspell_config["max_edit_distance_dictionary"], Corpus.symspell_config["prefix_length"]) config = Corpus.symspell_config # self._tweets_df = self._tweets_df.sample(frac=1) for idx, record in self._tweets_df.iterrows(): suggestions = sym_spell.lookup_compound( record.text, config["max_edit_distance_lookup"]) for suggestion in suggestions: print(" {}, {}, {}".format(suggestion.term, suggestion.count, suggestion.distance)) return self._tweets_df
class SpellCorrect: def __init__(self, max_dictionary_edit_distance=2, prefix_length=7, dictionary_path=None): # maximum edit-distance for doing lookups self.max_dictionary_edit_distance = max_dictionary_edit_distance # Length of word prefixes used for spell checking self.prefix_length = prefix_length # create object self.sym_spell = SymSpell( max_dictionary_edit_distance=self.max_dictionary_edit_distance, prefix_length=self.prefix_length) # load dictionary if dictionary_path is None: dictionary_path = os.path.join( os.path.dirname('__file__'), "frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index): print('Dictionary file not found') def spelling_correct(self, input_term): # lookup suggestions for multi-word input strings (supports compound # splitting & merging) # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = self.sym_spell.lookup_compound( phrase=input_term, max_edit_distance=max_edit_distance_lookup) return "".join([suggestion.term for suggestion in suggestions])
def main(): initial_capacity = 83000 max_edit_distance_dictionary = 3 prefix_length = 7 sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length) dictionary_path = "alfabeto.txt" term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return input_term = ("previdensia sosial é augo difisio e discitido no bra sil") max_edit_distance_lookup = 3 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.count, suggestion.distance))
class SymSpell(SpellCheck): def __init__(self, dictionary_file_path='', dictionary=None, verbose=0): super().__init__(dictionary=dictionary, verbose=verbose) self.dictionary_file_path = dictionary_file_path self.model = None def load_vocab(self, corpus_file_path, max_edit_distance_dictionary=2, prefix_length=5): # initial_capacity = len(corpus) # sym_spell = SymSpellPy( # initial_capacity, max_edit_distance_dictionary, # prefix_length) self.model = SymSpellPy( max_dictionary_edit_distance=max_edit_distance_dictionary, prefix_length=prefix_length) term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not self.model.load_dictionary(corpus_file_path, term_index, count_index): print("Dictionary file not found") def build_vocab(self, dictionary, file_dir, file_name, verbose=0): if not os.path.exists(file_dir): os.makedirs(file_dir) """ Data format: token, frequency Example: edward 154 edwards 50 ... """ if self.verbose > 3 or verbose > 3: print('Size of dictionary: %d' % len(dictionary)) with open(file_dir + file_name, "w") as text_file: for token, count in dictionary.items(): text_file.write(token + ' ' + str(count)) text_file.write('\n') def correction(self, word, max_edit_distance_lookup=2, mode='cloest'): if mode == 'cloest': suggestion_verbosity = Verbosity.CLOSEST elif mode == 'top': suggestion_verbosity = Verbosity.TOP elif mode == 'all': suggestion_verbosity = Verbosity.ALL results = self.model.lookup(word, suggestion_verbosity, max_edit_distance_lookup) results = [{ 'word': suggestion.term, 'count': suggestion.count, 'distance': suggestion.distance } for suggestion in results] return results def corrections(self, sentence, max_edit_distance_lookup=2): normalized_sentence = (sentence.lower()) results = self.model.lookup_compound(normalized_sentence, max_edit_distance_lookup) results = [{ 'word': suggestion.term, 'distance': suggestion.distance } for suggestion in results] return results
class WordSimilarity: def __init__(self, spell): max_edit_distance_dictionary = 2 prefix_length = 7 self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") if not self.sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not self.sym_spell.load_bigram_dictionary( bigram_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return self.nlp = spacy.load( "shop_recognizer/semantic_detector/models/en_core_web_lg") self.spell = spell def checkSemanticSimilarity(self, labels, words): result = {} texts = self.removeNoise2(words) for label in labels: tmp = "" doc1 = self.nlp(label) for text in texts: tmp += text + " " doc2 = self.nlp(tmp) score = doc2.similarity(doc1) result[label] = int(score * 100) prob = self.softmax(labels, result) counter = 0 for cls in labels: if len(words): result[cls] = float(prob[counter]) counter = counter + 1 else: result[cls] = 0 return result def checkSemanticSimilarity2(self, labels, words): result = {} texts = self.removeNoise2(words) for label in labels: tmp = 0 doc1 = self.nlp(label) for text in texts: doc2 = self.nlp(text) similarity = doc2.similarity(doc1) if similarity > tmp: tmp = similarity result[label] = int(tmp * 100) prob = self.softmax(labels, result) counter = 0 for cls in labels: if len(words): result[cls] = float(prob[counter]) counter = counter + 1 else: result[cls] = 0 return result def removeNoise(self, words): result = [] for word in words: if len(word) > 2 and (word.isdigit() is False): if (word in self.nlp.Defaults.stop_words): continue else: newWord = self.spell.correction(word) if self.nlp.vocab.has_vector(newWord): result.append(newWord) return result def removeNoise2(self, words): result = [] for word in words: if len(word) > 2 and (word.isdigit() is False): newWord = self.correct(word) newWord = newWord.replace(" ", "") result.append(newWord) return result def correct(self, word): input_term = (word) max_edit_distance_lookup = 2 suggestions = self.sym_spell.lookup_compound(input_term, max_edit_distance_lookup) return suggestions[0].term def softmax(self, classes, scores): inputArry = [] for cls in classes: inputArry.append(scores[cls]) ex = np.exp(inputArry) sum_ex = np.sum(np.exp(inputArry)) return ex / sum_ex
class Cleaner: """ Cleaner object for the first type of documents """ def __init__(self, directory, lexique_path, dict_path): """ args: directory: directory where the CSV will be stored dict_path: path of the dictionnary lexique_path: path of the lexique """ self.directory = directory self.dict_path = dict_path self.stopwords = list(nltk.corpus.stopwords.words('french')) self.lexique_path = lexique_path self.words = load_dictionnary( self.dict_path) + self.stopwords + load_lexique(self.lexique_path) self.corrected = {} self.max_edit_distance_dictionary = 2 self.prefix_length = 7 self.sym_spell = SymSpell(self.max_edit_distance_dictionary, self.prefix_length) self.dictionary_path = "../ressources/fr-100k.txt" self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) pass def extract(self, file): """ function to extract the judgements args: file to extract return: dataframe of the judgements """ soup = BeautifulSoup(file, "html.parser") df = pd.DataFrame(columns=["page", "arrêt", "date", "juridiction"]) Decision, notes, page, new_page, new_decision, count = False, False, 0, True, False, 1 for tag in soup.body: if count == 15: #limit length od judgement to 15 paragraphs count = 0 Decision = False new_decision = False string = tag.get_text() if tag.name == "hr": # hr means new page page += 1 notes = False new_page = True if tag.name == "p" and string is not None and not new_page: # pattern : start judgement m1 = re.match( r"(^.*?(La Cour,|L(A|À|a) COUR)(?! DE)(.+)$|^J\s?U\s?G\s?E\s?M\s?E\s?N\s?T\.?\s?$|^A\s?R\s?R\s?(Ê|E)\s?T\s?\.?\s?$)", string) # pattern: end judgement m2 = re.match(r"(.*?)D(u|û|ù)(.+?)(—|–|-|–|–)(.+)", string) if not Decision and m1: # if new decision Decision = True text = "" count = 1 if m1.groups()[3] != None: #extract the text after la cour text = str(m1.groups()[3]) First_page = page new_decision = True if Decision and m2: # case : end of judgement if count < 15: if (new_decision): text = m2.groups()[0] else: text += m2.groups()[0] date = m2.groups()[2] juridiction = m2.groups()[4] df = df.append( { 'page': First_page, 'arrêt': text, "date": date, "juridiction": juridiction }, ignore_index=True) Decision = False text = '' elif not notes and Decision and not new_decision: if not re.match(r"^\(\d*\).+$", string): count += 1 text += string + "\n" else: notes = True else: pass else: new_page = False return df def save(self, df, ark, year): """ function to save the DF""" df.to_csv(f"{self.directory}/{year}/{ark}.csv", encoding="utf-8", sep=";") pass def postProcess(self, df, ark, year, recceuil): """ function to post process""" # fix mix date-juridiction Rows_contains_ = df['date'].str.contains(r"(—|–|-)") for i, row in df[Rows_contains_].iterrows(): m = re.search(r"(.+?)(—|–|-|—)(.+)(—|–|-|—)?.*", row["date"]) if m: df.at[i, "date"] = m.groups()[0] df.at[i, "juridiction"] = m.groups()[2] #if still not fixed --> drop them Rows_contains_ = df['date'].str.contains(r"(—|–|-)") df = df[Rows_contains_ == False] # drop date too long leng = df["date"].str.len() df = df[leng < 25] # drop too long date # drop date with no number number = df["date"].str.contains("^\D*$") df = df[number == False] # drop too long date length_decision = df.arrêt.str.len() # drop decision too short df = df[length_decision > 100] # drop juridiction too long for i, row in df.iterrows(): m = re.search(r"(.+?)(—|–|-|—|,|;).*", row["juridiction"]) if m: df.at[i, "juridiction"] = m.groups()[0] # add link df["lien"] = "https://gallica.bnf.fr/ark:/12148/" + ark + "/f" df["lien"] = df["lien"] + df.page.map(str) + ".image" df["id"] = "" + str(year) + str(recceuil) + df.index.map(str) df.index = df.id return df def spell_check(self, df): """ apply the spell checking on the df""" df["arrêt"] = df["arrêt"].apply(self.correct) return df def correct(self, text): """ spell check text""" ntokens = [] tokens = re.split('\s|,|\.|;|—|–|-|–|–|\n|:|\!|\?', text) for t in tokens: if (str(t).lower().isalpha() and not str(t).lower() in self.words and not str(t)[0].isupper()): if str(t) in self.corrected: nt = self.corrected[t] else: nt = t suggestion = self.sym_spell.lookup_compound(t, 2) if len(suggestion) > 0: nt = suggestion[0].term self.corrected[t] = nt ntokens.append(nt) else: ntokens.append(t) return " ".join(ntokens)
term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): #loading the dictionary print("Dictionary file not found") placeHolderList = [] #lookup suggestions for multi-word input strings #Conversion is needed. The sym spell lookup-compound takes string input_corpus = (str(Corpus)) # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_corpus, max_edit_distance_lookup) #Putting everything back in a list format. 'PlaceHolderList' is temporary for suggestion in suggestions: placeHolderList.append(suggestion.term) stopWords = set( stopwords.words('english')) #getting stop wards to clean up the corpus #Tokenize made everything separate in a list. Using the 'join' function will makes it a string again. words = word_tokenize(''.join(placeHolderList)) print(words) #Checking the full list cleanedCorpus = [] #This loop takes out all the stopwords. I didn't add any additional stopwords. for w in words:
class SpellCheck: def __init__(self, progress, directory, countries_dict): self.progress = progress self.logger = logging.getLogger(__name__) self.spelling_update = Counter() self.directory = directory self.spell_path = os.path.join(self.directory, 'spelling.pkl') self.countries_dict = countries_dict self.sym_spell = SymSpell() def insert(self, name, iso): if 'gothland cemetery' not in name and name not in noise_words: name_tokens = name.split(' ') for word in name_tokens: key = f'{word}' if len(key) > 2: self.spelling_update[key] += 1 def write(self): # Create blank spelling dictionary path = os.path.join(self.directory, 'spelling.tmp') fl = open(path, 'w') fl.write('the,1\n') fl.close() success = self.sym_spell.create_dictionary(corpus=path) if not success: self.logger.error(f"error creating spelling dictionary") self.logger.info('Building Spelling Dictionary') # Add all words from geonames into spelling dictionary for key in self.spelling_update: self.sym_spell.create_dictionary_entry( key=key, count=self.spelling_update[key]) self.logger.info('Writing Spelling Dictionary') self.sym_spell.save_pickle(self.spell_path) def read(self): success = False if os.path.exists(self.spell_path): self.logger.info( f'Loading Spelling Dictionary from {self.spell_path}') success = self.sym_spell.load_pickle(self.spell_path) else: self.logger.error( f"spelling dictionary not found: {self.spell_path}") if not success: self.logger.error( f"error loading spelling dictionary from {self.spell_path}") else: self.sym_spell.delete_dictionary_entry(key='gothland') size = len(self.sym_spell.words) self.logger.info(f"Spelling Dictionary contains {size} words") def lookup(self, input_term): #suggestions = [SymSpell. SuggestItem] if '*' in input_term: return input_term res = '' if len(input_term) > 1: suggestions = self.sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) for idx, item in enumerate(suggestions): if idx > 3: break #self.logger.debug(f'{item._term}') if item._term[0] == input_term[0]: # Only accept results where first letter matches res += item._term + ' ' return res else: return input_term def lookup_compound(self, phrase): suggestions = self.sym_spell.lookup_compound(phrase=phrase, max_edit_distance=2, ignore_non_words=False) for item in suggestions: self.logger.debug(f'{item._term}') return suggestions[0]._term def fix_spelling(self, text): new_text = text if bool(re.search(r'\d', text)): # Has digits, just return text, no spellcheck pass elif 'st ' in text: # Spellcheck not handling St properly pass else: if len(text) > 0: new_text = self.lookup(text) self.logger.debug(f'Spell {text} -> {new_text}') return new_text.strip(' ')
misspelled=spell.unknown(injury_tokens) max_edit_distance_dictionary = 2 prefix_length = 7 sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt") sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) max_edit_distance_lookup = 2 suggestion_verbosity = Verbosity.CLOSEST for word in misspelled: spell_corrected_obj=sym_spell.lookup_compound(word, max_edit_distance_lookup) if len(spell_corrected_obj) > 0: spell_correction= spell_corrected_obj[0].term injury_text=regex.sub(pattern=" "+word+" ", repl=" "+spell_correction+" ", string=injury_text) injury_tokens=tokenizer.tokenize(injury_text) misspelled=spell.unknown(injury_tokens) for word in misspelled: spell_correction = spell.correction(word) injury_text=regex.sub(pattern=" "+word+" ", repl=" "+spell_correction+" ", string=injury_text) injury_tokens=tokenizer.tokenize(injury_text) misspelled=spell.unknown(injury_tokens) for word in misspelled:
def main(): #Load the image from the desktop imgFile = '/Users/emily/Desktop/basic_word2.png' #Read the image. Adding "0" makes this image grayscale img = cv2.imread(imgFile,0) #If you haven't given the program an image, #you're going to get this error: if img is None: print("Could not read:", imgFile) #Now isolate the dark text from the pale background. #Text is now black, background is now white. #This way, it's easy to detect the text from the picture thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY) #Let's make the grayscale image bigger!! #Note: this makes the text detection MUCH better. #Please do NOT delete this line!! gray = cv2.resize(img, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC) #Add a little blur to the picture img = cv2.bilateralFilter(img,3,75,75) #Aaaand that's all, folks! #The image is done being processsed. #Save final grayscale image to a new image file filename="/Users/emily/Desktop/gray_image.png" cv2.imwrite(filename, gray) #Save the text from the image as a variable "text" #Do we need this? I seriously hope we do... text = pytesseract.image_to_string(Image.open(filename), lang = 'eng') # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 5 prefix_length = 7 # create object sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = os.path.join(os.path.dirname(r"/Users/emily/Documents/Tinovation/spellcheck2.py"), "/Users/emily/Desktop/frequency_dictionary_en_82_765.txt") term_index = 0 # column of the term in the dictionary text file count_index = 1 # column of the term frequency in the dictionary text file if not sym_spell.load_dictionary(dictionary_path, term_index, count_index): print("Dictionary file not found") return # # lookup suggestions for single-word input strings # input_term = "memebers" # misspelling of "members" # # max edit distance per lookup # # (max_edit_distance_lookup <= max_edit_distance_dictionary) # max_edit_distance_lookup = 2 # suggestion_verbosity = Verbosity.CLOSEST # TOP, CLOSEST, ALL # suggestions = sym_spell.lookup(input_term, suggestion_verbosity, # max_edit_distance_lookup) # # display suggestion term, term frequency, and edit distance # for suggestion in suggestions: # print("{}, {}, {}".format(suggestion.term, suggestion.distance, # suggestion.count)) # # lookup suggestions for multi-word input strings (supports compound # # splitting & merging) input_term = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixtgrade and ins pired him") input_term = ("ront tshi liptop si ocol") input_term = text # max edit distance per lookup (per single word, not per whole input string) max_edit_distance_lookup = 2 suggestions = sym_spell.lookup_compound(input_term, max_edit_distance_lookup) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print("{}, {}, {}".format(suggestion.term, suggestion.distance, suggestion.count))