def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content
def spell_corrector(df, lang1, lang2): #Create an object of the Hunspell class h = Hunspell() print('I am spell_checker') #An empty list to hold the corrected sentences which would later be made into a dataframe corr_sent_list = {'L1': [], 'L2': []} #For each sentence in the dataframe for sent in df['L1']: #Empty string to which the corrected words are appended corr_sent = '' #For every word in the sentence. Which is split by word boundary for w in re.split(r'\b', sent): #If the split part is not a word (punctuation marks, spaces) or if it is a correct word, append it to corr_sent if not w.isalpha() or h.spell(w): corr_sent += w #If the split part is word and is incorrect else: #Suggest possible correct candidates to the incorrect word suggest = h.suggest(w) #If more than one word is suggested, more processing is required to select a word if len(suggest) > 1: #TODO : Parse the list and find the n-gram probability to find the best candidate. For now it just appends the first word corr_sent += suggest[0] #If only one word is suggested, append it to corr_sent else: corr_sent += suggest[0] #When all the words in the sentence is traversed, append the corrected_sentence to corr_sent_list corr_sent_list['L1'].append(corr_sent) #Convert the corrected sentences list into pandas dataframe to return if lang2 is not None: corr_sent_list['L2'].extend(list(df['L2'])) return pd.DataFrame.from_dict(corr_sent_list) else: return pd.DataFrame(corr_sent_list['L1'], columns=['L1'])
class HunspellChecker(object): def __init__(self): self.checker = Hunspell() self.stopwords = set(SW.words("english")) | set(string.punctuation) def correct_word(self, word): """Borrowed from: https://datascience.blog.wzb.eu/2016/07/13/autocorrecting-misspelled-words-in-python-using-hunspell/ """ ok = self.checker.spell(word) # check spelling if not ok: suggestions = self.checker.suggest(word) if len(suggestions) > 0: # there are suggestions return suggestions[0] else: return word else: return word def correct_string(self, text, ensure_length=False): """Break into words and correct each word.""" tokens = text.split() corrected = [] for token in tokens: if token in self.stopwords: corrected.append(token) else: correction = self.correct_word(token) if ensure_length: corrected.append(correction.split()[0]) else: corrected.append(correction) return " ".join(corrected)
def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem)
class HunspellChecker(object): def __init__(self): self.checker = Hunspell() def correct(self, word): if self.checker.spell(word) == True: return word else: res = self.checker.suggest(word) if res: return res[0] else: return word
def test_clear_caches_persistance(hunspell): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_suffix = h1.suffix_suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest assert h1.suggest('made-up') == test_suggest h1._suffix_cache['made-up'] = test_suffix assert h1.suffix_suggest('made-up') == test_suffix h1._stem_cache['made-up'] = test_stem assert h1.stem('made-up') == test_stem h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() assert len(cacheman.cache_by_name) == 0 h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') assert len(h2._suggest_cache) == 0 assert len(h2._stem_cache) == 0 assert h2.suggest('made-up') != test_suggest assert h2.suffix_suggest('made-up') != test_suffix assert h2.stem('made-up') != test_stem finally: shutil.rmtree(temp_dir) # Nuke temp content
def test_non_overlapping_caches(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) assert h2.suggest('made-up') != test_suggest assert h2.stem('made-up') != test_stem
def test_clear_caches_non_peristance(hunspell): test_suggest = hunspell.suggest('testing') test_suffix = hunspell.suffix_suggest('testing') test_stem = hunspell.stem('testing') hunspell._suggest_cache['made-up'] = test_suggest assert hunspell.suggest('made-up') == test_suggest hunspell._suffix_cache['made-up'] = test_suffix assert hunspell.suffix_suggest('made-up') == test_suffix hunspell._stem_cache['made-up'] = test_stem assert hunspell.stem('made-up') == test_stem hunspell.clear_cache() del hunspell hunspell = Hunspell('test', hunspell_data_dir=DICT_DIR) assert hunspell.suggest('made-up') != test_suggest assert hunspell.suffix_suggest('made-up') != test_suffix assert hunspell.stem('made-up') != test_stem
def pluralize_advanced( singular: str, speller: Hunspell = None, ending_overrides: NounEndingMap = None) -> AdvancedPluralizationResult: if not speller: speller = ensure_hunspell_nl() plural = __pluralize(singular, ending_overrides) # empty plural - just stop if not plural: return AdvancedPluralizationResult(plural, None, (), None, None, False) # right spelled plural if speller.spell(plural): return AdvancedPluralizationResult(plural, plural, (), None, None, True) # if no rightly spelled word can be found, use suggestions, # replacement of the endings and the Hunspell dictionary if # we can find something that is spelled correctly. suggestions = speller.suggest(plural) search_result:SearchResult = \ search_by_suggestions(plural, suggestions) or \ search_by_dictionary(speller, plural) or \ search_by_dictionary_plus_s(speller, singular) if search_result: return AdvancedPluralizationResult(plural, search_result.plural, suggestions, search_result.switched_ending_from, search_result.switched_ending_to, True) return AdvancedPluralizationResult(plural, None, (), None, None, False)
class Stem: """ The Stem module deals with various tasks, mainly through the following functions: - `check_spelling`: spell error detection - `correct_spelling`: spell error correction - `analyze`: morphological analysis Please note that only Sorani is supported in this version in this module. The module is based on the [Kurdish Hunspell project](https://github.com/sinaahmadi/KurdishHunspell). Example: ```python >>> from klpt.stem import Stem >>> stemmer = Stem("Sorani", "Arabic") >>> stemmer.check_spelling("سوتاندبووت") False >>> stemmer.correct_spelling("سوتاندبووت") (False, ['ستاندبووت', 'سووتاندبووت', 'سووڕاندبووت', 'ڕووتاندبووت', 'فەوتاندبووت', 'بووژاندبووت']) >>> stemmer.analyze("دیتبامن") [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}] ``` """ def __init__(self, dialect, script): self.dialect = dialect self.script = script self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"} if self.dialect == "Sorani" and self.script == "Arabic": self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/")) else: if not (self.dialect == "Kurmanji" and self.script == "Latin"): raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!") # def stem(self, word): # """A function for stemming a single word""" # pass # def lemmatize(self, word): # """A function for lemmatization of a single word""" # pass def check_spelling(self, word): """Check spelling of a word Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: bool: True if the spelling is correct, False if the spelling is incorrect """ if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"): raise TypeError("Not supported yet.") else: return self.huns.spell(word) def correct_spelling(self, word): """ Correct spelling errors if the input word is incorrect. It returns a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect). If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []). If no suggestion is available, the list is returned empty as (True, []). Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: tuple (boolean, list) """ if not isinstance(word, str) or not (self.dialect == "Sorani" and self.script == "Arabic"): raise TypeError("Not supported yet.") else: if self.check_spelling(word): return (True, []) return (False, list(self.huns.suggest(word))) def analyze(self, word_form): """ Morphological analysis of a given word. It returns morphological analyses. The morphological analysis is returned as a dictionary as follows: - "pos": the part-of-speech of the word-form according to [the Universal Dependency tag set](https://universaldependencies.org/u/pos/index.html). - "description": is flag - "terminal_suffix": anything except ts flag - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure. - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to [the Hunspell documentation](http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html), "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base. As in [{'pos': 'verb', 'description': 'past_stem_transitive_active', 'base': 'دیت', 'terminal_suffix': 'بامن'}] If the input cannot be analyzed morphologically, an empty list is returned. Sorani: More details regarding Sorani Kurdish morphological analysis can be found at [https://github.com/sinaahmadi/KurdishHunspell](https://github.com/sinaahmadi/KurdishHunspell). Kurmanji: Regarding Kurmanji, we use the morphological analyzer provided by the [Kurmanji part](https://github.com/apertium/apertium-kmr) Please note that there are delicate difference between who the analyzers work in Hunspell and Apertium. For instane, the `base` in the Kurmanji analysis refers to the lemma while in Sorani (from Hunspell), it refers to the morphological base. Args: word_form (str): a single word-form Raises: TypeError: only string as input Returns: (list(dict)): a list of all possible morphological analyses according to the defined morphological rules """ if not isinstance(word_form, str): raise TypeError("Only a word (str) is allowed.") else: word_analysis = list() if self.dialect == "Sorani" and self.script == "Arabic": # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary for analysis in list(self.huns.analyze(word_form)): analysis_dict = dict() for item in analysis.split(): if ":" not in item: continue if item.split(":")[1] == "ts": # ts flag exceptionally appears after the value as value:key in the Hunspell output analysis_dict["base"] = item.split(":")[0] # anything except the terminal_suffix is considered to be the base analysis_dict[self.hunspell_flags[item.split(":")[1]]] = word_form.replace(item.split(":")[0], "") elif item.split(":")[0] in self.hunspell_flags.keys(): # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function # for ds flag, add derivation as the formation type, otherwise inflection if item.split(":")[0] == "ds": analysis_dict[self.hunspell_flags[item.split(":")[0]]] = "derivational" analysis_dict[self.hunspell_flags["is"]] = item.split(":")[1] else: analysis_dict[self.hunspell_flags[item.split(":")[0]]] = item.split(":")[1] # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0 if self.hunspell_flags["ts"] not in analysis_dict or analysis_dict[self.hunspell_flags["ts"]] == "": analysis_dict[self.hunspell_flags["ts"]] = "0" word_analysis.append(analysis_dict) elif self.dialect == "Kurmanji" and self.script == "Latin": att_analysis = Analysis("Kurmanji", "Latin").analyze(word_form) # check if the word-form is analyzed or no if not len(att_analysis[1]): # the word-form could not be analyzed return [] for form_analysis in list(att_analysis[-1]): for analysis in form_analysis: analysis_dict = dict() structure = analysis[0].rsplit('@', 1)[1].split("<", 1) analysis_dict["base"], analysis_dict["description"] = structure[0], structure[1].replace("><", "_").replace(">", "").strip() analysis_dict["pos"] = "" analysis_dict["terminal_suffix"] = "" analysis_dict["formation"] = "" # TODO: the description needs further information extraction in such a way that some values should be assigned to the "pos" key # analysis_dict["terminal_suffix"] = word_form.replace(analysis_dict["base"], "") word_analysis.append(analysis_dict) return word_analysis
class Application: def __init__(self): self.hs = Hunspell('en_US') self.vs = cv2.VideoCapture(0) self.current_image = None self.current_image2 = None self.json_file = open("Models\model_new.json", "r") self.model_json = self.json_file.read() self.json_file.close() self.loaded_model = model_from_json(self.model_json) self.loaded_model.load_weights("Models\model_new.h5") self.json_file_dru = open("Models\model-bw_dru.json", "r") self.model_json_dru = self.json_file_dru.read() self.json_file_dru.close() self.loaded_model_dru = model_from_json(self.model_json_dru) self.loaded_model_dru.load_weights("Models\model-bw_dru.h5") self.json_file_tkdi = open("Models\model-bw_tkdi.json", "r") self.model_json_tkdi = self.json_file_tkdi.read() self.json_file_tkdi.close() self.loaded_model_tkdi = model_from_json(self.model_json_tkdi) self.loaded_model_tkdi.load_weights("Models\model-bw_tkdi.h5") self.json_file_smn = open("Models\model-bw_smn.json", "r") self.model_json_smn = self.json_file_smn.read() self.json_file_smn.close() self.loaded_model_smn = model_from_json(self.model_json_smn) self.loaded_model_smn.load_weights("Models\model-bw_smn.h5") self.ct = {} self.ct['blank'] = 0 self.blank_flag = 0 for i in ascii_uppercase: self.ct[i] = 0 print("Loaded model from disk") self.root = tk.Tk() self.root.title("Sign Language To Text Conversion") self.root.protocol('WM_DELETE_WINDOW', self.destructor) self.root.geometry("900x900") self.panel = tk.Label(self.root) self.panel.place(x=100, y=10, width=580, height=580) self.panel2 = tk.Label(self.root) # initialize image panel self.panel2.place(x=400, y=65, width=275, height=275) self.T = tk.Label(self.root) self.T.place(x=60, y=5) self.T.config(text="Sign Language To Text Conversion", font=("Courier", 30, "bold")) self.panel3 = tk.Label(self.root) # Current Symbol self.panel3.place(x=500, y=540) self.T1 = tk.Label(self.root) self.T1.place(x=10, y=540) self.T1.config(text="Character :", font=("Courier", 30, "bold")) self.panel4 = tk.Label(self.root) # Word self.panel4.place(x=220, y=595) self.T2 = tk.Label(self.root) self.T2.place(x=10, y=595) self.T2.config(text="Word :", font=("Courier", 30, "bold")) self.panel5 = tk.Label(self.root) # Sentence self.panel5.place(x=350, y=645) self.T3 = tk.Label(self.root) self.T3.place(x=10, y=645) self.T3.config(text="Sentence :", font=("Courier", 30, "bold")) self.T4 = tk.Label(self.root) self.T4.place(x=250, y=690) self.T4.config(text="Suggestions :", fg="red", font=("Courier", 30, "bold")) self.bt1 = tk.Button(self.root, command=self.action1, height=0, width=0) self.bt1.place(x=26, y=745) self.bt2 = tk.Button(self.root, command=self.action2, height=0, width=0) self.bt2.place(x=325, y=745) self.bt3 = tk.Button(self.root, command=self.action3, height=0, width=0) self.bt3.place(x=625, y=745) self.str = "" self.word = " " self.current_symbol = "Empty" self.photo = "Empty" self.video_loop() def video_loop(self): ok, frame = self.vs.read() if ok: cv2image = cv2.flip(frame, 1) x1 = int(0.5 * frame.shape[1]) y1 = 10 x2 = frame.shape[1] - 10 y2 = int(0.5 * frame.shape[1]) cv2.rectangle(frame, (x1 - 1, y1 - 1), (x2 + 1, y2 + 1), (255, 0, 0), 1) cv2image = cv2.cvtColor(cv2image, cv2.COLOR_BGR2RGBA) self.current_image = Image.fromarray(cv2image) imgtk = ImageTk.PhotoImage(image=self.current_image) self.panel.imgtk = imgtk self.panel.config(image=imgtk) cv2image = cv2image[y1:y2, x1:x2] gray = cv2.cvtColor(cv2image, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (5, 5), 2) th3 = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) ret, res = cv2.threshold(th3, 70, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) self.predict(res) self.current_image2 = Image.fromarray(res) imgtk = ImageTk.PhotoImage(image=self.current_image2) self.panel2.imgtk = imgtk self.panel2.config(image=imgtk) self.panel3.config(text=self.current_symbol, font=("Courier", 30)) self.panel4.config(text=self.word, font=("Courier", 30)) self.panel5.config(text=self.str, font=("Courier", 30)) predicts = self.hs.suggest(self.word) if (len(predicts) > 1): self.bt1.config(text=predicts[0], font=("Courier", 20)) else: self.bt1.config(text="") if (len(predicts) > 2): self.bt2.config(text=predicts[1], font=("Courier", 20)) else: self.bt2.config(text="") if (len(predicts) > 3): self.bt3.config(text=predicts[2], font=("Courier", 20)) else: self.bt3.config(text="") self.root.after(5, self.video_loop) def predict(self, test_image): test_image = cv2.resize(test_image, (128, 128)) result = self.loaded_model.predict(test_image.reshape(1, 128, 128, 1)) result_dru = self.loaded_model_dru.predict( test_image.reshape(1, 128, 128, 1)) result_tkdi = self.loaded_model_tkdi.predict( test_image.reshape(1, 128, 128, 1)) result_smn = self.loaded_model_smn.predict( test_image.reshape(1, 128, 128, 1)) prediction = {} prediction['blank'] = result[0][0] inde = 1 for i in ascii_uppercase: prediction[i] = result[0][inde] inde += 1 #LAYER 1 prediction = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True) self.current_symbol = prediction[0][0] #LAYER 2 if (self.current_symbol == 'D' or self.current_symbol == 'R' or self.current_symbol == 'U'): prediction = {} prediction['D'] = result_dru[0][0] prediction['R'] = result_dru[0][1] prediction['U'] = result_dru[0][2] prediction = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True) self.current_symbol = prediction[0][0] if (self.current_symbol == 'D' or self.current_symbol == 'I' or self.current_symbol == 'K' or self.current_symbol == 'T'): prediction = {} prediction['D'] = result_tkdi[0][0] prediction['I'] = result_tkdi[0][1] prediction['K'] = result_tkdi[0][2] prediction['T'] = result_tkdi[0][3] prediction = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True) self.current_symbol = prediction[0][0] if (self.current_symbol == 'M' or self.current_symbol == 'N' or self.current_symbol == 'S'): prediction1 = {} prediction1['M'] = result_smn[0][0] prediction1['N'] = result_smn[0][1] prediction1['S'] = result_smn[0][2] prediction1 = sorted(prediction1.items(), key=operator.itemgetter(1), reverse=True) if (prediction1[0][0] == 'S'): self.current_symbol = prediction1[0][0] else: self.current_symbol = prediction[0][0] if (self.current_symbol == 'blank'): for i in ascii_uppercase: self.ct[i] = 0 self.ct[self.current_symbol] += 1 if (self.ct[self.current_symbol] > 60): for i in ascii_uppercase: if i == self.current_symbol: continue tmp = self.ct[self.current_symbol] - self.ct[i] if tmp < 0: tmp *= -1 if tmp <= 20: self.ct['blank'] = 0 for i in ascii_uppercase: self.ct[i] = 0 return self.ct['blank'] = 0 for i in ascii_uppercase: self.ct[i] = 0 if self.current_symbol == 'blank': if self.blank_flag == 0: self.blank_flag = 1 if len(self.str) > 0: self.str += " " self.str += self.word self.word = "" else: if (len(self.str) > 16): self.str = "" self.blank_flag = 0 self.word += self.current_symbol def action1(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 0): self.word = "" self.str += " " self.str += predicts[0] def action2(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 1): self.word = "" self.str += " " self.str += predicts[1] def action3(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 2): self.word = "" self.str += " " self.str += predicts[2] def action4(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 3): self.word = "" self.str += " " self.str += predicts[3] def action5(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 4): self.word = "" self.str += " " self.str += predicts[4] def destructor(self): print("Closing Application...") self.root.destroy() self.vs.release() cv2.destroyAllWindows()
from hunspell import Hunspell h = Hunspell("ko", hunspell_data_dir='ko') if __name__ == "__main__": answer = h.spell("안녕하세요으") print(answer) answer2 = h.spell("안녕하세") print(answer2) answer3 = h.suggest("안녕하세요으") print(answer3)
class SpellChecker: """ Class for managing spell checking using Hunspell. Implemented as a class, as multiple instances of a SpellChecker might be used to maintain different dictionaries simultaneously (for example adding custom words). """ def __init__(self, allowed_punctuation_marks, dictionary_directory): """ Constructor method. Declares and creates a new Hunspell object. """ self.allowed_punctuation_marks = allowed_punctuation_marks self.dictionary_directory = dictionary_directory self.hunspell = None self.refresh_dict() def refresh_dict(self): """ Create a new Hunspell object from the specified dictionary file. """ self.hunspell = Hunspell('index', hunspell_data_dir=self.dictionary_directory) def is_punctuation_mark(self, word): """ Checks if the given word corresponds to one of the allowed punctuation marks. :param word: a string with a single word :type: string :return: boolean indicating if the given word is an allowed punctuation mark :type: boolean """ return bool(re.match(r'[%s]' % self.allowed_punctuation_marks, word)) def is_correctly_spelled(self, word): """ Checks if the given word is correctly spelled. :param word: a string with a single word :type: string :return: boolean indicating if the spelling of the word is correct :type: boolean """ return self.hunspell.spell(word) def suggest(self, word): """ Suggest similar and correctly spelled alternatives for the given string. Orders Hunspell suggestions by edit distance. :param word: a string with a single word :type: string :return: a list of suggestions :type: list<string> """ suggestions = self.hunspell.suggest(word) return sorted(suggestions, key=lambda suggestion: edit_distance(word, suggestion)) def fix(self, word): """ Fixes the spelling of the given word. :param word: a string with a single word :type: string :return: the same word if correctly spelled or a punctuation mark, otherwise the top Hunspell suggestion. """ return word if self.is_punctuation_mark( word) or self.is_correctly_spelled(word) else self.suggest(word)[0] def fix_text(self, text): """ Fixes the spelling of a multi-worded phrase. :param text: the phrase string :type: string :return: the same phrase, with the spelling of each word fixed. """ fixed_text = ' '.join([self.fix(word) for word in word_tokenize(text)]) return re.sub(r' ([%s])' % self.allowed_punctuation_marks, r'\1', fixed_text) # remove spaces preceding punctuation
def test_hunspell_suggest(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertListEqual( d.suggest('dpg'), ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP']) del d
class UnsupervisedGrammarCorrector: def __init__(self, threshold=0.96): basename = os.path.dirname(os.path.realpath(__file__)) self.lm = LanguageModel() # Load spaCy self.nlp = spacy.load("en") # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower. self.gb = Hunspell("en_GB-large", hunspell_data_dir=basename + '/resources/spelling/') # Inflection forms: http://wordlist.aspell.net/other/ self.gb_infl = loadWordFormDict(basename + "/resources/agid-2016.01.19/infl.txt") # List of common determiners self.determiners = {"", "the", "a", "an"} # List of common prepositions self.prepositions = { "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with" } self.threshold = threshold def correct(self, sentence): # If the line is empty, preserve the newline in output and continue if not sentence: return "" best = sentence score = self.lm.score(best) while True: new_best, new_score = self.process(best) if new_best and new_score > score: best = new_best score = new_score else: break return best def process(self, sentence: str) -> Tuple[str, bool]: # Process sent with spacy proc_sent = self.nlp.tokenizer(sentence) self.nlp.tagger(proc_sent) # Calculate avg token prob of the sent so far. orig_prob = self.lm.score(proc_sent.text) # Store all the candidate corrected sentences here candidates = [] # Process each token. for tok in proc_sent: # SPELLCHECKING # Spell check: tok must be alphabetical and not a real word. candidate_tokens = set() lower_cased_token = tok.lower_ if lower_cased_token.isalpha( ) and not self.gb.spell(lower_cased_token): candidate_tokens |= set(self.gb.suggest(lower_cased_token)) # MORPHOLOGY if tok.lemma_ in self.gb_infl: candidate_tokens |= self.gb_infl[tok.lemma_] # DETERMINERS if lower_cased_token in self.determiners: candidate_tokens |= self.determiners # PREPOSITIONS if lower_cased_token in self.prepositions: candidate_tokens |= self.prepositions candidate_tokens = [ c for c in candidate_tokens if self.gb.spell(c) ] if candidate_tokens: if tok.is_title: candidate_tokens = [c.title() for c in candidate_tokens] elif tok.is_upper: candidate_tokens = [c.upper() for c in candidate_tokens] candidates.extend( self._generate_candidates(tok.i, candidate_tokens, proc_sent)) best_prob = orig_prob best = sentence for candidate in candidates: # Score the candidate sentence cand_prob = self.lm.score(candidate.text) print(candidate.text, self.lm.score(candidate.text), cand_prob) # Compare cand_prob against weighted orig_prob and best_prob if cand_prob > best_prob: best_prob = cand_prob best = candidate.text # Return the best sentence and a boolean whether to search for more errors return best, best_prob def _generate_candidates(self, tok_id, candidate_tokens, tokenized_sentence) -> List[str]: # Save candidates here. candidates = [] prefix = tokenized_sentence[:tok_id] suffix = tokenized_sentence[tok_id + 1:] # Loop through the input alternative candidates for token in candidate_tokens: candidate = prefix.text_with_ws if token: candidate += token + " " candidate += suffix.text_with_ws candidate = self.nlp.tokenizer(candidate) candidates.append(candidate) return candidates
def test_hunspell_suggest(self): d = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertListEqual(d.suggest('dpg'), ['dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg', 'GDP']) del d
class HunspellTest(unittest.TestCase): def assertRegexpSearch(self, *args, **kwargs): if PY3: self.assertRegex(*args, **kwargs) else: self.assertRegexpMatches(*args, **kwargs) def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(HunspellFilePathError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) @patch('os.path.isfile', return_value=True) @patch('os.access', return_value=True) def test_bad_path_encoding(self, *mocks): if PY3: with self.assertRaises(HunspellFilePathError): Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') else: # Python 2 just make an illegal string instead of raising with captured_c_stderr_file() as caperr: Hunspell('not_checked', hunspell_data_dir=u'bad/\udcc3/decoding') with open(caperr, 'r') as err: self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding') @patch('hunspell.hunspell.WIN32_LONG_PATH_PREFIX', '/not/valid') def test_windows_utf_8_encoding_applies_prefix(self, *mocks): with captured_c_stderr_file() as caperr: with patch("os.name", 'nt'): # If python file existance checks used prefix, this would raise a HunspellFilePathError Hunspell('test', system_encoding='UTF-8') with open(caperr, 'r') as err: # But the Hunspell library lookup had the prefix applied self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*') def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_add(self): word = 'outofvocabularyword' self.assertEqual(self.h.spell(word), False) self.h.add(word) self.assertEqual(self.h.spell(word), True) typo = word + 'd' self.assertAllIn([word], self.h.suggest(typo)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) }) def test_non_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) def test_overlapping_caches(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertEqual(self.h.suggest('made-up'), test_suggest) self.assertEqual(self.h.stem('made-up'), test_stem) def test_save_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertNotEqual(len(h2._suggest_cache), 0) self.assertNotEqual(len(h2._stem_cache), 0) self.assertEqual(h2.suggest('made-up'), test_suggest) self.assertEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_persistance(self): temp_dir = tempfile.mkdtemp() try: h1 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') test_suggest = h1.suggest('testing') test_stem = h1.stem('testing') h1._suggest_cache['made-up'] = test_suggest self.assertEqual(h1.suggest('made-up'), test_suggest) h1._stem_cache['made-up'] = test_stem self.assertEqual(h1.stem('made-up'), test_stem) h1.save_cache() h1.clear_cache() del h1 cacheman = get_cache_manager('disk_hun') cacheman.deregister_all_caches() self.assertEqual(len(cacheman.cache_by_name), 0) h2 = Hunspell('test', hunspell_data_dir=DICT_DIR, disk_cache_dir=temp_dir, cache_manager='disk_hun') self.assertEqual(len(h2._suggest_cache), 0) self.assertEqual(len(h2._stem_cache), 0) self.assertNotEqual(h2.suggest('made-up'), test_suggest) self.assertNotEqual(h2.stem('made-up'), test_stem) finally: shutil.rmtree(temp_dir) # Nuke temp content def test_clear_caches_non_peristance(self): test_suggest = self.h.suggest('testing') test_stem = self.h.stem('testing') self.h._suggest_cache['made-up'] = test_suggest self.assertEqual(self.h.suggest('made-up'), test_suggest) self.h._stem_cache['made-up'] = test_stem self.assertEqual(self.h.stem('made-up'), test_stem) self.h.clear_cache() del self.h self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) self.assertNotEqual(self.h.suggest('made-up'), test_suggest) self.assertNotEqual(self.h.stem('made-up'), test_stem)
class Application: def __init__(self): self.directory = "model/" self.hs = Hunspell('en_US') self.vs = cv2.VideoCapture(0) self.current_image = None self.current_image2 = None self.json_file = open(self.directory + "model.json", "r") self.model_json = self.json_file.read() self.json_file.close() self.loaded_model = model_from_json(self.model_json) self.loaded_model.load_weights(self.directory + "model.h5") self.json_file_dru = open(self.directory + "model_dru.json", "r") self.model_json_dru = self.json_file_dru.read() self.json_file_dru.close() self.loaded_model_dru = model_from_json(self.model_json_dru) self.loaded_model_dru.load_weights(self.directory + "model_dru.h5") self.json_file_tkdi = open(self.directory + "model_tkdi.json", "r") self.model_json_tkdi = self.json_file_tkdi.read() self.json_file_tkdi.close() self.loaded_model_tkdi = model_from_json(self.model_json_tkdi) self.loaded_model_tkdi.load_weights(self.directory + "model_tkdi.h5") self.json_file_smn = open(self.directory + "model_smn.json", "r") self.model_json_smn = self.json_file_smn.read() self.json_file_smn.close() self.loaded_model_smn = model_from_json(self.model_json_smn) self.loaded_model_smn.load_weights(self.directory + "model_smn.h5") self.ct = {} self.ct['blank'] = 0 self.blank_flag = 0 for i in ascii_uppercase: self.ct[i] = 0 print("Loaded model from disk") self.root = tk.Tk() self.root.title("Sign language to Text Converter") self.root.protocol('WM_DELETE_WINDOW', self.destructor) self.root.geometry("1100x1100") self.canvas = tk.Canvas(width=1100, height=1100) self.canvas.pack(fill="both", expand=True) self.panel = tk.Label(self.root) self.panel.place(x=135, y=90, width=640, height=480) self.panel2 = tk.Label(self.root) # initialize image panel self.panel2.place(x=460, y=95, width=310, height=310) self.canvas.create_text(450, 50, text="Sign Language to Text", fill="black", font=("courier", 30, "bold")) self.panel3 = tk.Label(self.root) # Current Symbol self.panel3.place(x=500, y=600) self.canvas.create_text(155, 653, text="Character:", fill="black", font=("courier", 30, "bold")) self.panel4 = tk.Label(self.root) # Word self.panel4.place(x=220, y=680) self.canvas.create_text(110, 713, text="Word:", fill="black", font=("courier", 30, "bold")) self.panel5 = tk.Label(self.root) # Sentence self.panel5.place(x=350, y=740) self.canvas.create_text(140, 773, text="Sentence:", fill="black", font=("courier", 30, "bold")) self.T4 = tk.Label(self.root) self.T4.place(x=270, y=800) self.T4.config(text="Suggestions", fg="red", font=("Courier", 20, "bold")) self.btcall = tk.Button(self.root, command=self.action_call, height=0, width=0) self.btcall.config(text="About", bg="black", fg="white", font=("Courier", 14)) self.btcall.place(x=950, y=20) self.bt1 = tk.Button(self.root, bg="#DAF7A6", activebackground='white', command=self.action1, height=0, width=0) self.bt1.place(x=25, y=890) self.bt2 = tk.Button(self.root, bg="#DAF7A6", activebackground='white', command=self.action2, height=0, width=0) self.bt2.place(x=325, y=890) self.bt3 = tk.Button(self.root, bg="#DAF7A6", activebackground='white', command=self.action3, height=0, width=0) self.bt3.place(x=625, y=890) self.bt4 = tk.Button(self.root, bg="#DAF7A6", activebackground='white', command=self.action4, height=0, width=0) self.bt4.place(x=25, y=950) self.bt5 = tk.Button(self.root, bg="#DAF7A6", activebackground='white', command=self.action5, height=0, width=0) self.bt5.place(x=325, y=950) self.bt6 = tk.Button(self.root, text="Audio", bg="#DAF7A6", activebackground='white', font=("Courier", 20)) self.bt6.place(x=930, y=80) self.bt7 = tk.Button(self.root, text="Backspace", bg="#DAF7A6", activebackground='white', font=("Courier", 20)) self.bt7.place(x=880, y=140) self.bt8 = tk.Button(self.root, text="Reset", bg="#DAF7A6", activebackground='white', font=("Courier", 20)) self.bt8.place(x=930, y=200) self.str = "" self.word = "" self.current_symbol = "Empty" self.photo = "Empty" self.video_loop() def video_loop(self): ok, frame = self.vs.read() if ok: cv2image = cv2.flip(frame, 1) x1 = int(0.5 * frame.shape[1]) y1 = 10 x2 = frame.shape[1] - 10 y2 = int(0.5 * frame.shape[1]) cv2.rectangle(frame, (x1 - 1, y1 - 1), (x2 + 1, y2 + 1), (255, 0, 0), 1) cv2image = cv2.cvtColor(cv2image, cv2.COLOR_BGR2RGBA) self.current_image = Image.fromarray(cv2image) imgtk = ImageTk.PhotoImage(image=self.current_image) self.panel.imgtk = imgtk self.panel.config(image=imgtk) cv2image = cv2image[y1:y2, x1:x2] gray = cv2.cvtColor(cv2image, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (5, 5), 2) th3 = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) ret, res = cv2.threshold(th3, 70, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) self.predict(res) self.current_image2 = Image.fromarray(res) imgtk = ImageTk.PhotoImage(image=self.current_image2) self.panel2.imgtk = imgtk self.panel2.config(image=imgtk) self.panel3.config(text=self.current_symbol, font=("Courier", 35)) self.panel4.config(text=self.word, font=("Courier", 25)) self.panel5.config(text=self.str, font=("Courier", 25)) predicts = self.hs.suggest(self.word) if (len(predicts) > 0): self.bt1.config(text=predicts[0], font=("Courier", 20)) else: self.bt1.config(text="") if (len(predicts) > 1): self.bt2.config(text=predicts[1], font=("Courier", 20)) else: self.bt2.config(text="") if (len(predicts) > 2): self.bt3.config(text=predicts[2], font=("Courier", 20)) else: self.bt3.config(text="") if (len(predicts) > 3): self.bt4.config(text=predicts[3], font=("Courier", 20)) else: self.bt4.config(text="") if (len(predicts) > 4): self.bt5.config(text=predicts[4], font=("Courier", 20)) else: self.bt5.config(text="") self.root.after(30, self.video_loop) def predict(self, test_image): test_image = cv2.resize(test_image, (128, 128)) result = self.loaded_model.predict(test_image.reshape(1, 128, 128, 1)) result_dru = self.loaded_model_dru.predict( test_image.reshape(1, 128, 128, 1)) result_tkdi = self.loaded_model_tkdi.predict( test_image.reshape(1, 128, 128, 1)) result_smn = self.loaded_model_smn.predict( test_image.reshape(1, 128, 128, 1)) prediction = {} prediction['blank'] = result[0][0] inde = 1 for i in ascii_uppercase: prediction[i] = result[0][inde] inde += 1 #LAYER 1 prediction = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True) self.current_symbol = prediction[0][0] #LAYER 2 if (self.current_symbol == 'D' or self.current_symbol == 'R' or self.current_symbol == 'U'): prediction = {} prediction['D'] = result_dru[0][0] prediction['R'] = result_dru[0][1] prediction['U'] = result_dru[0][2] prediction = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True) self.current_symbol = prediction[0][0] if (self.current_symbol == 'D' or self.current_symbol == 'I' or self.current_symbol == 'K' or self.current_symbol == 'T'): prediction = {} prediction['D'] = result_tkdi[0][0] prediction['I'] = result_tkdi[0][1] prediction['K'] = result_tkdi[0][2] prediction['T'] = result_tkdi[0][3] prediction = sorted(prediction.items(), key=operator.itemgetter(1), reverse=True) self.current_symbol = prediction[0][0] if (self.current_symbol == 'M' or self.current_symbol == 'N' or self.current_symbol == 'S'): prediction1 = {} prediction1['M'] = result_smn[0][0] prediction1['N'] = result_smn[0][1] prediction1['S'] = result_smn[0][2] prediction1 = sorted(prediction1.items(), key=operator.itemgetter(1), reverse=True) if (prediction1[0][0] == 'S'): self.current_symbol = prediction1[0][0] else: self.current_symbol = prediction[0][0] if (self.current_symbol == 'blank'): for i in ascii_uppercase: self.ct[i] = 0 self.ct[self.current_symbol] += 1 if (self.ct[self.current_symbol] > 15): # 60 for i in ascii_uppercase: if i == self.current_symbol: print(i) continue tmp = self.ct[self.current_symbol] - self.ct[i] if tmp < 0: tmp *= -1 if tmp <= 5: # 20 self.ct['blank'] = 0 for i in ascii_uppercase: self.ct[i] = 0 return self.ct['blank'] = 0 for i in ascii_uppercase: self.ct[i] = 0 if self.current_symbol == 'blank': if self.blank_flag == 0: self.blank_flag = 1 if len(self.str) > 0: self.str += " " self.str += self.word self.word = "" print(self.str) def Text_to_speech(): # for audio output if os.path.exists("audio.mp3"): os.remove("audio.mp3") Message = self.str speech = gTTS(text=Message) speech.save('audio.mp3') playsound('audio.mp3') def erase(): # for reset self.str = "" def Back_Space(): # for correction self.str = self.str.rstrip(self.str[-1]) self.bt6.config(command=Text_to_speech) self.bt7.config(command=Back_Space) self.bt8.config(command=erase) else: if (len(self.str) > 16): self.str = "" self.blank_flag = 0 self.word += self.current_symbol print(self.str) def action1(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 0): self.word = "" self.str += " " self.str += predicts[0] def action2(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 1): self.word = "" self.str += " " self.str += predicts[1] def action3(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 2): self.word = "" self.str += " " self.str += predicts[2] def action4(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 3): self.word = "" self.str += " " self.str += predicts[3] def action5(self): predicts = self.hs.suggest(self.word) if (len(predicts) > 4): self.word = "" self.str += " " self.str += predicts[4] def destructor(self): print("Closing Application...") self.root.destroy() self.vs.release() cv2.destroyAllWindows() def destructor1(self): print("Closing Application...") self.root1.destroy() def action_call(self): self.root1 = tk.Toplevel(self.root) self.root1.title("About") self.root1.protocol('WM_DELETE_WINDOW', self.destructor1) self.root1.geometry("900x900") self.tx = tk.Label(self.root1) self.tx.place(x=360, y=40) self.tx.config(text="Efforts By", font=("Courier", 20, "bold")) self.photo1 = tk.PhotoImage(file='Pictures/chiranjit.png') self.w1 = tk.Label(self.root1, image=self.photo1) self.w1.place(x=170, y=105) self.tx6 = tk.Label(self.root1) self.tx6.place(x=170, y=310) self.tx6.config(text="Chiranjit\n170130103093", font=("Courier", 15, "bold")) self.photo2 = tk.PhotoImage(file='Pictures/mitesh.png') self.w2 = tk.Label(self.root1, image=self.photo2) self.w2.place(x=380, y=105) self.tx2 = tk.Label(self.root1) self.tx2.place(x=380, y=310) self.tx2.config(text="Mitesh\n170130103115", font=("Courier", 15, "bold")) self.photo3 = tk.PhotoImage(file='Pictures/harshil.png') self.w3 = tk.Label(self.root1, image=self.photo3) self.w3.place(x=590, y=105) self.tx3 = tk.Label(self.root1) self.tx3.place(x=590, y=310) self.tx3.config(text="Harshil\n170130103092", font=("Courier", 15, "bold")) self.tx7 = tk.Label(self.root1) self.tx7.place(x=220, y=380) self.tx7.config(text="Under the supervision of", font=("Courier", 20, "bold")) self.photo6 = tk.PhotoImage(file='Pictures/sir.png') self.w6 = tk.Label(self.root1, image=self.photo6) self.w6.place(x=380, y=430) self.tx6 = tk.Label(self.root1) self.tx6.place(x=230, y=640) self.tx6.config(text="Prof. Manan M. Nanavati", font=("Courier", 20, "bold"))
class CyHunspell(): ''' Спеллер на основе cython версии hunspell >>> word_en = 'cookbok' >>> word_ru = 'поваринная' >>> speller_en = CyHunspell(lang="en") >>> speller_en.spell(word_en) False >>> speller_en.suggest(word_en) ('cookbook', 'copybook', 'codebook', 'Cook', 'cook') >>> speller_en.replace(word_en) 'cookbook' >>> speller_ru = CyHunspell(lang="ru") >>> speller_ru.spell(word_ru) False >>> speller_ru.suggest(word_ru) ('поваренная',) >>> speller_ru.replace(word_ru) 'поваренная' ''' langs = {'ru': 'ru_RU', 'en': 'en_US'} def __init__( self, lang='en', max_dist=2, cpu=os.cpu_count(), # cache_manager="hunspell",disk_cache_dir=None, # hunspell_data_dir=None,system_encoding=None spell_kwargs={}): self.lang = self.langs.get(lang, lang) self.spell_dict = Hunspell(self.lang, **spell_kwargs) self.max_dist = max_dist self.spell_dict.set_concurrency(cpu) def spell(self, word): try: result = self.spell_dict.spell(word) except UnicodeEncodeError as err: result = None return result def suggest(self, word): try: result = self.spell_dict.suggest(word) except UnicodeEncodeError as err: result = tuple() return result def replace(self, word, max_dist=None): max_dist = max_dist if max_dist is not None else self.max_dist if self.spell(word): return word suggestions = self.suggest(word) if (suggestions and edit_distance(word, suggestions[0]) <= max_dist): return suggestions[0] else: return word
class HunspellTest(unittest.TestCase): def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(IOError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog', )) self.assertEqual(self.h.stem('permanently'), ('permanent', )) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog', ), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = [ 'bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg' ] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent', ), 'dog': ('dog', ) }) self.assertDictEqual( self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded', ), 'permanently': ('permanent', ), 'twigs': ('twig', ), 'dog': ('dog', ) })
class HunspellTest(unittest.TestCase): def setUp(self): self.h = Hunspell('test', hunspell_data_dir=DICT_DIR) def tearDown(self): try: del self.h except AttributeError: pass def assertAllIn(self, checked, expected): self.assertTrue(all(x in expected for x in checked), u"{} not all found in {}".format(checked, expected)) def test_create_destroy(self): del self.h def test_missing_dict(self): with self.assertRaises(IOError): Hunspell('not_avail', hunspell_data_dir=DICT_DIR) def test_spell(self): self.assertFalse(self.h.spell('dpg')) self.assertTrue(self.h.spell('dog')) def test_spell_utf8(self): self.assertTrue(self.h.spell(u'café')) self.assertFalse(self.h.spell(u'uncafé')) def test_spell_empty(self): self.assertTrue(self.h.spell('')) def test_suggest(self): required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') suggest = self.h.suggest('dpg') self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_utf8(self): required = (u'café', u'Cerf') for variant in ('cefé', u'cefé'): suggest = self.h.suggest(variant) self.assertIsInstance(suggest, tuple) self.assertAllIn(required, suggest) def test_suggest_empty(self): self.assertEqual(self.h.suggest(''), ()) def test_stem(self): self.assertEqual(self.h.stem('dog'), ('dog',)) self.assertEqual(self.h.stem('permanently'), ('permanent',)) def test_bulk_suggest(self): self.h.set_concurrency(3) suggest = self.h.bulk_suggest(['dog', 'dpg']) self.assertEqual(sorted(suggest.keys()), ['dog', 'dpg']) self.assertIsInstance(suggest['dog'], tuple) self.assertAllIn(('dog',), suggest['dog']) required = ('dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg') self.assertIsInstance(suggest['dpg'], tuple) self.assertAllIn(required, suggest['dpg']) checked = ['bjn', 'dog', 'dpg', 'dyg', 'foo', 'frg', 'opg', 'pgg', 'qre', 'twg'] suggest = self.h.bulk_suggest(checked) self.assertEqual(sorted(suggest.keys()), checked) def test_bulk_stem(self): self.h.set_concurrency(3) self.assertDictEqual(self.h.bulk_stem(['dog', 'permanently']), { 'permanently': ('permanent',), 'dog': ('dog',) }) self.assertDictEqual(self.h.bulk_stem(['dog', 'twigs', 'permanently', 'unrecorded']), { 'unrecorded': ('recorded',), 'permanently': ('permanent',), 'twigs': ('twig',), 'dog': ('dog',) })
class Stem(): """The Stem class deals with various tasks as follows: - spell error detection and correction - morphological analysis - stemming These tasks are carried out in the `Kurdish Hunspell project <https://github.com/sinaahmadi/KurdishHunspell>`_. """ def __init__(self, dialect, script): self.hunspell_flags = { "po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation" } if dialect == "Sorani" and script == "Arabic": self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/")) else: raise Exception( "Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!" ) # def stem(self, word): # """A function for stemming a single word""" # pass # def lemmatize(self, word): # """A function for lemmatization of a single word""" # pass def check_spelling(self, word): """Check spelling of a word Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: bool: True if the spelling is correct, False if the spelling is incorrect """ if not isinstance(word, str): raise TypeError("Only a word (str) is allowed.") else: return self.huns.spell(word) def correct_spelling(self, word): """Correct spelling errors if the input word is incorrect Args: word (str): input word to be spell-checked Raises: TypeError: only string as input Returns: tuple (boolean, list): a tuple where the first element indicates the correctness of the word (True if correct, False if incorrect). If the input word is incorrect, suggestions are provided in a list as the second element of the tuple, as (False, []). If no suggestion is available, the list is returned empty as (True, []). """ if not isinstance(word, str): raise TypeError("Only a word (str) is allowed.") else: if self.check_spelling(word): return (True, []) return (False, list(self.huns.suggest(word))) def analyze(self, word_form): """Morphological analysis of a given word More details regarding Kurdish morphological analysis can be found at https://github.com/sinaahmadi/KurdishHunspell Args: word_form (str): a single word-form Raises: TypeError: only string as input Returns: (list(dict)): a list of all possible morphological analyses according to the defined morphological rules The morphological analysis is returned as a dictionary as follows: - "pos": the part-of-speech of the word-form according to `the Universal Dependency tag set <https://universaldependencies.org/u/pos/index.html>`_ - "description": is flag - "terminal_suffix": anything except ts flag - "formation": if ds flag is set, its value is assigned to description and the value of formation is set to derivational. Although the majority of our morphological rules cover inflectional forms, it is not accurate to say all of them are inflectional. Therefore, we only set this value to derivational wherever we are sure. - "base": `ts` flag. The definition of terminal suffix is a bit tricky in Hunspell. According to `the Hunspell documentation <http://manpages.ubuntu.com/manpages/trusty/en/man4/hunspell.4.html>`_, "Terminal suffix fields are inflectional suffix fields "removed" by additional (not terminal) suffixes". In other words, the ts flag in Hunspell represents whatever is left after stripping all affixes. Therefore, it is the morphological base. If the input cannot be analyzed morphologically, an empty list is returned. """ if not isinstance(word_form, str): raise TypeError("Only a word (str) is allowed.") else: # Given the morphological analysis of a word-form with Hunspell flags, extract relevant information and return a dictionary word_analysis = list() for analysis in list(self.huns.analyze(word_form)): analysis_dict = dict() for item in analysis.split(): if ":" not in item: continue if item.split(":")[1] == "ts": # ts flag exceptionally appears after the value as value:key in the Hunspell output analysis_dict["base"] = item.split(":")[0] # anything except the terminal_suffix is considered to be the base analysis_dict[self.hunspell_flags[item.split( ":")[1]]] = word_form.replace( item.split(":")[0], "") elif item.split(":")[0] in self.hunspell_flags.keys(): # assign the key:value pairs from the Hunspell string output to the dictionary output of the current function # for ds flag, add derivation as the formation type, otherwise inflection if item.split(":")[0] == "ds": analysis_dict[self.hunspell_flags[item.split( ":")[0]]] = "derivational" analysis_dict[ self.hunspell_flags["is"]] = item.split(":")[1] else: analysis_dict[self.hunspell_flags[item.split( ":")[0]]] = item.split(":")[1] # if there is no value assigned to the ts flag, the terminal suffix is a zero-morpheme 0 if self.hunspell_flags[ "ts"] not in analysis_dict or analysis_dict[ self.hunspell_flags["ts"]] == "": analysis_dict[self.hunspell_flags["ts"]] = "0" word_analysis.append(analysis_dict) return word_analysis