def addCustomDict(self, customDictPath: str): logger.debug('Add custom dictionary: {}'.format(customDictPath)) dic_folder = os.path.dirname(customDictPath) dic_file_name = os.path.basename(customDictPath) dic_name = dic_file_name[:-4] dic_file = customDictPath aff_file = customDictPath[:-4] + '.aff' key = (dic_name, customDictPath) if key in self._checkers: logger.debug('Dictionary already added: {}'.format(customDictPath)) return try: create_new_dic_file(dic_file) create_new_aff_file(aff_file) fix_dic_file(dic_file) checker = hunspell.Hunspell( dic_name, hunspell_data_dir=dic_folder, system_encoding='UTF-8') self._checkers[key] = checker self._customDicts.append((key, dic_file)) except IOError: logger.error( "Can't create custom dictionary: {}".format(customDictPath))
def __init__(self, langlist: List[str], folders: List[str]): """ langlist - list of the languages ("ru_RU", "en_US", etc) """ logger.debug('Initialize HunspellWrapper spell checker') # Key - language (en_US, ru_RU etc), # value - onstance of the HunSpell class self._checkers = {} # Index - number of the dictionary, # value - tuple: (key for self._checkers, path to .dic file) self._customDicts = [] dictsFinder = DictsFinder(folders) for lang in langlist: checker = None for path in dictsFinder.getFoldersForLang(lang): dic_file = os.path.join(path, lang + '.dic') aff_file = os.path.join(path, lang + '.aff') if (checker is None and os.path.exists(dic_file) and os.path.exists(aff_file)): checker = hunspell.Hunspell( lang, hunspell_data_dir=path, system_encoding='UTF-8') logger.debug('Add dictionary: {}'.format(dic_file)) if checker is not None: self._checkers[lang] = checker
def __init__(self, input_data): super().__init__(input_data=input_data) self.hunspell = hunspell.Hunspell() self.correct_words = [] self.incorrect_words = [] self.full_data = [ ] # Tuples of [Original-Word, Hunspell-Suggestion, Levenshtein-Distance] self.has_done_work = False self.tokenizer = TweetTokenizer()
def preprocess_texts_from_given_forum(forum_id, date_from, date_to, filename): data_frame = get_texts_and_prepare_data_frame(date_from, date_to, forum_id) data_frame = delete_undesired_elements_from_texts(data_frame) tokens = data_frame.post.apply(lambda x: nltk.word_tokenize(x)) hun = hunspell.Hunspell('pl') counter = [0] tokens_stemmed = tokens.apply(lambda x: correct_writing(hun, x, counter)) tokens_stemmed = delete_stop_words(tokens_stemmed) data_frame.post = tokens_stemmed data_frame.to_csv( filename, sep=';', escapechar='\\') #addidional save after preparing preprocessing return data_frame
def __init__(self, dict_file, aff_file, add_words_file, tokenizer_language, dictionary_dir=None, language_long=None): self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.dict_file = dict_file if self.dict_file == None: print('*** Missing spelling-dict-file in configuration. Exiting.') sys.exit(1) self.aff_file = aff_file if self.dict_file == None: print('*** Missing spelling-aff-file in configuration. Exiting.') sys.exit(1) self.add_words_file = add_words_file if sys.platform == 'darwin': localdir = os.path.dirname(__file__) dict_dir = os.path.join(localdir, dictionary_dir) self.speller = hunspell.Hunspell(language_long, hunspell_data_dir=dict_dir) else: self.speller = hunspell.HunSpell(self.dict_file, self.aff_file) if self.speller == None: print('>>>>>> Could not create speller...') try: self.tokenizer = nltk.data.load( 'tokenizers/punkt/{0}.pickle'.format(tokenizer_language)) except: print('>>>>>> Could not load TOKENIZER language file.') sys.exit(1) if self.add_words_file != None: self.train()
def check(self, fname): """Check file.""" if os.path.exists(self.exclude_fname): patterns = [ _make_abspath(item) for item in _read_file(self.exclude_fname) ] if any(fnmatch(fname, pattern) for pattern in patterns): return [] ret = [] if not self.native: # hunspell has trouble with apostrophes and other delimiters out-of-the-box words = [] with open(fname, "r") as fobj: for line in fobj: for word in re.split("[^a-zA-Z]", line.strip()): words.append(word) with TmpFile( lambda x: x.write(os.linesep.join(words))) as temp_fname: stdout, _ = _shcmd(self.cmd + [temp_fname]) words = sorted( list(set([word.strip() for word in stdout if word.strip()]))) if words: ldict = _grep(fname, words) for word, lines in [(word, ldict[word]) for word in words]: for lnum in lines: ret.append((lnum, (word, ))) else: spell_obj = hunspell.Hunspell("en_US") ret = [] with open(fname, "r") as fobj: for num, line in enumerate(fobj): line = line.strip() for word in re.split("[^a-zA-Z]", line): if (not spell_obj.spell(word)) and ( word not in self.whitelist): ret.append((num + 1, (word, ))) return ret
FILE_NAME = "markov.csv" df = pd.read_csv(PATH + FILE_NAME) all_words = [] for index, row in df.iterrows(): all_words.append(row['hunspell_errors']) flat_list = "".join([item for sublist in all_words for item in sublist]).replace("][", ",").replace('''"''', '\'').replace("','", "', '") print('"' in flat_list) splitlist = flat_list[2:].split("\', \'") hs = hunspell.Hunspell() tuples = [] from tqdm import tqdm for word in tqdm(splitlist, desc="Loading Suggestions"): suggestions = hs.suggest(word) if(len(suggestions) > 0 and word != "<END>"): tuples.append([word, suggestions[0]]) print(tuples) from Levenshtein import _levenshtein def calc_levenshtein(tup): return _levenshtein.distance(tup[0], tup[1])
import hunspell dic_any = hunspell.Hunspell("es_ANY") dic_es = hunspell.Hunspell("es_ES", "es_ES") dic_en = hunspell.Hunspell("en_US", "en_US") res = dic_any.spell("análisis") def check_spell(words): count = 0 count_en = 0 if len(words) == 0: return 1 for word in words: correct_es = dic_any.spell(word) or dic_es.spell(word) count = count + 1 if correct_es else count if not correct_es: correct_en = dic_any.spell(word) count_en = count_en + 1 if correct_en else count return count / len(words), count_en
def dictionary(self, name, path): self.checker = hunspell.Hunspell(name, hunspell_data_dir=path) return self
from re import finditer from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn import decomposition, ensemble import pandas, xgboost, numpy, textblob, string from keras.preprocessing import text, sequence from keras import layers, models, optimizers from nltk.tokenize.treebank import TreebankWordDetokenizer from collections import Counter LANGUAGE_CODE = 'es' dictionary = hunspell.Hunspell('es_ANY', hunspell_data_dir="./") emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) data_path = "./public_data_development/" data_path_mint = "/home/nacho/DATASETS/public_data_development/" parser_dev = ET.XMLParser(encoding='utf-8')
import hunspell import os curr_dir = os.getcwd() print(curr_dir) dir1 = curr_dir + '/dictionary' print(dir1) h = hunspell.Hunspell('en_US' , hunspell_data_dir=dir1) print("Word Suggestions ready") while(True): i = input("String?") list1 = h.suggest(i) print(list1) list2 = [] for s in list1: if s.startswith(i): list2.append(s) print(list2)
def test_given_correct_word_when_testing_then_true(self): word = "konstytucja" res = prp.is_correct(word, hun=hunspell.Hunspell('pl')) self.assertTrue(res)
def __init__(self): ruta_recursos = BASE_DIR + "/res/hunspell-es/" print(ruta_recursos) #self.dic = hunspell.Hunspell(ruta_recursos + "es_ANY.dic", ruta_recursos + "es_ANY.aff") self.dic = hunspell.Hunspell(ruta_recursos + "es_ANY")
from nltk.tokenize.treebank import TreebankWordDetokenizer from collections import Counter from textacy import keyterms warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') warnings.filterwarnings(action='ignore') LANGUAGE_CODE = ['es', 'cr', 'mx', 'pe', 'uy'] CROSS_LINGUAL = [True, False] # CROSS_LINGUAL = [True] bTestPhase = False # If we are doing test, then concatenate train + dev, if not use dev as test dictionary = hunspell.Hunspell( 'es_ANY', hunspell_data_dir="./dictionaries") # In case you're using CyHunspell print("Loading Hunspell directory") # dictionary = hunspell.HunSpell('./Dict/es_ANY.dic', "./Dict/es_ANY.aff") # In case you're using Hunspell LABEL_ENCODER = preprocessing.LabelEncoder() TERNARY_LABEL_ENCODER = preprocessing.LabelEncoder() data_test_path = "./public_data_task1/" data_path = "./public_data_development/" # data_path = "../TASS2019/DATASETS/public_data/" print("Loading Spacy Model") lemmatizer = spacy.load( "es_core_news_sm") # GLOBAL to avoid loading the model several times print("Loading NLTK stuff")
def __init__(self): self.directory = 'C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/model/' dict_path = 'C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/hunspell-master/dicts/en_US/' self.hs = hunspell.Hunspell("en_US", hunspell_data_dir=dict_path) #self.hs = hunspell.Hunspell('C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/hunspell-master/dicts/en_US') self.vs = cv2.VideoCapture(0 + cv2.CAP_DSHOW) self.current_image = None self.current_image2 = None self.json_file = open(self.directory+"model-bw.json", "r") self.model_json = self.json_file.read() self.json_file.close() self.loaded_model = model_from_json(self.model_json) self.loaded_model.load_weights(self.directory+"model-bw.h5") self.json_file_dru = open(self.directory+"model-bw_dru.json" , "r") self.model_json_dru = self.json_file_dru.read() self.json_file_dru.close() self.loaded_model_dru = model_from_json(self.model_json_dru) self.loaded_model_dru.load_weights(self.directory+"model-bw_dru.h5") self.json_file_tkdi = open(self.directory+"model-bw_tkdi.json" , "r") self.model_json_tkdi = self.json_file_tkdi.read() self.json_file_tkdi.close() self.loaded_model_tkdi = model_from_json(self.model_json_tkdi) self.loaded_model_tkdi.load_weights(self.directory+"model-bw_tkdi.h5") self.json_file_smn = open(self.directory+"model-bw_smn.json" , "r") self.model_json_smn = self.json_file_smn.read() self.json_file_smn.close() self.loaded_model_smn = model_from_json(self.model_json_smn) self.loaded_model_smn.load_weights(self.directory+"model-bw_smn.h5") self.ct = {} self.ct['blank'] = 0 self.blank_flag = 0 for i in ascii_uppercase: self.ct[i] = 0 print("Loaded model from disk") self.root = tk.Tk() self.root.title("SIGN LANGUAGE ILLUSTRATOR") self.root.protocol('WM_DELETE_WINDOW', self.destructor) self.root.attributes('-fullscreen', True) self.root.bind('<Escape>', lambda e: self.root.destroy()) #self.root.geometry("1080x1920") self.panel = tk.Label(self.root) self.panel.place(x = 135, y = 10, width = 640, height = 640) self.panel2 = tk.Label(self.root) # initialize image panel self.panel2.place(x = 460, y = 95, width = 310, height = 310) self.T = tk.Label(self.root) self.T.place(x=21, y = 17) self.T.config(text = "SIGN LANGUAGE TO TEXT",font=("sans-serif",25,"bold")) self.panel3 = tk.Label(self.root) # Current SYmbol self.panel3.place(x = 1100,y=90) self.T1 = tk.Label(self.root) self.T1.place(x = 900,y = 90) self.T1.config(text="Character :",font=("sans-serif",15,"bold")) self.panel4 = tk.Label(self.root) # Word self.panel4.place(x = 1100 ,y=130) self.T2 = tk.Label(self.root) self.T2.place(x = 900,y = 130) self.T2.config(text ="Word :",font=("sans-serif",15,"bold")) self.panel5 = tk.Label(self.root) # Sentence self.panel5.place(x = 1100,y=170) self.T3 = tk.Label(self.root) self.T3.place(x = 900,y = 170) self.T3.config(text ="Sentence :",font=("sans-serif",15,"bold")) self.T4 = tk.Label(self.root) self.T4.place(x = 900,y = 220) self.T4.config(text = "Suggestions :-",fg="blue",font = ("sans-serif",20,"bold")) self.bt1=tk.Button(self.root, command=self.action1,height = 0,width = 0) self.bt1.place(x = 900,y=260) #self.bt1.grid(padx = 900, pady = 180) self.bt2=tk.Button(self.root, command=self.action2,height = 0,width = 0) self.bt2.place(x = 1200,y=260) # self.bt2.grid(row = 4, column = 1, columnspan = 1, padx = 900, pady = 210, sticky = tk.NW) self.panel3.place(x = 1100,y=90) self.bt3=tk.Button(self.root, command=self.action3,height = 0,width = 0) self.bt3.place(x = 900,y=300) #self.bt3.grid(row = 4, column = 2, columnspan = 1, padx = 900, pady = 270, sticky = tk.NW) self.bt4=tk.Button(self.root, command=self.action4,height = 0,width = 0) self.bt4.place(x = 1200,y=300) # self.bt4.grid(row = 5, column = 0, columnspan = 1, padx = 900, pady = 300, sticky = tk.N) self.bt5=tk.Button(self.root, command=self.action5,height = 0,width = 0) self.bt5.place(x = 1050,y=340) # self.bt5.grid(row = 5, column = 1, columnspan = 1, padx = 900, pady = 330, sticky = tk.N) self.image1 = Image.open("C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/sign.jpg") self.image1 = self.image1.resize((400, 300), Image.ANTIALIAS) test = ImageTk.PhotoImage(self.image1) self.label1 = tk.Label(image=test) self.label1.image = test # Position image self.label1.place(x=900, y=420) self.str="" self.word="" self.current_symbol="Empty" self.photo="Empty" self.video_loop()
import pickle import gensim import pandas as pd import spacy import hunspell from util.SpellChecker import check_spell from util.TextPreprocessor import clean_text from util.UserStoryParser import parse_user_story dic_es = hunspell.Hunspell("es_ES", "es_ES") dic_en = hunspell.Hunspell("en_US", "en_US") try: with open('./../word2vec.pickle', 'rb') as handle: word2vec = pickle.load(handle) except FileNotFoundError: word2vec = gensim.models.KeyedVectors.load_word2vec_format( './../resources/SBW-vectors-300-min5.bin', binary=True) with open('./../word2vec.pickle', 'wb') as handle: pickle.dump(word2vec, handle, protocol=pickle.HIGHEST_PROTOCOL) def label_user_stories(): user_stories_df = pd.read_csv("./../resources/data_aug.csv", sep="\t") user_stories_df["cleaned"] = user_stories_df.apply( lambda row: clean_text(row["description"]), axis=1) user_stories = [parse_user_story(us) for us in user_stories_df["cleaned"]] spacy_nlp = spacy.load('es')
def test_given_incorrect_word_when_testing_then_false(self): word = "karkoweczka" res = prp.is_correct(word, hun=hunspell.Hunspell('pl')) self.assertFalse(res)
def __init__(self, obj): self.root = Toplevel(obj.root) self.root.title("Gesture to Text and Voice") self.root.protocol('WM_DELETE_WINDOW', self.destructor) self.root.geometry('1600x1000') self.root.configure(bg="gray11") self.root.withdraw() splash = Splash(self) #RGB feed self.panel1 = Label(self.root) self.panel1.place(x=500, y=-60, width=600, height=600) #binary feed self.panel2 = Label(self.root) self.panel2.place(x=800, y=95, width=250, height=250) #Gesture Chart self.gesturechart = cv2.imread("images\\canny.png") self.panel3 = Label(self.root) self.panel3.place(x=50, y=0, width=400, height=500) self.im1 = cv2.cvtColor(self.gesturechart, cv2.COLOR_BGR2RGBA) self.im1 = cv2.resize(self.im1, (400, 500)) self.im1 = Image.fromarray(self.im1) self.g1 = ImageTk.PhotoImage(image=self.im1) self.panel3.img = self.g1 self.panel3.configure(image=self.g1) #sentence self.label1 = Label(self.root, text="Sentence:", bg="gray1", fg="red", font=("Courier", 20)) self.label1.place(x=50, y=700) self.label2 = Label(self.root, bg="gray1", fg="yellow") self.label2.place(x=250, y=700) #Predicted letter self.label3 = Label(self.root, text="Predicted letter:", bg="gray1", fg="red", font=("Courier", 20)) self.label3.place(x=50, y=580) self.label4 = Label(self.root, bg="gray1", fg="cyan", font=("Courier", 20)) self.label4.place(x=350, y=580) #word formation self.label5 = Label(self.root, text="Current Word:", bg="gray1", fg="red", font=("Courier", 20)) self.label5.place(x=50, y=640) self.label6 = Label(self.root, bg="gray1", fg="cyan", font=("Courier", 20)) self.label6.place(x=350, y=640) #suggestion header self.label7 = Label(self.root, bg="gray1", text="Suggestions", fg="spring green", font=("Courier", 30)) self.label7.place(x=1200, y=10) #Buttons #enter buuton self.bt0 = Button(self.root, text="Enter", font=("Courier", 15, "bold"), command=self.suggest, bg="gray9", fg="deepskyblue2") self.bt0.place(x=600, y=500) #sign to speech buuton self.bt_speak = Button(self.root, text="Speak", font=("Courier", 15, "bold"), command=self.gesture_to_voice, bg="gray9", fg="deepskyblue2") self.bt_speak.place(x=800, y=500) #reset buttons self.bt_reset = Button(self.root, text="Reset", font=("Courier", 15, "bold"), command=self.reset, bg="gray9", fg="deepskyblue2") self.bt_reset.place(x=1000, y=500) #Suggestion buttons self.bt1 = Button(self.root, command=self.action1, bg="gray9", fg="pink", height=0, width=0) self.bt1.place(x=1125, y=60) self.bt2 = Button(self.root, command=self.action2, bg="gray9", fg="pink", height=0, width=0) self.bt2.place(x=1265, y=110) self.bt3 = Button(self.root, command=self.action3, bg="gray9", fg="pink", height=0, width=0) self.bt3.place(x=1405, y=60) self.bt4 = Button(self.root, command=self.action4, bg="gray9", fg="pink", height=0, width=0) self.bt4.place(x=1125, y=160) self.bt5 = Button(self.root, command=self.action5, bg="gray9", fg="pink", height=0, width=0) self.bt5.place(x=1265, y=210) self.bt6 = Button(self.root, command=self.action6, bg="gray9", fg="pink", height=0, width=0) self.bt6.place(x=1405, y=160) self.bt7 = Button(self.root, command=self.action7, bg="gray9", fg="pink", height=0, width=0) self.bt7.place(x=1125, y=270) self.bt8 = Button(self.root, command=self.action8, bg="gray9", fg="pink", height=0, width=0) self.bt8.place(x=1265, y=330) self.bt9 = Button(self.root, command=self.action9, bg="gray9", fg="pink", height=0, width=0) self.bt9.place(x=1405, y=270) self.bt10 = Button(self.root, command=self.action10, bg="gray9", fg="pink", height=0, width=0) self.bt10.place(x=1125, y=390) self.bt11 = Button(self.root, command=self.action11, bg="gray9", fg="pink", height=0, width=0) self.bt11.place(x=1265, y=450) self.bt12 = Button(self.root, command=self.action12, bg="gray9", fg="pink", height=0, width=0) self.bt12.place(x=1405, y=390) self.bw_model = obj.bw_model self.dru_model = obj.dru_model self.tkdi_model = obj.tkdi_model self.vw_model = obj.vw_model self.aesmn_model = obj.aesmn_model self.directory = 'model\\' self.cam = cv2.VideoCapture(0) self.current_image = None self.canny_img = None self.image_x = 128 self.image_y = 128 self.ct = {} self.ct['blank'] = 0 self.blank_flag = 0 for i in ascii_uppercase: self.ct[i] = 0 self.sentence = "" self.word = "" self.current_symbol = "Empty" curr_dir = os.getcwd() dir1 = curr_dir + '/dictionary' self.h = hunspell.Hunspell('en_US', hunspell_data_dir=dir1) self.engine = pyttsx3.init() self.voices = self.engine.getProperty('voices') self.engine.setProperty('voice', self.voices[1].id) self.engine.setProperty('rate', 130) splash.destroy() self.root.deiconify() self.videoloop()
os.mkdir(clean_tables_dir) tables_csv = [ f for f in os.listdir(tables_dir) if os.path.isfile(os.path.join(tables_dir, f)) ] # Import standard Slovak vocabulary corpus and dictionary import hunspell normal_SK = os.path.join(working_dir, 'Dicts\\sk_SK') english_US = os.path.join(working_dir, 'Dicts\\en_US') special_SK = os.path.join(working_dir, 'Dicts\\sk_SK_special') # Dictionary with standard Slovak language and words from contracts in this sector by build_special_dictionary.py hunspell_normal = hunspell.Hunspell(normal_SK, normal_SK) hunspell_english = hunspell.Hunspell(english_US, english_US) hunspell_special = hunspell.Hunspell(normal_SK, special_SK) # Own spellcheck function also making sure word is case-folded and whitespace is stripped def spell(word): word = word.casefold().strip() return (hunspell_normal.spell(word) or hunspell_english.spell(word) or hunspell_special.spell(word)) # Import keywords and add them to the special dictionary for spellchecking fo = open('keywords.txt', 'r', encoding='utf-8') lines = fo.readlines() fo.close()
new_word = new_word + char word = True else: if word: words.append(new_word) new_word = '' word = False return words # Import standard Slovak dictionary normal_SK = os.path.join(os.getcwd(), 'Dicts\\sk_SK') english_US = os.path.join(os.getcwd(), 'Dicts\\en_US') hunspell_normal = hunspell.Hunspell(normal_SK, normal_SK) hunspell_english = hunspell.Hunspell(english_US, english_US) def check_normal(word): return hunspell_normal.spell(word) or hunspell_english.spell(word) # Find all text contracts find_txt = re.compile('txt') working_dir = os.getcwd() + '\\IT_contracts_text\\' contracts = [ f for f in os.listdir(working_dir) if os.path.isfile(os.path.join(working_dir, f)) ]