def __init__(self, language: str, platform_folder: str, embeddings_path: str): """ :param platform_folder: str, like datasets/Websites/FakeBrCorpus/ :param embeddings_path: str, like embeddings/pt/model.txt """ self.PLATFORM_FOLDER = platform_folder binary = True if embeddings_path.split('.')[-1] == 'bin' else False self.embeddings = KeyedVectors.load_word2vec_format( embeddings_path, binary=binary, unicode_errors='ignore') self.LANGUAGE = language self.PT_BR_dic = 'Dictionaries/pt_BR/pt_BR.dic' self.PT_BR_aff = 'Dictionaries/pt_BR/pt_BR.aff' self.EN_US_dic = 'Dictionaries/en_US/en_US.dic' self.EN_US_aff = 'Dictionaries/en_US/en_US.aff' self.BG_BG_dic = 'Dictionaries/bg_BG/bg.dic' self.BG_BG_aff = 'Dictionaries/bg_BG/bg.aff' if language == 'pt': self.spell_checker = hunspell.HunSpell(self.PT_BR_dic, self.PT_BR_aff) elif language == 'en': self.spell_checker = hunspell.HunSpell(self.EN_US_dic, self.EN_US_aff) else: self.spell_checker = hunspell.HunSpell(self.BG_BG_dic, self.BG_BG_aff)
def test_map_4(hpk_dic_words, hpk_dic_filepath, sb0n_REP10D_MAP5_filepath, sb0n_MAP5_filepath): ''' Note D stands for DoubleVowel Example of failure ama maa sb0n_REP10D_MAP5_suggestions ['mā', 'ama'] maa sb0n_MAP5_suggestions ['ama', 'maua'] - Show stopper, no 'mā' ''' hobj_sb0n_REP10D_MAP5 = hunspell.HunSpell(hpk_dic_filepath, sb0n_REP10D_MAP5_filepath) hobj_sb0n_MAP5 = hunspell.HunSpell(hpk_dic_filepath, sb0n_MAP5_filepath) for word in hpk_dic_words: if not " " in word and not "-" in word: word_as_list = list(word) shuffle(word_as_list) jumbled_word = ''.join(word_as_list) if jumbled_word not in hpk_dic_words: # Just test non-compound words, its easier sb0n_REP10D_MAP5_suggestions = [x.decode() for x in \ hobj_sb0n_REP10D_MAP5.suggest(jumbled_word)] sb0n_MAP5_suggestions = [x.decode() for x in \ hobj_sb0n_MAP5.suggest(jumbled_word)] print(word) print(jumbled_word, "sb0n_REP10D_MAP5_suggestions", sb0n_REP10D_MAP5_suggestions) print(jumbled_word, "sb0n_MAP5_suggestions", sb0n_MAP5_suggestions) assert sorted(sb0n_REP10D_MAP5_suggestions) == sorted(sb0n_MAP5_suggestions)
def __init__(self, locale, dict_dir='/usr/share/myspell/dicts/', words=None): """Requires a locale to work correctly.""" parts = re.split(r'\W', locale, 2) lang = parts[0].lower() try: co = parts[1].upper() except IndexError: co = '' locale = u'_'.join([lang, co]) if path.isfile(dict_dir + locale + '.dic'): self.hunspell = hunspell.HunSpell(dict_dir + locale + '.dic', dict_dir + locale + '.aff') elif path.isfile(dict_dir + lang + '.dic'): self.hunspell = hunspell.HunSpell(dict_dir + lang + '.dic', dict_dir + lang + '.aff') else: self.hunspell = None if self.hunspell and words: with open(words, 'r') as fp: lines = [l.strip() for l in fp.readlines()] for line in lines: self.hunspell.add(line)
async def on_message(message): # we do not want the bot to reply to itself if message.author == client.user: return if message.content.startswith("!suggest "): string = clean_content(message.content, "!suggest") hobj = hunspell.HunSpell("dictionaries/en_US.dic", "dictionaries/en_US.aff") if not hobj.spell(string): await message.channel.send( 'Did you maybe mean "' + hobj.suggest(string)[0] + '"?' ) else: await message.channel.send("Seems fine to me.") if message.content.startswith("!search "): string = clean_content(message.content, "!search") hobj = hunspell.HunSpell("dictionaries/en_US.dic", "dictionaries/en_US.aff") if not hobj.spell(string): data = Search(hobj.suggest(string)[0]) else: data = Search(string) await message.channel.send("", embed=data.performSearch()) if message.content.startswith("!build ") or message.content.startswith("!builds "): await message.channel.send(**BuildResponder(message).getReply()) if message.content.startswith("!skill ") or message.content.startswith("!skills "): await message.channel.send(**SkillResponder(message).getReply()) if message.content.startswith("!github"): await message.channel.send("https://github.com/rbridge/discord-divinity-bot") if message.content.startswith("!help"): await message.channel.send(**HelpResponder(message).getReply())
def test_map_3(hpk_dic_words, hpk_dic_filepath, sb0n_MAP5_REP10_filepath, sb0n_MAP5_filepath): ''' This passes and takes about 40 mins on the laptop. so MAP5 by itself is all that is needed Note that when I originally did this test the set up of the file MAP5_REP10 had 'REP 20' - This made the file misbehave. Need to put in place something to check that the number of REPs is as advertised. ''' hobj_sb0n_MAP5_REP10 = hunspell.HunSpell(hpk_dic_filepath, sb0n_MAP5_REP10_filepath) hobj_sb0n_MAP5 = hunspell.HunSpell(hpk_dic_filepath, sb0n_MAP5_filepath) for word in hpk_dic_words: if not " " in word and not "-" in word: word_as_list = list(word) shuffle(word_as_list) jumbled_word = ''.join(word_as_list) if jumbled_word not in hpk_dic_words: # Just test non-compound words, its easier sb0n_MAP5_REP10_suggestions = [x.decode() for x in \ hobj_sb0n_MAP5_REP10.suggest(jumbled_word)] sb0n_MAP5_suggestions = [x.decode() for x in \ hobj_sb0n_MAP5.suggest(jumbled_word)] print(word) print(jumbled_word, "sb0n_MAP5_REP10_suggestions", sb0n_MAP5_REP10_suggestions) print(jumbled_word, "sb0n_MAP5_suggestions", sb0n_MAP5_suggestions) assert sorted(sb0n_MAP5_REP10_suggestions) == sorted(sb0n_MAP5_suggestions)
def test_encoding(hpk_dic_filepath, \ test_aff_set_only_filepath, \ test_aff_empty_filepath): hobj_no_encoding = hunspell.HunSpell(hpk_dic_filepath, test_aff_empty_filepath) with pytest.raises(ValueError): assert hobj_no_encoding.spell("ā") == True hobj_encoding = hunspell.HunSpell(hpk_dic_filepath, test_aff_set_only_filepath) assert hobj_encoding.spell("ā") == True
def __init__(self, lang='en'): if lang == 'en': self.hobj = hunspell.HunSpell('/root/hunspell/en_US.dic', '/root/hunspell/en_US.aff') elif lang == 'es': self.hobj = hunspell.HunSpell('/root/hunspell/es_ANY.dic', '/root/hunspell/es_ANY.aff') elif lang == 'nl': self.hobj = hunspell.HunSpell('/root/hunspell/nl_NL.dic', '/root/hunspell/nl_NL.aff') else: raise ValueError('Unsupported language')
def baseline(corpus_file_name): dic_es = hunspellES.HunSpell("es.dic", "es.aff") #Get Hunspell spanish dictionary dic_eu = hunspellEU.HunSpell('eu.dic', 'eu.aff') #Get Hunspell basque dictionary CS_Corpus = open(corpus_file_name, 'rb') CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"') CS_Reader.next() #Skip first line count = 0 right = 0 total = 0 y_true = [] y_pred = [] for row in CS_Reader: row_processed = getTweetTokensTags(row) hand_tagged_tags = [] for token, tag in row_processed: hand_tagged_tags.append(tag) tokens = Process.tokenize( row[2].decode('UTF-8')) # Tweet text tokenized predicted_tags = [] for i in range(0, len(tokens)): t0 = tokens[i] if i > 0 and tokens[i - 1] not in [".", "!", "?"] and t0.istitle(): predicted_tags.append("IE") elif t0.isupper(): predicted_tags.append("IE") elif dic_es.spell(t0): predicted_tags.append("ES") elif dic_eu.spell(t0): predicted_tags.append("EUS") elif Process.isURL(t0): predicted_tags.append("URL") elif Process.isID(t0): predicted_tags.append("ID") else: predicted_tags.append("EG") y_true.append(hand_tagged_tags) y_pred.append(predicted_tags) print "" print "Sequence item accuracy score: %.5f" % seqItemAccuracyScore( y_true, y_pred) print "Sequence accuracy score: %.5f" % seqAccuracyScore(y_true, y_pred) print "Global tag accuracy score: %.5f" % globalTagAccuracyScore( y_true, y_pred)
def __init__(self, extra_words=(), **kwargs): super(Speller, self).__init__(**kwargs) if Speller.large_hspell is None: Speller.large_hspell = hunspell.HunSpell( os.path.join(config.hunspell_path, 'en_US.dic_large.utf8'), os.path.join(config.hunspell_path, 'en_US.aff.utf8')) self.hspell = hunspell.HunSpell( os.path.join(config.hunspell_path, 'en_US.dic_sw.utf8'), os.path.join(config.hunspell_path, 'en_US.aff.utf8')) self.extra_words = set(extra_words) for w in self.extra_words: self.hspell.add(w.lower())
def test_rep_2(hpk_dic_words, hpk_dic_filepath, sb0n_filepath, sb0n_REP11_filepath): hobj_sb0n = hunspell.HunSpell(hpk_dic_filepath, sb0n_filepath) hobj_sb0n_REP11 = hunspell.HunSpell(hpk_dic_filepath, sb0n_REP11_filepath) for word in hpk_dic_words: if not " " in word and not "-" in word: # Just test non-compound words, its easier sb0n_suggestions = [x.decode() for x in hobj_sb0n.suggest(word)] sb0n_REP11_suggestions = [x.decode() for x in \ hobj_sb0n_REP11.suggest(word)] print(word, "sb0n_suggestions", sb0n_suggestions) print(word, "sb0n_REP11_suggestions", sb0n_REP11_suggestions) assert sorted(sb0n_suggestions) == sorted(sb0n_REP11_suggestions)
def brute_rot(*args): enc_string = " ".join([x for x in args]) dec_strings = [] for key in range(1, 26): dec_string = "" for c in enc_string: if c.isalpha(): if c.islower(): dec_string += chr((ord(c) + key - 97) % 26 + 97) else: dec_string += chr((ord(c) + key - 65) % 26 + 65) else: dec_string += c dec_strings.append(dec_string) # FIXME: sometimes it ranks non-words as words hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') msg_table = [] for i, dec_string in enumerate(dec_strings): curr_rank = 0 for word in dec_string.split(): if hobj.spell(word): curr_rank += 1 msg_table.append((curr_rank, i + 1, dec_string)) msg_table = sorted(msg_table, key=lambda x: x[0], reverse=True) formatted_msg_table = "rank ┃ key ┃ decrypted_string\n" + "━━━━━╋━━━━━╋━━━━━━━━━━━━━━━━━" + "\n" for rank, key, dec_string in msg_table: formatted_msg_table += f"{rank:>4} ┃ {key:>3} ┃ {dec_string}\n" return ["```" + formatted_msg_table + "```"]
def spell(filenames, debug=False): if hunspell is None: raise ImportError('hunspell is not installed on your system. If you want ' 'to run `ontutils spell` please run pipenv install --dev --skip-lock. ' 'You will need the development libs for hunspell on your system.') spell_objects = (u for r in Parallel(n_jobs=9)(delayed(get_spells)(f) for f in filenames) for u in r) hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') #nobj = hunspell.HunSpell(os.path.expanduser('~/git/domain_wordlists/neuroscience-en.dic'), '/usr/share/hunspell/en_US.aff') # segfaults without aff :x collect = set() for filename, s, p, o in spell_objects: missed = False no = [] for line in o.split('\n'): nline = [] for tok in line.split(' '): prefix, tok, suffix = tokstrip(tok) #print((prefix, tok, suffix)) if not hobj.spell(tok):# and not nobj.spell(tok): missed = True collect.add(tok) nline.append(prefix + tc.red(tok) + suffix) else: nline.append(prefix + tok + suffix) line = ' '.join(nline) no.append(line) o = '\n'.join(no) if missed: #print(filename, s, o) print('>>>', o) if debug: [print(_) for _ in sorted(collect)] breakpoint()
def __init__(self, langlist, folders): """ langlist - list of the languages ("ru_RU", "en_US", etc) """ logger.debug('Initialize HunspellWrapper spell checker') # Key - language (en_US, ru_RU etc), # value - onstance of the HunSpell class self._checkers = {} # Index - number of the dictionary, # value - tuple: (key for self._checkers, path to .dic file) self._customDicts = [] dictsFinder = DictsFinder(folders) for lang in langlist: checker = None for path in dictsFinder.getFoldersForLang(lang): dic_file = os.path.join(path, lang + '.dic') aff_file = os.path.join(path, lang + '.aff') if (checker is None and os.path.exists(dic_file) and os.path.exists(aff_file)): checker = hunspell.HunSpell(dic_file, aff_file) else: checker.add_dic(dic_file) logger.debug('Add dictionary: {}'.format(dic_file)) if checker is not None: self._checkers[lang] = checker
def __init__(self, configDictionary): super(HunSpelling, self).__init__(configDictionary) self.profile = { "name" : "hunspelling-module", "class": "spelling", "supported-languages" : ["de", "en", "tr"] } self.logger = logging.getLogger(os.path.basename(sys.argv[0])) self.dict_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-dict-file', None, configDictionary) if self.dict_file == None: print('*** Missing spelling-dict-file in configuration. Exiting.') sys.exit(1) self.aff_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-aff-file', None, configDictionary) if self.dict_file == None: print('*** Missing spelling-aff-file in configuration. Exiting.') sys.exit(1) self.add_words_file = utils.getKeyFromSectionInConfiguration('spelling', 'training-add-words-from-file', None, configDictionary) self.speller = hunspell.HunSpell(self.dict_file, self.aff_file) if self.speller == None: print('>>>>>> Could not create speller...') tokenizer_language = utils.getKeyFromSectionInConfiguration('spelling', 'tokenizer-language', 'german', configDictionary) try: self.tokenizer = nltk.data.load('tokenizers/punkt/{0}.pickle'.format(tokenizer_language)) except: print('>>>>>> Could not load TOKENIZER language file.') sys.exit(1) if self.add_words_file != None: self.train()
def stem_with_hunspell(word): import hunspell huns_obj = hunspell.HunSpell(helper.get_nepali_dict_path(), helper.get_nepali_rules_path()) res = huns_obj.stem(word) for r in res: print(r.decode())
def __init__(self): index_file = "../../indices/sample_collection_jsonl/" self.keyword_util = KeywordSearchUtil(index_file) self.semantic_util = SemanticSearchUtil() self.hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') self.index_reader = index.IndexReader(index_file)
def __init__(self, script_path, tmx_file): """Initialize object""" self.verbose = False self.tmx_file = tmx_file self.translations = {} self.script_path = script_path self.exceptions_path = os.path.join(script_path, os.path.pardir, "exceptions") self.errors_path = os.path.join(script_path, os.path.pardir, "errors") # Set up spellcheckers # Load hunspell dictionaries dictionary_path = os.path.join(self.script_path, os.path.pardir, "dictionaries") self.spellchecker = hunspell.HunSpell( os.path.join(dictionary_path, "it_IT.dic"), os.path.join(dictionary_path, "it_IT.aff"), ) self.spellchecker.add_dic( os.path.join(dictionary_path, "mozilla_qa_specialized.dic")) # Extract strings self.extractStrings() # Run checks self.checkQuotes() self.checkSpelling()
def spellCorrect(self, origTweet=True): ''' Parameters: None Description: Correct spelling mistakes using the hunspell engine if origTweet: Returns: tokenized tweet ''' if origTweet: line = copy.copy(self.tok) else: line = self.sen hspell = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') #pass it through a spell checker - hunspell for i in range(len(line)): flag = False for c in self.punc: if c in line[i]: flag = True break if (not hspell.spell( line[i])) and line[i] != "**NAME**" and flag == False: line[i] = hspell.suggest(line[i])[0] self.sen = line return line
def __init__(self, server_address, RequestHandlerClass, settings, bind_and_activate=True): """Constructor. May be extended, do not override.""" self.log_path = settings['log'] self.key_file = settings['key'] self.cert_file = settings['cert'] self.allow_ip = IPRange(settings['allow_ip']) self.spellchecker = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') SocketServer.TCPServer.__init__(self, server_address, RequestHandlerClass, False) # initialize SSL connection self.socket = ssl.wrap_socket(self.socket, keyfile=self.key_file, certfile=self.cert_file, cert_reqs=ssl.CERT_NONE, ssl_version=ssl.PROTOCOL_TLSv1, server_side=True) # start serving if bind_and_activate: self.server_bind() self.server_activate()
def search_word_in_dict(word: str, dict: str, morphology: bool = True): global logger word = word.strip(' \n') words = [word] if morphology: hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') if hobj.spell(word) and hobj.stem(word): words = [b.decode() for b in hobj.stem(word)] logger.debug('Get stems: {}.'.format(', '.join(words))) builder = IndexBuilder(dict) builder.check_build() for w in words: meanings = builder.mdx_lookup(w, ignorecase=True) if not meanings: continue logger.debug('Find {} meanings of word {} from dictionary {}.'.format( len(meanings), w, dict)) if w != word: word = w return word, meanings[0] logger.debug('Cannot find word {} from dictionary {}.'.format(word, dict)) return word, None
def processSpellcheck(doc, directory, spellCheckDict): # do spell check try: hobj = hunspell.HunSpell('/usr/share/hunspell/%s.dic' % spellCheckDict, '/usr/share/hunspell/%s.aff' % spellCheckDict) except: print "Error: Dictionary %r not found." % spellCheckDict return # check wiki for custom dictionary update_custom_dictionary(hobj) # ugly to call it for each document :-/ file = os.path.join(directory, doc) + ".md" print "Doing spell check on %r" % file # read md file lines = list() out_lines = list() with open(file, 'r') as f: lines = f.readlines() for line in lines: words = line.strip("\n").split(" ") out_words = list() for w in words: # generate a version of the word that does not contain any non-alpha characters w_striped = w.strip(".:;,!?'_-") if w_striped.isalpha(): if not hobj.spell(w_striped): print "Spelling mistake: %r" % w_striped # mark mistake! we need to stick to simple ASCII chars, things like <strike> could break the latex document if they occur, e.g., in headings w = w.replace(w_striped, "??%s??" % w_striped) out_words.append(w) out_lines.append(" ".join(out_words)) # write back with open(file, 'w') as f: f.write("\n".join(out_lines))
def test_compound_suggestion(hpk_dic_words, hpk_dic_filepath, baseline_aff_filepath): hobj = hunspell.HunSpell(hpk_dic_filepath, baseline_aff_filepath) for word in hpk_dic_words: if " " in word: word_with_dashes = word.replace(" ", "-") suggestions = [x.decode() for x in hobj.suggest(word_with_dashes)] print(word, word_with_dashes, suggestions) assert any(x == word for x in suggestions)
def __init__(self): self.stanfordCoreNLP = StanfordCoreNLP('http://localhost', port=9500) self.spacyNLP = spacy.load('en_core_web_sm') with open(join(FILE_PATH, "infVocab.json"), "r") as f: self.infoVocab = json.load(f) self.spellchecker = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') self.lemmatizer = WordNetLemmatizer()
def init_spellchecker(corpus): """ Initializes the spell checker. It uses the corpus information to choose a proper language for spell checker. Returns the initialied spell checker """ if corpus in ["conll03_en", "ontonotes"]: spell_check = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') elif corpus in ["conll03_de", "germeval"]: spell_check = hunspell.HunSpell('/usr/share/hunspell/de_DE.dic', '/usr/share/hunspell/de_DE.aff') else: spell_check = None return spell_check
def __init__(self): """ Normalizer: restores original/formal spelling of misspelled/colloquial token, replaces twitter-specific phenomena for tagging Normalization requires setup of a dictionary of unigram/bigram occurences (to make the most probable correction) - first, run collect_bigrams() on every tweet - then, the normalize() function can be used For detailed usage, see tests/test_normalizer """ self.dictionary = hunspell.HunSpell( "/Users/ulisteinbach/Desktop/SS18/software_projekt/softwareprojekt/autosarkasmus/rsrc/hunspell/de_DE.dic", "/Users/ulisteinbach/Desktop/SS18/software_projekt/softwareprojekt/autosarkasmus/rsrc/hunspell/de_DE.aff" ) self.bigrams = defaultdict(lambda: 1.0) #add-1 self.unigrams = defaultdict(lambda: 1.0) self.emoji_pos = [ u"\U0001F601", u"\U0001F602", u"\U0001F603", u"\U0001F604", u"\U0001F605", u"\U0001F606", u"\U0001F607", u"\U0001F608", u"\U0001F609", u"\U0001F60A", u"\U0001F60B", u"\U0001F60C", u"\U0001F60D", u"\U0001F60E", u"\U0001F60F", u"\U0001F638", u"\U0001F639", u"\U0001F63A", u"\U0001F63B", u"\u263a", u"\U0001f61d", u"\u2600", u"\U0001f44d", u"\u2665" ] self.emoji_pos += [ u"\U0001f44c", u"\U0001f389", u"\U0001f49c", u"\U0001f499", u"\U0001f49b", u"\u2661", u"\U0001f497", u"\U0001f61c" ] self.emoji_pos += [u"\U0001f498"] self.emoji_neg = [ u"\U0001F612", u"\U0001F61E", u"\U0001F61F", u"\U0001F620", u"\U0001F621", u"\U0001F622", u"\U0001F623", u"\U0001F625", u"\U0001F627", u"\U0001F628", u"\U0001F62D", u"\U0001F63E", u"\U0001F63F", u"\U0001f614", u"\U0001f44e", u"\U0001f616", u"\u2639", u"\U0001f494" ] self.emoji_neg += [u"\U0001f613", u"\U0001f645", u"\U0001f630"] self.emoji = [ u"\U0001f3c3", u"\ue00e", u"\u2614", u"\U0001f4a8", u"\U0001f4a6", u"\u2601", u"\U0001f4b0", u"\U0001f341", u"\U0001f631", u"\U0001f4a4", u"\U0001f637", u"\U0001f436" ] self.emoji += [ u"\U0001f64f", u"\U0001f4fa", u"\u270b", u"\U0001f633", u"\U0001f366", u"\U0001f632", u"\U0001f44f", u"\U0001f44a", u"\U0001f4a2", u"\U0001f497", u"\U0001f631" ] self.emoji += [ u"\U0001f37a", u"\U0001f37b", u"\U0001f52b", u"\U0001f378", u"\U0001f48a", u"\U0001f483", u"\U0001f487", u"\U0001f4aa", u"\U0001f41f" ] self.special_tags = [ "%HASHTAG%", "%MENTION%", "%SMILEYPOS%", "%SMILEYNEG%", "%SMILEY%", "%URL%", ",", ".", "!", "?", ":", ";", "-", "+++", "–", "\"", "|" ] self.tokens = 0 self._cache_spelling = {}
def test_break_0(hpk_dic_filepath, test_aff_break_0_only_filepath): # This shows the importance of setting BREAK 0 in the .aff file # The BREAK 0 results in 'new' words being marked WRONG (desired behaviour) # in this case "awa-kai" and "kai-awa" hobj = hunspell.HunSpell(hpk_dic_filepath, test_aff_break_0_only_filepath) assert hobj.spell("awa") == True assert hobj.spell("kai") == True assert hobj.spell("awa-kai") == False assert hobj.spell("kai-awa") == False
def test_break_default(hpk_dic_filepath, test_aff_empty_filepath): # This shows the importance of setting BREAK 0 in the .aff file # The default results in 'new' words being marked ok # in this case "awa-kai" and "kai-awa" hobj = hunspell.HunSpell(hpk_dic_filepath, test_aff_empty_filepath) assert hobj.spell("awa") == True assert hobj.spell("kai") == True assert hobj.spell("awa-kai") == True assert hobj.spell("kai-awa") == True
def hunspellTest(): import hunspell spellchecker = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') print(spellchecker.spell("?")) spellchecker.add("Getter") print(spellchecker.spell("Getter-")) # True suggestions = spellchecker.suggest("Private") print(suggestions)
def __init__(self, dic_file='/usr/share/hunspell/hu_HU.dic', aff_file='/usr/share/hunspell/hu_HU.aff', task='dstem', source_fields=None, target_fields=None): """ The initialisation of the module. One can extend the lsit of parameters as needed. The mandatory fields which should be set by keywords are the following: :param task: the task to specialise the current instance :param source_fields: the set of names of the input fields :param target_fields: the list of names of the output fields in generation order """ # TODO: Heroku workaround for issue https://github.com/heroku/heroku-buildpack-apt/issues/35 import os.path if not os.path.exists(dic_file): dic_file = '/app/.apt/usr/share/hunspell/hu_HU.dic' aff_file = '/app/.apt/usr/share/hunspell/hu_HU.aff' if not os.path.exists(dic_file): # TODO: Alpine Linux workaround dic_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'dicts/hu_HU.dic') aff_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'dicts/hu_HU.aff') # Specialise the class for eg. stemming or detailed output... available_tasks = { 'spell': self._do_spell, 'stem': self._do_stem, 'analyze': self._do_analyze, 'dstem': self._do_dstem } for keyword, key_fun in available_tasks.items(): if task == keyword: self.process_token = key_fun break else: raise ValueError( 'No proper task is specified. The available tasks are {0}'. format(' or '.join(available_tasks.keys()))) # Field names for xtsv (the code below is mandatory for an xtsv module) if source_fields is None: source_fields = set() if target_fields is None: target_fields = [] self.source_fields = source_fields self.target_fields = target_fields self.h = hunspell.HunSpell(dic_file, aff_file) self._added_words = set() self._removed_words = set() self._added_words_w_affix = {}
def hunSpellcheck(language_country, word): import hunspell if not language_country in hunSpell_factory: hunSpell = hunspell.HunSpell('%s%s.dic'%(DICT_PATH, language_country), '%s%s.aff'%(DICT_PATH, language_country)) hunSpell_factory[language_country] = hunSpell else: hunSpell = hunSpell_factory[language_country] return hunSpell.spell(word)