def test_basic_case_(self): obj = RemovePluralSuffix() objf = CaseFilter(obj) expected = [u"பதிவி", u"கட்டளை", u"அவர்"] words_list = [u"பதிவில்", u"கட்டளைகள்", u"அவர்கள்"] for w, x in zip(words_list, expected): rval = obj.removeSuffix(w) trunc_word = objf.apply(w) self.assertEqual(trunc_word, rval[0]) return
def test_basic_case_(self): obj = RemovePluralSuffix() objf = CaseFilter(obj) expected = [u"பதிவி",u"கட்டளை",u"அவர்"] words_list = [u"பதிவில்",u"கட்டளைகள்",u"அவர்கள்"] for w,x in zip(words_list,expected): rval = obj.removeSuffix(w) trunc_word = objf.apply( w ) self.assertEqual( trunc_word ,rval[0] ) return
def __init__(self,filename=None,lang="ta",mode="non-web"): object.__init__(self) self.lang = lang.lower() self.filename = filename self.user_dict = set() self.case_filter = CaseFilter( RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix() ) if self.lang == u"en": self.alphabets = [a for a in string.ascii_lowercase] else: self.alphabets = None if mode == "web": return if not self.filename: self.interactive() else: self.spellcheck(self.filename)
def __init__(self, filename=None, lang="ta", mode="non-web"): object.__init__(self) self.lang = lang.lower() self.filename = filename self.user_dict = set() self.case_filter = CaseFilter(RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix()) if self.lang == u"en": self.alphabets = [a for a in string.ascii_lowercase] else: self.alphabets = None if mode != "web": if not self.filename: self.interactive() else: self.spellcheck(self.filename) pass
class Speller(object): TVU_dict = None ENL_dict = None punctuation = string.punctuation + '()[]{}' def __init__(self, filename=None, lang="ta", mode="non-web"): object.__init__(self) self.lang = lang.lower() self.filename = filename self.user_dict = set() self.case_filter = CaseFilter(RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix()) if not self.in_tamil_mode(): self.alphabets = [a for a in string.ascii_lowercase] else: self.alphabets = None if mode == "web": return if not self.filename: self.interactive() else: self.spellcheck(self.filename) def in_tamil_mode(self): return self.lang != u"en" @staticmethod def get_dictionary(): LoadDictionary.lock.acquire() if not Speller.TVU_dict: Speller.TVU_dict, _ = DictionaryBuilder.create(TamilVU) LoadDictionary.lock.release() return Speller.TVU_dict @staticmethod def get_english_dictionary(): LoadDictionary.lock.acquire() if not Speller.ENL_dict: Speller.ENL_dict, _ = DictionaryBuilder.create(EnglishLinux) LoadDictionary.lock.release() return Speller.ENL_dict def language(self): if self.in_tamil_mode(): return "tamil" return "english" def checklang(self, word): if self.in_tamil_mode(): return tamil.utf8.all_tamil(word) for w in word.lower(): if not (w in string.ascii_lowercase): return False return True # full-text interface driver for unittest @ Dec 10, 2017 def noninteractive_spellcheck(self, text): nwords = 0 npass = 0 nfail = 0 fail_n_suggs = dict() for word in re.split('\s+', text): if len(word) < 1: continue nwords += 1 result, suggs = self.REST_interface(word) nfail += int(not result) npass += int(result) if not result: fail_n_suggs[word] = suggs obj = { 'total': nwords, 'correct_words': npass, 'wrong_words': nfail, 'word_suggestions': fail_n_suggs } return obj # Ref: https://www.tinymce.com/docs/plugins/spellchecker/ def REST_interface(self, word): # returns JSON data in TinyMCE format ok, suggs = self.check_word_and_suggest(word) if _DEBUG: print("REST => %d" % ok) pprint.pprint(suggs) if ok: return ok, {} return ok, suggs @staticmethod def dice_comparison(ref_word, word): """ use this class method for SORTED""" val = Dice_coeff(ref_word, word) if (val == 1): return 0 return (2 * (val - 0.5) > 0) and 1 or -1 def suggestion_policy(self, word, suggs): # pick suggestions that are only +/- 2 letter length different filter_suggs = [] tamil_length = lambda w: len(tamil.utf8.get_letters(w)) ref_wl = tamil_length(word) accept_min_max = [max(ref_wl - 2, 1), ref_wl + 1] filter_suggs = filter( lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <= accept_min_max[1], suggs) # sort the suggestions by Dice coefficient filter_suggs = set(filter_suggs) if len(filter_suggs) == 0: # guess! filter_suggs = suggs filter_suggs = sorted(filter_suggs, cmp=tamil.utf8.compare_words_lexicographic) filter_suggs[min(10, len(filter_suggs) - 1):] = [] return filter_suggs filter_suggs = sorted(filter_suggs, cmp=Speller.dice_comparison) return filter_suggs def str_suggestions(self, word): if self.in_tamil_mode(): return u"சொல் \"%s\" மாற்றங்கள்" % word return u"SUGGESTIONS for \"%s\"" % word def mayangoli_suggestions(self, word, letters): alternates = Mayangoli.run(word, letters) alternates = filter(lambda w: w != word, alternates) if _DEBUG: for idx, w in enumerate(alternates): pprint.pprint(["Myangoli", idx, w]) return copy.copy(alternates) def interactive(self): try: while (True): if PYTHON3: word = input(u">> ") else: word = raw_input(u">> ") word = word.decode("utf-8").strip() word = re.sub(u"\s+", "", word) # skip empty words if len(word) < 1: continue if not self.checklang(word): print(u"EXCEPTION \"%s\" is not a %s Word" % (word, self.language())) continue ok, suggs = self.check_word_and_suggest(word) suggs = self.suggestion_policy(word, suggs) if not ok: words_per_row = 4 option_str = u", ".join([ u"(%d) %s" % (itr, wrd) + ((itr > 0 and itr % words_per_row == 0) and u"\n" or u"") for itr, wrd in enumerate(suggs) ]) print(u"%s\n\t %s" % (self.str_suggestions(word), option_str)) else: print(self.in_tamil_mode() and u"சரி" or u"OK") except KeyboardInterrupt as ke: pass except EOFError as eof: pass finally: print(self.in_tamil_mode() and u"\nவணக்கம்!" or "\nBYE!") return def spellcheck(self, filename): new_document = [] data = codecs.open(filename, u"r", u"utf-8") lines = data.readlines() for line in lines: words = tamil.utf8.get_words(tamil.utf8.get_letters(line)) for word in words: # FIXME : handle punctuation #word = filter( tamil.utf8.is_tamil_unicode_predicate, word ) ok, suggs = self.check_word_and_suggest(word) if PYTHON3 and not ok: suggs = list(suggs) if not ok: option = suggs[0] # take user input. # FIXME: User options to include DONTREPLACE/KEEP, DELETE WORD, etc. option_str = u", ".join([ u"(%d) %s" % (itr, wrd) for itr, wrd in enumerate(suggs) ]) if self.in_tamil_mode(): print(u"வரி \"%s\"" % line.strip()) print( u"'%s' சொல்லை கொண்டு\n\t சொல்லை '%s' மாற்றிடு\n" % (option_str, word)) else: print(u"Line, \"%s\"" % line.strip()) print(u" Replace word %s with\n\t => %s\n" % (word, option_str)) try: if self.in_tamil_mode(): choice_str = "விருப்பம் [-1 புறக்கணி, 0-%d மாற்றவும்]:" else: choice_str = u"option [-1 ignore, 0-%d replace]: " choice = input(choice_str % (len(suggs) - 1)) if PYTHON3: choice = int(choice) if choice == -1: if self.in_tamil_mode(): print(u"வார்த்தை மாறாத இருந்தது") else: print(u"Not replacing word") option = word self.user_dict.add(word) else: option = suggs[choice] except Exception as ie: print(str(ie)) if self.in_tamil_mode(): replace_msg = u"வார்த்தை %s -> %s இதற்காக மாற்றவும்\n" else: replace_msg = u" replacing word %s -> %s\n" print(replace_msg % (word, option)) new_document.append(unicode(option)) else: new_document.append(word) new_document.append(u"\n") if self.in_tamil_mode(): print( u"*********** ஆவணத்தில் உள்ள பிழைகளை திருத்திய பின் *********") else: print(u"*********** cleaned up document **********") print(u" ".join(new_document)) def get_lang_dictionary(self): if not self.in_tamil_mode(): return Speller.get_english_dictionary() return Speller.get_dictionary() def isWord(self, word): # Plain old dictionary checks LANG_dict = self.get_lang_dictionary() is_dict_word = LANG_dict.isWord(word) in_user_dict = word in self.user_dict or is_dict_word return in_user_dict def add_numeral_words(self, lexicon): if not self.in_tamil_mode(): return units = (u'பூஜ்ஜியம்', u'ஒன்று', u'இரண்டு', u'மூன்று', u'நான்கு', u'ஐந்து', u'ஆறு', u'ஏழு', u'எட்டு', u'ஒன்பது', u'பத்து' ) # 0-10 teens = (u'பதினொன்று', u' பனிரண்டு', u'பதிமூன்று', u'பதினான்கு', u'பதினைந்து', u'பதினாறு', u'பதினேழு', u'பதினெட்டு', u'பத்தொன்பது') # 11-19 tens = (u'பத்து', u'இருபது', u'முப்பது', u'நாற்பது', u'ஐம்பது', u'அறுபது', u'எழுபது', u'எண்பது', u'தொன்னூறு') # 10-90 tens_suffix = (u'இருபத்து', u'முப்பத்து', u'நாற்பத்து', u'ஐம்பத்து', u'அறுபத்து', u'எழுபத்து', u'எண்பத்து', u'தொன்னூத்து' ) # 10+-90+ hundreds = (u'நூறு', u'இருநூறு', u'முந்நூறு', u'நாநூறு', u'ஐநூறு', u'அறுநூறு', u'எழுநூறு', u'எண்ணூறு', u'தொள்ளாயிரம்' ) #100 - 900 hundreds_suffix = (u'நூற்றி', u'இருநூற்றி', u'முந்நூற்று', u'நாநூற்று', u'ஐநூற்று', u'அறுநூற்று', u'எழுநூற்று', u'எண்ணூற்று', u'தொள்ளாயிரத்து') #100+ - 900+ one_thousand_prefix = (u'ஓர்', ) thousands = (u'ஆயிரம்', u'ஆயிரத்தி') one_prefix = (u'ஒரு', ) lakh = (u'இலட்சம்', u'இலட்சத்து') crore = (u'கோடி', u'கோடியே') mil = (u'மில்லியன்', ) bil = (u'பில்லியன்', ) tril = (u'டிரில்லியன்', ) if lexicon.isWord(tril[0]): return numerals = list() for wordset in [ units, tens, teens, tens_suffix, hundreds, hundreds_suffix, one_thousand_prefix, thousands, one_prefix, lakh, crore, mil, bil, tril ]: numerals.extend(wordset) #with codecs.open("numerals.json","w","utf-8") as fp: # fp.write(json.dumps(numerals)) for word in numerals: lexicon.add(word) @staticmethod def scrub_ws(word): return re.sub(u'[\s{}()\[\]]+', u'', word) def check_word_and_suggest(self, word, errmsg=None): word = word.strip() # skip known punctuation at end of line while len(word) >= 1 and any(map(word.endswith, Speller.punctuation)): word = word[:-1] while len(word) >= 1 and any(map(word.startswith, string.whitespace)): word = word[1:] # is number then we propose a numeral if self.in_tamil_mode(): numword = word.replace(u',', u'') if re.match(u'[+|-]*[\d]+', numword): try: num = float(numword) posnum = num if num < 0: posnum = -1 * num numeral_form = tamil.numeral.num2tamilstr(posnum) if num < 0: numeral_form = u"கழித்தல் " + numeral_form return (False, [numeral_form]) except Exception as ioe: pass # dates are okay if any( map(word.endswith, [u"-இல்", u"-ஆம்", u"-இலிருந்து", u"-வரை"])): if re.search('^\d+', word): return (True, [word]) #word is okay # check if words are transliterated if any( filter(lambda x: x in string.ascii_letters, tamil.utf8.get_letters(word))): # letter-sequence only en_word = Speller.scrub_ws(word) EN_Lexicon = Speller.get_english_dictionary() if EN_Lexicon.isWord(en_word): return ( False, [''] ) #English word - nosub- yet until we have parallel dictionaries or translation. TBD. #is english letter ta = algorithm.Iterative.transliterate( jaffna.Transliteration.table, en_word) # TBD: potential for having ANN to tell if english text is pure English word # or a romanized Tamil word. Output of classifier can be useful here. return (False, [ta]) # check if it matches Tamil numeral and has close match. # propose suggestions from that list. # TBD # hyphens are not okay if word.find(u"-") >= 0: return (False, [word.replace(u"-", u" ")]) #re.sub(u"^w"," ",word)) # replace other spurious ()[] punctuations by concatenation #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word))) orig_word = u"%s" % word # remove digits word = re.sub(u'\d+', u'', word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() self.add_numeral_words(TVU_dict) # Check if this 'word' is any common kind of error if Typographical.checkFormErrors(word, errmsg): if errmsg: errmsg.append("TypographicalError") if not self.checklang(word): print("Word is not in desired language!") return (False, [u""]) if len(word) < 1: print("Word is too small") return (False, [u'']) # plain old dictionary + user dictionary check if self.isWord(word): return (True, word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply(word) if (self.isWord(word_nocase)): return (True, word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word, TVU_dict) greedy_results = list() if len(alt) >= 1: greedy_results = [u" ".join(alt), u"-".join(alt)] greedy_results.extend(alt) #return (False, greedy_results ) # if there are no other suggestions than deletion filter, we return # in presence of other suggestions we can just return suggestions suggs = DeletionFilter.get_suggestions(letters, TVU_dict) if len(suggs) > 0: if len(greedy_results) == 0: return (False, suggs) else: greedy_results.extend(suggs) # ottru splitting for Tamil language mode ottru_options = [] if self.in_tamil_mode(): # discover words like யாரிகழ்ந்து are accepted. ottru = OttruSplit(word, letters) ottru.run(TVU_dict) if len(ottru.results) > 0: return (True, word) ottru_options = ottru.results # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor(word, self.alphabets, 2, limit=25)) combinagram_suggests = list( tamil.wordutils.combinagrams(word, TVU_dict, limit=25)) pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1])) # FIXME: score the options options = greedy_results options.extend(ottru_options) options.extend(list(norvig_suggests)) options.extend(combinagram_suggests) options.extend(pfx_options) # filter the options against a dictionary! options = filter(TVU_dict.isWord, options) if PYTHON3: options = list(options) if self.in_tamil_mode(): options.extend(self.mayangoli_suggestions(orig_word, letters)) # sort the options if not self.in_tamil_mode(): options.sort() else: if PYTHON3: options = sorted(options, key=functools.cmp_to_key( tamil.utf8.compare_words_lexicographic)) else: options = sorted(options, cmp=tamil.utf8.compare_words_lexicographic) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2, options) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options if _DEBUG: print("@deduplication") pprint.pprint(options2) # score by Dice or Edit-Distance coefficients options_score = [0.0 for i in range(len(options2))] for itr, sugg_word in enumerate(options2): #options_score[itr] = Dice_coeff( word, sugg_word ) options_score[itr] = (len(word) - edit_distance( word, sugg_word)) / (1.0 * len(orig_word)) * Dice_coeff( word, sugg_word) / 3.0 #dice coeff is weighted down options = zip(options2, options_score) # limit options by score options = sorted(options, key=operator.itemgetter(1), reverse=True) options = [word_pair[0] for word_pair in options] #L = 40 # limit to first top -L=20 only which is good enough #options = options[0:min(len(options),L)] if _DEBUG: pprint.pprint("@after scoring/sorting") pprint.pprint(options) # eliminate single letter options options = filter(lambda x: not (x in tamil.utf8.tamil_letters), options) # Due to suggestion policy we may have words which are found in error but we dont have # replacements for them! # TBD: options should not have the 'word'! return (False, options)
class Speller(object): TVU_dict = None ENL_dict = None def __init__(self,filename=None,lang="ta",mode="non-web"): object.__init__(self) self.lang = lang.lower() self.filename = filename self.user_dict = set() self.case_filter = CaseFilter( RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix() ) if self.lang == u"en": self.alphabets = [a for a in string.ascii_lowercase] else: self.alphabets = None if mode == "web": return if not self.filename: self.interactive() else: self.spellcheck(self.filename) def in_tamil_mode(self): return self.lang != u"en" @staticmethod def get_dictionary(): LoadDictionary.lock.acquire() if not Speller.TVU_dict: Speller.TVU_dict,_ = DictionaryBuilder.create(TamilVU) LoadDictionary.lock.release() return Speller.TVU_dict @staticmethod def get_english_dictionary(): LoadDictionary.lock.acquire() if not Speller.ENL_dict: Speller.ENL_dict,_ = DictionaryBuilder.create(EnglishLinux) LoadDictionary.lock.release() return Speller.ENL_dict def language(self): if self.lang == "ta": return "tamil" return "english" def checklang(self,word): if self.lang == "ta": return tamil.utf8.all_tamil(word) for w in word.lower(): if not ( w in string.ascii_lowercase ): return False return True # Ref: https://www.tinymce.com/docs/plugins/spellchecker/ def REST_interface(self,word): # returns JSON data in TinyMCE format ok,suggs = self.check_word_and_suggest( word ) if ok: return ok, {} return ok, suggs @staticmethod def dice_comparison(ref_word,word): """ use this class method for SORTED""" val = Dice_coeff(ref_word,word) if ( val == 1 ): return 0 return (2*(val - 0.5) > 0) and 1 or -1 def suggestion_policy(self,word,suggs): # pick suggestions that are only +/- 2 letter length different filter_suggs = [] tamil_length = lambda w: len(tamil.utf8.get_letters(w)) ref_wl = tamil_length(word) accept_min_max = [max(ref_wl-2,1),ref_wl+1] filter_suggs = filter(lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <= accept_min_max[1], suggs) # sort the suggestions by Dice coefficient filter_suggs = set(filter_suggs) if len(filter_suggs) == 0: # guess! filter_suggs = suggs filter_suggs=sorted(filter_suggs,cmp=tamil.utf8.compare_words_lexicographic) filter_suggs[min(10,len(filter_suggs)-1):]=[] return filter_suggs filter_suggs=sorted(filter_suggs,cmp=Speller.dice_comparison) return filter_suggs def str_suggestions(self,word): if self.in_tamil_mode(): return u"சொல் \"%s\" மாற்றங்கள்"%word return u"SUGGESTIONS for \"%s\""%word def interactive(self): try: while( True ): if PYTHON3: word = input(u">> ") else: word = raw_input(u">> ") word = word.decode("utf-8").strip() word = re.sub(u"\s+","",word) # skip empty words if len(word) < 1: continue if not self.checklang(word): print(u"EXCEPTION \"%s\" is not a %s Word"%(word,self.language())) continue ok,suggs = self.check_word_and_suggest( word ) suggs = self.suggestion_policy(word,suggs) if not ok: words_per_row = 4 option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) + ((itr > 0 and itr%words_per_row == 0) and u"\n" or u"") for itr,wrd in enumerate(suggs)] ) print(u"%s\n\t %s"%(self.str_suggestions(word),option_str)) else: print(self.in_tamil_mode() and u"சரி" or u"OK") except KeyboardInterrupt as ke: pass except EOFError as eof: pass finally: print(self.in_tamil_mode() and u"\nவணக்கம்!" or "\nBYE!") return def spellcheck(self,filename): new_document = [] data = codecs.open(filename,u"r",u"utf-8") lines = data.readlines() for line in lines: words = tamil.utf8.get_words( tamil.utf8.get_letters(line) ) for word in words: # FIXME : handle punctuation #word = filter( tamil.utf8.is_tamil_unicode_predicate, word ) ok,suggs = self.check_word_and_suggest( word ) if not ok: option = suggs[0] # take user input. # FIXME: User optiions to include DONTREPLACE/KEEP, DELETE WORD, etc. option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) for itr,wrd in enumerate(suggs)] ) print(u"In line, \"%s\""%line.strip()) print(u" Replace word %s with\n\t => %s\n"%(word, option_str)) try: choice = input(u"option [-1 ignore, 0-%d replace]: "%(len(suggs)-1)) if PYTHON3: choice = int(choice) if choice == -1: print(u"Not replacing word") option = word self.user_dict.add(word) else: option = suggs[choice] except Exception as ie: print (str(ie)) print(u" replacing word %s -> %s\n"%(word,option)) new_document.append( unicode(option) ) else: new_document.append( word ) new_document.append(u"\n") print(u"*********** cleaned up document **********") print(u" ".join(new_document)) def get_lang_dictionary(self): if self.lang == u"en": return Speller.get_english_dictionary() return Speller.get_dictionary() def isWord(self, word): # Plain old dictionary checks LANG_dict = self.get_lang_dictionary() is_dict_word = LANG_dict.isWord(word) in_user_dict = word in self.user_dict or is_dict_word return in_user_dict def check_word_and_suggest( self,word ): word = word.strip() # remove punctuation for x in string.punctuation: word = word.replace(x,u"") # remove digits word = re.sub(u'\d+',u'',word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() if not self.checklang(word): return (False,[u'']) if len(word) < 1: return (False,[u'']) # plain old dictionary + user dictionary check if self.isWord(word): return (True,word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply( word ) if ( self.isWord( word_nocase ) ): return (True,word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word,TVU_dict) if len(alt) >= 1: greedy_results = [u" ".join(alt),u"-".join(alt)] greedy_results.extend(alt) #return (False, results ) else: greedy_results = list() # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=50)) combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=50)) pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) ) # FIXME: score the options options = greedy_results options.extend( list(norvig_suggests)) options.extend( combinagram_suggests ) options.extend( pfx_options ) # sort the options if self.lang == u"en": options.sort() else: if PYTHON3: options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) ) else: options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic ) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter( lambda x: len(tamil.utf8.get_letters(x)) > 2, options ) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options # score by Dice coefficients options_score = [0.0 for i in range(len(options2))] for itr,sugg_word in enumerate(options2): options_score[itr] = Dice_coeff( word, sugg_word ) options = zip( options2, options_score) # limit options by score options = sorted(options,key=operator.itemgetter(1),reverse=True) options = [word_pair[0] for word_pair in options] L = 20 # limit to first top -L=20 only which is good enough options = options[0:min(len(options),L)] return (False, options )
class Speller(object): TVU_dict = None ENL_dict = None def __init__(self, filename=None, lang="ta", mode="non-web"): object.__init__(self) self.lang = lang.lower() self.filename = filename self.user_dict = set() self.case_filter = CaseFilter(RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix()) if self.lang == u"en": self.alphabets = [a for a in string.ascii_lowercase] else: self.alphabets = None if mode != "web": if not self.filename: self.interactive() else: self.spellcheck(self.filename) pass @staticmethod def get_dictionary(): LoadDictionary.lock.acquire() if not Speller.TVU_dict: Speller.TVU_dict, _ = DictionaryBuilder.create(TamilVU) LoadDictionary.lock.release() return Speller.TVU_dict @staticmethod def get_english_dictionary(): LoadDictionary.lock.acquire() if not Speller.ENL_dict: Speller.ENL_dict, _ = DictionaryBuilder.create(EnglishLinux) LoadDictionary.lock.release() return Speller.ENL_dict def language(self): if self.lang == "ta": return "tamil" return "english" def checklang(self, word): if self.lang == "ta": return tamil.utf8.all_tamil(word) for w in word.lower(): if not (w in string.ascii_lowercase): return False return True # Ref: https://www.tinymce.com/docs/plugins/spellchecker/ def REST_interface(self, word): # returns JSON data in TinyMCE format ok, suggs = self.check_word_and_suggest(word) if ok: return ok, {} return ok, suggs def interactive(self): try: while True: if PYTHON3: word = input(u">> ") else: word = raw_input(u">> ") word = word.decode("utf-8").strip() word = re.sub(u"\s+", "", word) if not self.checklang(word): print(u'EXCEPTION "%s" is not a %s Word' % (word, self.language())) continue ok, suggs = self.check_word_and_suggest(word) if not ok: option_str = u", ".join([u"(%d) %s" % (itr, wrd) for itr, wrd in enumerate(suggs)]) print(u'SUGGESTIONS for "%s"\n\t %s' % (word, option_str)) else: print(u"OK") except KeyboardInterrupt as ke: pass except EOFError as eof: pass finally: print("\nBYE!") return def spellcheck(self, filename): new_document = [] data = codecs.open(filename, u"r", u"utf-8") lines = data.readlines() for line in lines: words = tamil.utf8.get_words(tamil.utf8.get_letters(line)) for word in words: # FIXME : handle punctuation # word = filter( tamil.utf8.is_tamil_unicode_predicate, word ) ok, suggs = self.check_word_and_suggest(word) if not ok: option = suggs[0] # take user input. # FIXME: User optiions to include DONTREPLACE/KEEP, DELETE WORD, etc. option_str = u", ".join([u"(%d) %s" % (itr, wrd) for itr, wrd in enumerate(suggs)]) print(u'In line, "%s"' % line.strip()) print(u" Replace word %s with\n\t => %s\n" % (word, option_str)) try: choice = input(u"option [-1 ignore, 0-%d replace]: " % (len(suggs) - 1)) if PYTHON3: choice = int(choice) if choice == -1: print(u"Not replacing word") option = word self.user_dict.add(word) else: option = suggs[choice] except Exception as ie: print(str(ie)) print(u" replacing word %s -> %s\n" % (word, option)) new_document.append(unicode(option)) else: new_document.append(word) new_document.append(u"\n") print(u"*********** cleaned up document **********") print(u" ".join(new_document)) def get_lang_dictionary(self): if self.lang == u"en": return Speller.get_english_dictionary() return Speller.get_dictionary() def isWord(self, word): # Plain old dictionary checks LANG_dict = self.get_lang_dictionary() is_dict_word = LANG_dict.isWord(word) in_user_dict = word in self.user_dict or is_dict_word return in_user_dict def check_word_and_suggest(self, word): word = word.strip() # remove punctuation for x in string.punctuation: word = word.replace(x, u"") # remove digits word = re.sub(u"\d+", u"", word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() if not self.checklang(word): return (False, [u""]) if len(word) < 1: return (False, [u""]) # plain old dictionary + user dictionary check if self.isWord(word): return (True, word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply(word) if self.isWord(word_nocase): return (True, word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word, TVU_dict) if len(alt) >= 1: greedy_results = [u" ".join(alt), u"-".join(alt)] greedy_results.extend(alt) # return (False, results ) else: greedy_results = list() # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter(TVU_dict.isWord, norvig_suggestor(word, self.alphabets, 2, limit=50)) combinagram_suggests = list(tamil.wordutils.combinagrams(word, TVU_dict, limit=50)) pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1])) # FIXME: score the options options = greedy_results options.extend(list(norvig_suggests)) options.extend(combinagram_suggests) options.extend(pfx_options) # sort the options if self.lang == u"en": options.sort() else: if PYTHON3: options = sorted(options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic)) else: options = sorted(options, cmp=tamil.utf8.compare_words_lexicographic) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2, options) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options # score by Dice coefficients options_score = [0.0 for i in range(len(options2))] for itr, sugg_word in enumerate(options2): options_score[itr] = Dice_coeff(word, sugg_word) options = zip(options2, options_score) # limit options by score options = sorted(options, key=operator.itemgetter(1), reverse=True) options = [word_pair[0] for word_pair in options] L = 20 # limit to first top -L=20 only which is good enough options = options[0 : min(len(options), L)] return (False, options)
class Speller(object): TVU_dict = None ENL_dict = None punctuation = string.punctuation+'()[]{}' def __init__(self,filename=None,lang="ta",mode="non-web"): object.__init__(self) self.lang = lang.lower() self.filename = filename self.user_dict = set() self.case_filter = CaseFilter( RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix() ) if not self.in_tamil_mode(): self.alphabets = [a for a in string.ascii_lowercase] else: self.alphabets = None if mode == "web": return if not self.filename: self.interactive() else: self.spellcheck(self.filename) def in_tamil_mode(self): return self.lang != u"en" @staticmethod def get_dictionary(): LoadDictionary.lock.acquire() if not Speller.TVU_dict: Speller.TVU_dict,_ = DictionaryBuilder.create(TamilVU) LoadDictionary.lock.release() return Speller.TVU_dict @staticmethod def get_english_dictionary(): LoadDictionary.lock.acquire() if not Speller.ENL_dict: Speller.ENL_dict,_ = DictionaryBuilder.create(EnglishLinux) LoadDictionary.lock.release() return Speller.ENL_dict def language(self): if self.in_tamil_mode(): return "tamil" return "english" def checklang(self,word): if self.in_tamil_mode(): return tamil.utf8.all_tamil(word) for w in word.lower(): if not ( w in string.ascii_lowercase ): return False return True # full-text interface driver for unittest @ Dec 10, 2017 def noninteractive_spellcheck(self,text): nwords = 0 npass = 0 nfail = 0 fail_n_suggs = dict() for word in re.split('\s+',text): if len(word) < 1: continue nwords += 1 result,suggs = self.REST_interface(word) nfail += int(not result) npass += int(result) if not result: fail_n_suggs[word] = suggs obj = {'total':nwords, 'correct_words':npass, 'wrong_words':nfail, 'word_suggestions':fail_n_suggs} return obj # Ref: https://www.tinymce.com/docs/plugins/spellchecker/ def REST_interface(self,word): # returns JSON data in TinyMCE format ok,suggs = self.check_word_and_suggest( word ) if _DEBUG: print("REST => %d"%ok) pprint.pprint(suggs) if ok: return ok, {} return ok, suggs @staticmethod def dice_comparison(ref_word,word): """ use this class method for SORTED""" val = Dice_coeff(ref_word,word) if ( val == 1 ): return 0 return (2*(val - 0.5) > 0) and 1 or -1 def suggestion_policy(self,word,suggs): # pick suggestions that are only +/- 2 letter length different filter_suggs = [] tamil_length = lambda w: len(tamil.utf8.get_letters(w)) ref_wl = tamil_length(word) accept_min_max = [max(ref_wl-2,1),ref_wl+1] filter_suggs = filter(lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <= accept_min_max[1], suggs) # sort the suggestions by Dice coefficient filter_suggs = set(filter_suggs) if len(filter_suggs) == 0: # guess! filter_suggs = suggs filter_suggs=sorted(filter_suggs,cmp=tamil.utf8.compare_words_lexicographic) filter_suggs[min(10,len(filter_suggs)-1):]=[] return filter_suggs filter_suggs=sorted(filter_suggs,cmp=Speller.dice_comparison) return filter_suggs def str_suggestions(self,word): if self.in_tamil_mode(): return u"சொல் \"%s\" மாற்றங்கள்"%word return u"SUGGESTIONS for \"%s\""%word def mayangoli_suggestions(self,word,letters): alternates = Mayangoli.run(word,letters) alternates = filter(lambda w: w != word, alternates) if _DEBUG: for idx,w in enumerate(alternates): pprint.pprint(["Myangoli",idx,w]) return copy.copy(alternates) def interactive(self): try: while( True ): if PYTHON3: word = input(u">> ") else: word = raw_input(u">> ") word = word.decode("utf-8").strip() word = re.sub(u"\s+","",word) # skip empty words if len(word) < 1: continue if not self.checklang(word): print(u"EXCEPTION \"%s\" is not a %s Word"%(word,self.language())) continue ok,suggs = self.check_word_and_suggest( word ) suggs = self.suggestion_policy(word,suggs) if not ok: words_per_row = 4 option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) + ((itr > 0 and itr%words_per_row == 0) and u"\n" or u"") for itr,wrd in enumerate(suggs)] ) print(u"%s\n\t %s"%(self.str_suggestions(word),option_str)) else: print(self.in_tamil_mode() and u"சரி" or u"OK") except KeyboardInterrupt as ke: pass except EOFError as eof: pass finally: print(self.in_tamil_mode() and u"\nவணக்கம்!" or "\nBYE!") return def spellcheck(self,filename): new_document = [] data = codecs.open(filename,u"r",u"utf-8") lines = data.readlines() for line in lines: words = tamil.utf8.get_words( tamil.utf8.get_letters(line) ) for word in words: # FIXME : handle punctuation #word = filter( tamil.utf8.is_tamil_unicode_predicate, word ) ok,suggs = self.check_word_and_suggest( word ) if PYTHON3 and not ok: suggs = list(suggs) if not ok: option = suggs[0] # take user input. # FIXME: User options to include DONTREPLACE/KEEP, DELETE WORD, etc. option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) for itr,wrd in enumerate(suggs)] ) if self.in_tamil_mode(): print(u"வரி \"%s\""%line.strip()) print(u"'%s' சொல்லை கொண்டு\n\t சொல்லை '%s' மாற்றிடு\n"%(option_str,word)) else: print(u"Line, \"%s\""%line.strip()) print(u" Replace word %s with\n\t => %s\n"%(word, option_str)) try: if self.in_tamil_mode(): choice_str="விருப்பம் [-1 புறக்கணி, 0-%d மாற்றவும்]:" else: choice_str=u"option [-1 ignore, 0-%d replace]: " choice = input(choice_str%(len(suggs)-1)) if PYTHON3: choice = int(choice) if choice == -1: if self.in_tamil_mode(): print(u"வார்த்தை மாறாத இருந்தது") else: print(u"Not replacing word") option = word self.user_dict.add(word) else: option = suggs[choice] except Exception as ie: print (str(ie)) if self.in_tamil_mode(): replace_msg=u"வார்த்தை %s -> %s இதற்காக மாற்றவும்\n" else: replace_msg = u" replacing word %s -> %s\n" print(replace_msg%(word,option)) new_document.append( unicode(option) ) else: new_document.append( word ) new_document.append(u"\n") if self.in_tamil_mode(): print(u"*********** ஆவணத்தில் உள்ள பிழைகளை திருத்திய பின் *********") else: print(u"*********** cleaned up document **********") print(u" ".join(new_document)) def get_lang_dictionary(self): if not self.in_tamil_mode(): return Speller.get_english_dictionary() return Speller.get_dictionary() def isWord(self, word): # Plain old dictionary checks LANG_dict = self.get_lang_dictionary() is_dict_word = LANG_dict.isWord(word) in_user_dict = word in self.user_dict or is_dict_word return in_user_dict def add_numeral_words(self,lexicon): if not self.in_tamil_mode(): return units = (u'பூஜ்ஜியம்', u'ஒன்று', u'இரண்டு', u'மூன்று', u'நான்கு', u'ஐந்து', u'ஆறு', u'ஏழு', u'எட்டு', u'ஒன்பது', u'பத்து') # 0-10 teens = (u'பதினொன்று', u' பனிரண்டு', u'பதிமூன்று', u'பதினான்கு', u'பதினைந்து',u'பதினாறு', u'பதினேழு', u'பதினெட்டு', u'பத்தொன்பது') # 11-19 tens = (u'பத்து', u'இருபது', u'முப்பது', u'நாற்பது', u'ஐம்பது',u'அறுபது', u'எழுபது', u'எண்பது', u'தொன்னூறு') # 10-90 tens_suffix = (u'இருபத்து', u'முப்பத்து', u'நாற்பத்து', u'ஐம்பத்து', u'அறுபத்து', u'எழுபத்து', u'எண்பத்து', u'தொன்னூத்து') # 10+-90+ hundreds = ( u'நூறு', u'இருநூறு', u'முந்நூறு', u'நாநூறு',u'ஐநூறு', u'அறுநூறு', u'எழுநூறு', u'எண்ணூறு', u'தொள்ளாயிரம்') #100 - 900 hundreds_suffix = (u'நூற்றி', u'இருநூற்றி', u'முந்நூற்று', u'நாநூற்று', u'ஐநூற்று', u'அறுநூற்று', u'எழுநூற்று', u'எண்ணூற்று',u'தொள்ளாயிரத்து') #100+ - 900+ one_thousand_prefix = (u'ஓர்',) thousands = (u'ஆயிரம்',u'ஆயிரத்தி') one_prefix = (u'ஒரு',) lakh = (u'இலட்சம்',u'இலட்சத்தி') crore = (u'கோடி',u'கோடியே') mil = (u'மில்லியன்',) bil = (u'பில்லியன்',) tril = (u'டிரில்லியன்',) if lexicon.isWord(tril[0]): return numerals = list() for wordset in [units,tens,teens,tens_suffix,hundreds,hundreds_suffix,one_thousand_prefix,thousands,one_prefix,lakh,crore,mil,bil,tril]: numerals.extend(wordset) #with codecs.open("numerals.json","w","utf-8") as fp: # fp.write(json.dumps(numerals)) for word in numerals: lexicon.add(word) @staticmethod def scrub_ws(word): return re.sub(u'[\s{}()\[\]]+',u'',word) def check_word_and_suggest( self,word, errmsg = None ): word = word.strip() # skip known punctuation at end of line while len(word) >= 1 and any(map(word.endswith,Speller.punctuation)): word = word[:-1] while len(word) >= 1 and any(map(word.startswith,string.whitespace)): word = word[1:] # is number then we propose a numeral if self.in_tamil_mode(): numword = word.replace(u',',u'') if re.match(u'[+|-]*[\d]+',numword): try: num = float(numword) posnum = num if num < 0: posnum = -1*num numeral_form = tamil.numeral.num2tamilstr(posnum) if num < 0: numeral_form = u"கழித்தல் "+numeral_form return (False,[numeral_form]) except Exception as ioe: pass # dates are okay if any(map(word.endswith,[u"-இல்",u"-ஆம்",u"-இலிருந்து", u"-வரை"])): if re.search('^\d+',word): return (True,[word]) #word is okay # check if words are transliterated if any(filter(lambda x: x in string.ascii_letters,tamil.utf8.get_letters(word))): # letter-sequence only en_word = Speller.scrub_ws(word) EN_Lexicon = Speller.get_english_dictionary() if EN_Lexicon.isWord(en_word): return (False,['']) #English word - nosub- yet until we have parallel dictionaries or translation. TBD. #is english letter ta = algorithm.Iterative.transliterate(jaffna.Transliteration.table,en_word) # TBD: potential for having ANN to tell if english text is pure English word # or a romanized Tamil word. Output of classifier can be useful here. return (False,[ta]) # check if it matches Tamil numeral and has close match. # propose suggestions from that list. # TBD # hyphens are not okay if word.find(u"-") >= 0: return (False,[word.replace(u"-",u" ")])#re.sub(u"^w"," ",word)) # replace other spurious ()[] punctuations by concatenation #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word))) orig_word = u"%s"%word # remove digits word = re.sub(u'\d+',u'',word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() self.add_numeral_words(TVU_dict) # Check if this 'word' is any common kind of error if Typographical.checkFormErrors(word,errmsg): if errmsg: errmsg.append("TypographicalError") if not self.checklang(word): print("Word is not in desired language!") return (False,[u""]) if len(word) < 1: print("Word is too small") return (False,[u'']) # plain old dictionary + user dictionary check if self.isWord(word): return (True,word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply( word ) if ( self.isWord( word_nocase ) ): return (True,word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word,TVU_dict) greedy_results = list() if len(alt) >= 1: greedy_results = [u" ".join(alt),u"-".join(alt)] greedy_results.extend(alt) #return (False, greedy_results ) # if there are no other suggestions than deletion filter, we return # in presence of other suggestions we can just return suggestions suggs = DeletionFilter.get_suggestions(letters,TVU_dict) if len(suggs) > 0: if len(greedy_results) == 0: return (False,suggs) else: greedy_results.extend(suggs) # ottru splitting for Tamil language mode ottru_options = [] if self.in_tamil_mode(): # discover words like யாரிகழ்ந்து are accepted. ottru = OttruSplit(word,letters) ottru.run(TVU_dict) if len(ottru.results) > 0: return (True,word) ottru_options = ottru.results # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=25)) combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=25)) pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) ) # FIXME: score the options options = greedy_results options.extend( ottru_options ) options.extend( list(norvig_suggests) ) options.extend( combinagram_suggests ) options.extend( pfx_options ) # filter the options against a dictionary! options = filter(TVU_dict.isWord,options ) if PYTHON3: options = list(options) if self.in_tamil_mode(): options.extend( self.mayangoli_suggestions(orig_word,letters) ) # sort the options if not self.in_tamil_mode(): options.sort() else: if PYTHON3: options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) ) else: options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic ) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter( lambda x: len(tamil.utf8.get_letters(x)) > 2, options ) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options if _DEBUG: print("@deduplication") pprint.pprint(options2) # score by Dice or Edit-Distance coefficients options_score = [0.0 for i in range(len(options2))] for itr,sugg_word in enumerate(options2): #options_score[itr] = Dice_coeff( word, sugg_word ) options_score[itr] = (len(word)-edit_distance(word,sugg_word))/(1.0*len(orig_word))*Dice_coeff( word, sugg_word )/3.0 #dice coeff is weighted down options = zip( options2, options_score) # limit options by score options = sorted(options,key=operator.itemgetter(1),reverse=True) options = [word_pair[0] for word_pair in options] #L = 40 # limit to first top -L=20 only which is good enough #options = options[0:min(len(options),L)] if _DEBUG: pprint.pprint("@after scoring/sorting") pprint.pprint(options) # eliminate single letter options options = filter(lambda x : not( x in tamil.utf8.tamil_letters), options) # Due to suggestion policy we may have words which are found in error but we dont have # replacements for them! # TBD: options should not have the 'word'! return (False, options )
class Speller(object): TVU_dict = None ENL_dict = None def __init__(self, filename=None, lang="ta", mode="non-web"): object.__init__(self) self.lang = lang self.filename = filename self.user_dict = set() self.case_filter = CaseFilter(RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix()) if self.lang == u"en": self.alphabets = [a for a in string.ascii_lowercase] else: self.alphabets = None if mode == "web": return if not self.filename: self.interactive() else: self.spellcheck(self.filename) @staticmethod def get_dictionary(): LoadDictionary.lock.acquire() if not Speller.TVU_dict: Speller.TVU_dict, _ = DictionaryBuilder.create(TamilVU) LoadDictionary.lock.release() return Speller.TVU_dict @staticmethod def get_english_dictionary(): LoadDictionary.lock.acquire() if not Speller.ENL_dict: Speller.ENL_dict, _ = DictionaryBuilder.create(EnglishLinux) LoadDictionary.lock.release() return Speller.ENL_dict def language(self): if self.lang == "ta": return "tamil" return "english" def checklang(self, word): if self.lang == "ta": return tamil.utf8.all_tamil(word) return all([w in string.ascii_lowercase for w in word.lower()]) # Ref: https://www.tinymce.com/docs/plugins/spellchecker/ def REST_interface(self, word): # returns JSON data in TinyMCE format ok, suggs = self.check_word_and_suggest(word) if ok: return ok, "" return ok, json.dumps({word: suggs}) def interactive(self): try: while (True): if PYTHON3: word = input(u">> ") else: word = raw_input(u">> ") word = word.decode("utf-8").strip() word = re.sub(u"\s+", "", word) if not self.checklang(word): print(u"EXCEPTION \"%s\" is not a %s Word" % (word, self.language())) continue ok, suggs = self.check_word_and_suggest(word) if not ok: option_str = u", ".join([ u"(%d) %s" % (itr, wrd) for itr, wrd in enumerate(suggs) ]) print(u"SUGGESTIONS for \"%s\"\n\t %s" % (word, option_str)) else: print(u"OK") except KeyboardInterrupt as ke: pass except EOFError as eof: pass finally: print("\nBYE!") return def spellcheck(self, filename): new_document = [] data = codecs.open(filename, u"r", u"utf-8") lines = data.readlines() for line in lines: words = tamil.utf8.get_words(tamil.utf8.get_letters(line)) for word in words: # FIXME : handle punctuation #word = filter( tamil.utf8.is_tamil_unicode_predicate, word ) ok, suggs = self.check_word_and_suggest(word) if not ok: option = suggs[0] # take user input. # FIXME: User optiions to include DONTREPLACE/KEEP, DELETE WORD, etc. option_str = u", ".join([ u"(%d) %s" % (itr, wrd) for itr, wrd in enumerate(suggs) ]) print(u"In line, \"%s\"" % line.strip()) print(u" Replace word %s with\n\t => %s\n" % (word, option_str)) try: choice = input(u"option [-1 ignore, 0-%d replace]: " % (len(suggs) - 1)) if PYTHON3: choice = int(choice) if choice == -1: print(u"Not replacing word") option = word self.user_dict.add(word) else: option = suggs[choice] except Exception as ie: print(str(ie)) print(u" replacing word %s -> %s\n" % (word, option)) new_document.append(unicode(option)) else: new_document.append(word) new_document.append(u"\n") print(u"*********** cleaned up document **********") print(u" ".join(new_document)) def get_lang_dictionary(self): if self.lang == u"en": return Speller.get_english_dictionary() return Speller.get_dictionary() def isWord(self, word): # Plain old dictionary checks LANG_dict = self.get_lang_dictionary() is_dict_word = LANG_dict.isWord(word) in_user_dict = word in self.user_dict or is_dict_word return in_user_dict def check_word_and_suggest(self, word): word = word.strip() letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() # plain old dictionary + user dictionary check if self.isWord(word): return (True, word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply(word) if (self.isWord(word_nocase)): return (True, word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word, TVU_dict) if len(alt) >= 1: results = [u" ".join(alt)] results.extend(alt) return (False, results) # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor(word, self.alphabets, 1, limit=50)) combinagram_suggests = list( tamil.wordutils.combinagrams(word, TVU_dict, limit=50)) pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1])) # FIXME: score the options options = list(norvig_suggests) options.extend(combinagram_suggests) options.extend(pfx_options) # score by # sort the options if self.lang == u"en": options.sort() else: if PYTHON3: options = sorted(options, key=functools.cmp_to_key( tamil.utf8.compare_words_lexicographic)) else: options = sorted(options, cmp=tamil.utf8.compare_words_lexicographic) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() return (False, options)