def dice_comparison(ref_word, word): """ use this class method for SORTED""" val = Dice_coeff(ref_word, word) if (val == 1): return 0 return (2 * (val - 0.5) > 0) and 1 or -1
def check_word_and_suggest(self, word, errmsg=None): word = word.strip() # skip known punctuation at end of line while len(word) >= 1 and any(map(word.endswith, Speller.punctuation)): word = word[:-1] while len(word) >= 1 and any(map(word.startswith, string.whitespace)): word = word[1:] # is number then we propose a numeral if self.in_tamil_mode(): numword = word.replace(u',', u'') if re.match(u'[+|-]*[\d]+', numword): try: num = float(numword) posnum = num if num < 0: posnum = -1 * num numeral_form = tamil.numeral.num2tamilstr(posnum) if num < 0: numeral_form = u"கழித்தல் " + numeral_form return (False, [numeral_form]) except Exception as ioe: pass # dates are okay if any( map(word.endswith, [u"-இல்", u"-ஆம்", u"-இலிருந்து", u"-வரை"])): if re.search('^\d+', word): return (True, [word]) #word is okay # check if words are transliterated if any( filter(lambda x: x in string.ascii_letters, tamil.utf8.get_letters(word))): # letter-sequence only en_word = Speller.scrub_ws(word) EN_Lexicon = Speller.get_english_dictionary() if EN_Lexicon.isWord(en_word): return ( False, [''] ) #English word - nosub- yet until we have parallel dictionaries or translation. TBD. #is english letter ta = algorithm.Iterative.transliterate( jaffna.Transliteration.table, en_word) # TBD: potential for having ANN to tell if english text is pure English word # or a romanized Tamil word. Output of classifier can be useful here. return (False, [ta]) # check if it matches Tamil numeral and has close match. # propose suggestions from that list. # TBD # hyphens are not okay if word.find(u"-") >= 0: return (False, [word.replace(u"-", u" ")]) #re.sub(u"^w"," ",word)) # replace other spurious ()[] punctuations by concatenation #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word))) orig_word = u"%s" % word # remove digits word = re.sub(u'\d+', u'', word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() self.add_numeral_words(TVU_dict) # Check if this 'word' is any common kind of error if Typographical.checkFormErrors(word, errmsg): if errmsg: errmsg.append("TypographicalError") if not self.checklang(word): print("Word is not in desired language!") return (False, [u""]) if len(word) < 1: print("Word is too small") return (False, [u'']) # plain old dictionary + user dictionary check if self.isWord(word): return (True, word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply(word) if (self.isWord(word_nocase)): return (True, word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word, TVU_dict) greedy_results = list() if len(alt) >= 1: greedy_results = [u" ".join(alt), u"-".join(alt)] greedy_results.extend(alt) #return (False, greedy_results ) # if there are no other suggestions than deletion filter, we return # in presence of other suggestions we can just return suggestions suggs = DeletionFilter.get_suggestions(letters, TVU_dict) if len(suggs) > 0: if len(greedy_results) == 0: return (False, suggs) else: greedy_results.extend(suggs) # ottru splitting for Tamil language mode ottru_options = [] if self.in_tamil_mode(): # discover words like யாரிகழ்ந்து are accepted. ottru = OttruSplit(word, letters) ottru.run(TVU_dict) if len(ottru.results) > 0: return (True, word) ottru_options = ottru.results # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor(word, self.alphabets, 2, limit=25)) combinagram_suggests = list( tamil.wordutils.combinagrams(word, TVU_dict, limit=25)) pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1])) # FIXME: score the options options = greedy_results options.extend(ottru_options) options.extend(list(norvig_suggests)) options.extend(combinagram_suggests) options.extend(pfx_options) # filter the options against a dictionary! options = filter(TVU_dict.isWord, options) if PYTHON3: options = list(options) if self.in_tamil_mode(): options.extend(self.mayangoli_suggestions(orig_word, letters)) # sort the options if not self.in_tamil_mode(): options.sort() else: if PYTHON3: options = sorted(options, key=functools.cmp_to_key( tamil.utf8.compare_words_lexicographic)) else: options = sorted(options, cmp=tamil.utf8.compare_words_lexicographic) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2, options) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options if _DEBUG: print("@deduplication") pprint.pprint(options2) # score by Dice or Edit-Distance coefficients options_score = [0.0 for i in range(len(options2))] for itr, sugg_word in enumerate(options2): #options_score[itr] = Dice_coeff( word, sugg_word ) options_score[itr] = (len(word) - edit_distance( word, sugg_word)) / (1.0 * len(orig_word)) * Dice_coeff( word, sugg_word) / 3.0 #dice coeff is weighted down options = zip(options2, options_score) # limit options by score options = sorted(options, key=operator.itemgetter(1), reverse=True) options = [word_pair[0] for word_pair in options] #L = 40 # limit to first top -L=20 only which is good enough #options = options[0:min(len(options),L)] if _DEBUG: pprint.pprint("@after scoring/sorting") pprint.pprint(options) # eliminate single letter options options = filter(lambda x: not (x in tamil.utf8.tamil_letters), options) # Due to suggestion policy we may have words which are found in error but we dont have # replacements for them! # TBD: options should not have the 'word'! return (False, options)
def check_word_and_suggest( self,word ): word = word.strip() # remove punctuation for x in string.punctuation: word = word.replace(x,u"") # remove digits word = re.sub(u'\d+',u'',word) letters = tamil.utf8.get_letters(word) TVU_dict = self.get_lang_dictionary() if not self.checklang(word): return (False,[u'']) if len(word) < 1: return (False,[u'']) # plain old dictionary + user dictionary check if self.isWord(word): return (True,word) # Remove case and redo the dictionary + user check word_nocase = self.case_filter.apply( word ) if ( self.isWord( word_nocase ) ): return (True,word_nocase) else: word = word_nocase # Consider splitting the word and see if it has 2 sub-words # e.g. செயல்பட => செயல் + பட alt = tamil.wordutils.greedy_split(word,TVU_dict) if len(alt) >= 1: greedy_results = [u" ".join(alt),u"-".join(alt)] greedy_results.extend(alt) #return (False, results ) else: greedy_results = list() # TODO: Noun Declension - ticket- # suggestions at edit distance 1 norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=50)) combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=50)) pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) ) # FIXME: score the options options = greedy_results options.extend( list(norvig_suggests)) options.extend( combinagram_suggests ) options.extend( pfx_options ) # sort the options if self.lang == u"en": options.sort() else: if PYTHON3: options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) ) else: options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic ) # remove replacements with single-letter words WL = len(tamil.utf8.get_letters(word)) if WL > 3: options = filter( lambda x: len(tamil.utf8.get_letters(x)) > 2, options ) # remove dupes in list options2 = [] prev = None for val in options: if val.strip() != prev: options2.append(val.strip()) prev = val.strip() del options # score by Dice coefficients options_score = [0.0 for i in range(len(options2))] for itr,sugg_word in enumerate(options2): options_score[itr] = Dice_coeff( word, sugg_word ) options = zip( options2, options_score) # limit options by score options = sorted(options,key=operator.itemgetter(1),reverse=True) options = [word_pair[0] for word_pair in options] L = 20 # limit to first top -L=20 only which is good enough options = options[0:min(len(options),L)] return (False, options )