예제 #1
0
 def test_basic_case_(self):
     obj = RemovePluralSuffix()
     objf = CaseFilter(obj)
     expected = [u"பதிவி", u"கட்டளை", u"அவர்"]
     words_list = [u"பதிவில்", u"கட்டளைகள்", u"அவர்கள்"]
     for w, x in zip(words_list, expected):
         rval = obj.removeSuffix(w)
         trunc_word = objf.apply(w)
         self.assertEqual(trunc_word, rval[0])
     return
 def test_basic_case_(self):
     obj = RemovePluralSuffix()
     objf = CaseFilter(obj)
     expected = [u"பதிவி",u"கட்டளை",u"அவர்"]
     words_list = [u"பதிவில்",u"கட்டளைகள்",u"அவர்கள்"]
     for w,x in zip(words_list,expected):
         rval = obj.removeSuffix(w)
         trunc_word = objf.apply( w )
         self.assertEqual( trunc_word ,rval[0] )
     return
예제 #3
0
 def __init__(self,filename=None,lang="ta",mode="non-web"):
     object.__init__(self)
     self.lang = lang.lower()
     self.filename = filename
     self.user_dict = set()
     self.case_filter = CaseFilter( RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix() )
     if self.lang == u"en":
         self.alphabets = [a for a in string.ascii_lowercase]
     else:
         self.alphabets = None
     
     if mode == "web":
         return
         
     if not self.filename:            
         self.interactive()
     else:
         self.spellcheck(self.filename)
예제 #4
0
    def __init__(self, filename=None, lang="ta", mode="non-web"):
        object.__init__(self)
        self.lang = lang.lower()
        self.filename = filename
        self.user_dict = set()
        self.case_filter = CaseFilter(RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix())
        if self.lang == u"en":
            self.alphabets = [a for a in string.ascii_lowercase]
        else:
            self.alphabets = None

        if mode != "web":
            if not self.filename:
                self.interactive()
            else:
                self.spellcheck(self.filename)
        pass
예제 #5
0
파일: spell.py 프로젝트: vmmlog/open-tamil
class Speller(object):
    TVU_dict = None
    ENL_dict = None
    punctuation = string.punctuation + '()[]{}'

    def __init__(self, filename=None, lang="ta", mode="non-web"):
        object.__init__(self)
        self.lang = lang.lower()
        self.filename = filename
        self.user_dict = set()
        self.case_filter = CaseFilter(RemovePluralSuffix(),
                                      RemoveVerbSuffixTense(),
                                      RemoveCaseSuffix(), RemovePrefix())
        if not self.in_tamil_mode():
            self.alphabets = [a for a in string.ascii_lowercase]
        else:
            self.alphabets = None

        if mode == "web":
            return

        if not self.filename:
            self.interactive()
        else:
            self.spellcheck(self.filename)

    def in_tamil_mode(self):
        return self.lang != u"en"

    @staticmethod
    def get_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.TVU_dict:
            Speller.TVU_dict, _ = DictionaryBuilder.create(TamilVU)
        LoadDictionary.lock.release()
        return Speller.TVU_dict

    @staticmethod
    def get_english_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.ENL_dict:
            Speller.ENL_dict, _ = DictionaryBuilder.create(EnglishLinux)
        LoadDictionary.lock.release()
        return Speller.ENL_dict

    def language(self):
        if self.in_tamil_mode():
            return "tamil"
        return "english"

    def checklang(self, word):
        if self.in_tamil_mode():
            return tamil.utf8.all_tamil(word)
        for w in word.lower():
            if not (w in string.ascii_lowercase):
                return False
        return True

    # full-text interface driver for unittest @ Dec 10, 2017
    def noninteractive_spellcheck(self, text):
        nwords = 0
        npass = 0
        nfail = 0
        fail_n_suggs = dict()
        for word in re.split('\s+', text):
            if len(word) < 1:
                continue
            nwords += 1
            result, suggs = self.REST_interface(word)
            nfail += int(not result)
            npass += int(result)
            if not result:
                fail_n_suggs[word] = suggs
        obj = {
            'total': nwords,
            'correct_words': npass,
            'wrong_words': nfail,
            'word_suggestions': fail_n_suggs
        }
        return obj

    # Ref: https://www.tinymce.com/docs/plugins/spellchecker/
    def REST_interface(self, word):
        # returns JSON data in TinyMCE format
        ok, suggs = self.check_word_and_suggest(word)
        if _DEBUG:
            print("REST => %d" % ok)
            pprint.pprint(suggs)
        if ok:
            return ok, {}
        return ok, suggs

    @staticmethod
    def dice_comparison(ref_word, word):
        """ use this class method for SORTED"""
        val = Dice_coeff(ref_word, word)
        if (val == 1):
            return 0
        return (2 * (val - 0.5) > 0) and 1 or -1

    def suggestion_policy(self, word, suggs):
        # pick suggestions that are only +/- 2 letter length different
        filter_suggs = []
        tamil_length = lambda w: len(tamil.utf8.get_letters(w))
        ref_wl = tamil_length(word)
        accept_min_max = [max(ref_wl - 2, 1), ref_wl + 1]
        filter_suggs = filter(
            lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <=
            accept_min_max[1], suggs)
        # sort the suggestions by Dice coefficient
        filter_suggs = set(filter_suggs)
        if len(filter_suggs) == 0:
            # guess!
            filter_suggs = suggs
            filter_suggs = sorted(filter_suggs,
                                  cmp=tamil.utf8.compare_words_lexicographic)
            filter_suggs[min(10, len(filter_suggs) - 1):] = []
            return filter_suggs
        filter_suggs = sorted(filter_suggs, cmp=Speller.dice_comparison)
        return filter_suggs

    def str_suggestions(self, word):
        if self.in_tamil_mode():
            return u"சொல் \"%s\" மாற்றங்கள்" % word
        return u"SUGGESTIONS for \"%s\"" % word

    def mayangoli_suggestions(self, word, letters):
        alternates = Mayangoli.run(word, letters)
        alternates = filter(lambda w: w != word, alternates)
        if _DEBUG:
            for idx, w in enumerate(alternates):
                pprint.pprint(["Myangoli", idx, w])
        return copy.copy(alternates)

    def interactive(self):
        try:
            while (True):
                if PYTHON3:
                    word = input(u">> ")
                else:
                    word = raw_input(u">> ")
                    word = word.decode("utf-8").strip()
                word = re.sub(u"\s+", "", word)

                # skip empty words
                if len(word) < 1:
                    continue

                if not self.checklang(word):
                    print(u"EXCEPTION \"%s\" is not a %s Word" %
                          (word, self.language()))
                    continue
                ok, suggs = self.check_word_and_suggest(word)
                suggs = self.suggestion_policy(word, suggs)
                if not ok:
                    words_per_row = 4
                    option_str = u", ".join([
                        u"(%d) %s" % (itr, wrd) +
                        ((itr > 0 and itr % words_per_row == 0) and u"\n"
                         or u"") for itr, wrd in enumerate(suggs)
                    ])
                    print(u"%s\n\t %s" %
                          (self.str_suggestions(word), option_str))
                else:
                    print(self.in_tamil_mode() and u"சரி" or u"OK")
        except KeyboardInterrupt as ke:
            pass
        except EOFError as eof:
            pass
        finally:
            print(self.in_tamil_mode() and u"\nவணக்கம்!" or "\nBYE!")
        return

    def spellcheck(self, filename):
        new_document = []
        data = codecs.open(filename, u"r", u"utf-8")
        lines = data.readlines()
        for line in lines:
            words = tamil.utf8.get_words(tamil.utf8.get_letters(line))
            for word in words:
                # FIXME : handle punctuation
                #word = filter( tamil.utf8.is_tamil_unicode_predicate, word )
                ok, suggs = self.check_word_and_suggest(word)
                if PYTHON3 and not ok:
                    suggs = list(suggs)
                if not ok:
                    option = suggs[0]
                    # take user input.
                    # FIXME: User options to include DONTREPLACE/KEEP, DELETE WORD, etc.
                    option_str = u", ".join([
                        u"(%d) %s" % (itr, wrd)
                        for itr, wrd in enumerate(suggs)
                    ])
                    if self.in_tamil_mode():
                        print(u"வரி \"%s\"" % line.strip())
                        print(
                            u"'%s' சொல்லை கொண்டு\n\t சொல்லை '%s' மாற்றிடு\n" %
                            (option_str, word))
                    else:
                        print(u"Line, \"%s\"" % line.strip())
                        print(u" Replace word %s with\n\t => %s\n" %
                              (word, option_str))
                    try:
                        if self.in_tamil_mode():
                            choice_str = "விருப்பம் [-1 புறக்கணி, 0-%d மாற்றவும்]:"
                        else:
                            choice_str = u"option [-1 ignore, 0-%d replace]: "
                        choice = input(choice_str % (len(suggs) - 1))
                        if PYTHON3:
                            choice = int(choice)
                        if choice == -1:
                            if self.in_tamil_mode():
                                print(u"வார்த்தை மாறாத இருந்தது")
                            else:
                                print(u"Not replacing word")

                            option = word
                            self.user_dict.add(word)
                        else:
                            option = suggs[choice]
                    except Exception as ie:
                        print(str(ie))
                    if self.in_tamil_mode():
                        replace_msg = u"வார்த்தை %s -> %s இதற்காக மாற்றவும்\n"
                    else:
                        replace_msg = u" replacing word %s -> %s\n"
                    print(replace_msg % (word, option))
                    new_document.append(unicode(option))
                else:
                    new_document.append(word)
            new_document.append(u"\n")
        if self.in_tamil_mode():
            print(
                u"*********** ஆவணத்தில் உள்ள பிழைகளை திருத்திய பின் *********")
        else:
            print(u"*********** cleaned up document **********")
        print(u" ".join(new_document))

    def get_lang_dictionary(self):
        if not self.in_tamil_mode():
            return Speller.get_english_dictionary()
        return Speller.get_dictionary()

    def isWord(self, word):
        # Plain old dictionary checks
        LANG_dict = self.get_lang_dictionary()
        is_dict_word = LANG_dict.isWord(word)
        in_user_dict = word in self.user_dict or is_dict_word
        return in_user_dict

    def add_numeral_words(self, lexicon):
        if not self.in_tamil_mode():
            return

        units = (u'பூஜ்ஜியம்', u'ஒன்று', u'இரண்டு', u'மூன்று', u'நான்கு',
                 u'ஐந்து', u'ஆறு', u'ஏழு', u'எட்டு', u'ஒன்பது', u'பத்து'
                 )  # 0-10
        teens = (u'பதினொன்று', u' பனிரண்டு', u'பதிமூன்று', u'பதினான்கு',
                 u'பதினைந்து', u'பதினாறு', u'பதினேழு', u'பதினெட்டு',
                 u'பத்தொன்பது')  # 11-19
        tens = (u'பத்து', u'இருபது', u'முப்பது', u'நாற்பது', u'ஐம்பது',
                u'அறுபது', u'எழுபது', u'எண்பது', u'தொன்னூறு')  # 10-90
        tens_suffix = (u'இருபத்து', u'முப்பத்து', u'நாற்பத்து', u'ஐம்பத்து',
                       u'அறுபத்து', u'எழுபத்து', u'எண்பத்து', u'தொன்னூத்து'
                       )  # 10+-90+
        hundreds = (u'நூறு', u'இருநூறு', u'முந்நூறு', u'நாநூறு', u'ஐநூறு',
                    u'அறுநூறு', u'எழுநூறு', u'எண்ணூறு', u'தொள்ளாயிரம்'
                    )  #100 - 900
        hundreds_suffix = (u'நூற்றி', u'இருநூற்றி', u'முந்நூற்று', u'நாநூற்று',
                           u'ஐநூற்று', u'அறுநூற்று', u'எழுநூற்று',
                           u'எண்ணூற்று', u'தொள்ளாயிரத்து')  #100+ - 900+
        one_thousand_prefix = (u'ஓர்', )
        thousands = (u'ஆயிரம்', u'ஆயிரத்தி')

        one_prefix = (u'ஒரு', )
        lakh = (u'இலட்சம்', u'இலட்சத்து')
        crore = (u'கோடி', u'கோடியே')

        mil = (u'மில்லியன்', )
        bil = (u'பில்லியன்', )
        tril = (u'டிரில்லியன்', )

        if lexicon.isWord(tril[0]):
            return

        numerals = list()
        for wordset in [
                units, tens, teens, tens_suffix, hundreds, hundreds_suffix,
                one_thousand_prefix, thousands, one_prefix, lakh, crore, mil,
                bil, tril
        ]:
            numerals.extend(wordset)
        #with codecs.open("numerals.json","w","utf-8") as fp:
        #    fp.write(json.dumps(numerals))
        for word in numerals:
            lexicon.add(word)

    @staticmethod
    def scrub_ws(word):
        return re.sub(u'[\s{}()\[\]]+', u'', word)

    def check_word_and_suggest(self, word, errmsg=None):
        word = word.strip()
        # skip known punctuation at end of line
        while len(word) >= 1 and any(map(word.endswith, Speller.punctuation)):
            word = word[:-1]
        while len(word) >= 1 and any(map(word.startswith, string.whitespace)):
            word = word[1:]

        # is number then we propose a numeral
        if self.in_tamil_mode():
            numword = word.replace(u',', u'')
            if re.match(u'[+|-]*[\d]+', numword):
                try:
                    num = float(numword)
                    posnum = num
                    if num < 0:
                        posnum = -1 * num
                    numeral_form = tamil.numeral.num2tamilstr(posnum)
                    if num < 0:
                        numeral_form = u"கழித்தல் " + numeral_form
                    return (False, [numeral_form])
                except Exception as ioe:
                    pass

            # dates are okay
            if any(
                    map(word.endswith,
                        [u"-இல்", u"-ஆம்", u"-இலிருந்து", u"-வரை"])):
                if re.search('^\d+', word):
                    return (True, [word])  #word is okay

            # check if words are transliterated
            if any(
                    filter(lambda x: x in string.ascii_letters,
                           tamil.utf8.get_letters(word))):
                # letter-sequence only
                en_word = Speller.scrub_ws(word)
                EN_Lexicon = Speller.get_english_dictionary()
                if EN_Lexicon.isWord(en_word):
                    return (
                        False, ['']
                    )  #English word - nosub- yet until we have parallel dictionaries or translation. TBD.

                #is english letter
                ta = algorithm.Iterative.transliterate(
                    jaffna.Transliteration.table, en_word)
                # TBD: potential for having ANN to tell if english text is pure English word
                # or a romanized Tamil word. Output of classifier can be useful here.
                return (False, [ta])

            # check if it matches Tamil numeral and has close match.
            # propose suggestions from that list.
            # TBD

        # hyphens are not okay
        if word.find(u"-") >= 0:
            return (False, [word.replace(u"-",
                                         u" ")])  #re.sub(u"^w"," ",word))
        # replace other spurious ()[] punctuations by concatenation
        #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word)))
        orig_word = u"%s" % word

        # remove digits
        word = re.sub(u'\d+', u'', word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        self.add_numeral_words(TVU_dict)

        # Check if this 'word' is any common kind of error
        if Typographical.checkFormErrors(word, errmsg):
            if errmsg: errmsg.append("TypographicalError")

        if not self.checklang(word):
            print("Word is not in desired language!")
            return (False, [u""])

        if len(word) < 1:
            print("Word is too small")
            return (False, [u''])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True, word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply(word)
        if (self.isWord(word_nocase)):
            return (True, word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word, TVU_dict)
        greedy_results = list()
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt), u"-".join(alt)]
            greedy_results.extend(alt)
            #return (False, greedy_results )

        # if there are no other suggestions than deletion filter, we return
        # in presence of other suggestions we can just return suggestions
        suggs = DeletionFilter.get_suggestions(letters, TVU_dict)
        if len(suggs) > 0:
            if len(greedy_results) == 0:
                return (False, suggs)
            else:
                greedy_results.extend(suggs)

        # ottru splitting for Tamil language mode
        ottru_options = []
        if self.in_tamil_mode():
            # discover words like யாரிகழ்ந்து are accepted.
            ottru = OttruSplit(word, letters)
            ottru.run(TVU_dict)
            if len(ottru.results) > 0:
                return (True, word)
            ottru_options = ottru.results

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter(
            TVU_dict.isWord, norvig_suggestor(word,
                                              self.alphabets,
                                              2,
                                              limit=25))
        combinagram_suggests = list(
            tamil.wordutils.combinagrams(word, TVU_dict, limit=25))
        pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1]))

        # FIXME: score  the options
        options = greedy_results
        options.extend(ottru_options)
        options.extend(list(norvig_suggests))
        options.extend(combinagram_suggests)
        options.extend(pfx_options)

        # filter the options against a dictionary!
        options = filter(TVU_dict.isWord, options)
        if PYTHON3:
            options = list(options)

        if self.in_tamil_mode():
            options.extend(self.mayangoli_suggestions(orig_word, letters))

        # sort the options
        if not self.in_tamil_mode():
            options.sort()
        else:
            if PYTHON3:
                options = sorted(options,
                                 key=functools.cmp_to_key(
                                     tamil.utf8.compare_words_lexicographic))
            else:
                options = sorted(options,
                                 cmp=tamil.utf8.compare_words_lexicographic)

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2,
                             options)

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        if _DEBUG:
            print("@deduplication")
            pprint.pprint(options2)

        # score by Dice or Edit-Distance coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr, sugg_word in enumerate(options2):
            #options_score[itr] = Dice_coeff( word, sugg_word )
            options_score[itr] = (len(word) - edit_distance(
                word, sugg_word)) / (1.0 * len(orig_word)) * Dice_coeff(
                    word, sugg_word) / 3.0  #dice coeff is weighted down
        options = zip(options2, options_score)

        # limit options by score
        options = sorted(options, key=operator.itemgetter(1), reverse=True)
        options = [word_pair[0] for word_pair in options]
        #L = 40
        # limit to first top -L=20 only which is good enough
        #options = options[0:min(len(options),L)]
        if _DEBUG:
            pprint.pprint("@after scoring/sorting")
            pprint.pprint(options)

        # eliminate single letter options
        options = filter(lambda x: not (x in tamil.utf8.tamil_letters),
                         options)

        # Due to suggestion policy we may have words which are found in error but we dont have
        # replacements for them!

        # TBD: options should not have the 'word'!
        return (False, options)
예제 #6
0
class Speller(object):
    TVU_dict = None
    ENL_dict = None
    def __init__(self,filename=None,lang="ta",mode="non-web"):
        object.__init__(self)
        self.lang = lang.lower()
        self.filename = filename
        self.user_dict = set()
        self.case_filter = CaseFilter( RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix() )
        if self.lang == u"en":
            self.alphabets = [a for a in string.ascii_lowercase]
        else:
            self.alphabets = None
        
        if mode == "web":
            return
            
        if not self.filename:            
            self.interactive()
        else:
            self.spellcheck(self.filename)

    def in_tamil_mode(self):
        return self.lang != u"en"
    
    @staticmethod
    def get_dictionary():        
        LoadDictionary.lock.acquire()
        if not Speller.TVU_dict:
            Speller.TVU_dict,_ = DictionaryBuilder.create(TamilVU)
        LoadDictionary.lock.release()
        return Speller.TVU_dict
    
    @staticmethod
    def get_english_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.ENL_dict:
            Speller.ENL_dict,_ = DictionaryBuilder.create(EnglishLinux)
        LoadDictionary.lock.release()
        return Speller.ENL_dict    
    
    def language(self):
        if self.lang == "ta":
            return "tamil"
        return "english"
        
    def checklang(self,word):
        if self.lang == "ta":
            return tamil.utf8.all_tamil(word)
        for w in word.lower():
            if not ( w in string.ascii_lowercase ):
                return False
        return True
    
    # Ref: https://www.tinymce.com/docs/plugins/spellchecker/
    def REST_interface(self,word):
        # returns JSON data in TinyMCE format
        ok,suggs = self.check_word_and_suggest( word )
        if ok:
            return ok, {}
        return ok, suggs
        
    @staticmethod
    def dice_comparison(ref_word,word):
        """ use this class method for SORTED"""
        val = Dice_coeff(ref_word,word)
        if ( val == 1 ):
            return 0
        return (2*(val - 0.5) > 0) and 1 or -1
        
    def suggestion_policy(self,word,suggs):
        # pick suggestions that are only +/- 2 letter length different
        filter_suggs = []
        tamil_length = lambda w: len(tamil.utf8.get_letters(w))
        ref_wl = tamil_length(word)
        accept_min_max = [max(ref_wl-2,1),ref_wl+1]
        filter_suggs = filter(lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <= accept_min_max[1], suggs)
        # sort the suggestions by Dice coefficient
        filter_suggs = set(filter_suggs)
        if len(filter_suggs) == 0:
            # guess!
            filter_suggs = suggs
            filter_suggs=sorted(filter_suggs,cmp=tamil.utf8.compare_words_lexicographic)
            filter_suggs[min(10,len(filter_suggs)-1):]=[]
            return filter_suggs
        filter_suggs=sorted(filter_suggs,cmp=Speller.dice_comparison)
        return filter_suggs
    
    def str_suggestions(self,word):
        if self.in_tamil_mode():
            return u"சொல் \"%s\" மாற்றங்கள்"%word
        return u"SUGGESTIONS for \"%s\""%word
    
    def interactive(self):
        try:
            while( True ):
                if PYTHON3:
                    word = input(u">> ")
                else:
                    word = raw_input(u">> ")
                    word = word.decode("utf-8").strip()
                word = re.sub(u"\s+","",word)
                
                # skip empty words
                if len(word) < 1:
                    continue
                
                if not self.checklang(word):
                    print(u"EXCEPTION \"%s\" is not a %s Word"%(word,self.language()))
                    continue
                ok,suggs = self.check_word_and_suggest( word )
                suggs = self.suggestion_policy(word,suggs)
                if not ok:
                    words_per_row = 4
                    option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) + ((itr > 0 and itr%words_per_row == 0) and u"\n" or u"") for itr,wrd in enumerate(suggs)] )
                    print(u"%s\n\t %s"%(self.str_suggestions(word),option_str))                    
                else:
                    print(self.in_tamil_mode() and  u"சரி" or u"OK")
        except KeyboardInterrupt as ke:
            pass
        except EOFError as eof:
            pass
        finally:
            print(self.in_tamil_mode() and  u"\nவணக்கம்!" or "\nBYE!")
        return
    
    def spellcheck(self,filename):
        new_document = []
        data = codecs.open(filename,u"r",u"utf-8")
        lines = data.readlines()
        for line in lines:
            words = tamil.utf8.get_words( tamil.utf8.get_letters(line) )
            for word  in words:
                # FIXME : handle punctuation
                #word = filter( tamil.utf8.is_tamil_unicode_predicate, word )
                ok,suggs = self.check_word_and_suggest( word )
                if not ok:
                    option = suggs[0]
                    # take user input.
                    # FIXME: User optiions to include DONTREPLACE/KEEP, DELETE WORD, etc.
                    option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) for itr,wrd in enumerate(suggs)] )
                    print(u"In line, \"%s\""%line.strip())
                    print(u" Replace word %s with\n\t => %s\n"%(word, option_str))
                    try:
                        choice = input(u"option [-1 ignore, 0-%d replace]: "%(len(suggs)-1))
                        if PYTHON3:
                            choice = int(choice)
                        if choice == -1:
                            print(u"Not replacing word")
                            option = word
                            self.user_dict.add(word)
                        else:
                            option = suggs[choice]
                    except Exception as ie:
                        print (str(ie))
                    print(u" replacing word %s -> %s\n"%(word,option))
                    new_document.append( unicode(option) )
                else:
                    new_document.append( word )
            new_document.append(u"\n")
        print(u"*********** cleaned up document **********")
        print(u" ".join(new_document))
        
    def get_lang_dictionary(self):
        if self.lang == u"en":
            return Speller.get_english_dictionary()
        return Speller.get_dictionary()
     
    def isWord(self, word):
        # Plain old dictionary checks
        LANG_dict = self.get_lang_dictionary()
        is_dict_word = LANG_dict.isWord(word)
        
        in_user_dict = word in self.user_dict or is_dict_word
        return in_user_dict
        
    def check_word_and_suggest( self,word ):         
        word = word.strip()
        # remove punctuation
        for x in string.punctuation:
            word = word.replace(x,u"")
        # remove digits
        word = re.sub(u'\d+',u'',word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        
        if not self.checklang(word):
            return (False,[u''])
        
        if len(word) < 1:
            return (False,[u''])
        
        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True,word)
        
        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply( word )
        if ( self.isWord( word_nocase ) ):
            return (True,word_nocase)
        else:
            word = word_nocase
        
        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word,TVU_dict)
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt),u"-".join(alt)]
            greedy_results.extend(alt)
        #return (False, results )
        else:
            greedy_results = list()
        
        # TODO: Noun Declension - ticket-
        
        # suggestions at edit distance 1
        norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=50))
        combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=50)) 
        pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) )
        
        # FIXME: score  the options
        options = greedy_results
        options.extend( list(norvig_suggests))
        options.extend( combinagram_suggests )
        options.extend( pfx_options )
        
        # sort the options
        if self.lang == u"en":
            options.sort()
        else:
            if PYTHON3:
                options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) )
            else:
                options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic )
        
        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter( lambda x:  len(tamil.utf8.get_letters(x)) > 2, options )
        
        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        
        # score by Dice coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr,sugg_word in enumerate(options2):
            options_score[itr] = Dice_coeff( word, sugg_word )
        options = zip( options2, options_score)
        
        # limit options by score
        options = sorted(options,key=operator.itemgetter(1),reverse=True)
        options = [word_pair[0] for word_pair in options]
        
        L = 20
        # limit to first top -L=20 only which is good enough
        options = options[0:min(len(options),L)]
        
        return (False, options )
예제 #7
0
class Speller(object):
    TVU_dict = None
    ENL_dict = None

    def __init__(self, filename=None, lang="ta", mode="non-web"):
        object.__init__(self)
        self.lang = lang.lower()
        self.filename = filename
        self.user_dict = set()
        self.case_filter = CaseFilter(RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix())
        if self.lang == u"en":
            self.alphabets = [a for a in string.ascii_lowercase]
        else:
            self.alphabets = None

        if mode != "web":
            if not self.filename:
                self.interactive()
            else:
                self.spellcheck(self.filename)
        pass

    @staticmethod
    def get_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.TVU_dict:
            Speller.TVU_dict, _ = DictionaryBuilder.create(TamilVU)
        LoadDictionary.lock.release()
        return Speller.TVU_dict

    @staticmethod
    def get_english_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.ENL_dict:
            Speller.ENL_dict, _ = DictionaryBuilder.create(EnglishLinux)
        LoadDictionary.lock.release()
        return Speller.ENL_dict

    def language(self):
        if self.lang == "ta":
            return "tamil"
        return "english"

    def checklang(self, word):
        if self.lang == "ta":
            return tamil.utf8.all_tamil(word)
        for w in word.lower():
            if not (w in string.ascii_lowercase):
                return False
        return True

    # Ref: https://www.tinymce.com/docs/plugins/spellchecker/
    def REST_interface(self, word):
        # returns JSON data in TinyMCE format
        ok, suggs = self.check_word_and_suggest(word)
        if ok:
            return ok, {}
        return ok, suggs

    def interactive(self):
        try:
            while True:
                if PYTHON3:
                    word = input(u">> ")
                else:
                    word = raw_input(u">> ")
                    word = word.decode("utf-8").strip()
                word = re.sub(u"\s+", "", word)
                if not self.checklang(word):
                    print(u'EXCEPTION "%s" is not a %s Word' % (word, self.language()))
                    continue
                ok, suggs = self.check_word_and_suggest(word)
                if not ok:
                    option_str = u", ".join([u"(%d) %s" % (itr, wrd) for itr, wrd in enumerate(suggs)])
                    print(u'SUGGESTIONS for "%s"\n\t %s' % (word, option_str))
                else:
                    print(u"OK")
        except KeyboardInterrupt as ke:
            pass
        except EOFError as eof:
            pass
        finally:
            print("\nBYE!")
        return

    def spellcheck(self, filename):
        new_document = []
        data = codecs.open(filename, u"r", u"utf-8")
        lines = data.readlines()
        for line in lines:
            words = tamil.utf8.get_words(tamil.utf8.get_letters(line))
            for word in words:
                # FIXME : handle punctuation
                # word = filter( tamil.utf8.is_tamil_unicode_predicate, word )
                ok, suggs = self.check_word_and_suggest(word)
                if not ok:
                    option = suggs[0]
                    # take user input.
                    # FIXME: User optiions to include DONTREPLACE/KEEP, DELETE WORD, etc.
                    option_str = u", ".join([u"(%d) %s" % (itr, wrd) for itr, wrd in enumerate(suggs)])
                    print(u'In line, "%s"' % line.strip())
                    print(u" Replace word %s with\n\t => %s\n" % (word, option_str))
                    try:
                        choice = input(u"option [-1 ignore, 0-%d replace]: " % (len(suggs) - 1))
                        if PYTHON3:
                            choice = int(choice)
                        if choice == -1:
                            print(u"Not replacing word")
                            option = word
                            self.user_dict.add(word)
                        else:
                            option = suggs[choice]
                    except Exception as ie:
                        print(str(ie))
                    print(u" replacing word %s -> %s\n" % (word, option))
                    new_document.append(unicode(option))
                else:
                    new_document.append(word)
            new_document.append(u"\n")
        print(u"*********** cleaned up document **********")
        print(u" ".join(new_document))

    def get_lang_dictionary(self):
        if self.lang == u"en":
            return Speller.get_english_dictionary()
        return Speller.get_dictionary()

    def isWord(self, word):
        # Plain old dictionary checks
        LANG_dict = self.get_lang_dictionary()
        is_dict_word = LANG_dict.isWord(word)

        in_user_dict = word in self.user_dict or is_dict_word
        return in_user_dict

    def check_word_and_suggest(self, word):
        word = word.strip()
        # remove punctuation
        for x in string.punctuation:
            word = word.replace(x, u"")
        # remove digits
        word = re.sub(u"\d+", u"", word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()

        if not self.checklang(word):
            return (False, [u""])

        if len(word) < 1:
            return (False, [u""])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True, word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply(word)
        if self.isWord(word_nocase):
            return (True, word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word, TVU_dict)
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt), u"-".join(alt)]
            greedy_results.extend(alt)
        # return (False, results )
        else:
            greedy_results = list()

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter(TVU_dict.isWord, norvig_suggestor(word, self.alphabets, 2, limit=50))
        combinagram_suggests = list(tamil.wordutils.combinagrams(word, TVU_dict, limit=50))
        pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1]))

        # FIXME: score  the options
        options = greedy_results
        options.extend(list(norvig_suggests))
        options.extend(combinagram_suggests)
        options.extend(pfx_options)

        # sort the options
        if self.lang == u"en":
            options.sort()
        else:
            if PYTHON3:
                options = sorted(options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic))
            else:
                options = sorted(options, cmp=tamil.utf8.compare_words_lexicographic)

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter(lambda x: len(tamil.utf8.get_letters(x)) > 2, options)

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options

        # score by Dice coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr, sugg_word in enumerate(options2):
            options_score[itr] = Dice_coeff(word, sugg_word)
        options = zip(options2, options_score)

        # limit options by score
        options = sorted(options, key=operator.itemgetter(1), reverse=True)
        options = [word_pair[0] for word_pair in options]

        L = 20
        # limit to first top -L=20 only which is good enough
        options = options[0 : min(len(options), L)]

        return (False, options)
예제 #8
0
파일: spell.py 프로젝트: msathia/Ezhil-Lang
class Speller(object):
    TVU_dict = None
    ENL_dict = None
    punctuation = string.punctuation+'()[]{}'
    def __init__(self,filename=None,lang="ta",mode="non-web"):
        object.__init__(self)
        self.lang = lang.lower()
        self.filename = filename
        self.user_dict = set()
        self.case_filter = CaseFilter( RemovePluralSuffix(), RemoveVerbSuffixTense(), RemoveCaseSuffix(), RemovePrefix() )
        if not self.in_tamil_mode():
            self.alphabets = [a for a in string.ascii_lowercase]
        else:
            self.alphabets = None

        if mode == "web":
            return

        if not self.filename:
            self.interactive()
        else:
            self.spellcheck(self.filename)

    def in_tamil_mode(self):
        return self.lang != u"en"

    @staticmethod
    def get_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.TVU_dict:
            Speller.TVU_dict,_ = DictionaryBuilder.create(TamilVU)
        LoadDictionary.lock.release()
        return Speller.TVU_dict

    @staticmethod
    def get_english_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.ENL_dict:
            Speller.ENL_dict,_ = DictionaryBuilder.create(EnglishLinux)
        LoadDictionary.lock.release()
        return Speller.ENL_dict

    def language(self):
        if self.in_tamil_mode():
            return "tamil"
        return "english"

    def checklang(self,word):
        if self.in_tamil_mode():
            return tamil.utf8.all_tamil(word)
        for w in word.lower():
            if not ( w in string.ascii_lowercase ):
                return False
        return True

    # full-text interface driver for unittest @ Dec 10, 2017
    def noninteractive_spellcheck(self,text):
        nwords = 0
        npass = 0
        nfail = 0
        fail_n_suggs = dict()
        for word in re.split('\s+',text):
            if len(word) < 1:
                continue
            nwords += 1
            result,suggs = self.REST_interface(word)
            nfail += int(not result)
            npass += int(result)
            if not result:
                fail_n_suggs[word] = suggs
        obj = {'total':nwords,
        'correct_words':npass,
        'wrong_words':nfail,
        'word_suggestions':fail_n_suggs}
        return obj

    # Ref: https://www.tinymce.com/docs/plugins/spellchecker/
    def REST_interface(self,word):
        # returns JSON data in TinyMCE format
        ok,suggs = self.check_word_and_suggest( word )
        if _DEBUG:
            print("REST => %d"%ok)
            pprint.pprint(suggs)
        if ok:
            return ok, {}
        return ok, suggs

    @staticmethod
    def dice_comparison(ref_word,word):
        """ use this class method for SORTED"""
        val = Dice_coeff(ref_word,word)
        if ( val == 1 ):
            return 0
        return (2*(val - 0.5) > 0) and 1 or -1

    def suggestion_policy(self,word,suggs):
        # pick suggestions that are only +/- 2 letter length different
        filter_suggs = []
        tamil_length = lambda w: len(tamil.utf8.get_letters(w))
        ref_wl = tamil_length(word)
        accept_min_max = [max(ref_wl-2,1),ref_wl+1]
        filter_suggs = filter(lambda w: tamil_length(w) >= accept_min_max[0] and len(w) <= accept_min_max[1], suggs)
        # sort the suggestions by Dice coefficient
        filter_suggs = set(filter_suggs)
        if len(filter_suggs) == 0:
            # guess!
            filter_suggs = suggs
            filter_suggs=sorted(filter_suggs,cmp=tamil.utf8.compare_words_lexicographic)
            filter_suggs[min(10,len(filter_suggs)-1):]=[]
            return filter_suggs
        filter_suggs=sorted(filter_suggs,cmp=Speller.dice_comparison)
        return filter_suggs

    def str_suggestions(self,word):
        if self.in_tamil_mode():
            return u"சொல் \"%s\" மாற்றங்கள்"%word
        return u"SUGGESTIONS for \"%s\""%word

    def mayangoli_suggestions(self,word,letters):
        alternates = Mayangoli.run(word,letters)
        alternates = filter(lambda w: w != word, alternates)
        if _DEBUG:
            for idx,w in enumerate(alternates):
                pprint.pprint(["Myangoli",idx,w])
        return copy.copy(alternates)

    def interactive(self):
        try:
            while( True ):
                if PYTHON3:
                    word = input(u">> ")
                else:
                    word = raw_input(u">> ")
                    word = word.decode("utf-8").strip()
                word = re.sub(u"\s+","",word)

                # skip empty words
                if len(word) < 1:
                    continue

                if not self.checklang(word):
                    print(u"EXCEPTION \"%s\" is not a %s Word"%(word,self.language()))
                    continue
                ok,suggs = self.check_word_and_suggest( word )
                suggs = self.suggestion_policy(word,suggs)
                if not ok:
                    words_per_row = 4
                    option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) + ((itr > 0 and itr%words_per_row == 0) and u"\n" or u"") for itr,wrd in enumerate(suggs)] )
                    print(u"%s\n\t %s"%(self.str_suggestions(word),option_str))
                else:
                    print(self.in_tamil_mode() and  u"சரி" or u"OK")
        except KeyboardInterrupt as ke:
            pass
        except EOFError as eof:
            pass
        finally:
            print(self.in_tamil_mode() and  u"\nவணக்கம்!" or "\nBYE!")
        return

    def spellcheck(self,filename):
        new_document = []
        data = codecs.open(filename,u"r",u"utf-8")
        lines = data.readlines()
        for line in lines:
            words = tamil.utf8.get_words( tamil.utf8.get_letters(line) )
            for word  in words:
                # FIXME : handle punctuation
                #word = filter( tamil.utf8.is_tamil_unicode_predicate, word )
                ok,suggs = self.check_word_and_suggest( word )
                if PYTHON3 and not ok:
                    suggs = list(suggs)
                if not ok:
                    option = suggs[0]
                    # take user input.
                    # FIXME: User options to include DONTREPLACE/KEEP, DELETE WORD, etc.
                    option_str = u", ".join( [ u"(%d) %s"%(itr,wrd) for itr,wrd in enumerate(suggs)] )
                    if self.in_tamil_mode():
                        print(u"வரி \"%s\""%line.strip())
                        print(u"'%s' சொல்லை கொண்டு\n\t சொல்லை '%s' மாற்றிடு\n"%(option_str,word))
                    else:
                        print(u"Line, \"%s\""%line.strip())
                        print(u" Replace word %s with\n\t => %s\n"%(word, option_str))
                    try:
                        if self.in_tamil_mode():
                            choice_str="விருப்பம் [-1 புறக்கணி, 0-%d மாற்றவும்]:"
                        else:
                            choice_str=u"option [-1 ignore, 0-%d replace]: "
                        choice = input(choice_str%(len(suggs)-1))
                        if PYTHON3:
                            choice = int(choice)
                        if choice == -1:
                            if self.in_tamil_mode():
                                print(u"வார்த்தை மாறாத இருந்தது")
                            else:
                                print(u"Not replacing word")

                            option = word
                            self.user_dict.add(word)
                        else:
                            option = suggs[choice]
                    except Exception as ie:
                        print (str(ie))
                    if self.in_tamil_mode():
                        replace_msg=u"வார்த்தை %s -> %s இதற்காக மாற்றவும்\n"
                    else:
                        replace_msg = u" replacing word %s -> %s\n"
                    print(replace_msg%(word,option))
                    new_document.append( unicode(option) )
                else:
                    new_document.append( word )
            new_document.append(u"\n")
        if self.in_tamil_mode():
            print(u"*********** ஆவணத்தில் உள்ள பிழைகளை திருத்திய பின் *********")
        else:
            print(u"*********** cleaned up document **********")
        print(u" ".join(new_document))

    def get_lang_dictionary(self):
        if not self.in_tamil_mode():
            return Speller.get_english_dictionary()
        return Speller.get_dictionary()

    def isWord(self, word):
        # Plain old dictionary checks
        LANG_dict = self.get_lang_dictionary()
        is_dict_word = LANG_dict.isWord(word)
        in_user_dict = word in self.user_dict or is_dict_word
        return in_user_dict

    def add_numeral_words(self,lexicon):
        if not self.in_tamil_mode():
            return

        units = (u'பூஜ்ஜியம்', u'ஒன்று', u'இரண்டு', u'மூன்று', u'நான்கு', u'ஐந்து', u'ஆறு', u'ஏழு', u'எட்டு', u'ஒன்பது', u'பத்து') # 0-10
        teens = (u'பதினொன்று', u' பனிரண்டு', u'பதிமூன்று', u'பதினான்கு', u'பதினைந்து',u'பதினாறு', u'பதினேழு', u'பதினெட்டு', u'பத்தொன்பது') # 11-19
        tens = (u'பத்து', u'இருபது', u'முப்பது', u'நாற்பது', u'ஐம்பது',u'அறுபது', u'எழுபது', u'எண்பது', u'தொன்னூறு') # 10-90
        tens_suffix = (u'இருபத்து', u'முப்பத்து', u'நாற்பத்து', u'ஐம்பத்து', u'அறுபத்து', u'எழுபத்து', u'எண்பத்து', u'தொன்னூத்து') # 10+-90+
        hundreds = ( u'நூறு', u'இருநூறு', u'முந்நூறு', u'நாநூறு',u'ஐநூறு', u'அறுநூறு', u'எழுநூறு', u'எண்ணூறு', u'தொள்ளாயிரம்') #100 - 900
        hundreds_suffix = (u'நூற்றி', u'இருநூற்றி', u'முந்நூற்று', u'நாநூற்று', u'ஐநூற்று', u'அறுநூற்று', u'எழுநூற்று', u'எண்ணூற்று',u'தொள்ளாயிரத்து') #100+ - 900+
        one_thousand_prefix = (u'ஓர்',)
        thousands = (u'ஆயிரம்',u'ஆயிரத்தி')

        one_prefix = (u'ஒரு',)
        lakh = (u'இலட்சம்',u'இலட்சத்தி')
        crore = (u'கோடி',u'கோடியே')

        mil = (u'மில்லியன்',)
        bil = (u'பில்லியன்',)
        tril = (u'டிரில்லியன்',)

        if lexicon.isWord(tril[0]):
            return

        numerals = list()
        for wordset in [units,tens,teens,tens_suffix,hundreds,hundreds_suffix,one_thousand_prefix,thousands,one_prefix,lakh,crore,mil,bil,tril]:
            numerals.extend(wordset)
        #with codecs.open("numerals.json","w","utf-8") as fp:
        #    fp.write(json.dumps(numerals))
        for word in numerals:
            lexicon.add(word)

    @staticmethod
    def scrub_ws(word):
        return re.sub(u'[\s{}()\[\]]+',u'',word)

    def check_word_and_suggest( self,word, errmsg = None ):
        word = word.strip()
        # skip known punctuation at end of line
        while len(word) >= 1 and any(map(word.endswith,Speller.punctuation)):
            word = word[:-1]
        while len(word) >= 1 and any(map(word.startswith,string.whitespace)):
            word = word[1:]

        # is number then we propose a numeral
        if self.in_tamil_mode():
            numword = word.replace(u',',u'')
            if re.match(u'[+|-]*[\d]+',numword):
                try:
                    num = float(numword)
                    posnum = num
                    if num < 0:
                        posnum = -1*num
                    numeral_form = tamil.numeral.num2tamilstr(posnum)
                    if num < 0:
                        numeral_form = u"கழித்தல் "+numeral_form
                    return (False,[numeral_form])
                except Exception as ioe:
                    pass

            # dates are okay
            if any(map(word.endswith,[u"-இல்",u"-ஆம்",u"-இலிருந்து", u"-வரை"])):
                if re.search('^\d+',word):
                    return (True,[word]) #word is okay

            # check if words are transliterated
            if any(filter(lambda x: x in string.ascii_letters,tamil.utf8.get_letters(word))):
                # letter-sequence only
                en_word = Speller.scrub_ws(word)
                EN_Lexicon = Speller.get_english_dictionary()
                if EN_Lexicon.isWord(en_word):
                    return (False,['']) #English word - nosub- yet until we have parallel dictionaries or translation. TBD.

                #is english letter
                ta = algorithm.Iterative.transliterate(jaffna.Transliteration.table,en_word)
                # TBD: potential for having ANN to tell if english text is pure English word
                # or a romanized Tamil word. Output of classifier can be useful here.
                return (False,[ta])

            # check if it matches Tamil numeral and has close match.
            # propose suggestions from that list.
            # TBD

        # hyphens are not okay
        if word.find(u"-") >= 0:
            return (False,[word.replace(u"-",u" ")])#re.sub(u"^w"," ",word))
        # replace other spurious ()[] punctuations by concatenation
        #word = u"".join(filter(lambda l: not( l in Speller.punctuation), tamil.utf8.get_letters(word)))
        orig_word = u"%s"%word

        # remove digits
        word = re.sub(u'\d+',u'',word)
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        self.add_numeral_words(TVU_dict)

        # Check if this 'word' is any common kind of error
        if Typographical.checkFormErrors(word,errmsg):
            if errmsg: errmsg.append("TypographicalError")

        if not self.checklang(word):
            print("Word is not in desired language!")
            return (False,[u""])

        if len(word) < 1:
            print("Word is too small")
            return (False,[u''])

        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True,word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply( word )
        if ( self.isWord( word_nocase ) ):
            return (True,word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word,TVU_dict)
        greedy_results = list()
        if len(alt) >= 1:
            greedy_results = [u" ".join(alt),u"-".join(alt)]
            greedy_results.extend(alt)
            #return (False, greedy_results )

        # if there are no other suggestions than deletion filter, we return
        # in presence of other suggestions we can just return suggestions
        suggs = DeletionFilter.get_suggestions(letters,TVU_dict)
        if len(suggs) > 0:
            if len(greedy_results) == 0:
                return (False,suggs)
            else:
                greedy_results.extend(suggs)

        # ottru splitting for Tamil language mode
        ottru_options = []
        if self.in_tamil_mode():
            # discover words like யாரிகழ்ந்து are accepted.
            ottru = OttruSplit(word,letters)
            ottru.run(TVU_dict)
            if len(ottru.results) > 0:
                return (True,word)
            ottru_options = ottru.results

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter( TVU_dict.isWord, norvig_suggestor( word, self.alphabets, 2,limit=25))
        combinagram_suggests = list(tamil.wordutils.combinagrams(word,TVU_dict,limit=25))
        pfx_options = TVU_dict.getWordsStartingWith( u"".join( letters[:-1] ) )

        # FIXME: score  the options
        options = greedy_results
        options.extend( ottru_options )
        options.extend( list(norvig_suggests) )
        options.extend( combinagram_suggests )
        options.extend( pfx_options )

        # filter the options against a dictionary!
        options = filter(TVU_dict.isWord,options )
        if PYTHON3:
            options = list(options)

        if self.in_tamil_mode():
            options.extend( self.mayangoli_suggestions(orig_word,letters) )

        # sort the options
        if not self.in_tamil_mode():
            options.sort()
        else:
            if PYTHON3:
                options = sorted( options, key=functools.cmp_to_key(tamil.utf8.compare_words_lexicographic) )
            else:
                options = sorted( options, cmp=tamil.utf8.compare_words_lexicographic )

        # remove replacements with single-letter words
        WL = len(tamil.utf8.get_letters(word))
        if WL > 3:
            options = filter( lambda x:  len(tamil.utf8.get_letters(x)) > 2, options )

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()
        del options
        if _DEBUG:
            print("@deduplication")
            pprint.pprint(options2)

        # score by Dice or Edit-Distance coefficients
        options_score = [0.0 for i in range(len(options2))]
        for itr,sugg_word in enumerate(options2):
            #options_score[itr] = Dice_coeff( word, sugg_word )
            options_score[itr] = (len(word)-edit_distance(word,sugg_word))/(1.0*len(orig_word))*Dice_coeff( word, sugg_word )/3.0 #dice coeff is weighted down
        options = zip( options2, options_score)

        # limit options by score
        options = sorted(options,key=operator.itemgetter(1),reverse=True)
        options = [word_pair[0] for word_pair in options]
        #L = 40
        # limit to first top -L=20 only which is good enough
        #options = options[0:min(len(options),L)]
        if _DEBUG:
            pprint.pprint("@after scoring/sorting")
            pprint.pprint(options)

        # eliminate single letter options
        options = filter(lambda x : not( x in tamil.utf8.tamil_letters), options)

        # Due to suggestion policy we may have words which are found in error but we dont have
        # replacements for them!

        # TBD: options should not have the 'word'!
        return (False, options )
예제 #9
0
class Speller(object):
    TVU_dict = None
    ENL_dict = None

    def __init__(self, filename=None, lang="ta", mode="non-web"):
        object.__init__(self)
        self.lang = lang
        self.filename = filename
        self.user_dict = set()
        self.case_filter = CaseFilter(RemovePluralSuffix(),
                                      RemoveVerbSuffixTense(),
                                      RemoveCaseSuffix(), RemovePrefix())
        if self.lang == u"en":
            self.alphabets = [a for a in string.ascii_lowercase]
        else:
            self.alphabets = None

        if mode == "web":
            return

        if not self.filename:
            self.interactive()
        else:
            self.spellcheck(self.filename)

    @staticmethod
    def get_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.TVU_dict:
            Speller.TVU_dict, _ = DictionaryBuilder.create(TamilVU)
        LoadDictionary.lock.release()
        return Speller.TVU_dict

    @staticmethod
    def get_english_dictionary():
        LoadDictionary.lock.acquire()
        if not Speller.ENL_dict:
            Speller.ENL_dict, _ = DictionaryBuilder.create(EnglishLinux)
        LoadDictionary.lock.release()
        return Speller.ENL_dict

    def language(self):
        if self.lang == "ta":
            return "tamil"
        return "english"

    def checklang(self, word):
        if self.lang == "ta":
            return tamil.utf8.all_tamil(word)
        return all([w in string.ascii_lowercase for w in word.lower()])

    # Ref: https://www.tinymce.com/docs/plugins/spellchecker/
    def REST_interface(self, word):
        # returns JSON data in TinyMCE format
        ok, suggs = self.check_word_and_suggest(word)
        if ok:
            return ok, ""
        return ok, json.dumps({word: suggs})

    def interactive(self):
        try:
            while (True):
                if PYTHON3:
                    word = input(u">> ")
                else:
                    word = raw_input(u">> ")
                    word = word.decode("utf-8").strip()
                word = re.sub(u"\s+", "", word)
                if not self.checklang(word):
                    print(u"EXCEPTION \"%s\" is not a %s Word" %
                          (word, self.language()))
                    continue
                ok, suggs = self.check_word_and_suggest(word)
                if not ok:
                    option_str = u", ".join([
                        u"(%d) %s" % (itr, wrd)
                        for itr, wrd in enumerate(suggs)
                    ])
                    print(u"SUGGESTIONS for \"%s\"\n\t %s" %
                          (word, option_str))
                else:
                    print(u"OK")
        except KeyboardInterrupt as ke:
            pass
        except EOFError as eof:
            pass
        finally:
            print("\nBYE!")
        return

    def spellcheck(self, filename):
        new_document = []
        data = codecs.open(filename, u"r", u"utf-8")
        lines = data.readlines()
        for line in lines:
            words = tamil.utf8.get_words(tamil.utf8.get_letters(line))
            for word in words:
                # FIXME : handle punctuation
                #word = filter( tamil.utf8.is_tamil_unicode_predicate, word )
                ok, suggs = self.check_word_and_suggest(word)
                if not ok:
                    option = suggs[0]
                    # take user input.
                    # FIXME: User optiions to include DONTREPLACE/KEEP, DELETE WORD, etc.
                    option_str = u", ".join([
                        u"(%d) %s" % (itr, wrd)
                        for itr, wrd in enumerate(suggs)
                    ])
                    print(u"In line, \"%s\"" % line.strip())
                    print(u" Replace word %s with\n\t => %s\n" %
                          (word, option_str))
                    try:
                        choice = input(u"option [-1 ignore, 0-%d replace]: " %
                                       (len(suggs) - 1))
                        if PYTHON3:
                            choice = int(choice)
                        if choice == -1:
                            print(u"Not replacing word")
                            option = word
                            self.user_dict.add(word)
                        else:
                            option = suggs[choice]
                    except Exception as ie:
                        print(str(ie))
                    print(u" replacing word %s -> %s\n" % (word, option))
                    new_document.append(unicode(option))
                else:
                    new_document.append(word)
            new_document.append(u"\n")
        print(u"*********** cleaned up document **********")
        print(u" ".join(new_document))

    def get_lang_dictionary(self):
        if self.lang == u"en":
            return Speller.get_english_dictionary()
        return Speller.get_dictionary()

    def isWord(self, word):
        # Plain old dictionary checks
        LANG_dict = self.get_lang_dictionary()
        is_dict_word = LANG_dict.isWord(word)

        in_user_dict = word in self.user_dict or is_dict_word
        return in_user_dict

    def check_word_and_suggest(self, word):
        word = word.strip()
        letters = tamil.utf8.get_letters(word)
        TVU_dict = self.get_lang_dictionary()
        # plain old dictionary + user dictionary check
        if self.isWord(word):
            return (True, word)

        # Remove case and redo the dictionary + user check
        word_nocase = self.case_filter.apply(word)
        if (self.isWord(word_nocase)):
            return (True, word_nocase)
        else:
            word = word_nocase

        # Consider splitting the word and see if it has 2 sub-words
        # e.g. செயல்பட => செயல் + பட
        alt = tamil.wordutils.greedy_split(word, TVU_dict)
        if len(alt) >= 1:
            results = [u" ".join(alt)]
            results.extend(alt)
            return (False, results)

        # TODO: Noun Declension - ticket-

        # suggestions at edit distance 1
        norvig_suggests = filter(
            TVU_dict.isWord, norvig_suggestor(word,
                                              self.alphabets,
                                              1,
                                              limit=50))
        combinagram_suggests = list(
            tamil.wordutils.combinagrams(word, TVU_dict, limit=50))
        pfx_options = TVU_dict.getWordsStartingWith(u"".join(letters[:-1]))

        # FIXME: score  the options
        options = list(norvig_suggests)
        options.extend(combinagram_suggests)
        options.extend(pfx_options)

        # score by

        # sort the options
        if self.lang == u"en":
            options.sort()
        else:
            if PYTHON3:
                options = sorted(options,
                                 key=functools.cmp_to_key(
                                     tamil.utf8.compare_words_lexicographic))
            else:
                options = sorted(options,
                                 cmp=tamil.utf8.compare_words_lexicographic)

        # remove dupes in list
        options2 = []
        prev = None
        for val in options:
            if val.strip() != prev:
                options2.append(val.strip())
            prev = val.strip()

        return (False, options)