def removeSuffix(self, word):
     removed = False
     if not self.possible_suffixes:
         # init once
         self.setSuffixes()
         self.prepareSuffixes()
     word_lett = utf8.get_letters(word)
     rword_lett = copy.copy(word_lett)
     rword_lett.reverse()
     # print('rev word ->',rword_lett)
     rword = u"".join(rword_lett)
     longest_match = ""
     for itr in range(len(self.reversed_suffixes)):
         suffix = self.reversed_suffixes[itr]
         # print(itr,utf8.get_letters(suffix))
         if rword.startswith(suffix):
             if len(longest_match) <= len(suffix):
                 longest_match = suffix
                 # print('L-match-->',utf8.get_letters(longest_match))
         continue
     if len(longest_match) > 0:
         removed = True
         sfx = []
         for itr in range(len(utf8.get_letters(longest_match))):
             sfx.append(word_lett.pop())
         word = u"".join(word_lett)
         sfx.reverse()
         sfx = u"".join(sfx)
         # rule to replace suffix
         alt_suffix = self.replace_suffixes.get(sfx, None)
         if alt_suffix:
             word = word + alt_suffix
     return word, removed
예제 #2
0
 def test_reverse_words( self ):
     """ unittest for reverse a Tamil string"""
     print utf8.get_letters(u"இந்த")
     print u"".join(utf8.get_letters(u"இந்த"))
     for word in u"இந்த (C) tamil முத்தையா அண்ணாமலை 2013 இந்த ஒரு எழில் தமிழ் நிரலாக்க மொழி உதாரணம்".split():
         rword = utf8.reverse_word(word)
         print word,rword
         self.assertTrue( utf8.get_letters(rword)[0] == utf8.get_letters(word)[-1] )
     return
예제 #3
0
 def test_istamil( self ):
     zz = u"முத்தையா அண்ணாமலை எந்த ஒரு தெரிந்த அல்லது தெரியாத எழுத்துருவாகவிருந்தாலும் அதனை மேல்தட்டில் உள்ளிட்டு கீழே உள்ள முடியும்"
     for z in zz.split(u" "):
         print("********** t/f ********")
         for x,y in zip(map(utf8.istamil,utf8.get_letters(z)),utf8.get_letters(z)):
             print("%s => %s"%(y,x))        
             assert( all( map( utf8.istamil, utf8.get_letters( z ) ) ) )
     
     z = u"முத்தையா அண்ணாமலை"
     assert( any( map( utf8.istamil, utf8.get_letters( z ) ) ) )
     
     correct = [True, True, True, True, False, True, True, True, True, True, False, False, False, False, False]
     assert( map(utf8.istamil,utf8.get_letters(u"முத்தையா அண்ணாமலை 2013")) == correct )
 def test_entity(self):
     word = u"nuthin"
     q = WordEntity(word,row=5,col=6)
     self.assertEqual(q.word,word)
     self.assertEqual(q.letters,utf8.get_letters(u"nuthin"))
     self.assertEqual((q.row, q.col),(5,6))
     self.assertTrue(q.isWord())
예제 #5
0
 def test_classifier(self):
     expected = []
     expected.extend(['english']*3)
     expected.extend(['digit']*4)
     expected.extend(['kuril','nedil','uyirmei','vallinam','uyirmei'])
     data = list(map(utf8.classify_letter,utf8.get_letters(u"abc1230அஆரெட்டை")))
     self.assertEqual(data,expected)
def norvig_suggestor(word,alphabets=None,nedits=1,limit=float("inf")):
    if not alphabets:
        alphabets = tamil_letters
    if not type(word) is list:
        wordL = get_letters(word)
    else:
        wordL = word
    # recursive method for edit distance > 1
    if nedits > 1:
        result = []
        for nAlternate in norvig_suggestor(wordL,alphabets,nedits-1,limit-len(result)):
            if len(result) > limit:
                break
            result.extend( norvig_suggestor(nAlternate,alphabets,1,limit-len(result)) )
        return set(result)
       
    ta_splits     = [ [u"".join(wordL[:idx-1]),u"".join(wordL[idx:])] for idx in range(len(wordL) + 1)]
    #pprint( ta_splits )
    ta_deletes    = [a + b[1:] for a, b in ta_splits if b]
    ta_transposes = [a + b[1] + b[0] + b[2:] for a, b in ta_splits if len(b)>1]
    ta_replaces   = [a + c + b[1:] for a, b in ta_splits for c in alphabets ]
    ta_replaces2   = [ c + b for a, b in ta_splits for c in alphabets ]
    ta_inserts    = [a + c + b     for a, b in ta_splits for c in alphabets]
    # TODO: add a normalizing pass word words in vowel+consonant forms to eliminate dangling ligatures
    return set(ta_deletes + ta_transposes + ta_replaces + ta_replaces2 + ta_inserts )
예제 #7
0
 def test_letter_extract_from_code_pts(self):
     letters = utf8.get_letters(u"கூவிளம் என்பது என்ன சீர்")
     #print "len ==== > " , len(letters)
     assert( len(letters) == 15 )
     for pos,letter in  enumerate(letters):
         print(u"%d %s"%(pos,letter))
     assert( letter == (u"ர்") )
예제 #8
0
 def test_letter_extract_with_ascii(self):
     letters = utf8.get_letters(u"கூவிளம் is என்பது also என்ன a சீர்")
     print "len ==== > " , len(letters)
     assert(len(letters) == 25 )
     for pos,letter in  enumerate(letters):
         print(u"%d %s"%(pos,letter))
     assert( letters[-4] == u"a" )
 def test_words_to_letters(self):
     k1 = u"இந்தக் குளிர்ல டெய்லி தலைக்கு குளிக்கற நல்லவங்க இருக்கறதாலதான் கோவை இப்படி சூப்பரா இருக்காம்"
     word_length = [4,4,3,4,5,6,9,2,4,4,5]
     for idx,kk in enumerate(k1.split(' ')):
         idx_len = len( get_letters(kk) )
         print('w# ',idx, idx_len )
         self.assertEqual( word_length[idx], idx_len)
예제 #10
0
 def getWordCount(self,word):
     isWord, ref_trie = self.isWord( word, ret_ref_trie = True)
     if not isWord:
         raise Exception(u"Word does not exist in Trie")
     #pprint(str(ref_trie))
     letters = utf8.get_letters( word )
     return ref_trie.count[ letters[-1] ]
 def get(word):
     word = word.strip()
     word = word.replace(u' ',u'')
     letters = utf8.get_letters(word)
     F = Feature()
     F.nletters = len(letters)*1.0
     F.unigscore = unigram_score(letters)
     F.bigscore = max(bigram_scores(letters))
     for l in letters:
         try:
             rtl = reverse_transliterate(l)
             if any( [rtl.startswith(l) for l  in ['a','e','i','o','u'] ] ):
                 F.vowels += 1.0
         except Exception as ioe:
             pass
         
         kind = utf8.classify_letter(l)
         if kind == 'kuril':
             F.kurils += 1
         elif kind == 'nedil':
             F.nedils += 1
         elif kind == 'ayudham':
             F.ayudhams += 1
         elif kind == 'vallinam':
             F.vallinams += 1
         elif kind == 'mellinam':
             F.mellinams += 1
         elif kind == 'idayinam':
             F.idayinams += 1
         elif kind in ['english','digit']:
             continue
         elif kind == 'tamil_or_grantham':
             F.granthams += 1
     
     F.kurils /= F.nletters
     F.nedils /= F.nletters
     F.ayudhams /= F.nletters
     F.vallinams /= F.nletters
     F.vallinams /= F.nletters
     F.mellinams /= F.nletters
     F.idayinams /= F.nletters
     F.granthams /= F.nletters
     F.vowels /= F.nletters
     
     if letters[0] in utf8.uyir_letters:
         F.first += 1.0
     if letters[0] in utf8.mei_letters:
         F.first += F.first + 0.25
     if letters[0] in utf8.uyirmei_letters:
         F.first += F.first + 0.05
     
     if letters[-1] in utf8.uyir_letters:
         F.last += 1.0
     if letters[-1] in utf8.mei_letters:
         F.last += F.last + 0.25
     if letters[-1] in utf8.uyirmei_letters:
         F.last += F.last + 0.05
     
     return F
예제 #12
0
def anagram(request,word):
    AllTrueDictionary = wordutils.DictionaryWithPredicate(lambda x: True)
    TVU,TVU_size = DictionaryBuilder.create(TamilVU)
    length = len(utf8.get_letters(word))
    actual =list(wordutils.anagrams(word,TVU))
    json_string = json.dumps(actual,ensure_ascii = False)
    #creating a Response object to set the content type and the encoding
    response = HttpResponse(json_string,content_type="application/json; charset=utf-8" )
    return response 
예제 #13
0
 def getAllWordsPrefix(self,prefix):
     raise Exception("NOT IMPLEMENTED RIGHT")
     all_words = []
     val,ref_trie,ref_word_limits = self.isWord(prefix,ret_ref_trie=True)
     # ignore val
     if val: all_words.append( prefix )
     prefix_letters = utf8.get_letters(prefix)
     self.getAllWordsHelper( ref_trie, ref_word_limits, prefix_letters, all_words)
     return all_words
예제 #14
0
 def test_tamil_only_words(self):
     s = u"உடனே உடனே seventh heaven எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்"
     words = s.replace(u"seventh heaven ",u"").split(u" ")
     letters = utf8.get_letters( s )
     outWords = utf8.get_tamil_words( letters )
     if ( LINUX ):
         print( u"|".join(words) )
         print( u"|".join(outWords) )
     self.assertEqual( outWords, words )
예제 #15
0
 def test_words(self):
     _str = u"உடனே random elevator jazz உடனே எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்"
     words = _str.split(u" ")
     
     letters = utf8.get_letters( _str )
     outWords = utf8.get_words( letters, tamil_only = False )
     if ( LINUX ):
         print( u"|".join(words) )
         print( u"|".join(outWords) )
     self.assertEqual( outWords, words )
예제 #16
0
def xkcd():
    obj = RemoveCaseSuffix()
    expected = [u"பதிவிற்",u"கட்டளைக",u"அவர்"]
    words_list = [u"பதிவிற்க்கு",u"கட்டளைகளை",u"அவர்கள்"]
    for w,x in zip(words_list,expected):
        rval = obj.removeSuffix(w)
        assert(rval[1])
        print(utf8.get_letters(w),'->',rval[1])
        assert(rval[0] == x)
    return
예제 #17
0
def keech(request,k1):
    dic={}
    for idx,kk in enumerate(k1.split(' ')):
            idx_len = len( get_letters(kk) )
            #print('w# ',idx, idx_len )
            dic[idx]=idx_len
    json_string = json.dumps(dic,ensure_ascii = False)
    #creating a Response object to set the content type and the encoding
    response = HttpResponse(json_string,content_type="application/json; charset=utf-8" )
    return response
예제 #18
0
 def getAllWordsPrefix(self,prefix):
     all_words = []
     val,curr_trie = self.isWord(prefix,ret_ref_trie=True)
     prefix_letters = utf8.get_letters(prefix)
     ref_trie = curr_trie.alphabets.get( prefix_letters[-1], curr_trie )
     #print(ref_trie.__str__())
     # ignore val
     if val:    all_words.append( prefix )
     self.getAllWordsHelper( ref_trie, prefix_letters, all_words=all_words )
     return all_words
예제 #19
0
 def test_letter_extract_yield_with_ascii(self):
     letters = []
     ta_str = u"கூவிளம் is என்பது also என்ன a சீர்"
     for l in  utf8.get_letters_iterable(ta_str):
         letters.append( l )
     act_letters = utf8.get_letters(ta_str)
     print( "len ==== > " , len(letters),"get_letters CALL = ",len(act_letters) )
     assert(len(letters) == len(act_letters) )
     for pos,letter in  enumerate(letters):
         if ( LINUX ): print( u"%d %s"%(pos,letter) )
     self.assertEqual( letters[-4], u"a" )
예제 #20
0
    def test_tamil_only_words(self):
        string = u"உடனே உடனே seventh heaven எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்"
        words = string.replace(u"seventh heaven ",u"").split(u" ")

        letters = utf8.get_letters( string )
        outWords = utf8.get_tamil_words( letters )
        
        print u"|".join(words)
        print u"|".join(outWords)
        
        assert( outWords == words )
예제 #21
0
    def test_words(self):
        string = u"உடனே random elevator jazz உடனே எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்"
        words = string.split(u" ")

        letters = utf8.get_letters( string )
        outWords = utf8.get_words( letters )
        
        print u"|".join(words)
        print u"|".join(outWords)
        
        assert( outWords == words )
def xkcd():
    obj = RemovePluralSuffix()
    objf = CaseFilter(obj)
    expected = [u"பதிவி", u"கட்டளை", u"அவர்", u"பள்ளி"]
    words_list = [u"பதிவில்", u"கட்டளைகள்", u"அவர்கள்", u"பள்ளிகள்"]
    for w, x in zip(words_list, expected):
        rval = obj.removeSuffix(w)
        trunc_word = objf.apply(w)
        assert trunc_word == rval[0]
        assert rval[1]
        print(utf8.get_letters(w), "->", rval[1])
        assert rval[0] == x
    return
예제 #23
0
 def test_letter_extract_yield(self):
     ta_str = u"கூவிளம் என்பது என்ன சீர்"
     act_letters = utf8.get_letters(ta_str)
     letters = []
     for l in utf8.get_letters_iterable(ta_str):
         letters.append( l )
     print( "len ==== > " , len(letters) )
     assert( len(letters) == 16 )
     print( "len ==== > " , len(letters),"get_letters CALL = ",len(act_letters) )
     assert(len(letters) == len(act_letters) )
     for pos,letter in  enumerate(letters):
         if ( LINUX ): print(u"%d %s"%(pos,letter))
     assert( letter == (u"ர்") )
 def test_all_valid(self):
     data,DEBUG = [],False
     with codecs.open("data/project_madurai_utf8.txt","r","utf-8") as f:
         data = filter(lambda x: len(x)>2, f.readlines())
     obj = BadIME()
     for idx,line in enumerate(data):
         for col,word in enumerate( re.split(u'\s+',line) ):
             if DEBUG:
                 print(idx,col)
                 print(utf8.get_letters(word))
             self.assertEqual(obj.apply(word),(True,None))
         pass
     pass
예제 #25
0
def test_ngram(request,ng):
    obj = DTrie()
    prev_letter = u''
    # per-line processor - remove spaces
    for char in get_letters(u"".join(re.split('\s+',ng)).lower()):
        if (prev_letter.isalpha() and char.isalpha()) or ( utf8.is_tamil_unicode(prev_letter) and utf8.is_tamil_unicode(char)):
            bigram = u"".join([prev_letter,char])
            obj.add(bigram) # update previous
        prev_letter = char
    actual = obj.getAllWordsAndCount()
    json_string = json.dumps(actual,ensure_ascii = False)
    #creating a Response object to set the content type and the encoding
    response = HttpResponse(json_string,content_type="application/json; charset=utf-8" )
    return response
예제 #26
0
 def isWordAndTrie(self,word):
     ref_trie = self.trie
     letters = utf8.get_letters(word)
     wLen = len(letters)
     rval = False
     prev_trie = None
     for idx,letter in enumerate(letters):
         #print(str(ref_trie))
         rval = ref_trie.is_word.get(letter,False)
         prev_trie = ref_trie
         ref_trie = ref_trie.alphabets.get(letter,None)
         if not ref_trie:
             break
     
     return rval,prev_trie
예제 #27
0
 def isWord(self,word):
     # see if @word is present in the current Trie; return True or False
     letters = utf8.get_letters(word)
     wLen = len(letters)
     ref_trie = self.trie
     ref_word_limits = self.word_limits
     for itr,letter in enumerate(letters):
         idx = self.getidx( letter )
         #print(idx, letter)
         if itr == (wLen-1):
             break
         if not ref_trie[idx][1]:
             return False #this branch of Trie did not exist
         ref_trie = ref_trie[idx][1]
         ref_word_limits = ref_word_limits[idx][1]
     return ref_word_limits[idx][0]
예제 #28
0
def map_to_braille(tamil_string):
    result = []
    for letter in get_letters(tamil_string):
        if letter in grantha_mei_letters:
            pos = grantha_mei_letters.index(letter)
            agaram = grantha_agaram_letters[pos]
            result.append(table[agaram])
            result.append(table[pulli_symbols[0]])
        elif letter in uyir_letters or letter == ayudha_letter:
            result.append(table[letter])
        else:
            lMei, lUyir = splitMeiUyir(letter)
            pos = grantha_mei_letters.index(lMei)
            agaram = grantha_agaram_letters[pos]
            result.append(table[agaram])
            if lUyir != 'அ':
                result.append(table[lUyir])
    return result
예제 #29
0
def main():
    eq = Counter()
    eqd = {}
    kural = Thirukkural()
    for kural_no in range(1330):
        kural_words = get_tamil_words(get_letters(kural.get_kural_no(kural_no + 1).ta))
        mathirai = sum([total_maaththirai(word) for word in kural_words])
        if eq[mathirai] == 0:
            eqd[mathirai] = [kural_no + 1]
        else:
            eqd[mathirai].append(kural_no + 1)
        eq[mathirai] += 1
    eq_sorted = OrderedDict(sorted(eq.items(), key=lambda x: x))
    pprint(eq_sorted)
    pprint(eq_sorted.values())
    pprint(eqd)
    print("total = ", sum(eq.values()))
    plt.scatter(eq_sorted.keys(), eq_sorted.values())
    plt.ylabel(u"குறட்பாக்கள் எண்ணிக்கை", {"fontname": "Catamaran"})
    plt.xlabel(u"மாத்திரை அளவு", {"fontname": "Catamaran"})  # Arial Unicode MS'})

    # p0 is the initial guess for the fitting coefficients (A, mu and sigma above)
    p0 = [75.0, 20.0, 5.0]
    coeff, var_matrix = curve_fit(
        gauss, list(eq_sorted.keys()), list(eq_sorted.values()), p0=p0
    )

    # Get the fitted curve
    hist_fit = gauss(list(eq_sorted.keys()), *coeff)
    plt.plot(
        eq_sorted.keys(),
        hist_fit,
        label="Gaussian Fitted data (mean=%g, std=%g)" % (coeff[1], coeff[2]),
    )
    plt.title(
        r"குறள் மாத்திரை வரிசை (Gauss \mu=%g, \sigma=%g)" % (coeff[1], coeff[2]),
        {"fontname": "Catamaran"},
    )

    # Finally, lets get the fitting parameters, i.e. the mean and standard deviation:
    print("Fitted mean = ", coeff[1])
    print("Fitted standard deviation = ", coeff[2])

    plt.show()
예제 #30
0
    def isWord(self, word, ret_ref_trie=False):
        # see if @word is present in the current Trie; return True or False
        letters = utf8.get_letters(word)
        wLen = len(letters)
        ref_trie = self.trie
        ref_word_limits = self.word_limits
        for itr, letter in enumerate(letters):
            idx = self.getidx(letter)
            #print(idx, letter)
            if itr == (wLen - 1):
                break
            if not ref_trie[idx][1]:
                return False  #this branch of Trie did not exist
            ref_trie = ref_trie[idx][1]
            ref_word_limits = ref_word_limits[idx][1]

        if ret_ref_trie:
            return ref_word_limits[idx][0], ref_trie, ref_word_limits
        return ref_word_limits[idx][0]
예제 #31
0
 def test_istamil( self ):
     zz = u"முத்தையா அண்ணாமலை எந்த ஒரு தெரிந்த அல்லது தெரியாத எழுத்துருவாகவிருந்தாலும் அதனை மேல்தட்டில் உள்ளிட்டு கீழே உள்ள முடியும்"
     for z in zz.split(u" "):
         print("********** t/f ********")
         for x,y in zip(map(utf8.istamil,utf8.get_letters(z)),utf8.get_letters(z)):
             if ( LINUX ): print(u"%s => %s"%(y,x))        
             assert( all( map( utf8.istamil, utf8.get_letters( z ) ) ) )
     
     z = u"முத்தையா அண்ணாமலை"
     assert( any( map( utf8.istamil, utf8.get_letters( z ) ) ) )
     
     correct = [True, True, True, True, False, True, True, True, True, True, False, False, False, False, False]
     print ( list(map(utf8.istamil,utf8.get_letters(u"முத்தையா அண்ணாமலை 2013"))) )
     print ( correct )
     assert( list(map(utf8.istamil,utf8.get_letters(u"முத்தையா அண்ணாமலை 2013"))) == correct )
예제 #32
0
 def add(self, word):
     ref_trie = self.trie
     letters = utf8.get_letters(word)
     wLen = len(letters)
     prev_trie = None
     assert wLen >= 1
     for idx, letter in enumerate(letters):
         value = ref_trie.alphabets.get(letter, None)
         prev_trie = ref_trie
         if not value:
             ref_trie.alphabets[letter] = Node()
             ref_trie.is_word[letter] = False
             ref_trie.count[letter] = 0
             ref_trie = ref_trie.alphabets[letter]
         else:
             ref_trie = value
     #print(str(prev_trie))
     last_trie = prev_trie
     last_trie.is_word[letter] = True
     last_trie.count[letter] += 1
     return
예제 #33
0
 def add(self,word):
     ref_trie = self.trie
     letters = utf8.get_letters(word)
     wLen = len(letters)
     prev_trie = None
     assert wLen >= 1
     for idx,letter in enumerate(letters):                
         value = ref_trie.alphabets.get(letter,None)
         prev_trie = ref_trie
         if not value:
             ref_trie.alphabets[letter] = Node()
             ref_trie.is_word[letter]=False
             ref_trie.count[letter]=0
             ref_trie = ref_trie.alphabets[letter]
         else:
             ref_trie = value
     #print(str(prev_trie))
     last_trie = prev_trie
     last_trie.is_word[letter] = True
     last_trie.count[letter] += 1
     return
예제 #34
0
 def add(self,word):
     # trie data structure is built here
     #print("*"*30,"adding","*"*30)
     letters = utf8.get_letters(word)
     wLen = len(letters)
     ref_trie = self.trie
     ref_word_limits = self.word_limits
     for itr,letter in enumerate(letters):
         try:
             idx = self.getidx( letter )
         except Exception as exp:
             continue
         #print(idx, itr)
         ref_trie[idx][0] = True
         if itr == (wLen-1):
             break
         if not ref_trie[idx][1]:
             ref_trie[idx][1] = Trie.mk_empty_trie(self.L)
             ref_word_limits[idx][1] = Trie.mk_empty_trie(self.L)
         ref_trie = ref_trie[idx][1]
         ref_word_limits = ref_word_limits[idx][1]
     ref_word_limits[idx][0] = True
예제 #35
0
 def add(self, word):
     # trie data structure is built here
     #print("*"*30,"adding","*"*30)
     letters = utf8.get_letters(word)
     wLen = len(letters)
     ref_trie = self.trie
     ref_word_limits = self.word_limits
     for itr, letter in enumerate(letters):
         try:
             idx = self.getidx(letter)
         except Exception as exp:
             continue
         #print(idx, itr)
         ref_trie[idx][0] = True
         if itr == (wLen - 1):
             break
         if not ref_trie[idx][1]:
             ref_trie[idx][1] = Trie.mk_empty_trie(self.L)
             ref_word_limits[idx][1] = Trie.mk_empty_trie(self.L)
         ref_trie = ref_trie[idx][1]
         ref_word_limits = ref_word_limits[idx][1]
     ref_word_limits[idx][0] = True
예제 #36
0
    def isWordAndTrie(self, word, prefix=False):
        ref_trie = self.trie
        letters = utf8.get_letters(word)
        wLen = len(letters)
        rval = False
        is_prefix = False
        prev_trie = None
        for idx, letter in enumerate(letters):
            #print(str(ref_trie))
            rval = ref_trie.is_word.get(letter, False)
            prev_trie = ref_trie
            ref_trie = ref_trie.alphabets.get(letter, None)
            if not ref_trie:
                break

        if prefix:
            if idx < (len(letters) - 1):
                return False
            elif not ref_trie:
                return False
            return True

        return rval, prev_trie
 def setUp(self):
     self.AllTrueDictionary = wordutils.DictionaryWithPredicate(
         lambda x: True)
     self.TVU, self.TVU_size = DictionaryBuilder.create(TamilVU)
     self.word = u"சவால்"
     self.length = len(utf8.get_letters(self.word))
def fun(e):
    return len(utf8.get_letters(e))
예제 #39
0
 def test_tamil_only_words(self):
     s = u"உடனே உடனே seventh heaven எழுதினால் செய்திப் பத்திரிகை போஆகிவிடும் அசோகமித்திரன் நேர்காணல்"
     words = s.replace(u"seventh heaven ", u"").split(u" ")
     letters = utf8.get_letters(s)
     outWords = utf8.get_tamil_words(letters)
     self.assertEqual(outWords, words)
예제 #40
0
 def next_tamil_letter(self):
     self.handle = codecs.open(self.filename, 'r', 'utf-8')
     for letter in utf8.get_letters(self.handle.read()):
         if (utf8.istamil(letter)):
             yield letter
     raise StopIteration
예제 #41
0
def கடையெழுத்து(சொல்):
    return tamilutf8.get_letters(சொல்)[-1]
예제 #42
0
 def test_word_length(self):
     actual = 5
     letters = utf8.get_letters(u"மென்பொருள்")
     self.assertEqual(actual, len(letters))
예제 #43
0
 def test_letter_extract_from_code_pts(self):
     letters = utf8.get_letters(u"கூவிளம் என்பது என்ன சீர்")
     assert len(letters) == 16
     assert letters[-1] == (u"ர்")
예제 #44
0
 def count_letter(self):
     self.letter_toll += len(tamil.get_letters(self.line))
예제 #45
0
 def removePrefix(self,word):
     word_lett = utf8.get_letters(word)
     word_lett.reverse()
     a,b = self.removeSuffix(u"".join(word_lett))
     return [utf8.reverse_word(a),b]
예제 #46
0
파일: ksa.py 프로젝트: subburajs/open-tamil
#!/bin/env python3
from codecs import open
from tamil import utf8
import re

with open("kuttistory.txt", "r", "utf-8") as fp:
    data = fp.readlines()


class Stats:
    __fields__ = ("total_words", "tamil_words")


stats = Stats()
stats.total_words = 0.0
stats.tamil_words = 0.0

for line in data:
    all_words = re.split("\s+", line.strip())
    ta_words = list(utf8.get_tamil_words(utf8.get_letters(line)))
    print((all_words, len(ta_words)))
    stats.tamil_words += len(ta_words)
    stats.total_words += len(all_words)
# tamil fraction
taf = float(stats.tamil_words) / stats.total_words
print(("English = {0}%, Tamil = {1}%".format(100.0 * (1 - taf),
                                             100.0 * (taf))))
예제 #47
0
def get_letters(word):
    if isinstance(word, list):
        chars = word
    else:
        chars = utf8.get_letters(word)
    return chars
예제 #48
0
파일: dom.py 프로젝트: vmmlog/open-tamil
 def __init__(self, word, flagged=False, **kwargs):
     super(Entity, self).__init__(**kwargs)
     self.flagged = flagged
     self.word = word
     self.letters = utf8.get_letters(word)
예제 #49
0
def முதலெழுத்து(சொல்):
    return tamilutf8.get_letters(சொல்)[0]
예제 #50
0
 def test_letter_extract_with_ascii(self):
     letters = utf8.get_letters(u"கூவிளம் is என்பது also என்ன a சீர்")
     assert len(letters) == 26
     assert letters[-4] == u"a"
예제 #51
0
 def test_shamikshu(self):
     word = u"க்ஷமிக்ஷூ"
     self.assertTrue(all(map(utf8.istamil, utf8.get_letters(word))))
     self.assertTrue(all(map(utf8.istamil_alnum, utf8.get_letters(word))))
 def setUp(self):
     self.AllTrueDictionary = wordutils.DictionaryWithPredicate(lambda x: True)
     self.TVU,self.TVU_size = DictionaryBuilder.create(TamilVU)
     self.word = u"சவால்"
     self.length = len(utf8.get_letters(self.word))
예제 #53
0
 def test_odd_case(self):
     # truly mal-formed inputs get mangled by get-letters
     not_a_word = u"ஆாள்"
     self.assertEqual(utf8.get_letters(not_a_word), [u"ஆா", u"ள்"])
     not_a_word = u"ஆள்்ஆ"
     self.assertEqual(utf8.get_letters(not_a_word), [u"ஆ", u"ள்்", u"ஆ"])
예제 #54
0
 def get_letters_impl(self, word):
     return self.is_english and [l for l in word] or utf8.get_letters(word)
예제 #55
0
 def test_word_no2_length(self):
     actual = 6
     letters = utf8.get_letters(u'[\u0baa-\u0baa\u0bcc]+')
     self.assertEqual(actual, len(letters))
예제 #56
0
def எழுத்தாக்கு(சொல்):
    return tamilutf8.get_letters(சொல்)
예제 #57
0
 def test_get_letters2(self):
     letters = utf8.get_letters(u"hello world  தெரிந்த அல்லது தெரியாத")
     assert (len(letters) == 27)
     self.assertTrue(letters[13] == u"தெ")
예제 #58
0
def tamil2eng(text):
    chars = utf8.get_letters(text)
    result = [tam2eng_map.get(char, char) for char in chars]
    return ''.join(result)
예제 #59
0
        a = a.replace(',', '', 10000000000)
        a = a.replace('?', '', 10000000000)
        a = a.replace('ஏற்றப்படுகின்றது', '', 10000000000)
        if (utf8.all_tamil(a)):
            if a not in new:
                new.append(a)
                fil.write(a)
                fil.write('\n')
            #f.write(a)
            #f.write("\n")

        else:
            a = ''

    cnt = count()
    final = sorted(new,
                   key=lambda w: (len(utf8.get_letters(w)), next(cnt)),
                   reverse=True)[:10]
    print(final)
    URL = href['href']
    for fa in final:
        f.write(fa)
        f.write('\t\t')
        f.write(str(len(utf8.get_letters(fa))))
        f.write('\n')
    fil2.write('\n')
    fil2.write(href['href'])
    f.write('\n')
    c = c + 1
f.close()
winsound.Beep(2000, 1000)
예제 #60
0
def joinWords(word_a, word_b):
    word_a = word_a.strip()
    word_b = word_b.strip()
    # get readable letters of first word
    first_word_letters = get_letters(word_a)

    if first_word_letters[-1] in mei_letters:
        # first word last char is mei letter. so just return as it is.
        # todo : apply special conditions also
        rval = word_a + " " + word_b
        return rval
    # end of if first_word_last_chars[-1] in mei_letters:

    # get mei & uyir characters of first word's last char
    first_word_last_chars = splitMeiUyir(first_word_letters[-1])
    if len(first_word_last_chars) == 2:
        first_word_last_mei_char, first_word_last_uyir_char = first_word_last_chars
    else:
        first_word_last_mei_char, first_word_last_uyir_char = (
            first_word_last_chars[0],
            first_word_last_chars[0],
        )

    # get rule sub dictionary from all dictionary by passing
    rule = all_rules.get(first_word_last_uyir_char, None)

    if word_a == word_b:
        # both input words are same
        same_word_rule = rule.get("same_words", [])
        if word_a in same_word_rule[0]:
            # get conjuction char
            jn = same_word_rule[1]
            # insert conjuction char between input words
            rval = first_word_letters[0] + jn + word_b
            return rval
        elif len(first_word_letters) == 3:
            # both words are same but length is 3.
            disappear_lastchar = rule.get("same_word_disappear_lastchar", [])
            if disappear_lastchar:
                disappear_lastchar = disappear_lastchar[0]
                if first_word_last_uyir_char == disappear_lastchar:
                    first_word_first_char = first_word_letters[0]
                    # get uyir char of second word's first char
                    first_word_first_uyir_char = splitMeiUyir(
                        first_word_first_char)[-1]
                    # get conjuction char by joining first word's last mei char and second word's first uyir char
                    jn = joinMeiUyir(first_word_last_mei_char,
                                     first_word_first_uyir_char)
                    # get first word till pre-last char
                    first_word = u"".join(first_word_letters[:-1])
                    # get second word from second char till end
                    second_word = u"".join(first_word_letters[1:])
                    # join all first, conjuction, second word
                    rval = first_word + jn + second_word
                    return rval
            # end of if disappear_lastchar:
        # end of if word_a in same_word_rule[0]:
    # end of if word_a == word_b:

    if rule:
        if word_a in rule.get("first_solo_words", []):
            # todo : need to find tune this first solo word check like using startswith, endswith, etc
            rval = word_a + " " + word_b
            return rval
        # end of if word_a in rule.get('first_solo_words', []):

        for diff_jn in rule.get("diff_jn_words", []):
            if word_a in diff_jn[0]:
                for last in diff_jn[1]:
                    if word_b.startswith(last):
                        # apply different conjuction char rule
                        rval = word_a + diff_jn[2] + word_b
                        return rval
    # end of for diff_jn in  rule.get('diff_jn_words', []):

    # get readable letters of second word
    second_word_letters = get_letters(word_b)
    # get second word's from second char to till end
    second_word_after_first_char = u"".join(second_word_letters[1:])
    # get mei & uyir characters of second word's first char
    second_word_first_chars = splitMeiUyir(second_word_letters[0])
    if len(second_word_first_chars) == 2:
        (
            second_word_first_mei_char,
            second_word_first_uyir_char,
        ) = second_word_first_chars
    else:
        second_word_first_mei_char, second_word_first_uyir_char = (
            second_word_first_chars[0],
            second_word_first_chars[0],
        )

    if rule:
        if second_word_first_mei_char in rule.get("secondword_first_chars",
                                                  []):
            # apply major conjuction rule
            return word_a + second_word_first_mei_char + " " + word_b
            # end of if second_word_first_mei_char in rule.get('secondword_first_chars', []):

        firstword_double_special_secondword = rule.get(
            "firstword_double_special_secondword", None)
        if firstword_double_special_secondword:
            if len(first_word_letters) == 4:
                # check either first word has repeated two times
                if (first_word_letters[:2] == first_word_letters[2:]
                    ):  # first word repeat two times within it
                    # get root second word by removing prefix
                    sec_word = (second_word_first_uyir_char +
                                second_word_after_first_char)
                    if sec_word in firstword_double_special_secondword[0]:
                        # get conjuction char by joining  special conjuction and  second root word
                        jn = joinMeiUyir(
                            firstword_double_special_secondword[1],
                            second_word_first_uyir_char,
                        )
                        # join all
                        return word_a + jn + second_word_after_first_char
        # end of if firstword_double_special_secondword:

        special_secondword_first_chars = rule.get(
            "special_secondword_first_chars", None)
        if special_secondword_first_chars:
            if second_word_first_uyir_char in special_secondword_first_chars[
                    0]:
                # get special conjuction char
                jn = special_secondword_first_chars[1]
                # join special conjuction char with second word's first uyir char
                second_word_first_schar = joinMeiUyir(
                    jn, second_word_first_uyir_char)
                # complete second word with prefix of conjuction
                second_word = second_word_first_schar + second_word_after_first_char
                # join all
                return word_a + second_word
            # end of if second_word_first_uyir_char in special_secondword_first_chars[0]:
        # end of if special_secondword_first_chars:

    # if all above rules not applicable, then just return as it is !
    return word_a + " " + word_b