def get(word):
     word = word.strip()
     word = word.replace(u' ',u'')
     letters = utf8.get_letters(word)
     F = Feature()
     F.nletters = len(letters)*1.0
     F.unigscore = unigram_score(letters)
     F.bigscore = max(bigram_scores(letters))
     for l in letters:
         try:
             rtl = reverse_transliterate(l)
             if any( [rtl.startswith(l) for l  in ['a','e','i','o','u'] ] ):
                 F.vowels += 1.0
         except Exception as ioe:
             pass
         
         kind = utf8.classify_letter(l)
         if kind == 'kuril':
             F.kurils += 1
         elif kind == 'nedil':
             F.nedils += 1
         elif kind == 'ayudham':
             F.ayudhams += 1
         elif kind == 'vallinam':
             F.vallinams += 1
         elif kind == 'mellinam':
             F.mellinams += 1
         elif kind == 'idayinam':
             F.idayinams += 1
         elif kind in ['english','digit']:
             continue
         elif kind == 'tamil_or_grantham':
             F.granthams += 1
     
     F.kurils /= F.nletters
     F.nedils /= F.nletters
     F.ayudhams /= F.nletters
     F.vallinams /= F.nletters
     F.vallinams /= F.nletters
     F.mellinams /= F.nletters
     F.idayinams /= F.nletters
     F.granthams /= F.nletters
     F.vowels /= F.nletters
     
     if letters[0] in utf8.uyir_letters:
         F.first += 1.0
     if letters[0] in utf8.mei_letters:
         F.first += F.first + 0.25
     if letters[0] in utf8.uyirmei_letters:
         F.first += F.first + 0.05
     
     if letters[-1] in utf8.uyir_letters:
         F.last += 1.0
     if letters[-1] in utf8.mei_letters:
         F.last += F.last + 0.25
     if letters[-1] in utf8.uyirmei_letters:
         F.last += F.last + 0.05
     
     return F
Пример #2
0
    def get(word):
        word = word.strip()
        word = word.replace(' ', '')
        letters = utf8.get_letters(word)
        F = Feature()
        F.nletters = len(letters) * 1.0
        F.unigscore = unigram_score(letters)
        F.bigscore = max(bigram_scores(letters))
        for l in letters:
            try:
                rtl = reverse_transliterate(l)
                if any([rtl.startswith(l) for l in ['a', 'e', 'i', 'o', 'u']]):
                    F.vowels += 1.0
            except Exception as ioe:
                pass

            kind = utf8.classify_letter(l)
            if kind == 'kuril':
                F.kurils += 1
            elif kind == 'nedil':
                F.nedils += 1
            elif kind == 'ayudham':
                F.ayudhams += 1
            elif kind == 'vallinam':
                F.vallinams += 1
            elif kind == 'mellinam':
                F.mellinams += 1
            elif kind == 'idayinam':
                F.idayinams += 1
            elif kind in ['english', 'digit']:
                continue
            elif kind == 'tamil_or_grantham':
                F.granthams += 1

        F.kurils /= F.nletters
        F.nedils /= F.nletters
        F.ayudhams /= F.nletters
        F.vallinams /= F.nletters
        F.vallinams /= F.nletters
        F.mellinams /= F.nletters
        F.idayinams /= F.nletters
        F.granthams /= F.nletters
        F.vowels /= F.nletters

        if letters[0] in utf8.uyir_letters:
            F.first += 1.0
        if letters[0] in utf8.mei_letters:
            F.first += F.first + 0.25
        if letters[0] in utf8.uyirmei_letters:
            F.first += F.first + 0.05

        if letters[-1] in utf8.uyir_letters:
            F.last += 1.0
        if letters[-1] in utf8.mei_letters:
            F.last += F.last + 0.25
        if letters[-1] in utf8.uyirmei_letters:
            F.last += F.last + 0.05

        return F
 def test_unigram_bigram_scoring(self):
     input_words = u"டைட்டானிக் படத்தில் ஜேக் மற்றும் ரோஸ் தன் காதலை வெளிப்படுத்தும் இரு தவளைகள்".split()
     data = []
     data2 = []
     ref_data = [-10.699,-8.196,-2.6212,-7.18,-8.013,-2.930,-5.40,-14.88,-3.95,-9.17]
     ref_data2 = [-16.297,-12.947,-4.0,-10.32,-4.0,-2.747,-6.599,-21.1337,-3.10847,-13.83320]
     for word in input_words:
         letters = utf8.get_letters(word)
         data.append( unigram_score(letters) )
         data2.append( max(bigram_scores(letters)) ) 
     #pprint(data2)
     for idx in range(0,len(data)):
         self.assertAlmostEqual(data[idx],ref_data[idx],places=2)
         self.assertAlmostEqual(data2[idx],ref_data2[idx],places=2)
Пример #4
0
 def test_unigram_bigram_scoring(self):
     input_words = u"டைட்டானிக் படத்தில் ஜேக் மற்றும் ரோஸ் தன் காதலை வெளிப்படுத்தும் இரு தவளைகள்".split(
     )
     data = []
     data2 = []
     ref_data = [
         -10.699, -8.196, -2.6212, -7.18, -8.013, -2.930, -5.40, -14.88,
         -3.95, -9.17
     ]
     ref_data2 = [
         -16.297, -12.947, -4.0, -10.32, -4.0, -2.747, -6.599, -21.1337,
         -3.10847, -13.83320
     ]
     for word in input_words:
         letters = utf8.get_letters(word)
         data.append(unigram_score(letters))
         data2.append(max(bigram_scores(letters)))
     #pprint(data2)
     for idx in range(0, len(data)):
         self.assertAlmostEqual(data[idx], ref_data[idx], places=2)
         self.assertAlmostEqual(data2[idx], ref_data2[idx], places=2)