def get(word): word = word.strip() word = word.replace(' ', '') letters = utf8.get_letters(word) F = Feature() F.nletters = len(letters) * 1.0 F.unigscore = unigram_score(letters) F.bigscore = max(bigram_scores(letters)) for l in letters: try: rtl = reverse_transliterate(l) if any([rtl.startswith(l) for l in ['a', 'e', 'i', 'o', 'u']]): F.vowels += 1.0 except Exception as ioe: pass kind = utf8.classify_letter(l) if kind == 'kuril': F.kurils += 1 elif kind == 'nedil': F.nedils += 1 elif kind == 'ayudham': F.ayudhams += 1 elif kind == 'vallinam': F.vallinams += 1 elif kind == 'mellinam': F.mellinams += 1 elif kind == 'idayinam': F.idayinams += 1 elif kind in ['english', 'digit']: continue elif kind == 'tamil_or_grantham': F.granthams += 1 F.kurils /= F.nletters F.nedils /= F.nletters F.ayudhams /= F.nletters F.vallinams /= F.nletters F.vallinams /= F.nletters F.mellinams /= F.nletters F.idayinams /= F.nletters F.granthams /= F.nletters F.vowels /= F.nletters if letters[0] in utf8.uyir_letters: F.first += 1.0 if letters[0] in utf8.mei_letters: F.first += F.first + 0.25 if letters[0] in utf8.uyirmei_letters: F.first += F.first + 0.05 if letters[-1] in utf8.uyir_letters: F.last += 1.0 if letters[-1] in utf8.mei_letters: F.last += F.last + 0.25 if letters[-1] in utf8.uyirmei_letters: F.last += F.last + 0.05 return F
def get(word): word = word.strip() word = word.replace(u' ',u'') letters = utf8.get_letters(word) F = Feature() F.nletters = len(letters)*1.0 F.unigscore = unigram_score(letters) F.bigscore = max(bigram_scores(letters)) for l in letters: try: rtl = reverse_transliterate(l) if any( [rtl.startswith(l) for l in ['a','e','i','o','u'] ] ): F.vowels += 1.0 except Exception as ioe: pass kind = utf8.classify_letter(l) if kind == 'kuril': F.kurils += 1 elif kind == 'nedil': F.nedils += 1 elif kind == 'ayudham': F.ayudhams += 1 elif kind == 'vallinam': F.vallinams += 1 elif kind == 'mellinam': F.mellinams += 1 elif kind == 'idayinam': F.idayinams += 1 elif kind in ['english','digit']: continue elif kind == 'tamil_or_grantham': F.granthams += 1 F.kurils /= F.nletters F.nedils /= F.nletters F.ayudhams /= F.nletters F.vallinams /= F.nletters F.vallinams /= F.nletters F.mellinams /= F.nletters F.idayinams /= F.nletters F.granthams /= F.nletters F.vowels /= F.nletters if letters[0] in utf8.uyir_letters: F.first += 1.0 if letters[0] in utf8.mei_letters: F.first += F.first + 0.25 if letters[0] in utf8.uyirmei_letters: F.first += F.first + 0.05 if letters[-1] in utf8.uyir_letters: F.last += 1.0 if letters[-1] in utf8.mei_letters: F.last += F.last + 0.25 if letters[-1] in utf8.uyirmei_letters: F.last += F.last + 0.05 return F
def test_unigram_bigram_scoring(self): input_words = u"டைட்டானிக் படத்தில் ஜேக் மற்றும் ரோஸ் தன் காதலை வெளிப்படுத்தும் இரு தவளைகள்".split() data = [] data2 = [] ref_data = [-10.699,-8.196,-2.6212,-7.18,-8.013,-2.930,-5.40,-14.88,-3.95,-9.17] ref_data2 = [-16.297,-12.947,-4.0,-10.32,-4.0,-2.747,-6.599,-21.1337,-3.10847,-13.83320] for word in input_words: letters = utf8.get_letters(word) data.append( unigram_score(letters) ) data2.append( max(bigram_scores(letters)) ) #pprint(data2) for idx in range(0,len(data)): self.assertAlmostEqual(data[idx],ref_data[idx],places=2) self.assertAlmostEqual(data2[idx],ref_data2[idx],places=2)
def test_unigram_bigram_scoring(self): input_words = u"டைட்டானிக் படத்தில் ஜேக் மற்றும் ரோஸ் தன் காதலை வெளிப்படுத்தும் இரு தவளைகள்".split( ) data = [] data2 = [] ref_data = [ -10.699, -8.196, -2.6212, -7.18, -8.013, -2.930, -5.40, -14.88, -3.95, -9.17 ] ref_data2 = [ -16.297, -12.947, -4.0, -10.32, -4.0, -2.747, -6.599, -21.1337, -3.10847, -13.83320 ] for word in input_words: letters = utf8.get_letters(word) data.append(unigram_score(letters)) data2.append(max(bigram_scores(letters))) #pprint(data2) for idx in range(0, len(data)): self.assertAlmostEqual(data[idx], ref_data[idx], places=2) self.assertAlmostEqual(data2[idx], ref_data2[idx], places=2)
prefer = அகராதி.starts_with(prefix) if letter in tamil.utf8.agaram_letters: alternate2 = prefix + mei_letter if அகராதி.starts_with(alternate2) or prefer: mei_letter = letter + tamil.utf8.pulli_symbols[0] result2 = pulligal_branch_bound(alternate2, letters[1:]) result.extend(result2) alternate1 = prefix + letter if அகராதி.starts_with(alternate1) or prefer: result1 = pulligal_branch_bound(alternate1, letters[1:]) result.extend(result1) return result # sort in descending order result_tpl = [("".join(sol), (unigram_score(sol))) for sol in pulligal_helper("", chol)] result_tpl = sorted(result_tpl, key=operator.itemgetter(1), reverse=True) pprint(result_tpl) """ ['கணனன', 'கணனன்', 'கணன்ன', 'கணன்ன்', 'கண்னன', 'கண்னன்', 'கண்ன்ன', 'கண்ன்ன்', 'க்ணனன', 'க்ணனன்', 'க்ணன்ன',