def test_seperate(self): """Test Separate function ?""" letters = u"العربية" marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f' word = u"اَلْعَرَبَيَةُ" l, m= ar.separate(word) self.assertEqual(ar.joint(l,m), word) self.assertEqual(ar.separate(ar.joint(letters,marks)), (letters,marks))
def vocalize_foreign(word): """ vocalize a foreign names written in arabic @param word: given word @type word: unicode @return: the vocalized word @rtype: unicode """ marks =[] previous = "" for c in word: if previous and not previous == araby.ALEF: #--------- add Harakat before letter if c in (araby.ALEF, araby.ALEF_MAKSURA, araby.TEH_MARBUTA,): marks.pop() marks.append(araby.FATHA) elif c in (araby.WAW, araby.WAW_HAMZA): marks.pop() marks.append(araby.DAMMA) elif c in( araby.YEH , araby.YEH_HAMZA ): marks.pop() marks.append(araby.KASRA) #--------- add Harakat before letter if c in (araby.ALEF_HAMZA_BELOW): marks.append(araby.KASRA) elif previous in (araby.ALEF_HAMZA_BELOW, araby.ALEF_HAMZA_ABOVE): marks.append(araby.SUKUN) else: marks.append(araby.NOT_DEF_HARAKA) previous = c #print len(word) ,len(marks) #print marks return araby.joint(word, u"".join(marks))
def decode_tashkeel(word, marks, method="ascii"): """ decode tashkeel""" """ decode marks from decimal/ascii string to be joint on word @input word: undiacritized arabic diacritcs @type word: unicode @input marks: encoded marks @type marks: unicode/integer @return: diacritized word @rtype: unicode """ if type(marks) != (str): marks = str(marks) # zeros can be removed in int code, then we must add them to left marks = marks.rjust(len(word), str("0")) if method == "decimal": transed = translate(marks, D2T_TRANS) elif method == "ascii": transed = translate(marks, A2T_TRANS) else: transed = translate(marks, A2T_TRANS) word2 = ar.joint(word, transed) return word2
def test_joint(self): """Test Join function ?""" letters = u"العربية" marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f' self.assertEqual(ar.joint(letters, marks), u"اَلْعَرَبَيَةُ")
def verify_tashkeel(word): """ verify tashkeel on vocalized word""" letters, marks = ar.separate(word) new_word = ar.joint(letters, marks) return new_word == word
def test_join(self): # joint(letters, marks) marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f' assert Araby.joint(u"العربية", marks) == u'اَلْعَرَبَيَةُ'
for ln in statTable.keys(): partialPatternCount = 0 for patternkey in statTable[ln].keys(): partialPatternCount += len(statTable[ln][patternkey].keys()) if statTable[ln].keys(): average = partialPatternCount / len(statTable[ln].keys()) else: average = 0 print "\t".join([str(ln), str(len(statTable[ln].keys())), str(average), "pw"]) # test vocalize a word text = u"يأكل الولد التفاح بالعشاء " words = araby.tokenize(text) for word in words: patternKey = harakatpattern.extractPattern(word) ln = len(patternKey) if statTable.has_key(ln) and statTable[ln].has_key(patternKey): print u"\t".join(statTable[ln][patternKey].keys()).encode("utf8") for vocalizedPattern in statTable[ln][patternKey].keys(): # vocalizedPattern2=araby.stripShadda(vocalizedPattern) # letters,harakat = araby.separate(vocalizedPattern2) # vocalizedForm =araby.joint(word,harakat) letters, harakat, ShaddaPlaces = araby.separate(vocalizedPattern, True) newWord_nm = araby.joint(word, ShaddaPlaces) vocWord = araby.joint(newWord_nm, harakat) print u"\t".join([word, patternKey, vocalizedPattern, harakat, vocWord]).encode("utf8") else: print patternKey.encode("utf8"), "pattern non found" # print wordCount/patternCount;
average = partialPatternCount / len(statTable[ln].keys()) else: average = 0 print "\t".join( [str(ln), str(len(statTable[ln].keys())), str(average), 'pw']) # test vocalize a word text = u"يأكل الولد التفاح بالعشاء " words = araby.tokenize(text) for word in words: patternKey = harakatpattern.extractPattern(word) ln = len(patternKey) if statTable.has_key(ln) and statTable[ln].has_key(patternKey): print u"\t".join(statTable[ln][patternKey].keys()).encode('utf8') for vocalizedPattern in statTable[ln][patternKey].keys(): # vocalizedPattern2=araby.stripShadda(vocalizedPattern) # letters,harakat = araby.separate(vocalizedPattern2) # vocalizedForm =araby.joint(word,harakat) letters, harakat, ShaddaPlaces = araby.separate( vocalizedPattern, True) newWord_nm = araby.joint(word, ShaddaPlaces) vocWord = araby.joint(newWord_nm, harakat) print u"\t".join( [word, patternKey, vocalizedPattern, harakat, vocWord]).encode('utf8') else: print patternKey.encode('utf8'), "pattern non found" # print wordCount/patternCount;