Exemplo n.º 1
0
 def test_seperate(self):
     """Test Separate function ?"""
     letters = u"العربية"
     marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
     word = u"اَلْعَرَبَيَةُ"
     l, m= ar.separate(word)
     
     self.assertEqual(ar.joint(l,m), word)
     self.assertEqual(ar.separate(ar.joint(letters,marks)), (letters,marks))
Exemplo n.º 2
0
 def test_seperate(self):
     """Test Separate function ?"""
     letters = u"العربية"
     marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
     word = u"اَلْعَرَبَيَةُ"
     l, m= ar.separate(word)
     
     self.assertEqual(ar.joint(l,m), word)
     self.assertEqual(ar.separate(ar.joint(letters,marks)), (letters,marks))
Exemplo n.º 3
0
def vocalize_foreign(word):
    """
    vocalize a foreign names written in arabic
    @param word: given word
    @type  word:  unicode
    @return: the vocalized word
    @rtype: unicode
    """
    marks =[]
    previous = ""
    for c in word:
        if previous and not previous == araby.ALEF:
            #--------- add Harakat before letter
            if  c in (araby.ALEF, araby.ALEF_MAKSURA, araby.TEH_MARBUTA,):
                marks.pop()
                marks.append(araby.FATHA)
            elif c in (araby.WAW, araby.WAW_HAMZA):
                marks.pop()
                marks.append(araby.DAMMA)
            elif  c in( araby.YEH , araby.YEH_HAMZA ):
                marks.pop()
                marks.append(araby.KASRA)
        #--------- add Harakat before letter
        if c in (araby.ALEF_HAMZA_BELOW):
                marks.append(araby.KASRA)
        elif previous in (araby.ALEF_HAMZA_BELOW, araby.ALEF_HAMZA_ABOVE):
                marks.append(araby.SUKUN)
        else:
                marks.append(araby.NOT_DEF_HARAKA)
        previous = c        
    #print len(word) ,len(marks)
    #print marks
    return araby.joint(word, u"".join(marks))
          
Exemplo n.º 4
0
def vocalize_foreign(word):
    """
    vocalize a foreign names written in arabic
    @param word: given word
    @type  word:  unicode
    @return: the vocalized word
    @rtype: unicode
    """
    marks =[]
    previous = ""
    for c in word:
        if previous and not previous == araby.ALEF:
            #--------- add Harakat before letter
            if  c in (araby.ALEF, araby.ALEF_MAKSURA, araby.TEH_MARBUTA,):
                marks.pop()
                marks.append(araby.FATHA)
            elif c in (araby.WAW, araby.WAW_HAMZA):
                marks.pop()
                marks.append(araby.DAMMA)
            elif  c in( araby.YEH , araby.YEH_HAMZA ):
                marks.pop()
                marks.append(araby.KASRA)
        #--------- add Harakat before letter
        if c in (araby.ALEF_HAMZA_BELOW):
                marks.append(araby.KASRA)
        elif previous in (araby.ALEF_HAMZA_BELOW, araby.ALEF_HAMZA_ABOVE):
                marks.append(araby.SUKUN)
        else:
                marks.append(araby.NOT_DEF_HARAKA)
        previous = c        
    #print len(word) ,len(marks)
    #print marks
    return araby.joint(word, u"".join(marks))
Exemplo n.º 5
0
def decode_tashkeel(word, marks, method="ascii"):
    """ decode tashkeel"""
    """
    decode marks from decimal/ascii string to be joint on word
    @input word: undiacritized arabic diacritcs
    @type word: unicode
    @input marks: encoded marks
    @type marks: unicode/integer
    @return:  diacritized word
    @rtype: unicode
    """
    if type(marks) != (str):
        marks = str(marks)
    # zeros can be removed in int code, then we must add them to left
    marks = marks.rjust(len(word), str("0"))
    if method == "decimal":
        transed = translate(marks, D2T_TRANS)
    elif method == "ascii":
        transed = translate(marks, A2T_TRANS)
    else:
        transed = translate(marks, A2T_TRANS)
    word2 = ar.joint(word, transed)
    return word2
Exemplo n.º 6
0
 def test_joint(self):
     """Test Join function ?"""
     letters = u"العربية"
     marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
     self.assertEqual(ar.joint(letters, marks), u"اَلْعَرَبَيَةُ")
Exemplo n.º 7
0
def verify_tashkeel(word):
    """ verify tashkeel on vocalized word"""
    letters, marks = ar.separate(word)
    new_word = ar.joint(letters, marks)
    return new_word == word
Exemplo n.º 8
0
    def test_join(self):

        # joint(letters,  marks)
        marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
        assert Araby.joint(u"العربية", marks) == u'اَلْعَرَبَيَةُ'
Exemplo n.º 9
0
 def test_joint(self):
     """Test Join function ?"""
     letters = u"العربية"
     marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
     self.assertEqual(ar.joint(letters, marks), u"اَلْعَرَبَيَةُ")
Exemplo n.º 10
0
    for ln in statTable.keys():
        partialPatternCount = 0
        for patternkey in statTable[ln].keys():
            partialPatternCount += len(statTable[ln][patternkey].keys())
        if statTable[ln].keys():
            average = partialPatternCount / len(statTable[ln].keys())
        else:
            average = 0
        print "\t".join([str(ln), str(len(statTable[ln].keys())), str(average), "pw"])
# test vocalize a word
text = u"يأكل الولد التفاح بالعشاء "
words = araby.tokenize(text)
for word in words:
    patternKey = harakatpattern.extractPattern(word)
    ln = len(patternKey)
    if statTable.has_key(ln) and statTable[ln].has_key(patternKey):
        print u"\t".join(statTable[ln][patternKey].keys()).encode("utf8")
        for vocalizedPattern in statTable[ln][patternKey].keys():
            # vocalizedPattern2=araby.stripShadda(vocalizedPattern)
            # letters,harakat = araby.separate(vocalizedPattern2)
            # vocalizedForm =araby.joint(word,harakat)

            letters, harakat, ShaddaPlaces = araby.separate(vocalizedPattern, True)
            newWord_nm = araby.joint(word, ShaddaPlaces)
            vocWord = araby.joint(newWord_nm, harakat)
            print u"\t".join([word, patternKey, vocalizedPattern, harakat, vocWord]).encode("utf8")
    else:
        print patternKey.encode("utf8"), "pattern non found"

        # print wordCount/patternCount;
            average = partialPatternCount / len(statTable[ln].keys())
        else:
            average = 0
        print "\t".join(
            [str(ln),
             str(len(statTable[ln].keys())),
             str(average), 'pw'])
# test vocalize a word
text = u"يأكل الولد التفاح بالعشاء "
words = araby.tokenize(text)
for word in words:
    patternKey = harakatpattern.extractPattern(word)
    ln = len(patternKey)
    if statTable.has_key(ln) and statTable[ln].has_key(patternKey):
        print u"\t".join(statTable[ln][patternKey].keys()).encode('utf8')
        for vocalizedPattern in statTable[ln][patternKey].keys():
            # vocalizedPattern2=araby.stripShadda(vocalizedPattern)
            # letters,harakat = araby.separate(vocalizedPattern2)
            # vocalizedForm =araby.joint(word,harakat)

            letters, harakat, ShaddaPlaces = araby.separate(
                vocalizedPattern, True)
            newWord_nm = araby.joint(word, ShaddaPlaces)
            vocWord = araby.joint(newWord_nm, harakat)
            print u"\t".join(
                [word, patternKey, vocalizedPattern, harakat,
                 vocWord]).encode('utf8')
    else:
        print patternKey.encode('utf8'), "pattern non found"

    # print wordCount/patternCount;