Пример #1
0
def vocalize(noun, proclitic, prefix, suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.
    @param prefix: second level suffix.
    @type prefix: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] 
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] 
    suffix_voc = suffix 
    #adjust some some harakat
    
    #strip last if tanwin or harakat
    if noun[-1:] in araby.HARAKAT:
        noun = noun[:-1]
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = araby.complet
    #~ print "stem_unknown.vocalize; before", noun.encode('utf8');
    noun = noun.replace(araby.ALEF, araby.FATHA + araby.ALEF)
    #~ print "stem_unknown.vocalize; 2", noun.encode('utf8');

    noun = noun.replace(araby.ALEF_MAKSURA, araby.FATHA + araby.ALEF_MAKSURA)
    noun = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, noun)
    
    # remove initial fatha if alef is the first letter
    noun = re.sub(ur"^(%s)+"%araby.FATHA , "", noun)
    #~ print "stem_unknown.vocalize; 3", noun.encode('utf8');
        
    #add shadda if the first letter is sunny and the prefix 
    #ends by al definition
    if proclitic.endswith(araby.ALEF+araby.LAM) and araby.is_sun(noun[0]):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]]) 
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1] 
    noun = get_word_variant(noun, suffix) 
    noun = get_word_variant(noun, enclitic)         
    suffix_voc = get_suffix_variant(noun, suffix_voc, enclitic) 
    return ''.join([ proclitic_voc, prefix, noun, suffix_voc,
     enclitic_voc]) 
Пример #2
0
def vocalize(noun, proclitic, prefix, suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.
    @param prefix: second level suffix.
    @type prefix: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.
    @return: vocalized word.
    @rtype: unicode.
    """
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    suffix_voc = suffix
    #adjust some some harakat

    #strip last if tanwin or harakat
    if noun[-1:] in araby.HARAKAT:
        noun = noun[:-1]
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = araby.complet
    #~ print "stem_unknown.vocalize; before", noun.encode('utf8');
    noun = noun.replace(araby.ALEF, araby.FATHA + araby.ALEF)
    #~ print "stem_unknown.vocalize; 2", noun.encode('utf8');

    noun = noun.replace(araby.ALEF_MAKSURA, araby.FATHA + araby.ALEF_MAKSURA)
    noun = re.sub(u"(%s)+" % araby.FATHA, araby.FATHA, noun)

    # remove initial fatha if alef is the first letter
    noun = re.sub(u"^(%s)+" % araby.FATHA, "", noun)
    #~ print "stem_unknown.vocalize; 3", noun.encode('utf8');

    #add shadda if the first letter is sunny and the prefix
    #ends by al definition
    if proclitic.endswith(araby.ALEF + araby.LAM) and araby.is_sun(noun[0]):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    noun = get_word_variant(noun, suffix)
    noun = get_word_variant(noun, enclitic)
    suffix_voc = get_suffix_variant(noun, suffix_voc, enclitic)
    return ''.join([proclitic_voc, prefix, noun, suffix_voc, enclitic_voc])
Пример #3
0
    def test_is_letter(self):

        self.assertTrue(Araby.is_sukun(Araby.SUKUN))
        self.assertTrue(Araby.is_shadda(Araby.SHADDA))
        self.assertTrue(Araby.is_tatweel(Araby.TATWEEL))

        for archar in Araby.TANWIN:
            self.assertTrue(Araby.is_tanwin(archar))

        for archar in Araby.TASHKEEL:
            self.assertTrue(Araby.is_tashkeel(archar))

        for haraka in Araby.HARAKAT:
            self.assertTrue(Araby.is_haraka(haraka))

        for short_haraka in Araby.SHORTHARAKAT:
            self.assertTrue(Araby.is_shortharaka(short_haraka))

        for liguature in Araby.LIGUATURES:
            self.assertTrue(Araby.is_ligature(liguature))

        for hamza in Araby.HAMZAT:
            self.assertTrue(Araby.is_hamza(hamza))

        for alef in Araby.ALEFAT:
            self.assertTrue(Araby.is_alef(alef))

        for yeh in Araby.YEHLIKE:
            self.assertTrue(Araby.is_yehlike(yeh))

        for waw in Araby.WAWLIKE:
            self.assertTrue(Araby.is_wawlike(waw))

        for teh in Araby.TEHLIKE:
            self.assertTrue(Araby.is_teh)

        for small in Araby.SMALL:
            self.assertTrue(Araby.is_small(small))

        for weak in Araby.WEAK:
            self.assertTrue(Araby.is_weak(weak))

        for archar in Araby.MOON:
            self.assertTrue(Araby.is_moon(archar))

        for archar in  Araby.SUN:
            self.assertTrue(Araby.is_sun(archar))
Пример #4
0
def vocalize(noun, proclitic, prefix, suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.
    @param prefix: second level suffix.
    @type prefix: unicode.
    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] 
    proclitic_voc = \
    snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] 
    suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0] 
    #adjust some some harakat
    
    #strip last if tanwin or harakat
    if noun[-1:] in araby.HARAKAT:
        noun = noun[:-1] 
    #add shadda if the first letter is sunny and the prefix 
    #ends by al definition
    if proclitic.endswith(araby.ALEF+araby.LAM) and araby.is_sun(noun[0]):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]]) 
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1] 
    noun = get_word_variant(noun, suffix) 
    noun = get_word_variant(noun, enclitic)         
    suffix_voc = get_suffix_variant(noun, suffix_voc, enclitic) 
    return ''.join([ proclitic_voc, prefix, noun, suffix_voc,
     enclitic_voc]) 
Пример #5
0
    if araby.is_shadda(c): print("shadda", end=" ")
    if araby.is_tatweel(c): print("tatweel", end=" ")
    if araby.is_tashkeel(c): print("tashkeel", end=" ")
    if araby.is_tanwin(c): print("tanwin", end=" ")
    if araby.is_shortharaka(c): print("short haraka", end=" ")
    if araby.is_ligature(c): print(" ligature", end=" ")
    if araby.is_ligature(c): print('ligature', end=" ")
    if araby.is_hamza(c): print('hamza', end=" ")
    if araby.is_alef(c): print('alef', end=" ")
    if araby.is_yehlike(c): print('yeh', end=" ")
    if araby.is_wawlike(c): print('waw', end=" ")
    if araby.is_teh(c): print('teh', end=" ")
    if araby.is_small(c): print('small', end=" ")
    if araby.is_weak(c): print('weak', end=" ")
    if araby.is_moon(c): print('moon', end=" ")
    if araby.is_sun(c): print('sun', end=" ")
    print(araby.order(c), end=" ")
    print()
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
    if araby.is_vocalized(word): print(' is vocalized', end=" ")
    if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ")
Пример #6
0
    if araby.is_shadda(c): print ("shadda")
    if araby.is_tatweel(c): print ("tatweel")
    if araby.is_tashkeel(c): print ("tashkeel")
    if araby.is_tanwin(c): print ("tanwin")
    if araby.is_shortharaka(c): print ("short haraka"),
    if araby.is_ligature(c):print (" ligature"),
    if araby.is_ligature(c):print ('ligature'),
    if araby.is_hamza(c):    print ('hamza'),
    if araby.is_alef(c): print ('alef'),
    if araby.is_yehlike(c):  print ('yeh'),
    if araby.is_wawlike(c):  print ('waw'),
    if araby.is_teh(c):  print ('teh'),
    if araby.is_small(c):    print ('small'),
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
u"سئل لأنه يؤم الإمام"
]
word1=u""
for word in word_list:
    print (word)
    if araby.is_vocalized(word): print (' is vocalized')
Пример #7
0
def vocalize(noun, proclitic, suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.
    @return: vocalized word.
    @rtype: unicode.
    """
    # procletic have only an uniq vocalization in arabic
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    # encletic can be variant according to suffix
    #print (u"vocalize: '%s' '%s'"%(enclitic, noun)).encode('utf8')
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    enclitic_voc, encl_voc_non_inflect = get_enclitic_variant(
        enclitic_voc, suffix)

    suffix_voc = suffix
    #adjust some some harakat

    #strip last if tanwin or last harakat
    if ar.is_haraka(noun[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        noun = noun[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    noun = noun.replace(ar.FATHATAN, ar.FATHA)

    #add shadda if the first letter is sunny and the procletic
    #contains AL definition mark
    if u'تعريف' in snconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and ar.is_sun(noun[0]):
        noun = u''.join([noun[0], ar.SHADDA, noun[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(ar.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = ar.complet
    #~ print "stem_noun.vocalize; before", noun.encode('utf8');
    noun = noun.replace(ar.ALEF, ar.FATHA + ar.ALEF)
    #~ print "stem_noun.vocalize; 2", noun.encode('utf8');

    noun = noun.replace(ar.ALEF_MAKSURA, ar.FATHA + ar.ALEF_MAKSURA)
    noun = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA, noun)
    # remove initial fatha if alef is the first letter
    noun = re.sub(ur"^(%s)+" % ar.FATHA, "", noun)
    #~ print "stem_noun.vocalize; 3", noun.encode('utf8');

    # generate the word variant for some words witch ends by special
    #letters like Teh_marbuta or Alef_maksura, or hamza,
    #the variant is influed by the suffix harakat,
    # for example مدرسة+ي = مدرست+ي
    mankous = True if noun.endswith(ar.KASRA + ar.YEH) else False

    noun = get_word_variant(noun, suffix, enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(
        noun, suffix_voc, enclitic, mankous)

    # generate the non vacalized end word: the vocalized word
    # without the I3rab Mark
    # if the suffix is a short haraka
    word_non_irab_mark = ''.join([
        proclitic_voc, noun, suffix_non_irab_mark, encl_voc_non_inflect
    ])
    # ajust the semivocalized form
    word_non_irab_mark = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA,
                                word_non_irab_mark)
    word_non_irab_mark = re.sub(
        u"(%s%s%s)+" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRATAN),
        ar.FATHATAN + ar.ALEF_MAKSURA, word_non_irab_mark)
    word_non_irab_mark = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                             ar.KASRA),
                                ar.FATHA + ar.ALEF_MAKSURA, word_non_irab_mark)
    word_non_irab_mark = re.sub(ur"%s[%s|%s|%s]" % (ar.ALEF_MAKSURA, ar.DAMMA,
                                                   ar.FATHA, ar.KASRA),
                                ar.ALEF_MAKSURA, word_non_irab_mark)

    #generate vocalized form

    word_vocalized = ''.join([proclitic_voc, noun, suffix_voc, enclitic_voc])
    #used for spelling purposes
    segmented = '-'.join([proclitic_voc, noun, suffix_voc, enclitic_voc])
    segmented = ar.strip_tashkeel(segmented)
    #~word_vocalized = ar.ajust_vocalization(word_vocalized)
    word_vocalized = re.sub(ur"(%s)+" % ar.FATHA, ar.FATHA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                         ar.KASRATAN),
                            ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                         ar.DAMMATAN),
                            ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA,
                                         ar.FATHATAN),
                            ar.FATHATAN + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s" % (ar.FATHA, ar.ALEF_MAKSURA, ar.KASRA),
                            ar.FATHA + ar.ALEF_MAKSURA, word_vocalized)
    word_vocalized = re.sub(ur"%s[%s|%s|%s]" % (ar.ALEF_MAKSURA, ar.DAMMA,
                                               ar.FATHA, ar.KASRA),
                            ar.ALEF_MAKSURA, word_vocalized)
    return word_vocalized, word_non_irab_mark, segmented
Пример #8
0
    if araby.is_shadda(c): print "shadda",
    if araby.is_tatweel(c): print "tatweel",
    if araby.is_tashkeel(c): print "tashkeel",
    if araby.is_tanwin(c): print "tanwin",
    if araby.is_shortharaka(c): print "short haraka",
    if araby.is_ligature(c):print " ligature",
    if araby.is_ligature(c):print 'ligature',
    if araby.is_hamza(c):    print 'hamza',
    if araby.is_alef(c): print 'alef',
    if araby.is_yehlike(c):  print 'yeh',
    if araby.is_wawlike(c):  print 'waw',
    if araby.is_teh(c):  print 'teh',
    if araby.is_small(c):    print 'small',
    if araby.is_weak(c): print 'weak',
    if araby.is_moon(c): print 'moon',
    if araby.is_sun(c):print 'sun',
    print araby.order(c),
    print;
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
##    if araby.isArabicstring(word): print ' iisArabicstring',
Пример #9
0
def vocalize( noun, proclitic,  suffix, enclitic):
    """
    Join the  noun and its affixes, and get the vocalized form
    @param noun: noun found in dictionary.
    @type noun: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    # procletic have only an uniq vocalization in arabic
    proclitic_voc = snconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    # encletic can be variant according to suffix
    #print (u"vocalize: '%s' '%s'"%(enclitic, noun)).encode('utf8')
    enclitic_voc = snconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    enclitic_voc,enclitic_voc_non_inflected  = get_enclitic_variant(enclitic_voc, suffix) 

    suffix_voc = suffix
    #adjust some some harakat
    
    #strip last if tanwin or last harakat
    if araby.is_haraka(noun[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        noun = noun[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    noun = noun.replace(araby.FATHATAN, araby.FATHA)

    #add shadda if the first letter is sunny and the procletic 
    #contains AL definition mark
    if (u'تعريف' in snconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and araby.is_sun(noun[0])):
        noun = u''.join([noun[0], araby.SHADDA, noun[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    # generate the word variant for some words witch ends by special 
    #letters like Teh_marbuta or Alef_maksura, or hamza, 
    #the variant is influed by the suffix harakat, 
    # for example مدرسة+ي = مدرست+ي
    noun = get_word_variant(noun, suffix+enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or 
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي        
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(noun,
     suffix_voc, enclitic)


    
    #completate the dictionary word vocalization
    # this allow to avoid some missed harakat before ALEF
    # in the dictionary form of word, all alefat are preceded by Fatha
    #~noun = araby.complet
    noun = noun.replace(araby.ALEF, araby.FATHA + araby.ALEF)

    noun = noun.replace(araby.ALEF_MAKSURA, araby.FATHA + araby.ALEF_MAKSURA)
    noun = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, noun)
    # remove initial fatha if alef is the first letter
    noun = re.sub(ur"^(%s)+"%araby.FATHA , "", noun)
    
    # generate the non vacalized end word: the vocalized word 
    # without the I3rab Mark
    # if the suffix is a short haraka 
    word_non_irab_mark = ''.join([ proclitic_voc,  noun, 
    suffix_non_irab_mark,   enclitic_voc_non_inflected]) 
    # ajust the semivocalized form
    word_non_irab_mark  = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_non_irab_mark )
    word_non_irab_mark  = re.sub(ur"(%s%s%s)+"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN)
 , araby.FATHATAN + araby.ALEF_MAKSURA, word_non_irab_mark )    
    word_non_irab_mark  = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA)
 , araby.FATHA + araby.ALEF_MAKSURA, word_non_irab_mark ) 
    word_non_irab_mark  = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA)
 , araby.ALEF_MAKSURA, word_non_irab_mark ) 
    
    #generate vocalized form
    
    word_vocalized = ''.join([ proclitic_voc, noun, suffix_voc, 
       enclitic_voc])
    #~word_vocalized = araby.ajust_vocalization(word_vocalized)
    word_vocalized = re.sub(ur"(%s)+"%araby.FATHA , araby.FATHA, word_vocalized)
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.DAMMATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.FATHATAN)
     , araby.FATHATAN + araby.ALEF_MAKSURA, word_vocalized)    
    word_vocalized = re.sub(ur"%s%s%s"%(araby.FATHA, araby.ALEF_MAKSURA, araby.KASRA)
     , araby.FATHA + araby.ALEF_MAKSURA, word_vocalized) 
    word_vocalized = re.sub(ur"%s[%s|%s|%s]"%(araby.ALEF_MAKSURA, araby.DAMMA, araby.FATHA, araby.KASRA)
     , araby.ALEF_MAKSURA, word_vocalized)      
    return word_vocalized, word_non_irab_mark 
Пример #10
0
    if araby.is_shadda(c): print "shadda",
    if araby.is_tatweel(c): print "tatweel",
    if araby.is_tashkeel(c): print "tashkeel",
    if araby.is_tanwin(c): print "tanwin",
    if araby.is_shortharaka(c): print "short haraka",
    if araby.is_ligature(c): print " ligature",
    if araby.is_ligature(c): print 'ligature',
    if araby.is_hamza(c): print 'hamza',
    if araby.is_alef(c): print 'alef',
    if araby.is_yehlike(c): print 'yeh',
    if araby.is_wawlike(c): print 'waw',
    if araby.is_teh(c): print 'teh',
    if araby.is_small(c): print 'small',
    if araby.is_weak(c): print 'weak',
    if araby.is_moon(c): print 'moon',
    if araby.is_sun(c): print 'sun',
    print araby.order(c),
    print
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print word.encode('utf8'), '\t',
    if araby.is_vocalized(word): print ' is vocalized',
    ##    if araby.isArabicstring(word): print ' iisArabicstring',
Пример #11
0
def vocalize( stop, proclitic,  suffix, enclitic):
    """
    Join the  stop and its affixes, and get the vocalized form
    @param stop: stop found in dictionary.
    @type stop: unicode.
    @param proclitic: first level prefix.
    @type proclitic: unicode.

    @param suffix: second level suffix.
    @type suffix: unicode.
    @param enclitic: first level suffix.
    @type enclitic: unicode.        
    @return: vocalized word.
    @rtype: unicode.
    """
    # enclitic and procletric have only an uniq vocalization in arabic
    enclitic_voc = ssconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0]
    proclitic_voc = ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0]
    suffix_voc = suffix#CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0]
    #adjust some some harakat
    
    #strip last if tanwin or last harakat
    if suffix_voc and araby.is_haraka(stop[-1:]):
        #(DAMMATAN, FATHATAN, KASRATAN, FATHA, DAMMA, KASRA):
        stop = stop[:-1]
    # convert Fathatan into one fatha, in some cases where #
    #the tanwin is not at the end: eg. محتوًى
    stop = stop.replace(araby.FATHATAN, araby.FATHA)

    #add shadda if the first letter is sunny and the procletic 
    #contains AL definition mark
    if (u'تعريف' in ssconst.COMP_PREFIX_LIST_TAGS[proclitic]["tags"]\
     and araby.is_sun(stop[0])):
        stop = u''.join([stop[0], araby.SHADDA, stop[1:]])
        #strip the Skun from the lam
        if proclitic_voc.endswith(araby.SUKUN):
            proclitic_voc = proclitic_voc[:-1]
    # generate the word variant for some words witch ends by special 
    #letters like Teh_marbuta or Alef_maksura, or hamza, 
    #the variant is influed by the suffix harakat, 
    # for example مدرسة+ي = مدرست+ي
    stop = get_word_variant(stop, suffix+enclitic)

    # generate the suffix variant. if the suffix is Teh_marbuta or 
    #Alef_maksura, or hamza, the variant is influed by the enclitic harakat,
    # for example مدرس+ة+ي = مدرس+ت+ي        
    suffix_voc, suffix_non_irab_mark = get_suffix_variants(stop,
     suffix_voc, enclitic)

    #Get the enclitic variant to be joined to the word.
    #For example: word = مدرس, suffix = ِة, encletic = هُ. 
    #The enclitic  is convert to HEH+ KAsra.
    #~enclitic_voc = self.getEncliticVariant(stop, suffix_voc, enclitic_voc)

    # generate the non vacalized end word: the vocalized word 
    # without the I3rab Mark
    # if the suffix is a short haraka 
    word_non_irab_mark = ''.join([ proclitic_voc,  stop, 
         suffix_non_irab_mark,   enclitic_voc])             
        
    word_vocalized = ''.join([proclitic_voc, stop, suffix_voc, 
       enclitic_voc])
    return word_vocalized, word_non_irab_mark