예제 #1
0
def extract_poem():
    """[summary]
    might not be using it
    Returns:
        [type] -- [description]
        dict    --  postings lists of arabic words
    """

    postings_list = {}
    tokens = []
    with open("short story.txt", encoding='utf-8') as ofile:

        for loc, line in enumerate(ofile, 1):

            words = araby.tokenize(araby.strip_tashkeel(line))
            tokens.extend(words)
            for word in words:
                if (araby.is_tatweel(word)):
                    word = araby.strip_tatweel(word)

                # if word not in postings_list:

                #     postings_list[word]=[loc]

                # else:

                #     postings_list[word].append(loc)

    #return postings_list
    return tokens
예제 #2
0
    def test_is_letter(self):

        self.assertTrue(Araby.is_sukun(Araby.SUKUN))
        self.assertTrue(Araby.is_shadda(Araby.SHADDA))
        self.assertTrue(Araby.is_tatweel(Araby.TATWEEL))

        for archar in Araby.TANWIN:
            self.assertTrue(Araby.is_tanwin(archar))

        for archar in Araby.TASHKEEL:
            self.assertTrue(Araby.is_tashkeel(archar))

        for haraka in Araby.HARAKAT:
            self.assertTrue(Araby.is_haraka(haraka))

        for short_haraka in Araby.SHORTHARAKAT:
            self.assertTrue(Araby.is_shortharaka(short_haraka))

        for liguature in Araby.LIGUATURES:
            self.assertTrue(Araby.is_ligature(liguature))

        for hamza in Araby.HAMZAT:
            self.assertTrue(Araby.is_hamza(hamza))

        for alef in Araby.ALEFAT:
            self.assertTrue(Araby.is_alef(alef))

        for yeh in Araby.YEHLIKE:
            self.assertTrue(Araby.is_yehlike(yeh))

        for waw in Araby.WAWLIKE:
            self.assertTrue(Araby.is_wawlike(waw))

        for teh in Araby.TEHLIKE:
            self.assertTrue(Araby.is_teh)

        for small in Araby.SMALL:
            self.assertTrue(Araby.is_small(small))

        for weak in Araby.WEAK:
            self.assertTrue(Araby.is_weak(weak))

        for archar in Araby.MOON:
            self.assertTrue(Araby.is_moon(archar))

        for archar in  Araby.SUN:
            self.assertTrue(Araby.is_sun(archar))
예제 #3
0
def soundex(sentence):
    '''
    - blanks and and spaces are deleted,
    - long vowels are deleted,
    - if two adjacent letters are identical, only one of the two is
        kept,
    - to each word’s letter are associated two numbers:

        1. the first one corresponds to the letter’s main category’s
        code. It is represented by an integer N of two bits, such as N
        E={0, 1, 2}.
        
        2. the second one corresponds to the letter’s sub-category’s
        code. It is represented by an integer n of four bits, such as:
        n SE= {0,...,10}.
        Thus :
            Given a word w, w=w 1 ...w n .
                w=w- {blanks and long vowels}=w’ 1 ... w’ n .
                f(w)=f(w’)=f(w’ 1 ... w’ n )=N 1 n 1 ... N n n n =X.
                The phonetic code generated X, can be used as a hash key
                for classifying and indexing purposes.
    
    Returns:
        [type] -- [description]
    '''
    words = araby.tokenize(sentence)
    for word in words:
        if type(word) != str:
            word = word.decode('utf-8')
        else:
            word = word.encode('utf-8')
        loc = 0
        for i in word[0:]:
            if araby.is_tatweel(i):
                word = araby.strip_tatweel(word)
            if loc < len(word) and loc != 0 and re.match(
                    "[\u0627\u064a\u0648]", str(word[loc])):
                word = word[:loc] + word[loc + 1:]
            if loc < len(word) and re.match("[\u0640]", str(word[loc])):
                word = word[:loc] + word[loc + 1:]
            loc += 1
    print(words)
예제 #4
0
    unicode_literals,
    division,
)
import sys
sys.path.append("../")
#~ import  pyarabic.harf as harf
#from   pyarabic import harf
import pyarabic.araby as araby

for c in araby.arabicrange():
    print(c, '\t', araby.name(c), end=" ")
    print('\t', end=" ")
    if araby.is_sukun(c): print("sukun", end=" ")
    if araby.is_haraka(c): print("haraka", end=" ")
    if araby.is_shadda(c): print("shadda", end=" ")
    if araby.is_tatweel(c): print("tatweel", end=" ")
    if araby.is_tashkeel(c): print("tashkeel", end=" ")
    if araby.is_tanwin(c): print("tanwin", end=" ")
    if araby.is_shortharaka(c): print("short haraka", end=" ")
    if araby.is_ligature(c): print(" ligature", end=" ")
    if araby.is_ligature(c): print('ligature', end=" ")
    if araby.is_hamza(c): print('hamza', end=" ")
    if araby.is_alef(c): print('alef', end=" ")
    if araby.is_yehlike(c): print('yeh', end=" ")
    if araby.is_wawlike(c): print('waw', end=" ")
    if araby.is_teh(c): print('teh', end=" ")
    if araby.is_small(c): print('small', end=" ")
    if araby.is_weak(c): print('weak', end=" ")
    if araby.is_moon(c): print('moon', end=" ")
    if araby.is_sun(c): print('sun', end=" ")
    print(araby.order(c), end=" ")
예제 #5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
sys.path.append('../')
from  pyarabic import araby


for c in araby.arabicrange():
    print (c,'\t', araby.name(c))
    print ('\t')
    if araby.is_sukun(c): print ("sukun")
    if araby.is_haraka(c): print ("haraka")
    if araby.is_shadda(c): print ("shadda")
    if araby.is_tatweel(c): print ("tatweel")
    if araby.is_tashkeel(c): print ("tashkeel")
    if araby.is_tanwin(c): print ("tanwin")
    if araby.is_shortharaka(c): print ("short haraka"),
    if araby.is_ligature(c):print (" ligature"),
    if araby.is_ligature(c):print ('ligature'),
    if araby.is_hamza(c):    print ('hamza'),
    if araby.is_alef(c): print ('alef'),
    if araby.is_yehlike(c):  print ('yeh'),
    if araby.is_wawlike(c):  print ('waw'),
    if araby.is_teh(c):  print ('teh'),
    if araby.is_small(c):    print ('small'),
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
예제 #6
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
sys.path.append("../")
#~ import  pyarabic.harf as harf
#from   pyarabic import harf
import  pyarabic.araby as araby

for c in araby.arabicrange():
    print c.encode('utf8'),'\t', araby.name(c).encode('utf8'),
    print '\t',
    if araby.is_sukun(c): print "sukun",
    if araby.is_haraka(c): print "haraka",
    if araby.is_shadda(c): print "shadda",
    if araby.is_tatweel(c): print "tatweel",
    if araby.is_tashkeel(c): print "tashkeel",
    if araby.is_tanwin(c): print "tanwin",
    if araby.is_shortharaka(c): print "short haraka",
    if araby.is_ligature(c):print " ligature",
    if araby.is_ligature(c):print 'ligature',
    if araby.is_hamza(c):    print 'hamza',
    if araby.is_alef(c): print 'alef',
    if araby.is_yehlike(c):  print 'yeh',
    if araby.is_wawlike(c):  print 'waw',
    if araby.is_teh(c):  print 'teh',
    if araby.is_small(c):    print 'small',
    if araby.is_weak(c): print 'weak',
    if araby.is_moon(c): print 'moon',
    if araby.is_sun(c):print 'sun',
    print araby.order(c),
    print;
예제 #7
0
                   for key, group in groupby(aa5irHarf)]
print(freqOfAa5irHarf)
import collections
counter = collections.Counter(aa5irHarf)
print(counter)
# Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1})
print(counter.values())
# [4, 4, 2, 1, 2]
print(counter.keys())
# [1, 2, 3, 4, 5]
print(counter.most_common(3))
# [(1, 4), (2, 4), (3, 2)]
print(counter.most_common(1))
kkey = counter.most_common(1)
#we should write to file or save it anywhere
#and also we should generalize it to all poems for each poet

#القافية :آخر ساكن وبدور عالساكن اللي قبله مع الحرف المتحرك اللي قبل الساكن ال ما قبل الاخير
print('********** Al Qafiya ************')
for line in f:
    line1 = araby.strip_tatweel(line)
    letters, hrkat = araby.separate(line1)
    #print(letters.encode('utf8'))
    for m in hrkat:
        #لازم نعمل تعديلات
        if not araby.is_tatweel(m):
            print(araby.name(m))
            print(''.join(m))

#Most Common Words بنعملهم بكل قصائد الشاعر
예제 #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
sys.path.append("../")
#~ import  pyarabic.harf as harf
#from   pyarabic import harf
import pyarabic.araby as araby

for c in araby.arabicrange():
    print c.encode('utf8'), '\t', araby.name(c).encode('utf8'),
    print '\t',
    if araby.is_sukun(c): print "sukun",
    if araby.is_haraka(c): print "haraka",
    if araby.is_shadda(c): print "shadda",
    if araby.is_tatweel(c): print "tatweel",
    if araby.is_tashkeel(c): print "tashkeel",
    if araby.is_tanwin(c): print "tanwin",
    if araby.is_shortharaka(c): print "short haraka",
    if araby.is_ligature(c): print " ligature",
    if araby.is_ligature(c): print 'ligature',
    if araby.is_hamza(c): print 'hamza',
    if araby.is_alef(c): print 'alef',
    if araby.is_yehlike(c): print 'yeh',
    if araby.is_wawlike(c): print 'waw',
    if araby.is_teh(c): print 'teh',
    if araby.is_small(c): print 'small',
    if araby.is_weak(c): print 'weak',
    if araby.is_moon(c): print 'moon',
    if araby.is_sun(c): print 'sun',
    print araby.order(c),
    print