def extract_poem(): """[summary] might not be using it Returns: [type] -- [description] dict -- postings lists of arabic words """ postings_list = {} tokens = [] with open("short story.txt", encoding='utf-8') as ofile: for loc, line in enumerate(ofile, 1): words = araby.tokenize(araby.strip_tashkeel(line)) tokens.extend(words) for word in words: if (araby.is_tatweel(word)): word = araby.strip_tatweel(word) # if word not in postings_list: # postings_list[word]=[loc] # else: # postings_list[word].append(loc) #return postings_list return tokens
def test_is_letter(self): self.assertTrue(Araby.is_sukun(Araby.SUKUN)) self.assertTrue(Araby.is_shadda(Araby.SHADDA)) self.assertTrue(Araby.is_tatweel(Araby.TATWEEL)) for archar in Araby.TANWIN: self.assertTrue(Araby.is_tanwin(archar)) for archar in Araby.TASHKEEL: self.assertTrue(Araby.is_tashkeel(archar)) for haraka in Araby.HARAKAT: self.assertTrue(Araby.is_haraka(haraka)) for short_haraka in Araby.SHORTHARAKAT: self.assertTrue(Araby.is_shortharaka(short_haraka)) for liguature in Araby.LIGUATURES: self.assertTrue(Araby.is_ligature(liguature)) for hamza in Araby.HAMZAT: self.assertTrue(Araby.is_hamza(hamza)) for alef in Araby.ALEFAT: self.assertTrue(Araby.is_alef(alef)) for yeh in Araby.YEHLIKE: self.assertTrue(Araby.is_yehlike(yeh)) for waw in Araby.WAWLIKE: self.assertTrue(Araby.is_wawlike(waw)) for teh in Araby.TEHLIKE: self.assertTrue(Araby.is_teh) for small in Araby.SMALL: self.assertTrue(Araby.is_small(small)) for weak in Araby.WEAK: self.assertTrue(Araby.is_weak(weak)) for archar in Araby.MOON: self.assertTrue(Araby.is_moon(archar)) for archar in Araby.SUN: self.assertTrue(Araby.is_sun(archar))
def soundex(sentence): ''' - blanks and and spaces are deleted, - long vowels are deleted, - if two adjacent letters are identical, only one of the two is kept, - to each word’s letter are associated two numbers: 1. the first one corresponds to the letter’s main category’s code. It is represented by an integer N of two bits, such as N E={0, 1, 2}. 2. the second one corresponds to the letter’s sub-category’s code. It is represented by an integer n of four bits, such as: n SE= {0,...,10}. Thus : Given a word w, w=w 1 ...w n . w=w- {blanks and long vowels}=w’ 1 ... w’ n . f(w)=f(w’)=f(w’ 1 ... w’ n )=N 1 n 1 ... N n n n =X. The phonetic code generated X, can be used as a hash key for classifying and indexing purposes. Returns: [type] -- [description] ''' words = araby.tokenize(sentence) for word in words: if type(word) != str: word = word.decode('utf-8') else: word = word.encode('utf-8') loc = 0 for i in word[0:]: if araby.is_tatweel(i): word = araby.strip_tatweel(word) if loc < len(word) and loc != 0 and re.match( "[\u0627\u064a\u0648]", str(word[loc])): word = word[:loc] + word[loc + 1:] if loc < len(word) and re.match("[\u0640]", str(word[loc])): word = word[:loc] + word[loc + 1:] loc += 1 print(words)
unicode_literals, division, ) import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print(c, '\t', araby.name(c), end=" ") print('\t', end=" ") if araby.is_sukun(c): print("sukun", end=" ") if araby.is_haraka(c): print("haraka", end=" ") if araby.is_shadda(c): print("shadda", end=" ") if araby.is_tatweel(c): print("tatweel", end=" ") if araby.is_tashkeel(c): print("tashkeel", end=" ") if araby.is_tanwin(c): print("tanwin", end=" ") if araby.is_shortharaka(c): print("short haraka", end=" ") if araby.is_ligature(c): print(" ligature", end=" ") if araby.is_ligature(c): print('ligature', end=" ") if araby.is_hamza(c): print('hamza', end=" ") if araby.is_alef(c): print('alef', end=" ") if araby.is_yehlike(c): print('yeh', end=" ") if araby.is_wawlike(c): print('waw', end=" ") if araby.is_teh(c): print('teh', end=" ") if araby.is_small(c): print('small', end=" ") if araby.is_weak(c): print('weak', end=" ") if araby.is_moon(c): print('moon', end=" ") if araby.is_sun(c): print('sun', end=" ") print(araby.order(c), end=" ")
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append('../') from pyarabic import araby for c in araby.arabicrange(): print (c,'\t', araby.name(c)) print ('\t') if araby.is_sukun(c): print ("sukun") if araby.is_haraka(c): print ("haraka") if araby.is_shadda(c): print ("shadda") if araby.is_tatweel(c): print ("tatweel") if araby.is_tashkeel(c): print ("tashkeel") if araby.is_tanwin(c): print ("tanwin") if araby.is_shortharaka(c): print ("short haraka"), if araby.is_ligature(c):print (" ligature"), if araby.is_ligature(c):print ('ligature'), if araby.is_hamza(c): print ('hamza'), if araby.is_alef(c): print ('alef'), if araby.is_yehlike(c): print ('yeh'), if araby.is_wawlike(c): print ('waw'), if araby.is_teh(c): print ('teh'), if araby.is_small(c): print ('small'), if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ"
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print c.encode('utf8'),'\t', araby.name(c).encode('utf8'), print '\t', if araby.is_sukun(c): print "sukun", if araby.is_haraka(c): print "haraka", if araby.is_shadda(c): print "shadda", if araby.is_tatweel(c): print "tatweel", if araby.is_tashkeel(c): print "tashkeel", if araby.is_tanwin(c): print "tanwin", if araby.is_shortharaka(c): print "short haraka", if araby.is_ligature(c):print " ligature", if araby.is_ligature(c):print 'ligature', if araby.is_hamza(c): print 'hamza', if araby.is_alef(c): print 'alef', if araby.is_yehlike(c): print 'yeh', if araby.is_wawlike(c): print 'waw', if araby.is_teh(c): print 'teh', if araby.is_small(c): print 'small', if araby.is_weak(c): print 'weak', if araby.is_moon(c): print 'moon', if araby.is_sun(c):print 'sun', print araby.order(c), print;
for key, group in groupby(aa5irHarf)] print(freqOfAa5irHarf) import collections counter = collections.Counter(aa5irHarf) print(counter) # Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1}) print(counter.values()) # [4, 4, 2, 1, 2] print(counter.keys()) # [1, 2, 3, 4, 5] print(counter.most_common(3)) # [(1, 4), (2, 4), (3, 2)] print(counter.most_common(1)) kkey = counter.most_common(1) #we should write to file or save it anywhere #and also we should generalize it to all poems for each poet #القافية :آخر ساكن وبدور عالساكن اللي قبله مع الحرف المتحرك اللي قبل الساكن ال ما قبل الاخير print('********** Al Qafiya ************') for line in f: line1 = araby.strip_tatweel(line) letters, hrkat = araby.separate(line1) #print(letters.encode('utf8')) for m in hrkat: #لازم نعمل تعديلات if not araby.is_tatweel(m): print(araby.name(m)) print(''.join(m)) #Most Common Words بنعملهم بكل قصائد الشاعر
#!/usr/bin/env python # -*- coding: utf-8 -*- import sys sys.path.append("../") #~ import pyarabic.harf as harf #from pyarabic import harf import pyarabic.araby as araby for c in araby.arabicrange(): print c.encode('utf8'), '\t', araby.name(c).encode('utf8'), print '\t', if araby.is_sukun(c): print "sukun", if araby.is_haraka(c): print "haraka", if araby.is_shadda(c): print "shadda", if araby.is_tatweel(c): print "tatweel", if araby.is_tashkeel(c): print "tashkeel", if araby.is_tanwin(c): print "tanwin", if araby.is_shortharaka(c): print "short haraka", if araby.is_ligature(c): print " ligature", if araby.is_ligature(c): print 'ligature', if araby.is_hamza(c): print 'hamza', if araby.is_alef(c): print 'alef', if araby.is_yehlike(c): print 'yeh', if araby.is_wawlike(c): print 'waw', if araby.is_teh(c): print 'teh', if araby.is_small(c): print 'small', if araby.is_weak(c): print 'weak', if araby.is_moon(c): print 'moon', if araby.is_sun(c): print 'sun', print araby.order(c), print