예제 #1
0
 def test_seperate(self):
     """Test Separate function ?"""
     letters = u"العربية"
     marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
     word = u"اَلْعَرَبَيَةُ"
     l, m= ar.separate(word)
     
     self.assertEqual(ar.joint(l,m), word)
     self.assertEqual(ar.separate(ar.joint(letters,marks)), (letters,marks))
예제 #2
0
 def test_seperate(self):
     """Test Separate function ?"""
     letters = u"العربية"
     marks = u'\u064e\u0652\u064e\u064e\u064e\u064e\u064f'
     word = u"اَلْعَرَبَيَةُ"
     l, m= ar.separate(word)
     
     self.assertEqual(ar.joint(l,m), word)
     self.assertEqual(ar.separate(ar.joint(letters,marks)), (letters,marks))
def convert_to_two_hot(word):
    if (len(word) == 1 and word[0] in diacritics):
        letters_hot_vector = np.zeros((1, 37))
        diacritics_hot_vector = convert_word_to_one_hot(word, is_letter=False)
    else:
        without_diacritics, only_diacritics = separate(word)
        only_diacritics = only_diacritics.replace("ـ", "ْ")

        letters_hot_vector = convert_word_to_one_hot(without_diacritics,
                                                     is_letter=True)
        diacritics_hot_vector = convert_word_to_one_hot(only_diacritics,
                                                        is_letter=False)


#     print(diacritics_hot_vector.shape)
#     print(letters_hot_vector.shape)
    return np.concatenate([letters_hot_vector, diacritics_hot_vector], axis=1)
예제 #4
0
def encode_tashkeel(word, method="ascii"):
    """
    Encode word marks into decimal or Ascii string to be saved as integer
    
    Example:
        >>> import pyarabic.trans
        >>> word1 = u"هَارِبًا"
        >>> pyarabic.trans.encode_tashkeel(word1)
        ('هاربا', 'a0iA0')
        >>> pyarabic.trans.encode_tashkeel(word1, "decimal")
        ('هاربا', 40610)
        >>> letters = u"هاربا" 
        >>> encoded_marks = u"a0iA0"
        >>> pyarabic.trans.decode_tashkeel(letters, encoded_marks)
        'هَارِبًا'
        >>> letters = u"هاربا" 
        >>> encoded_marks = 40610
        >>> pyarabic.trans.decode_tashkeel(letters, encoded_marks, "decimal")
        'هَارِبًا'


    @input word: diacritized arabic diacritcs
    @type word: unicode
    @return:  (letters, encoded) zero if fails
    @rtype: (letters, encoded) ttring/ integer
    """
    letters, marks = ar.separate(word)

    if method == "decimal":
        transed = translate(marks, T2D_TRANS)
    elif method == "ascii":
        transed = translate(marks, T2A_TRANS)
    else:
        transed = translate(marks, T2A_TRANS)

    if method == "decimal":
        try:
            transed = int(transed)
        except:
            return word, ""
    return letters, transed
예제 #5
0
    def prepare_dataset(self):
        data = {}
        counter = 0
        with open(self.file, encoding='utf8') as file:
            for line in file:
                letter_ids = []
                diacritic_ids = []
                word_ids = []
                letters, diacritics = araby.separate(line)
                letters = letters[0:-1]
                words = araby.tokenize(line)[0:-1]
                diacritics = diacritics[0:-1]
                for letter in letters:
                    if (letter == '\n') or (letter == '\u200f'):
                        continue

                    letter_ids.append(self.letter_to_id[letter])

                for index, diacritic in enumerate(diacritics):
                    if letters[index] == " ":
                        diacritic_ids.append(self.diacritic_to_id['space'])
                    else:
                        diacritic_ids.append(self.diacritic_to_id[diacritic])

                for word in words:
                    word_ids.append(
                        self.word_to_id[araby.strip_tashkeel(word)])

                instance = (torch.tensor(letter_ids,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(diacritic_ids,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(word_ids,
                                         dtype=torch.long,
                                         requires_grad=False))
                data[counter] = instance
                counter += 1
        return data
예제 #6
0
def encode_tashkeel(word, method="ascii"):
    """
    encode word marks into decimal string to be saved as integer
    @input word: diacritized arabic diacritcs
    @type word: unicode
    @return:  (letters, encoded) zero if fails
    @rtype: (letters, encoded) ttring/ integer
    """
    letters, marks = ar.separate(word)

    if method == "decimal":
        transed = translate(marks, T2D_TRANS)
    elif method == "ascii":
        transed = translate(marks, T2A_TRANS)
    else:
        transed = translate(marks, T2A_TRANS)

    if method == "decimal":
        try:
            transed = int(transed)
        except:
            return word, ""
    return letters, transed
예제 #7
0
def verify_tashkeel(word):
    """ verify tashkeel on vocalized word"""
    letters, marks = ar.separate(word)
    new_word = ar.joint(letters, marks)
    return new_word == word
예제 #8
0
    for ln in statTable.keys():
        partialPatternCount = 0
        for patternkey in statTable[ln].keys():
            partialPatternCount += len(statTable[ln][patternkey].keys())
        if statTable[ln].keys():
            average = partialPatternCount / len(statTable[ln].keys())
        else:
            average = 0
        print "\t".join([str(ln), str(len(statTable[ln].keys())), str(average), "pw"])
# test vocalize a word
text = u"يأكل الولد التفاح بالعشاء "
words = araby.tokenize(text)
for word in words:
    patternKey = harakatpattern.extractPattern(word)
    ln = len(patternKey)
    if statTable.has_key(ln) and statTable[ln].has_key(patternKey):
        print u"\t".join(statTable[ln][patternKey].keys()).encode("utf8")
        for vocalizedPattern in statTable[ln][patternKey].keys():
            # vocalizedPattern2=araby.stripShadda(vocalizedPattern)
            # letters,harakat = araby.separate(vocalizedPattern2)
            # vocalizedForm =araby.joint(word,harakat)

            letters, harakat, ShaddaPlaces = araby.separate(vocalizedPattern, True)
            newWord_nm = araby.joint(word, ShaddaPlaces)
            vocWord = araby.joint(newWord_nm, harakat)
            print u"\t".join([word, patternKey, vocalizedPattern, harakat, vocWord]).encode("utf8")
    else:
        print patternKey.encode("utf8"), "pattern non found"

        # print wordCount/patternCount;
예제 #9
0
                   for key, group in groupby(aa5irHarf)]
print(freqOfAa5irHarf)
import collections
counter = collections.Counter(aa5irHarf)
print(counter)
# Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1})
print(counter.values())
# [4, 4, 2, 1, 2]
print(counter.keys())
# [1, 2, 3, 4, 5]
print(counter.most_common(3))
# [(1, 4), (2, 4), (3, 2)]
print(counter.most_common(1))
kkey = counter.most_common(1)
#we should write to file or save it anywhere
#and also we should generalize it to all poems for each poet

#القافية :آخر ساكن وبدور عالساكن اللي قبله مع الحرف المتحرك اللي قبل الساكن ال ما قبل الاخير
print('********** Al Qafiya ************')
for line in f:
    line1 = araby.strip_tatweel(line)
    letters, hrkat = araby.separate(line1)
    #print(letters.encode('utf8'))
    for m in hrkat:
        #لازم نعمل تعديلات
        if not araby.is_tatweel(m):
            print(araby.name(m))
            print(''.join(m))

#Most Common Words بنعملهم بكل قصائد الشاعر
            average = partialPatternCount / len(statTable[ln].keys())
        else:
            average = 0
        print "\t".join(
            [str(ln),
             str(len(statTable[ln].keys())),
             str(average), 'pw'])
# test vocalize a word
text = u"يأكل الولد التفاح بالعشاء "
words = araby.tokenize(text)
for word in words:
    patternKey = harakatpattern.extractPattern(word)
    ln = len(patternKey)
    if statTable.has_key(ln) and statTable[ln].has_key(patternKey):
        print u"\t".join(statTable[ln][patternKey].keys()).encode('utf8')
        for vocalizedPattern in statTable[ln][patternKey].keys():
            # vocalizedPattern2=araby.stripShadda(vocalizedPattern)
            # letters,harakat = araby.separate(vocalizedPattern2)
            # vocalizedForm =araby.joint(word,harakat)

            letters, harakat, ShaddaPlaces = araby.separate(
                vocalizedPattern, True)
            newWord_nm = araby.joint(word, ShaddaPlaces)
            vocWord = araby.joint(newWord_nm, harakat)
            print u"\t".join(
                [word, patternKey, vocalizedPattern, harakat,
                 vocWord]).encode('utf8')
    else:
        print patternKey.encode('utf8'), "pattern non found"

    # print wordCount/patternCount;
예제 #11
0
def get_tashkeel_binary(ayah):
    '''
     get_tashkeel_pattern is function takes the str or list(ayah or token) and converts to zero and ones

     What it does:
           take token whether ayah or sub ayah and maps it to zero for sukoon and char without diarictics
           and one for char with harakat and tanwin
     Args:
           param1 (str): a string or list

     Returns:
           str : zero and ones for each token
  '''

    marksDictionary = {
        'ْ': 0,
        '': 0,
        'ُ': 1,
        'َ': 1,
        'ِ': 1,
        'ّ': 1,
        'ٌ': 1,
        'ً': 1,
        'ٍ': 1
    }
    charWithOutTashkeelOrSukun = ''
    tashkeelPatternList = []  # list of zeros and ones
    marksList = []

    # convert the List o to string without spaces
    ayahModified = ''.join(ayah.strip())
    tashkeelPatternStringWithSpace = ''

    # check is there a tatweel in ayah or not
    if (tatweel in ayahModified):
        ayahModified = strip_tatweel(ayahModified)

    # check whether exist alef_mad in ayah if exist unpack the alef mad
    if (alef_mad in ayahModified):
        ayahModified = unpack_alef_mad(ayahModified)

    # separate tashkeel from the ayah
    ayahOrAyatWithoutTashkeel, marks = separate(ayahModified)

    for mark in marks:
        #the pyarabic returns the char of marks without tashkeel with 'ـ' so if check about this mark if not exist
        #append in list harakat and zero or ones in tashkeel pattern list if yes append the marks and patterns
        if (mark != 'ـ'):
            marksList.append(mark)
            tashkeelPatternList.append(marksDictionary[mark])
        else:
            marksList.append(charWithOutTashkeelOrSukun)
            tashkeelPatternList.append(
                marksDictionary[charWithOutTashkeelOrSukun])

    # convert list of Tashkeel pattern to String for each token in ayah separate with another token with spce
    for posOfCharInAyah in range(0, len(ayahOrAyatWithoutTashkeel)):
        if ayahOrAyatWithoutTashkeel[
                posOfCharInAyah] == ' ' and tashkeelPatternList[
                    posOfCharInAyah] == 0:
            tashkeelPatternStringWithSpace += ' '
        else:
            tashkeelPatternStringWithSpace += str(
                tashkeelPatternList[posOfCharInAyah])
    return tashkeelPatternStringWithSpace, marksList
예제 #12
0
    def prepare_dataset(self):
        data = {}
        counter = 0
        with open(self.file, encoding='utf-8') as file:
            for line in file:
                letter_ids = []
                diacritic_ids = []
                word_ids_nodiacs = []
                word_ids_diacs = []
                letters, diacritics = araby.separate(line)
                letters = letters[0:-1]
                words = araby.tokenize(line)[0:-1]
                diacritics = diacritics[0:-1]
                diacritic_ids_nosh = []
                index = 0
                shaddahs = []
                for letter in letters:
                    if (letter == '\n') or (letter == '\u200f'):
                        continue
                    if (letter == 'ّ'):
                        if diacritics[index] != 'ـ':
                            diacritic_ids[-1] = self.diacritic_to_id[
                                letter + diacritics[index]]

                        else:
                            diacritic_ids[-1] = self.diacritic_to_id[letter]

                        diacritic_ids_nosh[-1] = self.diacritic_to_id_nosh[
                            diacritics[index]]

                    else:
                        letter_ids.append(self.letter_to_id[letter])
                        if letter == " ":
                            diacritic_ids.append(self.diacritic_to_id['space'])
                            diacritic_ids_nosh.append(
                                self.diacritic_to_id_nosh['space'])

                        else:
                            diacritic_ids.append(
                                self.diacritic_to_id[diacritics[index]])
                            diacritic_ids_nosh.append(
                                self.diacritic_to_id_nosh[diacritics[index]])

                    index += 1

                for diacritic_id in diacritic_ids:
                    if 'ّ' in self.id_to_diacritic[diacritic_id]:
                        shaddahs.append(1)
                    else:
                        shaddahs.append(0)

                for word in words:
                    word_ids_diacs.append(self.word_to_id_diacs[word])
                    word_ids_nodiacs.append(
                        self.word_to_id_nodiacs[araby.strip_tashkeel(word)])

                instance = (torch.tensor(letter_ids,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(diacritic_ids,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(diacritic_ids_nosh,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(shaddahs,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(word_ids_diacs,
                                         dtype=torch.long,
                                         requires_grad=False),
                            torch.tensor(word_ids_nodiacs,
                                         dtype=torch.long,
                                         requires_grad=False))
                data[counter] = instance
                counter += 1
        return data