Пример #1
0
def attach_racine(combos):
    """Filtre les compositions extraites de la fonction attach_sheme() pour donner en sortie les compositions dont la
    racine nettoyée des caractéres flexionnels existe dans notre BDD des racines. En sortie : les composition avec
    deux champs supplementaires : la racine nettoyée et le type du mot(à partir du shéme -- reste à développer !). """
    good_comb = []
    for comb in combos:
        word_racine = araby.strip_diacritics(comb['Base'])
        striped_sheme = comb['pattern'].unvoweled_form
        for i in range(len(striped_sheme)):
            if striped_sheme[i] not in [araby.FEH, araby.AIN, araby.LAM]:
                word_racine = word_racine[:i] + word_racine[i + 1:]

        if Racine.objects.filter(unvoweled_form=word_racine).exists():
            if {
                    'Base': comb['Base'],
                    'Préfixe': comb['Préfixe'],
                    'Suffixe': comb['Suffixe'],
                    'pattern': comb['pattern'],
                    'type': comb['type'],
                    'stype': comb['stype'],
                    'root': word_racine
            } not in good_comb:
                good_comb.append({
                    'Base': comb['Base'],
                    'Préfixe': comb['Préfixe'],
                    'Suffixe': comb['Suffixe'],
                    'pattern': comb['pattern'],
                    'type': comb['type'],
                    'stype': comb['stype'],
                    'root': word_racine
                })
    return good_comb
Пример #2
0
def outfile() -> str:
    '''
    Returns
    -------
    str
        Cleaned corpus of all poems. Poems joined as one text, and diacritics
        removed. Additionally, the clean text is written to a file to prevent 
        having to scrape data over and over.

    '''

    # if text file already exists
    try:
        f = open('arabic_poems.txt', encoding="utf-8")
        data = f.read()
        f.close()
    except:
        content = clean_poems(scrape_poetry_trans())

        data = ' '.join(content)
        data = araby.strip_diacritics(data)
        file = open("arabic_poems.txt", "w+", encoding="utf-8")
        file.write(data)
        file.close()

    return data
Пример #3
0
def mot_except(word):
    """Détecte si un mot donné en entrée est un mot éxceptionnel ou non par rapport à la BDD."""
    combs = []
    for me in ExceptionalWord.objects.filter(
            unvoweled_form=araby.strip_diacritics(word)):
        if araby.vocalizedlike(word, me):
            combs.append(me)
    return combs
Пример #4
0
def halat_al_irab(word):
    # المثنى وما يلحق به : يرفع بالألف ، وينصب ويجر بالياء
    # Maby adding affixes for dual and testing with alif or ya' should be beter...
    if araby.strip_diacritics(word)[-2:] == 'ان':
        return 'raf3'
    elif araby.strip_diacritics(word)[-2:] == 'ين':
        return 'nasb', 'jarr'
    if word[-1] in araby.HARAKAT:
        if word[-1] in [araby.DAMMA, araby.DAMMATAN]:
            return 'raf3'
        elif word[-1] in [araby.FATHA, araby.FATHATAN]:
            return 'nasb'
        elif word[-1] in [araby.KASRA, araby.KASRATAN]:
            return 'jarr'
        elif word[-1] == araby.SUKUN:
            return 'jazm'
    else:
        return None
Пример #5
0
def mot_outil(word):
    """Détecte si un mot donné en entrée est un mot outil ou non par rapport à la BDD."""
    mo_combs = []
    combs = decoupage(word)
    for c in combs:
        for mo in ToolWord.objects.filter(
                unvoweled_form=araby.strip_diacritics(c['Base'])):
            if araby.vocalizedlike(c['Base'], mo.voweled_form):
                dico = {'tw_object': mo}
                dico['Préfixe'] = c['Préfixe']
                dico['Suffixe'] = c['Suffixe']
                mo_combs.append(dico)
    return mo_combs
Пример #6
0
def nom_propre(word):
    """Détecte si un mot donné en entrée est un mot spécifique ou non par rapport à la BDD."""
    np_combs = []
    combs = decoupage(word)
    for c in combs:
        for np in ProperNoun.objects.filter(
                unvoweled_form=araby.strip_diacritics(c['Base'])):
            if araby.vocalizedlike(c['Base'], np.voweled_form):
                dico = {'pn_object': np}
                dico['Base'] = c['Base']
                dico['Préfixe'] = c['Préfixe']
                dico['Suffixe'] = c['Suffixe']
                np_combs.append(dico)
    return np_combs
def normalizeArabicText(sentence):
    sentence = sentence.replace('ة', 'ه')
    sentence = sentence.replace('أ', 'ا')
    sentence = sentence.replace('ى', 'ي')
    sentence = sentence.replace('إ', 'ا')
    sentence = sentence.replace('،', '')
    sentence = sentence.replace(',', '')
    sentence = sentence.replace('؟', '')
    sentence = sentence.replace('-', '')
    sentence = sentence.replace('ـ', '')
    sentence = sentence.replace('(', '')
    sentence = sentence.replace(')', '')

    sentence = sentence.replace(':', '')
    sentence = araby.strip_diacritics(sentence)
    sentence = sentence.replace('"', '')
    sentence = sentence.replace('!', '')
    sentence = sentence.replace('تُ', 'ت')
    sentence = sentence.replace('إ', 'ا')

    return sentence
Пример #8
0
def decoupage(word):
    """Découpe le mot donné en entrée (word) en (préfixes, racine et suffixes). La sortie de la fonction est une liste
    de dictionnaires regroupant toutes les combinaisons syntaxiquement correctes d'aprés la compatibilitée entre les
     préfixes et sufixes détéctés et la taille de la racine."""
    word_unvocalized = araby.strip_diacritics(word)
    prefixes, suffixes = [""], [""]
    combinaisons_possibles = []
    for p in Prefixe.objects.all():
        if word_unvocalized.startswith(p.unvoweled_form):
            # print("p:"+p.unvoweled_form)
            if araby.is_vocalized(word):
                if araby.vocalizedlike(word[:len(p.voweled_form)],
                                       p.voweled_form):
                    prefixes.append(p)
            else:
                prefixes.append(p)
    for s in Suffixe.objects.all():
        if word_unvocalized.endswith(s.unvoweled_form):
            if araby.is_vocalized(word):
                if araby.vocalizedlike(word[-len(s.voweled_form):],
                                       s.voweled_form):
                    suffixes.append(s)
            else:
                suffixes.append(s)

    for pr in prefixes:
        for sf in suffixes:
            # Validation criteria
            if pr != "" and sf != "":
                if (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) <= 2 or \
                    (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) > 9:
                    continue
                if ((pr.classe[0] == 'N' and sf.classe[0] == 'V')
                        or (pr.classe[0] == 'V' and sf.classe[0] == 'N')
                        or (pr.classe in ['N1', 'N2', 'N3', 'N5'])):
                    continue
            # Si on est là -> le préfixe est compatible avec le suffixe, et la taille de la base est accéptable
            base = word
            # Supprimer le prefixe de la base // En gardant le Tachkil
            if pr:
                for char in pr.unvoweled_form:
                    while char != base[0]:
                        base = base[1:]
                    base = base[1:]
                while araby.is_tashkeel(base[0]):
                    base = base[1:]

            # Supprimer le suffixe de la base // En gardant le Tachkil
            if sf:
                r_sf = [c for c in sf.unvoweled_form]
                r_sf.reverse()
                for char in r_sf:
                    base = base[:base.rindex(char)]

            combinaisons_possibles.append({
                'Base': base,
                'Préfixe': pr,
                'Suffixe': sf
            })

    return combinaisons_possibles
Пример #9
0
    def handle(self, *args, **options):
        # Chargement des mots spécifiques :

        mots_outils_doc = minidom.parse('Res/toolwords.xml')
        noms_propres_doc = minidom.parse('Res/propernouns.xml')
        mots_except_doc = minidom.parse('Res/exceptionalwords.xml')

        mots_outils_items = mots_outils_doc.getElementsByTagName('toolword')
        noms_propres_items = noms_propres_doc.getElementsByTagName(
            'propernoun')
        mots_except_items = mots_except_doc.getElementsByTagName(
            'exceptionalword')

        for item in mots_outils_items:
            ToolWord.objects.create(
                priority=item.attributes['priority'].value,
                prefixe_class=item.attributes['prefixeclass'].value,
                suffixe_class=item.attributes['suffixeclass'].value,
                ttype=item.attributes['type'].value,
                voweled_form=item.attributes['voweledform'].value,
                unvoweled_form=araby.strip_diacritics(
                    item.attributes['voweledform'].value))
        self.stdout.write(self.style.SUCCESS('ToolWords Populated !'))

        for item in noms_propres_items:
            ProperNoun.objects.create(
                voweled_form=item.attributes['voweledform'].value,
                unvoweled_form=item.attributes['unvoweledform'].value,
                ptype=item.attributes['type'].value)
        self.stdout.write(self.style.SUCCESS('ProperNouns Populated !'))

        for item in mots_except_items:
            ExceptionalWord.objects.create(
                stem=item.attributes['stem'].value,
                prefixe=item.attributes['prefix'].value,
                suffixe=item.attributes['suffix'].value,
                etype=item.attributes['type'].value,
                voweled_form=item.attributes['voweledform'].value,
                unvoweled_form=item.attributes['unvoweledform'].value)
        self.stdout.write(self.style.SUCCESS('ExceptionalWords Populated !'))
        # --------------------------------------------------------------------------------------------

        pref_doc = minidom.parse('Res/prefixes.xml')
        suff_doc = minidom.parse('Res/suffixes.xml')

        pref_items = pref_doc.getElementsByTagName('prefixe')
        suff_items = suff_doc.getElementsByTagName('suffixe')

        # Chargement des clitiques et infixes :

        for item in pref_items:
            Prefixe.objects.create(
                classe=item.attributes['classe'].value,
                description=item.attributes['desc'].value,
                voweled_form=item.attributes['voweledform'].value,
                unvoweled_form=item.attributes['unvoweledform'].value)
        self.stdout.write(self.style.SUCCESS('Prefixes Populated !'))

        for item in suff_items:
            Suffixe.objects.create(
                classe=item.attributes['classe'].value,
                description=item.attributes['desc'].value,
                voweled_form=item.attributes['voweledform'].value,
                unvoweled_form=item.attributes['unvoweledform'].value)
        self.stdout.write(self.style.SUCCESS('Suffixes Populated !'))
        # Chargement des racines :

        racines = {}
        lines = open('Res/racines.txt', 'r', encoding="UTF-8").readlines()
        for l in lines:
            hrf, msdr = l.split(':')
            racines[hrf] = msdr.split()

        for r in racines.values():
            for rr in r:
                Racine.objects.create(unvoweled_form=rr)
        self.stdout.write(self.style.SUCCESS('Roots Populated !'))
        # --------------------------------------------------------------------------------------------

        # Chargement des shémes :

        for line in open('Res/translated_patterns.txt', 'r',
                         encoding='UTF-8').readlines():
            pattern, type, nType, vType, isBrokenPlural, comment = line.split(
                '\t')
            pattern = pattern[8:]
            type = type[5:]
            nType = nType[6:]
            vType = vType[6:]
            isBrokenPlural = isBrokenPlural[15:]
            if isBrokenPlural == 'yes':
                isBrokenPlural = True
            else:
                isBrokenPlural = False
            comment = comment[8:].replace('\n', '')
            Pattern.objects.create(
                voweled_form=pattern,
                unvoweled_form=araby.strip_diacritics(pattern),
                ptype=type,
                ntype=nType,
                vtype=vType,
                broken_plural=isBrokenPlural,
                comment=comment)
        self.stdout.write(self.style.SUCCESS('Patterns Populated !'))

        # Instanciation d'un ensemble de régles :

        # RP = RulePack.objects.create(name="AutoINIT")
        # try:
        #     State.objects.get(rule_pack=RP, label="START")
        # except State.DoesNotExists:
        #     State.objects.create(rule_pack=RP, label="START", is_start=True, is_end=False)

        self.stdout.write(self.style.SUCCESS('Successfully populated data !'))