def attach_racine(combos): """Filtre les compositions extraites de la fonction attach_sheme() pour donner en sortie les compositions dont la racine nettoyée des caractéres flexionnels existe dans notre BDD des racines. En sortie : les composition avec deux champs supplementaires : la racine nettoyée et le type du mot(à partir du shéme -- reste à développer !). """ good_comb = [] for comb in combos: word_racine = araby.strip_diacritics(comb['Base']) striped_sheme = comb['pattern'].unvoweled_form for i in range(len(striped_sheme)): if striped_sheme[i] not in [araby.FEH, araby.AIN, araby.LAM]: word_racine = word_racine[:i] + word_racine[i + 1:] if Racine.objects.filter(unvoweled_form=word_racine).exists(): if { 'Base': comb['Base'], 'Préfixe': comb['Préfixe'], 'Suffixe': comb['Suffixe'], 'pattern': comb['pattern'], 'type': comb['type'], 'stype': comb['stype'], 'root': word_racine } not in good_comb: good_comb.append({ 'Base': comb['Base'], 'Préfixe': comb['Préfixe'], 'Suffixe': comb['Suffixe'], 'pattern': comb['pattern'], 'type': comb['type'], 'stype': comb['stype'], 'root': word_racine }) return good_comb
def outfile() -> str: ''' Returns ------- str Cleaned corpus of all poems. Poems joined as one text, and diacritics removed. Additionally, the clean text is written to a file to prevent having to scrape data over and over. ''' # if text file already exists try: f = open('arabic_poems.txt', encoding="utf-8") data = f.read() f.close() except: content = clean_poems(scrape_poetry_trans()) data = ' '.join(content) data = araby.strip_diacritics(data) file = open("arabic_poems.txt", "w+", encoding="utf-8") file.write(data) file.close() return data
def mot_except(word): """Détecte si un mot donné en entrée est un mot éxceptionnel ou non par rapport à la BDD.""" combs = [] for me in ExceptionalWord.objects.filter( unvoweled_form=araby.strip_diacritics(word)): if araby.vocalizedlike(word, me): combs.append(me) return combs
def halat_al_irab(word): # المثنى وما يلحق به : يرفع بالألف ، وينصب ويجر بالياء # Maby adding affixes for dual and testing with alif or ya' should be beter... if araby.strip_diacritics(word)[-2:] == 'ان': return 'raf3' elif araby.strip_diacritics(word)[-2:] == 'ين': return 'nasb', 'jarr' if word[-1] in araby.HARAKAT: if word[-1] in [araby.DAMMA, araby.DAMMATAN]: return 'raf3' elif word[-1] in [araby.FATHA, araby.FATHATAN]: return 'nasb' elif word[-1] in [araby.KASRA, araby.KASRATAN]: return 'jarr' elif word[-1] == araby.SUKUN: return 'jazm' else: return None
def mot_outil(word): """Détecte si un mot donné en entrée est un mot outil ou non par rapport à la BDD.""" mo_combs = [] combs = decoupage(word) for c in combs: for mo in ToolWord.objects.filter( unvoweled_form=araby.strip_diacritics(c['Base'])): if araby.vocalizedlike(c['Base'], mo.voweled_form): dico = {'tw_object': mo} dico['Préfixe'] = c['Préfixe'] dico['Suffixe'] = c['Suffixe'] mo_combs.append(dico) return mo_combs
def nom_propre(word): """Détecte si un mot donné en entrée est un mot spécifique ou non par rapport à la BDD.""" np_combs = [] combs = decoupage(word) for c in combs: for np in ProperNoun.objects.filter( unvoweled_form=araby.strip_diacritics(c['Base'])): if araby.vocalizedlike(c['Base'], np.voweled_form): dico = {'pn_object': np} dico['Base'] = c['Base'] dico['Préfixe'] = c['Préfixe'] dico['Suffixe'] = c['Suffixe'] np_combs.append(dico) return np_combs
def normalizeArabicText(sentence): sentence = sentence.replace('ة', 'ه') sentence = sentence.replace('أ', 'ا') sentence = sentence.replace('ى', 'ي') sentence = sentence.replace('إ', 'ا') sentence = sentence.replace('،', '') sentence = sentence.replace(',', '') sentence = sentence.replace('؟', '') sentence = sentence.replace('-', '') sentence = sentence.replace('ـ', '') sentence = sentence.replace('(', '') sentence = sentence.replace(')', '') sentence = sentence.replace(':', '') sentence = araby.strip_diacritics(sentence) sentence = sentence.replace('"', '') sentence = sentence.replace('!', '') sentence = sentence.replace('تُ', 'ت') sentence = sentence.replace('إ', 'ا') return sentence
def decoupage(word): """Découpe le mot donné en entrée (word) en (préfixes, racine et suffixes). La sortie de la fonction est une liste de dictionnaires regroupant toutes les combinaisons syntaxiquement correctes d'aprés la compatibilitée entre les préfixes et sufixes détéctés et la taille de la racine.""" word_unvocalized = araby.strip_diacritics(word) prefixes, suffixes = [""], [""] combinaisons_possibles = [] for p in Prefixe.objects.all(): if word_unvocalized.startswith(p.unvoweled_form): # print("p:"+p.unvoweled_form) if araby.is_vocalized(word): if araby.vocalizedlike(word[:len(p.voweled_form)], p.voweled_form): prefixes.append(p) else: prefixes.append(p) for s in Suffixe.objects.all(): if word_unvocalized.endswith(s.unvoweled_form): if araby.is_vocalized(word): if araby.vocalizedlike(word[-len(s.voweled_form):], s.voweled_form): suffixes.append(s) else: suffixes.append(s) for pr in prefixes: for sf in suffixes: # Validation criteria if pr != "" and sf != "": if (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) <= 2 or \ (len(word_unvocalized) - len(pr.unvoweled_form) - len(sf.unvoweled_form)) > 9: continue if ((pr.classe[0] == 'N' and sf.classe[0] == 'V') or (pr.classe[0] == 'V' and sf.classe[0] == 'N') or (pr.classe in ['N1', 'N2', 'N3', 'N5'])): continue # Si on est là -> le préfixe est compatible avec le suffixe, et la taille de la base est accéptable base = word # Supprimer le prefixe de la base // En gardant le Tachkil if pr: for char in pr.unvoweled_form: while char != base[0]: base = base[1:] base = base[1:] while araby.is_tashkeel(base[0]): base = base[1:] # Supprimer le suffixe de la base // En gardant le Tachkil if sf: r_sf = [c for c in sf.unvoweled_form] r_sf.reverse() for char in r_sf: base = base[:base.rindex(char)] combinaisons_possibles.append({ 'Base': base, 'Préfixe': pr, 'Suffixe': sf }) return combinaisons_possibles
def handle(self, *args, **options): # Chargement des mots spécifiques : mots_outils_doc = minidom.parse('Res/toolwords.xml') noms_propres_doc = minidom.parse('Res/propernouns.xml') mots_except_doc = minidom.parse('Res/exceptionalwords.xml') mots_outils_items = mots_outils_doc.getElementsByTagName('toolword') noms_propres_items = noms_propres_doc.getElementsByTagName( 'propernoun') mots_except_items = mots_except_doc.getElementsByTagName( 'exceptionalword') for item in mots_outils_items: ToolWord.objects.create( priority=item.attributes['priority'].value, prefixe_class=item.attributes['prefixeclass'].value, suffixe_class=item.attributes['suffixeclass'].value, ttype=item.attributes['type'].value, voweled_form=item.attributes['voweledform'].value, unvoweled_form=araby.strip_diacritics( item.attributes['voweledform'].value)) self.stdout.write(self.style.SUCCESS('ToolWords Populated !')) for item in noms_propres_items: ProperNoun.objects.create( voweled_form=item.attributes['voweledform'].value, unvoweled_form=item.attributes['unvoweledform'].value, ptype=item.attributes['type'].value) self.stdout.write(self.style.SUCCESS('ProperNouns Populated !')) for item in mots_except_items: ExceptionalWord.objects.create( stem=item.attributes['stem'].value, prefixe=item.attributes['prefix'].value, suffixe=item.attributes['suffix'].value, etype=item.attributes['type'].value, voweled_form=item.attributes['voweledform'].value, unvoweled_form=item.attributes['unvoweledform'].value) self.stdout.write(self.style.SUCCESS('ExceptionalWords Populated !')) # -------------------------------------------------------------------------------------------- pref_doc = minidom.parse('Res/prefixes.xml') suff_doc = minidom.parse('Res/suffixes.xml') pref_items = pref_doc.getElementsByTagName('prefixe') suff_items = suff_doc.getElementsByTagName('suffixe') # Chargement des clitiques et infixes : for item in pref_items: Prefixe.objects.create( classe=item.attributes['classe'].value, description=item.attributes['desc'].value, voweled_form=item.attributes['voweledform'].value, unvoweled_form=item.attributes['unvoweledform'].value) self.stdout.write(self.style.SUCCESS('Prefixes Populated !')) for item in suff_items: Suffixe.objects.create( classe=item.attributes['classe'].value, description=item.attributes['desc'].value, voweled_form=item.attributes['voweledform'].value, unvoweled_form=item.attributes['unvoweledform'].value) self.stdout.write(self.style.SUCCESS('Suffixes Populated !')) # Chargement des racines : racines = {} lines = open('Res/racines.txt', 'r', encoding="UTF-8").readlines() for l in lines: hrf, msdr = l.split(':') racines[hrf] = msdr.split() for r in racines.values(): for rr in r: Racine.objects.create(unvoweled_form=rr) self.stdout.write(self.style.SUCCESS('Roots Populated !')) # -------------------------------------------------------------------------------------------- # Chargement des shémes : for line in open('Res/translated_patterns.txt', 'r', encoding='UTF-8').readlines(): pattern, type, nType, vType, isBrokenPlural, comment = line.split( '\t') pattern = pattern[8:] type = type[5:] nType = nType[6:] vType = vType[6:] isBrokenPlural = isBrokenPlural[15:] if isBrokenPlural == 'yes': isBrokenPlural = True else: isBrokenPlural = False comment = comment[8:].replace('\n', '') Pattern.objects.create( voweled_form=pattern, unvoweled_form=araby.strip_diacritics(pattern), ptype=type, ntype=nType, vtype=vType, broken_plural=isBrokenPlural, comment=comment) self.stdout.write(self.style.SUCCESS('Patterns Populated !')) # Instanciation d'un ensemble de régles : # RP = RulePack.objects.create(name="AutoINIT") # try: # State.objects.get(rule_pack=RP, label="START") # except State.DoesNotExists: # State.objects.create(rule_pack=RP, label="START", is_start=True, is_end=False) self.stdout.write(self.style.SUCCESS('Successfully populated data !'))