def make_suggestions(self, ): """ add suggestion """ suggestions = "" if self.category == u"صفة": suggestions = """<match no="1"/> """ for sug in self.suggestions: sug_tokens = araby.tokenize(sug) sug_tokens = [araby.strip_lastharaka(s) for s in sug_tokens] tokens = [araby.strip_tashkeel(t) for t in self.pattern] if len(tokens) >= 2: match = tokens[1] else: match = "TODO" if len(sug_tokens) >= 2: suggest = sug_tokens[1] else: suggest = sug suggestions += u""" <suggestion><match no="2" regexp_match="%s" regexp_replace="%s"/></suggestion>\n""" % ( match, suggest) elif self.category in (u"كلمة واحدة", u"فعل"): suggestions = "" for sug in self.suggestions: sug_tokens = araby.tokenize(sug) sug_tokens = [araby.strip_lastharaka(s) for s in sug_tokens] tokens = [araby.strip_tashkeel(t) for t in self.pattern] if len(tokens) >= 1: match = tokens[0] else: match = "TODO" if len(sug_tokens) >= 1: suggest = sug_tokens[0] else: suggest = sug elif self.category in (u"متعدي بحرف", u"متعدي إلى مفعولين"): match = self.pattern[0] # add some suggestions suggestions += u"<!-- Verb Intransitive to transitive\n" suggestions += u"""<suggestion><match no="1" postag="(V.*a.;..)(-)" postag_replace="$1H">%s</match><match no="2" regexp_match="ل" regexp_replace=""/></suggestion>""" % ( match) suggestions += u"\n-->\n" suggestions += u"<!-- Verb Intransitive to transitive, when the preposition letter is attached to a noun\n" suggestions += u"""<suggestion><match no="1" postag="(V.*a.;..)(-)" postag_replace="$1H">%s</match> <match no="2" regexp_match="^ب" regexp_replace=""/></suggestion>""" % ( match) suggestions += u"\n-->\n" suggestions += u"<!-- Verb Transitive to intransitive\n" suggestions += u""" <suggestion><match no="1" postag="(V-1.*)(.)" postag_replace="$1-" postag_regexp="yes">%s</match> في<match no="1" regexp_match="(.*)((&verb_encletics;)$)|(.*)((&verb_encletics;)?$)" regexp_replace="$2"/></suggestion>""" % ( match) suggestions += u"\n-->\n" # make suggestion for verb category else: for sug in self.suggestions: suggestions += u" <suggestion>%s</suggestion>\n" % sug return suggestions
def vocalize(verb, proclitic, enclitic): """ Join the verb and its affixes, and get the vocalized form @param verb: verb found in dictionary. @type verb: unicode. @param proclitic: first level prefix. @type proclitic: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: (vocalized word, semivocalized). @rtype: (unicode, unicode). """ enclitic_voc = svconst.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] enclitic_voc = get_enclitic_variant(verb, enclitic_voc) proclitic_voc = svconst.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] #suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0] # لمعالجة حالة ألف التفريق if enclitic and verb.endswith(araby.WAW+ araby.ALEF) : verb = verb[:-1] if enclitic and verb.endswith(araby.ALEF_MAKSURA): verb = verb[:-1]+araby.ALEF vocalized = ''.join([ proclitic_voc, verb , enclitic_voc]) semivocalized = ''.join([ proclitic_voc, araby.strip_lastharaka(verb) , enclitic_voc]) return (vocalized, semivocalized)
def get_word_variant(word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) #if the word ends by a haraka strip the haraka if the suffix is not null if suffix: word_stem = araby.strip_lastharaka(word_stem) if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"": word_stem = word_stem[:-1]+araby.YEH elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def vocalize(verb, proclitic, enclitic): """ Join the verb and its affixes, and get the vocalized form @param verb: verb found in dictionary. @type verb: unicode. @param proclitic: first level prefix. @type proclitic: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: (vocalized word, semivocalized). @rtype: (unicode, unicode). """ enclitic_voc = SVC.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] enclitic_voc = get_enclitic_variant(verb, enclitic_voc) proclitic_voc = SVC.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] #suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0] # لمعالجة حالة ألف التفريق if enclitic and verb.endswith(ar.WAW + ar.ALEF): verb = verb[:-1] if enclitic and verb.endswith(ar.ALEF_MAKSURA): verb = verb[:-1] + ar.ALEF vocalized = ''.join([proclitic_voc, verb, enclitic_voc]) semivocalized = ''.join( [proclitic_voc, ar.strip_lastharaka(verb), enclitic_voc]) return (vocalized, semivocalized)
def get_word_variant(word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word # print word.encode('utf8') #HARAKAT = (FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, # KASRATAN, FATHATAN) suffix_nm = araby.strip_tashkeel(suffix) #if the word ends by a haraka word_stem = araby.strip_lastharaka(word_stem) if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in ( araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH): word_stem = word_stem[:-1] elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"": word_stem = word_stem[:-1]+araby.TEH elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"": word_stem = word_stem[:-1]+araby.YEH elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def get_suffix_variants(word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if not enclitic_nm and word[-1:] in ( araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.is_haraka(suffix): newsuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in ssconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark
def get_suffix_variants(word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len (enclitic_nm)>0: newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and word[-1:] in (araby.YEH, araby.ALEF) and araby.is_haraka(suffix): newsuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark
def get_word_variant(word, suffix, encletic): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( first level). @type suffix: unicode. @param encletic: encletic( second level). @type encletic: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) encletic_nm = araby.strip_tashkeel(encletic) long_suffix_nm = suffix_nm + encletic_nm #if the word ends by a haraka word_stem = araby.strip_lastharaka(word_stem) # الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية if word_stem.endswith(araby.TEH_MARBUTA): if suffix_nm in (araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA, araby.YEH, araby.YEH + araby.ALEF + araby.TEH): word_stem = word_stem[:-1] # الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق #مدرسة +ين = مدرستين elif long_suffix_nm != u"": word_stem = word_stem[:-1] + araby.TEH elif word_stem.endswith(araby.ALEF_MAKSURA): # الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء # مستوى +ان = مستويان # إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء if suffix_nm != u"": word_stem = word_stem[:-1] + araby.YEH # إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل تتحول الألف المقصورة إلى ألف elif encletic_nm != u"": word_stem = word_stem[:-1] + araby.ALEF elif word_stem.endswith(araby.KASRA + araby.YEH): # الاسم المنقوص ينتهي بياء قبلها مكسور # إذا كان لا ضمير واللاحقة فقط حركات # نحذف ال if not encletic_nm and not suffix_nm: word_stem = word_stem[:-2] #ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA) ) and suffix.startswith(araby.FATHATAN): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def get_word_variant(word, suffix, encletic): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( first level). @type suffix: unicode. @param encletic: encletic( second level). @type encletic: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) encletic_nm = araby.strip_tashkeel(encletic) long_suffix_nm = suffix_nm + encletic_nm #if the word ends by a haraka word_stem = araby.strip_lastharaka(word_stem) # الاسم المؤنث بالتاء المروبطة نحذفها قبل اللاحقات مثل ات وية if word_stem.endswith(araby.TEH_MARBUTA): if suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH): word_stem = word_stem[:-1] # الاسم المؤنث بالتاء المروبطة نفتحها قبل اللصق #مدرسة +ين = مدرستين elif long_suffix_nm != u"": word_stem = word_stem[:-1]+araby.TEH elif word_stem.endswith(araby.ALEF_MAKSURA): # الاسم المقصور إذا اتصل بلاحقة نحوية صارت ألف المقصورة ياء # مستوى +ان = مستويان # إذا كانت اللاحقة الصرفية ذات حروف تتحول الألف المقصورة إلى ياء if suffix_nm != u"": word_stem = word_stem[:-1]+araby.YEH # إذا كانت اللاحقة الصرفية حركات فقط والضمير المتصل تتحول الألف المقصورة إلى ألف elif encletic_nm != u"": word_stem = word_stem[:-1]+araby.ALEF elif word_stem.endswith(araby.KASRA + araby.YEH): # الاسم المنقوص ينتهي بياء قبلها مكسور # إذا كان لا ضمير واللاحقة فقط حركات # نحذف ال if not encletic_nm and not suffix_nm : word_stem = word_stem[:-2] #ضبط المنتهي بالهمزة حسب حركة اللاحقة النحوية elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA elif (word_stem.endswith(araby.YEH + araby.HAMZA) or word_stem.endswith(araby.YEH + araby.SUKUN + araby.HAMZA))and suffix.startswith(araby.FATHATAN): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def get_word_variant(word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) # تحويل الألف المقصورة إلى ياء في مثل إلى => إليك if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm : if word_stem == u"سِوَى": word_stem = word_stem[:-1]+araby.ALEF else: word_stem = word_stem[:-1]+araby.YEH + araby.SUKUN # تحويل الهمزة حسب موقعها elif word_stem.endswith(araby.HAMZA) and suffix_nm : if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA # this option is not used with stop words, because most of them are not inflected مبني #if the word ends by a haraka strip the haraka if the suffix is not null if suffix and suffix[0] in araby.HARAKAT: word_stem = araby.strip_lastharaka(word_stem) # الإدغام في النون والياء في مثل فيّ، إليّ، عنّا ، منّا if suffix.startswith(araby.NOON) and word.endswith(araby.NOON + araby.SUKUN) : word_stem = araby.strip_lastharaka(word_stem) elif suffix.startswith(araby.KASRA + araby.YEH) and word.endswith(araby.YEH + araby.SUKUN) : word_stem = araby.strip_lastharaka(word_stem) return word_stem
def get_word_variant(word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix = ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word suffix_nm = araby.strip_tashkeel(suffix) # تحويل الألف المقصورة إلى ياء في مثل إلى => إليك if word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm: if word_stem == u"سِوَى": word_stem = word_stem[:-1] + araby.ALEF else: word_stem = word_stem[:-1] + araby.YEH + araby.SUKUN # تحويل الهمزة حسب موقعها elif word_stem.endswith(araby.HAMZA) and suffix_nm: if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA # this option is not used with stop words, because most of them are not inflected مبني #if the word ends by a haraka strip the haraka if the suffix is not null if suffix and suffix[0] in araby.HARAKAT: word_stem = araby.strip_lastharaka(word_stem) # الإدغام في النون والياء في مثل فيّ، إليّ، عنّا ، منّا if suffix.startswith( araby.NOON) and word.endswith(araby.NOON + araby.SUKUN): word_stem = araby.strip_lastharaka(word_stem) elif suffix.startswith(araby.KASRA + araby.YEH) and word.endswith(araby.YEH + araby.SUKUN): word_stem = araby.strip_lastharaka(word_stem) return word_stem
def test_strip(self): # strip_harakat(text): assert Araby.strip_harakat(u"الْعَرَبِيّةُ") == u'العربيّة' # strip_lastharaka(text) assert Araby.strip_lastharaka(u"الْعَرَبِيّةُ") == u'الْعَرَبِيّة' # strip_tashkeel(text) assert Araby.strip_tashkeel(u"الْعَرَبِيّةُ") == u'العربية' # strip_tatweel(text): assert Araby.strip_tatweel(u"العـــــربية") == u'العربية' # strip_shadda(text): assert Araby.strip_shadda(u"الشّمسيّة") == u'الشمسية'
def get_suffix_variants(word, suffix, enclitic, mankous=False): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, enclitic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @param mankous: if the noun is mankous ends with Yeh منقوص. @type mankous: boolean. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ #enclitic_nm = ar.strip_tashkeel(enclitic) enclitic_nm = enclitic # given enclitic is not vocalized newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(ar.TEH_MARBUTA) >= 0 and enclitic_nm: newsuffix = re.sub(ar.TEH_MARBUTA, ar.TEH, suffix) elif not enclitic_nm and ar.is_haraka(suffix): if word[-1:] in (ar.YEH, ar.ALEF): newsuffix = u"" elif mankous: # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل # تحول حركته إلى تنوين كسر newsuffix = ar.KASRATAN #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in SNC.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = ar.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix #~ if suffix.endswith(ar.YEH+ar.SHADDA+ ar.DAMMA) and enclitic_nm.startswith(ar.YEH): #~ newsuffix = ar.YEH+ar.SHADDA+ ar.DAMMA #~ suffix_non_irab_mark = ar.YEH+ar.SHADDA #~ if suffix.endswith(ar.DAMMA) and enclitic_nm.startswith( ar.YEH): #~ newsuffix = suffix[:-1] + ar.KASRA #~ suffix_non_irab_mark = suffix[:-1] return newsuffix, suffix_non_irab_mark
def compare(self, baseline, vocalized_output): """ compare base line with automatic vocalized result """ myconsole.lineCorrect = 0 myconsole.lineWLMIncorrect = 0 inputVocalizedLine = baseline inputlist = araby.tokenize(inputVocalizedLine) if type(vocalized_output) == list: outputlist = [x.get("chosen", '') for x in vocalized_output] result = vocalized_output outputlistsemi = [x.get("semi", '') for x in vocalized_output] elif type(vocalized_output) == str: outputlist = araby.tokenize(vocalized_output) outputlistsemi = [araby.strip_lastharaka(x) for x in outputlist] else: print("Incompatible vocaluzed output, must be dict or string", type(vocalized_output), vocalized_output) sys.exit() self.total += len(inputlist) self.lineTotal = len(inputlist) if len(inputlist) != len(outputlist): print("lists haven't the same length") print(len(inputlist), len(outputlist)) print(u"# ".join(inputlist).encode('utf8')) print(u"# ".join(outputlist).encode('utf8')) else: for inword, outword, outsemiword in zip(inputlist, outputlist, outputlistsemi): simi = araby.vocalized_similarity(inword, outword) if simi < 0: self.LettersError += -simi self.incorrect += 1 # evaluation without last haraka simi2 = araby.vocalized_similarity(inword, outsemiword) if simi2 < 0: self.WLMIncorrect += 1 self.lineWLMIncorrect += 1 else: self.correct += 1 self.lineCorrect += 1 self.counter += 1
def preprocess(sentences, stopwords, isStopword = False): """ This takes in an array of complete araic sentences, and performs th following operations on all of them: 1.) strips tashkeel 2.) strips harakat 3.) strips lastharaka 4.) strips tatweel 5.) Strips shadda 6.) normalize lam alef ligatures 7.) normalize hamza 8.) tokenize Returns a 2D martix, where each row represents normalized, tokens of each sentence """ #print("SENTENCE INDEX!!!", sentences[0]) output = [] for sentence in sentences: #print("Before Preprocessing:"+ sentence) #print(sentence) text = araby.strip_harakat(sentence) #print("TEXT!!!!", text) text = araby.strip_tashkeel(text) text = araby.strip_lastharaka(text) text = araby.strip_tatweel(text) text = araby.strip_shadda(text) text = araby.normalize_ligature(text) text = araby.normalize_hamza(text) text = clean_str(text) #print("After Preprocessing:"+ text) #print("----") #print(text) try: text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group() tokens = araby.tokenize(text) if not isStopword: tokens = remove_stopwords(stopwords, tokens) tokens = [t for t in tokens if t != '\n'] output.append(tokens) except: pass return output
def vocalize(self,verb, proclitic, enclitic): """ Join the verb and its affixes, and get the vocalized form @param verb: verb found in dictionary. @type verb: unicode. @param proclitic: first level prefix. @type proclitic: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: (vocalized word, semivocalized). @rtype: (unicode, unicode). """ #~ print(verb.encode('utf8')) # لمعالجة حالة ألف التفريق if enclitic and verb.endswith(ar.WAW + ar.ALEF): verb = verb[:-1] # حالة مشَوْا if enclitic and verb.endswith(ar.WAW + ar.SUKUN + ar.ALEF): verb = verb[:-1] if enclitic and verb.endswith(ar.ALEF_MAKSURA): verb = verb[:-1] + ar.ALEF if enclitic and verb.endswith(ar.TEH+ar.DAMMA + ar.MEEM+ ar.SUKUN): verb = verb[:-1] + ar.DAMMA + ar.WAW if enclitic and verb.endswith(ar.TEH+ar.DAMMA + ar.MEEM): verb += ar.DAMMA + ar.WAW word_tuple_list =[] #~ enclitic_voc = SVC.COMP_SUFFIX_LIST_TAGS[enclitic]["vocalized"][0] #~ enclitic_voc = self.get_enclitic_variant(verb, enclitic_voc) #~ proclitic_voc = SVC.COMP_PREFIX_LIST_TAGS[proclitic]["vocalized"][0] #suffix_voc = suffix #CONJ_SUFFIX_LIST_TAGS[suffix]["vocalized"][0] for proclitic_voc in SVC.COMP_PREFIX_LIST_TAGS.get(proclitic, {}).get("vocalized", ''): for enclitic_voc in SVC.COMP_SUFFIX_LIST_TAGS.get(enclitic, {}).get("vocalized", ''): enclitic_voc = self.get_enclitic_variant(verb, enclitic_voc) vocalized = ''.join([proclitic_voc, verb, enclitic_voc]) semivocalized = ''.join( [proclitic_voc, ar.strip_lastharaka(verb), enclitic_voc]) word_tuple_list.append((vocalized, semivocalized)) return word_tuple_list
def get_suffix_variants(word, suffix, enclitic, mankous = False): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix = ة, encletic = ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @param mankous: if the noun is mankous ends with Yeh منقوص. @type mankous: boolean. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.strip_tashkeel(enclitic) newsuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and enclitic_nm: newsuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and araby.is_haraka(suffix): if word[-1:] in (araby.YEH, araby.ALEF): newsuffix = u"" elif mankous : # the word is striped from YEH المنقوص حذفت ياؤه قبل قليل # تحول حركته إلى تنوين كسر newsuffix = araby.KASRATAN #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is # changed and can be not found in table if u'متحرك' in snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffix_non_irab_mark = araby.strip_lastharaka(newsuffix) else: suffix_non_irab_mark = newsuffix return newsuffix, suffix_non_irab_mark