def getStemVariants(self, stem, prefix, suffix): """ Generate the Noun stem variants according to the affixes. For example مدرستي=>مدرست+ي => مدرسة +ي. Return a list of possible cases. @param stem: the input stem. @type stem: unicode. @param prefix: prefixe. @type prefix: unicode. @param suffix: suffixe. @type suffix: unicode. @return: list of stem variants. @rtype: list of unicode. """ #some cases must have some correction #determinate the prefix and suffix types # create a list, the first item is the verb without changes prefix_possible_noun_list = set([stem]) # Prefix prefix = araby.stripTashkeel(prefix) suffix = araby.stripTashkeel(suffix) possible_noun_list = prefix_possible_noun_list if suffix in (araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA, araby.YEH, araby.YEH + araby.ALEF + araby.TEH): possible_noun = stem + araby.TEH_MARBUTA possible_noun_list.add(possible_noun) if suffix == "" or suffix == araby.YEH + araby.NOON or suffix == araby.WAW + araby.NOON: possible_noun = stem + araby.YEH possible_noun_list.add(possible_noun) if stem.endswith(araby.YEH): possible_noun = stem[:-1] + araby.ALEF_MAKSURA possible_noun_list.add(possible_noun) #to be validated validated_list = possible_noun_list return validated_list
def check_normalized(self, word_vocalised, resulted_data): """ If the entred word is like the found word in dictionary, to treat some normalized cases, the analyzer return the vocalized like words; ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب. this function filter normalized resulted word according the given word, and give ذئب. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ # print word_vocalised.encode('utf8'); filtred_data = [] inputword = araby.stripTashkeel(word_vocalised) for item in resulted_data: if "vocalized" in item.__dict__: # .has_key('vocalized') : # ~ if 'vocalized' in item : # ~ outputword = araby.stripTashkeel(item['vocalized']) outputword = araby.stripTashkeel(item.__dict__["vocalized"]) # print u'\t'.join([inputword, outputword]).encode('utf8'); if inputword == outputword: # item['tags']+=':a'; filtred_data.append(item) return filtred_data
def check_normalized(self, word_vocalised, resulted_data): """ If the entred word is like the found word in dictionary, to treat some normalized cases, the analyzer return the vocalized like words; ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب. this function filter normalized resulted word according the given word, and give ذئب. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ #print word_vocalised.encode('utf8'); filtred_data = [] inputword = araby.stripTashkeel(word_vocalised) for item in resulted_data: if 'vocalized' in item.__dict__: #.has_key('vocalized') : #~ if 'vocalized' in item : #~ outputword = araby.stripTashkeel(item['vocalized']) outputword = araby.stripTashkeel(item.__dict__['vocalized']) #print u'\t'.join([inputword, outputword]).encode('utf8'); if inputword == outputword: #item['tags']+=':a'; filtred_data.append(item) return filtred_data
def getStemVariants(self,stem,prefix,suffix): """ Generate the Noun stem variants according to the affixes. For example مدرستي=>مدرست+ي => مدرسة +ي. Return a list of possible cases. @param stem: the input stem. @type stem: unicode. @param prefix: prefixe. @type prefix: unicode. @param suffix: suffixe. @type suffix: unicode. @return: list of stem variants. @rtype: list of unicode. """ #some cases must have some correction #determinate the prefix and suffix types # create a list, the first item is the verb without changes prefix_possible_noun_list= set([stem]) # Prefix prefix=araby.stripTashkeel(prefix); suffix=araby.stripTashkeel(suffix); possible_noun_list=prefix_possible_noun_list; if suffix in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,araby.YEH, araby.YEH+araby.ALEF+araby.TEH): possible_noun=stem+araby.TEH_MARBUTA; possible_noun_list.add(possible_noun) if suffix=="" or suffix==araby.YEH+araby.NOON or suffix==araby.WAW+araby.NOON: possible_noun=stem+araby.YEH; possible_noun_list.add(possible_noun) if stem.endswith(araby.YEH): possible_noun=stem[:-1]+araby.ALEF_MAKSURA; possible_noun_list.add(possible_noun) #to be validated validated_list=possible_noun_list; return validated_list
def isPossibleCollocation(self, list2, context="", lenght=2): """ Guess if the given list is a possible collocation This is used to collect unkown collocations, from user input return True oor false @param wordlist: word of list, 2 or more words. @type wordlist: list of unicode. @param lenght: minimum number of words in the collocation @type lenght: integer. @return : the rule of found collocation, 100 default. @rtype: interger. """ if len(list2) < lenght: return 0 else: itemV1 = list2[0] itemV2 = list2[1] item1 = araby.stripTashkeel(itemV1) item2 = araby.stripTashkeel(itemV2) #if item1[-1:] in (u".",u"?",u",",u'[', u']',u'(',')'): # return 0; if not collocation_const.token_pat.search( item1) or not collocation_const.token_pat.search(item2): return -1 #else: return 100; elif item1 in collocation_const.ADDITIONAL_WORDS: return 10 elif item1 in collocation_const.NAMED_PRIOR: return 15 elif (item2 not in collocation_const.SPECIAL_DEFINED): if item2.startswith(u'ال') and item1.startswith( u'ال' ): #re.search(ur'^(ال|بال|وبال|فال|وال|لل|كال|فكال|ولل|فلل|فبال)', item1): return 20 elif item1.endswith(u'ة') and item2.startswith(u'ال'): return 30 #حالة الكلمات التي تبدأ بلام الجر والتعريف # لا داعي لها لأنها دائما مجرورة #if item2.startswith(u'لل'): # return 40; elif item1.endswith(u'ة') and item2.endswith(u'ة'): return 40 #if item1.endswith(u'ي') and item2.endswith(u'ي'): # return 60; elif context != u"" and context in collocation_const.tab_noun_context and item2.startswith( u'ال'): return 50 #return True; elif item1.endswith(u'ات') and item2.startswith(u'ال'): return 60 return 100
def isPossibleCollocation(self, list2, context="", lenght=2): """ Guess if the given list is a possible collocation This is used to collect unkown collocations, from user input return True oor false @param wordlist: word of list, 2 or more words. @type wordlist: list of unicode. @param lenght: minimum number of words in the collocation @type lenght: integer. @return : the rule of found collocation, 100 default. @rtype: interger. """ if len(list2) < lenght: return 0 else: itemV1 = list2[0] itemV2 = list2[1] item1 = araby.stripTashkeel(itemV1) item2 = araby.stripTashkeel(itemV2) # if item1[-1:] in (u".",u"?",u",",u'[', u']',u'(',')'): # return 0; if not collocation_const.token_pat.search(item1) or not collocation_const.token_pat.search(item2): return -1 # else: return 100; elif item1 in collocation_const.ADDITIONAL_WORDS: return 10 elif item1 in collocation_const.NAMED_PRIOR: return 15 elif item2 not in collocation_const.SPECIAL_DEFINED: if item2.startswith(u"ال") and item1.startswith( u"ال" ): # re.search(ur'^(ال|بال|وبال|فال|وال|لل|كال|فكال|ولل|فلل|فبال)', item1): return 20 elif item1.endswith(u"ة") and item2.startswith(u"ال"): return 30 # حالة الكلمات التي تبدأ بلام الجر والتعريف # لا داعي لها لأنها دائما مجرورة # if item2.startswith(u'لل'): # return 40; elif item1.endswith(u"ة") and item2.endswith(u"ة"): return 40 # if item1.endswith(u'ي') and item2.endswith(u'ي'): # return 60; elif context != u"" and context in collocation_const.tab_noun_context and item2.startswith(u"ال"): return 50 # return True; elif item1.endswith(u"ات") and item2.startswith(u"ال"): return 60 return 100
def generate_possible_conjug(self, infinitive_verb, unstemed_verb , affix, future_type=araby.FATHA, externPrefix="-", externSuffix="-", transitive=True): """ """ ## future_type=FATHA; #~ transitive=True; list_correct_conj=[]; if infinitive_verb=="" or unstemed_verb=="" or affix=="": return set(); verb = infinitive_verb; future_type = libqutrub.ar_verb.get_future_type_entree(future_type); #print u"\t".join([verb, future_type]).encode('utf8'); vb = libqutrub.classverb.verbclass(verb, transitive, future_type); # الألف ليست جزءا من السابقة، لأنها تستعمل لمنع الابتداء بساكن # وتصريف الفعل في الامر يولده if affix.startswith(araby.ALEF): affix=affix[1:] # get all tenses to conjugate the verb one time tenses=[]; if stem_verb_const.Table_affix.has_key(affix): for pair in stem_verb_const.Table_affix[affix]: tenses.append(pair[0]);#tense=pair[0] tenses=list(set(tenses)); # avoid duplicata if stem_verb_const.Table_affix.has_key(affix): for pair in stem_verb_const.Table_affix[affix]: tense=pair[0] pronoun=pair[1] if self.is_compatible_proaffix_tense(externPrefix, externSuffix, tense, pronoun, transitive): conj_vocalized = vb.conjugateTenseForPronoun( tense, pronoun) #strip all marks and shadda conj_nm = araby.stripTashkeel(conj_vocalized); if conj_nm==unstemed_verb: list_correct_conj.append({'verb':infinitive_verb, 'tense':tense, 'pronoun':pronoun, 'vocalized':conj_vocalized, 'unvocalized':conj_nm}); return list_correct_conj;
def getSuffixVariant(self, word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm=araby.stripTashkeel(enclitic) newSuffix =suffix; #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0: newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix); elif not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix): newSuffix=u""; #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is changed and can be not found in table if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffixNonIrabMark =araby.stripLastHaraka(newSuffix); else: suffixNonIrabMark = newSuffix return newSuffix, suffixNonIrabMark ;
def getWordVariant(self, word, suffix): """ Get the word variant to be joined to the suffix. For example: word = مدرسة, suffix=ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem=word; # print word.encode('utf8'); #HARAKAT=(FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN); suffix_nm=araby.stripTashkeel(suffix) #if the word ends by a haraka word_stem=araby.stripLastHaraka(word_stem); if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH): word_stem=word_stem[:-1]; elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm!=u"": word_stem=word_stem[:-1]+araby.TEH; elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm!=u"": word_stem = word_stem[:-1]+araby.YEH; elif word_stem.endswith(araby.HAMZA) and suffix_nm!=u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA; elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA; return word_stem;
def getSuffixVariant(self, word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffixes (vocalized suffix and vocalized suffix without I'rab short mark). @rtype: (unicode, unicode) """ enclitic_nm = araby.stripTashkeel(enclitic) newSuffix = suffix #default value #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0: newSuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) elif not enclitic_nm and word[-1:] in ( araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix): newSuffix = u"" #gererate the suffix without I'rab short mark # here we lookup with given suffix because the new suffix is changed and can be not found in table if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']: suffixNonIrabMark = araby.stripLastHaraka(newSuffix) else: suffixNonIrabMark = newSuffix return newSuffix, suffixNonIrabMark
def Comparetashkeel(text): import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text=text; text=araby.stripTashkeel(text); vocalizer=ArabicVocalizer.TashkeelClass(); vocalized_text=vocalizer.tashkeel(text); # compare voalized text with a correct text text1=correct_text; text2=vocalized_text; # remove collocations symboles text2=text2.replace("'",""); text2=text2.replace("~",""); #stemmer=tashaphyne.stemming.ArabicLightStemmer() list1=vocalizer.analyzer.tokenize(text1); list2=vocalizer.analyzer.tokenize(text2); print u":".join(list1).encode('utf8'); print u":".join(list2).encode('utf8'); correct=0; incorrect=0; total=len(list1); if len(list1)!=len(list2): print "lists haven't the same length"; else: for i in range(total): if araby.vocalizedlike(list1[i],list2[i]): correct+=1; else: incorrect+=1; result=[vocalized_text,"correct:%0.2f%%"%round(correct*100.00/total,2),"incorrect:%0.2f%%"%round(incorrect*100.00/total,2),total] return result#correct*100/total;
def generate_possible_conjug(self, infinitive_verb, unstemed_verb , affix, future_type=araby.FATHA, externPrefix="-", externSuffix="-", transitive=True): """ """ ## future_type=FATHA; #~ transitive=True; list_correct_conj=[]; if infinitive_verb=="" or unstemed_verb=="" or affix=="": return set(); verb = infinitive_verb; future_type = ar_verb.get_future_type_entree(future_type); #print u"\t".join([verb, future_type]).encode('utf8'); vb = classverb.verbclass(verb, transitive, future_type); # الألف ليست جزءا من السابقة، لأنها تستعمل لمنع الابتداء بساكن # وتصريف الفعل في الامر يولده if affix.startswith(araby.ALEF): affix=affix[1:] # get all tenses to conjugate the verb one time tenses=[]; if affix in stem_verb_const.Table_affix: for pair in stem_verb_const.Table_affix[affix]: tenses.append(pair[0]);#tense=pair[0] tenses=list(set(tenses)); # avoid duplicata if affix in stem_verb_const.Table_affix: for pair in stem_verb_const.Table_affix[affix]: tense=pair[0] pronoun=pair[1] if self.is_compatible_proaffix_tense(externPrefix, externSuffix, tense, pronoun, transitive): conj_vocalized = vb.conjugateTenseForPronoun( tense, pronoun) #strip all marks and shadda conj_nm = araby.stripTashkeel(conj_vocalized); if conj_nm==unstemed_verb: list_correct_conj.append({'verb':infinitive_verb, 'tense':tense, 'pronoun':pronoun, 'vocalized':conj_vocalized, 'unvocalized':conj_nm}); return list_correct_conj;
def getWordVariant(self, word, suffix): """ Get the word variant to be joined to the suffix. For example: word = ةمدرس, suffix=ي. The word is converted to مدرست. @param word: word found in dictionary. @type word: unicode. @param suffix: suffix ( firts or second level). @type suffix: unicode. @return: variant of word. @rtype: unicode. """ word_stem = word #HARAKAT=(FATHA,DAMMA,KASRA,SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN); suffix_nm = araby.stripTashkeel(suffix) #if the word ends by a haraka if word_stem[-1:] in araby.HARAKAT: word_stem = word_stem[:-1] if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in ( araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA, araby.YEH, araby.YEH + araby.ALEF + araby.TEH): word_stem = word_stem[:-1] elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"": word_stem = word_stem[:-1] + araby.TEH elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"": word_stem = word_stem[:-1] + araby.YEH elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"": if suffix.startswith(araby.DAMMA): word_stem = word_stem[:-1] + araby.WAW_HAMZA elif suffix.startswith(araby.KASRA): word_stem = word_stem[:-1] + araby.YEH_HAMZA return word_stem
def check_word(self,word, guessedTag=""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word=araby.stripTatweel(word); word_vocalised=word; word_nm=araby.stripTashkeel(word); resulted_text=u""; resulted_data=[]; # if word is a pounctuation resulted_data+=self.check_word_as_pounct(word_nm); # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis resulted_data+=self.check_word_as_stopword(word_nm); #if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء if self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data+=self.check_word_as_verb(word_nm); #print "is verb", rabti,len(resulted_data); #if word is noun if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data+=self.check_word_as_noun(word_nm); if len(resulted_data)==0: #check the word as unkonwn resulted_data+=self.check_word_as_unknown(word_nm); #check if the word is nomralized and solution are equivalent resulted_data = self.check_normalized(word_vocalised, resulted_data) #check if the word is shadda like resulted_data = self.check_shadda(word_vocalised, resulted_data) #check if the word is vocalized like results if self.partial_vocalization_support: resulted_data=self.check_partial_vocalized(word_vocalised, resulted_data); # add word frequency information in tags resulted_data = self.addWordFrequency(resulted_data); if len(resulted_data)==0: resulted_data.append(wordCase.wordCase({ 'word':word, 'affix': ('' , '', '', ''), 'stem':'', 'original':word, 'vocalized':word, 'tags':u'', 'type':'unknown', 'root':'', 'template':'', 'freq':self.wordfreq.getFreq(word, 'unknown'), 'syntax':'', }) ); return resulted_data;
def setVocalized(self, newvocalized): """ Set the vocalized word @param newvocalized: the new given vocalized. @type newvocalized: unicode string """ self.vocalized = newvocalized self.unvocalized = araby.stripTashkeel(newvocalized)
def setVocalized(self,newvocalized): """ Set the vocalized word @param newvocalized: the new given vocalized. @type newvocalized: unicode string """ self.vocalized = newvocalized; self.unvocalized = araby.stripTashkeel(newvocalized);
def vocalizeNamed(wordlist, synTags=""): """ Vocalize a number words @param wordlist: words to vocalize @type wordlist: unicode list @param synTags: tags about the clause @type synTags: unicode @return: the vocalized wordlist. @rtype: unicode """ newlist = [] prefix = u"" next = u"" #detect tags # we can pass tags to this number word tags = synTags bin_count = 0 for i in range(len(wordlist)): #save the original word with possible harakat if exist word = wordlist[i] word_nm = araby.stripTashkeel(word) # the first word can have prefixes if i == 0 and word_nm: # word to get majrour tag if word_nm in ( u'أبي', u'بنو', u'آل', u'ابن', ): tags += u"مجرور" elif word_nm in (u'أبو', ): tags += u"مرفوع" elif word_nm in (u'أبا', ): tags += u"منصوب" # select vocalization if word_nm == u'بن': bin_count += 1 #treat first bin according to tags if bin_count == 1: if u'مجرور' in tags: voc = u'بْنِ' elif u'مرفوع' in tags: voc = u'بْنُ' elif u'منصوب' in tags: voc = u'بْنَ' else: voc = u'بْن' else: # u'مجرور' voc = u'بْنِ' #Todo Vocalize names else: voc = word newlist.append(voc) return newlist
def add(self, word, suggestList): if word!=u"" and suggestList!=[] and type(suggestList).__name__=='list': #ToDo: adding different suggestion into one list; # NB: this is time eater because if the word is frequent. # if self.dict.has_key(word): # # if the dict has previous suggestions for the word, # # add new suggestions and remove duplicata; # suggestList+=self.dict[word]; # suggestList=set(suggestList); # self.dict[word]=suggestList; #else: self.dict[araby.stripTashkeel(word)]=suggestList;
def add(self, word, suggestList): if word != u"" and suggestList != [] and type( suggestList).__name__ == 'list': #ToDo: adding different suggestion into one list; # NB: this is time eater because if the word is frequent. # if self.dict.has_key(word): # # if the dict has previous suggestions for the word, # # add new suggestions and remove duplicata; # suggestList+=self.dict[word]; # suggestList=set(suggestList); # self.dict[word]=suggestList; #else: self.dict[araby.stripTashkeel(word)] = suggestList
def vocalizeNamed(wordlist, synTags=""): """ Vocalize a number words @param wordlist: words to vocalize @type wordlist: unicode list @param synTags: tags about the clause @type synTags: unicode @return: the vocalized wordlist. @rtype: unicode """ newlist=[]; prefix=u""; next=u""; #detect tags # we can pass tags to this number word tags= synTags; bin_count=0; for i in range(len(wordlist)): #save the original word with possible harakat if exist word=wordlist[i]; word_nm=araby.stripTashkeel(word); # the first word can have prefixes if i==0 and word_nm: # word to get majrour tag if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',): tags +=u"مجرور"; elif word_nm in (u'أبو', ): tags +=u"مرفوع"; elif word_nm in (u'أبا', ): tags +=u"منصوب"; # select vocalization if word_nm==u'بن': bin_count+=1; #treat first bin according to tags if bin_count==1: if u'مجرور' in tags: voc=u'بْنِ' elif u'مرفوع' in tags: voc=u'بْنُ' elif u'منصوب' in tags: voc=u'بْنَ' else: voc=u'بْن' else: # u'مجرور' voc=u'بْنِ' #Todo Vocalize names else: voc=word; newlist.append(voc); return newlist;
def getUnvOriginal(self, ): """ Get the unvocalized original form of the input word @return: the given unvocalized original. @rtype: unicode string """ if self.unvoriginal: return self.unvoriginal else: if self.original: self.unvoriginal = araby.stripTashkeel(self.original) else: return u"" return self.unvoriginal
def getUnvocalized(self,): """ Get the unvocalized form of the input word @return: the given unvocalized. @rtype: unicode string """ if self.unvocalized: return self.unvocalized; else: if self.vocalized: self.unvocalized=araby.stripTashkeel(self.vocalized); else : return u""; return self.unvocalized;
def check_normalized(self, word_vocalised, resulted_data): """ If the entred word is like the found word in dictionary, to treat some normalized cases, the analyzer return the vocalized like words; ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب. this function filter normalized resulted word according the given word, and give ذئب. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ #print word_vocalised.encode('utf8'); filtred_data=[]; inputword = araby.stripTashkeel(word_vocalised) for item in resulted_data: vocalized = getattr(item, 'vocalized') if vocalized: outputword = araby.stripTashkeel(vocalized) if inputword == outputword: filtred_data.append(item); return filtred_data;
def getUnvOriginal(self,): """ Get the unvocalized original form of the input word @return: the given unvocalized original. @rtype: unicode string """ if self.unvoriginal: return self.unvoriginal; else : if self.original: self.unvoriginal = araby.stripTashkeel(self.original); else: return u""; return self.unvoriginal;
def getUnvocalized(self, ): """ Get the unvocalized form of the input word @return: the given unvocalized. @rtype: unicode string """ if self.unvocalized: return self.unvocalized else: if self.vocalized: self.unvocalized = araby.stripTashkeel(self.vocalized) else: return u"" return self.unvocalized
def check_normalized(self, word_vocalised, resulted_data): """ If the entred word is like the found word in dictionary, to treat some normalized cases, the analyzer return the vocalized like words; ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب. this function filter normalized resulted word according the given word, and give ذئب. @param word_vocalised: the input word. @type word_vocalised: unicode. @param resulted_data: the founded resulat from dictionary. @type resulted_data: list of dict. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ #print word_vocalised.encode('utf8'); filtred_data = [] inputword = araby.stripTashkeel(word_vocalised) for item in resulted_data: vocalized = getattr(item, 'vocalized') if vocalized: outputword = araby.stripTashkeel(vocalized) if inputword == outputword: filtred_data.append(item) return filtred_data
def create_index_broken_plural(self): """Deprecated: create index from the broken_plural dictionary to accelerate the search in the dictionary for broken_plural """ for key in BrokenPluralTable.keys(): vocnoun = key unvnoun = araby.stripTashkeel(vocnoun) normnoun = normalize_hamza(unvnoun) # transitive=BrokenPluralTable[key] # stamp=noun_stamp(normnoun); if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normnoun): self.BROKENPLURAL_DICTIONARY_INDEX[normnoun].append(vocnoun) else: self.BROKENPLURAL_DICTIONARY_INDEX[normnoun] = [vocnoun]
def create_index_broken_plural(self): """Deprecated: create index from the broken_plural dictionary to accelerate the search in the dictionary for broken_plural """ for key in BrokenPluralTable.keys(): vocnoun = key unvnoun = araby.stripTashkeel(vocnoun) normnoun = normalize_hamza(unvnoun) #transitive=BrokenPluralTable[key] #stamp=noun_stamp(normnoun); if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normnoun): self.BROKENPLURAL_DICTIONARY_INDEX[normnoun].append(vocnoun) else: self.BROKENPLURAL_DICTIONARY_INDEX[normnoun] = [ vocnoun, ]
def getPreviousTag(word): """Get the word tags @param word: given word @type word: unicode @return :word tag @rtype: unicode """ word=araby.stripTashkeel(word); tags=u''; if word in named_const.NOUN_NASEB_LIST: return u'منصوب'; elif word in named_const.JAR_LIST: return u'مجرور'; elif word in named_const.RAFE3_LIST: return u'مرفوع'; else: return u'';
def treatLine(line, action): """ treat one line at once with action""" global globalFreq if action == "extract": words=araby.tokenize(line); for word in words: extract(word); elif action =="reduce": line= line.strip(' '); fields=line.split(' '); if len(fields)>=2: freq = fields[0] word = fields[1] word_nm = araby.stripTashkeel(word); if WordsTab.has_key(word_nm): # the word has multiple vocalization WordsTab[word_nm]=False; else: WordsTab[word_nm]={'f':freq,'v':word} ; globalFreq += stringToInt(freq);
def getSuffixVariant(self,word, suffix,enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is convert to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffix. @rtype: unicode. """ enclitic_nm=araby.stripTashkeel(enclitic) #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0: suffix=re.sub(araby.TEH_MARBUTA,araby.TEH,suffix); if enclitic_nm==u"" and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH,araby.ALEF) and suffix in araby.HARAKAT : suffix=u""; return suffix;
def getSuffixVariant(self, word, suffix, enclitic): """ Get the suffix variant to be joined to the word. For example: word = مدرس, suffix=ة, encletic=ي. The suffix is convert to Teh. @param word: word found in dictionary. @type word: unicode. @param suffix: second level suffix. @type suffix: unicode. @param enclitic: first level suffix. @type enclitic: unicode. @return: variant of suffix. @rtype: unicode. """ enclitic_nm = araby.stripTashkeel(enclitic) #if the word ends by a haraka if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0: suffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix) if enclitic_nm == u"" and word[-1:] in ( araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and suffix in araby.HARAKAT: suffix = u"" return suffix
def generateSuggest(self,word): """ Generate word suggestion @param word: input text. @type word: unicode. @return: generated suggestion. rtype: list of words. """ wordlist=[word, araby.stripTashkeel(word)]; codidates=self.edits1(word) for condidate in codidates: if True :#self.accepted(condidate): wordlist.append(condidate); # commun letters error remplacement for tup in spellcheck_const.TabReplacment: sug =word.replace(tup[0], tup[1]) if sug!=word: # evaluate generated suggestion if self.accepted(sug): wordlist.append(sug); wordlist = list(set(wordlist)) return wordlist;
def verbStamp(self, word): """ generate a stamp for a verb, the verb stamp is different of word stamp, by hamza noralization remove all letters which can change form in the word : - ALEF, - YEH, - WAW, - ALEF_MAKSURA - SHADDA @return: stamped word """ word = araby.stripTashkeel(word) #The vowels are striped in stamp function word = araby.normalizeHamza(word) if word.startswith(araby.HAMZA): #strip The first hamza word = word[1:] # strip the last letter if is doubled if word[-1:] == word[-2:-1]: word = word[:-1] return self.VerbSTAMP_pat.sub('', word)
def verbStamp(self, word): """ generate a stamp for a verb, the verb stamp is different of word stamp, by hamza noralization remove all letters which can change form in the word : - ALEF, - YEH, - WAW, - ALEF_MAKSURA - SHADDA @return: stamped word """ word=araby.stripTashkeel(word); #The vowels are striped in stamp function word=araby.normalizeHamza(word); if word.startswith(araby.HAMZA): #strip The first hamza word=word[1:]; # strip the last letter if is doubled if word[-1:]== word[-2:-1]: word=word[:-1]; return self.VerbSTAMP_pat.sub('', word)
def treatLine(line, action): """ treat one line at once with action""" global globalFreq if action == "extract": words = araby.tokenize(line) for word in words: extract(word) elif action == "reduce": line = line.strip(' ') fields = line.split(' ') if len(fields) >= 2: freq = fields[0] word = fields[1] word_nm = araby.stripTashkeel(word) if WordsTab.has_key(word_nm): # the word has multiple vocalization WordsTab[word_nm] = False else: WordsTab[word_nm] = { 'f': freq, 'v': word } globalFreq += stringToInt(freq)
def segment(self, word): """ generate a list of all posibble segmentation positions (lef, right) of the treated word by the stemmer. Example: >>> ArListem=ArabicLightStemmer(); >>> word=u'فتضربين' >>> print ArListem.segment(word); set(([(1, 5), (2, 5), (0, 7)]) @return: List of segmentation @rtype: set of tuple of integer. """ self.word = word self.unvocalized = araby.stripTashkeel(word) # word, harakat=araby.separate(word); word = re.sub("[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) # word=re.sub("[^%s%s]"%(self.prefix_letters,self.suffix_letters),self.joker,word); # get all lefts position of prefixes lefts = self.lookup_prefixes(word) # get all rights position of suffixes rights = self.lookup_suffixes(word) if lefts: self.left = max(lefts) else: self.left = -1 if rights: self.right = min(rights) else: self.right = -1 ln = len(word) self.segment_list = set([(0, ln)]) # print lefts, rights for i in lefts: for j in rights: if j >= i + 2: self.segment_list.add((i, j)) # self.segment_list.add((i,j)); # self.segment_list=list_seg; return self.segment_list
def Comparetashkeel(text): import tashkeel.tashkeel as ArabicVocalizer # the entred text is vocalized correctly correct_text = text text = araby.stripTashkeel(text) vocalizer = ArabicVocalizer.TashkeelClass() vocalized_text = vocalizer.tashkeel(text) # compare voalized text with a correct text text1 = correct_text text2 = vocalized_text # remove collocations symboles text2 = text2.replace("'", "") text2 = text2.replace("~", "") #stemmer=tashaphyne.stemming.ArabicLightStemmer() list1 = vocalizer.analyzer.tokenize(text1) list2 = vocalizer.analyzer.tokenize(text2) print u":".join(list1).encode('utf8') print u":".join(list2).encode('utf8') correct = 0 incorrect = 0 total = len(list1) if len(list1) != len(list2): print "lists haven't the same length" else: for i in range(total): if araby.vocalizedlike(list1[i], list2[i]): correct += 1 else: incorrect += 1 result = [ vocalized_text, "correct:%0.2f%%" % round(correct * 100.00 / total, 2), "incorrect:%0.2f%%" % round(incorrect * 100.00 / total, 2), total ] return result #correct*100/total;
def segment(self,word): """ generate a list of all posibble segmentation positions (lef, right) of the treated word by the stemmer. Example: >>> ArListem=ArabicLightStemmer(); >>> word=u'فتضربين' >>> print ArListem.segment(word); set(([(1, 5), (2, 5), (0, 7)]) @return: List of segmentation @rtype: set of tuple of integer. """ self.word=word; self.unvocalized=araby.stripTashkeel(word); # word, harakat=araby.separate(word); word=re.sub("[%s]"%(araby.ALEF_MADDA),araby.HAMZA+araby.ALEF,word) # word=re.sub("[^%s%s]"%(self.prefix_letters,self.suffix_letters),self.joker,word); # get all lefts position of prefixes lefts=self.lookup_prefixes(word); # get all rights position of suffixes rights=self.lookup_suffixes(word); if lefts: self.left=max(lefts) else:self.left = -1 if rights: self.right=min(rights) else: self.right = -1; ln=len(word) self.segment_list=set([(0,ln)]); # print lefts, rights for i in lefts: for j in rights: if j>=i+2: self.segment_list.add((i,j)); # self.segment_list.add((i,j)); # self.segment_list=list_seg; return self.segment_list;
def DoAction(text, action, options={}): if action == "DoNothing": return text elif action == "TashkeelText": lastmark = options.get('lastmark', "0") return tashkeelText(text, lastmark) elif action == "Tashkeel2": lastmark = options.get('lastmark', "0") return tashkeel2(text, lastmark) elif action == "SpellCheck": # lastmark= options.get('lastmark', "0"); return spellcheck(text) elif action == "CompareTashkeel": return Comparetashkeel(text) elif action == "ReduceTashkeel": return reducedTashkeelText(text) if action == "Contibute": return text elif action == "StripHarakat": return araby.stripTashkeel(text) elif action == "CsvToData": return csv_to_python_table(text) elif action == "Romanize": return romanize(text) elif action == "NumberToLetters": return numberToLetters(text) elif action == "LightStemmer": lastmark = options.get('lastmark', "0") return fullStemmer(text, lastmark) elif action == "Tokenize": return token_text(text) elif action == "Poetry": return justify_poetry(text) elif action == "Unshape": import pyarabic.unshape return pyarabic.unshape.unshaping_text(text) elif action == "Affixate": return affixate(text) elif action == "Normalize": return normalize(text) elif action == "Wordtag": return wordtag(text) elif action == "Inverse": return inverse(text) elif action == "Itemize": return itemize(text) elif action == "Tabulize": return tabulize(text) elif action == "Tabbing": return tabbing(text) elif action == "Language": return segmentLanguage(text) elif action == "RandomText": return randomText() elif action == "showCollocations": return showCollocations(text) elif action == "extractNamed": return extractNamed(text) elif action == "extractNumbered": return extractNumbered(text) else: return text
def guess_stem(self,word): """ Detetect affixed letters based or phonetic root composition. In Arabic language, there are some letters which can't be adjacent in a root. This function return True, if the word is valid, else, return False @param word: the word. @type word: unicode. @return: word with a '-' to indicate the stemming position. @rtype: unicode """ # certain roots are forbiden in arabic #exprimed in letters sequences # but this sequence can be used for affixation #then we can guess that this letters are affixed # #treat one prefixe letter # we strip harkat and shadda word=araby.stripTashkeel(word); # prefixes_letters=( araby.TEH , araby.MEEM , araby.LAM, araby.WAW , araby.BEH, araby.KAF, araby.FEH, araby.HAMZA, araby.YEH, araby.NOON ) # prefixes_forbiden={ # araby.ALEF_HAMZA_ABOVE:( araby.ALEF_HAMZA_ABOVE, araby.ZAH, araby.AIN, araby.GHAIN), # araby.BEH:( araby.BEH, araby.FEH, araby.MEEM ), # araby.TEH :( araby.THEH, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH), # araby.FEH:( araby.BEH, araby.FEH, araby.MEEM ), # araby.KAF:( araby.JEEM, araby.DAD, araby.TAH, araby.ZAH, araby.QAF, araby.KAF), # araby.LAM:( araby.REH, araby.SHEEN, araby.LAM, araby.NOON ), # araby.MEEM :( araby.BEH, araby.FEH, araby.MEEM ), # araby.NOON :( araby.REH, araby.LAM, araby.NOON ), # araby.WAW :( araby.WAW , araby.YEH), # araby.YEH:( araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.KAF, araby.HEH, araby.YEH),} word_guess=word; if len(word)>=2: c1=word[0]; c2=word[1]; if c1 in wordtag_const.prefixes_letters and ( c2 in wordtag_const.prefixes_forbiden.get(c1,'')): word_guess=u"%s-%s"%(c1,word[1:]) if len(word_guess)>=4: c1=word_guess[2]; c2=word_guess[3]; if c1 in wordtag_const.prefixes_letters and ( c2 in wordtag_const.prefixes_forbiden[c1]): word_guess=u"%s-%s"%(c1,word_guess[2:]) # # treat two suffixe letters # bisuffixes_letters=(araby.KAF+araby.MEEM ,araby.KAF+araby.NOON ,araby.HEH+araby.MEEM ,araby.HEH+araby.NOON ) # bisuffixes_forbiden={ # araby.HEH+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH), # araby.KAF+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.KHAH, araby.ZAIN, araby.SEEN , araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.FEH, araby.QAF, araby.KAF, araby.LAM, araby.NOON , araby.HEH, araby.YEH), # araby.HEH+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH), # araby.KAF+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.QAF, araby.KAF, araby.NOON , araby.HEH, araby.YEH), # } ## word_guess=word; word=word_guess; if len(word)>=3: bc_last=word[-2:]; bc_blast=word[-3:-2] if bc_last in wordtag_const.bisuffixes_letters: if bc_blast in wordtag_const.bisuffixes_forbiden[bc_last]: word_guess=u"%s-%s"%(word[:-2],bc_last) # # treat one suffixe letters # suffixes_letters=(araby.KAF,araby.TEH ,araby.HEH) # suffixes_forbiden={ # araby.TEH :(araby.THEH, araby.JEEM, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.TAH, araby.ZAH), # araby.KAF:(araby.THEH, araby.JEEM, araby.KHAH, araby.THAL, araby.TAH, araby.ZAH, araby.GHAIN, araby.QAF), # araby.HEH:(araby.TEH , araby.HAH, araby.KHAH, araby.DAL, araby.REH, araby.SEEN , araby.SHEEN, araby.SAD, araby.ZAH, araby.AIN, araby.GHAIN), # } word=word_guess; c_last=word[-1:]; c_blast=word[-2:-1] if c_last in wordtag_const.suffixes_letters: if c_blast in wordtag_const.suffixes_forbiden[c_last]: word_guess=u"%s-%s"%(word[:-1],c_last) return word_guess;
def check(self, word): key = araby.stripTashkeel(word) if self.dict.has_key(key): return False else: return True
def detectNamedPosition(wordlist): """ Detect named enteties words in a text and return positions of each phrase. @param wordlist: wordlist @type wordlist: unicode list @return : list of numbers clause positions [(start,end),(start2,end2),] @rtype: list of tuple >>> detectNamedPosition(u"قال خالد بن رافع حدثني أحمد بن عنبر عن خاله"); ((1,3), (6,8)) """ wordlist#=text.split(u' '); #print words; positions = []; startNamed =-1; endNamed =False; # print u":".join(wordlist).encode('utf8'); for i in range(len(wordlist)): word=wordlist[i]; if i+1<len(wordlist): next=araby.stripTashkeel(wordlist[i+1]); else: next=u'' if i-1>=0: previous=araby.stripTashkeel(wordlist[i-1]); if previous and startNamed<0 and previous[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): previous=previous[1:]; else: previous = u'' #save the original word with possible harakat if exist word_nm=araby.stripTashkeel(word); key=word_nm; # the first word can have prefixes if word_nm and startNamed<0 and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'): key=word_nm[1:]; if startNamed<0 and key in (u'ابن', ): startNamed=i; endNamed=i elif key in (u'ابن', u'بن',u'أبو',u'أبا', u'أبي', u'عبد' , u'عبيد' , u'بنو', u'بني', u'بنت'): if startNamed<0: startNamed=i; endNamed=i elif previous in (u'بن', u'ابن', u'أبو',u'أبا', u'أبي', u'عبد', u'عبيد', u'بنو', u'بني', u'بنت'): if startNamed<0: startNamed=i-1; endNamed=i elif next in (u'بن', u'بنت',): # u'أبو', u'أبي', u'ابا',) :#or word in (u'الدين',): if startNamed<0: startNamed=i; endNamed=i # if the word is a proper noun elif startNamed<0 and isProperNoun(key): startNamed=i; endNamed=i else: if startNamed>=0: #There are a previous number phrase. if word_nm.startswith(u'ال') and word_nm.endswith(u'ي'): # add family name إضافة الكنية endNamed=i positions.append((startNamed, endNamed)); startNamed=-1; # add the final phrases if startNamed>=0: #There are a previous number phrase. positions.append((startNamed, endNamed)); return positions
def transformToStars(self, word): """ Transform all non affixation letters into a star. the star is a joker(by default '*'). which indicates that the correspandent letter is an original. this function is used by the stmmer to identify original letters., and return a stared form and stemming positions (left, right) Example: >>> ArListem=ArabicLightStemmer(); >>> word=u'أفتضاربانني' >>> starword,left, right=ArListem.transformToStrars(word); (أفت*ا**انني, 3, 6) @param word: the input word. @type word: unicode @return: (starword,left, right): - starword : all original letters converted into a star - left : the greater possible left stemming position. - right : the greater possible right stemming position. @rtype: tuple. """ self.word = word word = araby.stripTashkeel(word) # word, harakat=araby.separate(word); self.unvocalized = word word = re.sub("[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) word = re.sub("[^%s%s]" % (self.prefix_letters, self.suffix_letters), self.joker, word) ln = len(word) left = word.find(self.joker) right = word.rfind(self.joker) if left >= 0: left = min(left, self.max_prefix_length - 1) right = max(right + 1, len(word) - self.max_suffix_length) prefix = word[:left] stem = word[left:right] suffix = word[right:] prefix = re.sub("[^%s]" % self.prefix_letters, self.joker, prefix) # avoid null infixes if (self.infix_letters != u""): stem = re.sub("[^%s]" % self.infix_letters, self.joker, stem) suffix = re.sub("[^%s]" % self.suffix_letters, self.joker, suffix) word = prefix + stem + suffix left = word.find(self.joker) right = word.rfind(self.joker) # prefix_list=self.PREFIX_LIST; # suffix_list=self.SUFFIX_LIST; if left < 0: left = min(self.max_prefix_length, len(word) - 2) if left >= 0: prefix = word[:left] while prefix != "" and prefix not in self.prefix_list: prefix = prefix[:-1] if right < 0: right = max(len(prefix), len(word) - self.max_suffix_length) suffix = word[right:] while suffix != "" and suffix not in self.suffix_list: suffix = suffix[1:] left = len(prefix) right = len(word) - len(suffix) stem = word[left:right] # convert stem into stars. # a stem must starts with alef, or end with alef. # any other infixes letter isnt infixe at the border of the stem. #substitute all non infixes letters if self.infix_letters != "": stem = re.sub("[^%s]" % self.infix_letters, self.joker, stem) # substitube teh in infixes the teh mst be in the first or second place, all others, are converted # # stem=stem[:2]+re.sub(TEH,self.joker,stem[2:]) word = prefix + stem + suffix # store result self.left = left self.right = right self.starword = word self.extract_root() # return starword, left, right position of stem return (word, left, right)
import tashkeel if __name__ == '__main__': filename, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs() #filename="samples/randomtext.txt" try: myfile=open(filename) except: print " Can't Open the given File ", filename; counter=1; if not limit : limit= 100000000 nolimit = False; correct=0; total=0; line=(myfile.readline()).decode('utf8'); while line and (nolimit or counter<=limit): unvocline= araby.stripTashkeel(line); vocalized=pyarabic.number.preTashkeelNumber(araby.tokenize(unvocline)); vocalized=u' '.join(vocalized); if vocalized!=unvocline: total+=1; sim = araby.vocalizedSimilarity(vocalized, araby.stripShadda( line)); if sim>=0: correct+=1; # for res in result: if sim<0: print u"\t".join([str(sim),str(counter),str(len(vocalized)),str(len(line)),vocalized, line]).encode('utf8'); #get the next line line=(myfile.readline()).decode('utf8'); counter+=1; print correct, total, round(correct*100.00/total,2)
if __name__ == '__main__': #import number as ArabicNumberToLetters texts=[ u"وجد عبد الله بن عمر دينارا", u"جاء خالد بن الوليد وقاتل مسيلمة بن حذام الكذاب في موقعة الحديقة", u'روى أحمد بن عقيل الشامي عن أبي طلحة المغربي أنّ عقابا بن مسعود بن أبي سعاد قال', u""" 6 :* حَديثُ عَمٍّ: فَرَجُ سَقْفِ بَيْتِي وَأَنَا بِمَكَّةٍ ، فَنَزَلَ جِبْرِيلُ ، فَفَرَجُ صَدْرِي ، ثُمَّ غَسَلَهُ مِنْ مَاءِ زَمْزَمَ ، ثُمَّ جَاءَ بِطَسْتِ مَمْلُوءِ حِكْمَةِ وَإيمَانَا فَأُفْرِغُهَا فِي صَدْرِي ، ثُمَّ أَطُبِّقَهُ قَالَ عَبْدُ اللهِ بْن أَحَمْدٌ: حَدِّثِنَّي مُحَمَّدَ بْن عَبَّادٍ الْمَكِّيُّ ، ثِنَا أَبُو ضَمْرَةٌ ، عَنْ يُونِسٍ ، عَنِ الزَّهْرِيِ ، عَنْ أُنْسٍ: كَانَ أَبِي يُحَدِّثُ بِمَا هُنَا وَحَدِّثِنَّي مُحَمَّدَ بْن إسحاق بْن مُحَمَّدِ المسيبي ، ثِنَا أَنَسُ بْن عياض ، عَنْ يُونُسُ بْن يَزِيدُ ، قَالٌ: قَالَ اِبْنُ شِهَابٍ: قَالَ أَنَسُ بْن مَالِكٍ: كَانَ أَبِي بْن كَعْبِ يَحْدُثُ ، فَذُكِرَ حَديثُ الْإِسْراءِ بِطُولِهِ ، وَفِيه: قَالَ الزُّهْرِيُّ: وَأَخْبَرَنِي اِبْنُ حَزْمٍ ، أَنَّ اِبْنَ عَبَّاسٍ ، وَأَبَا حَبَّةُ الْأَنْصارِيِ يَقُولَانِّ: قَالَ رَسُولُ اللهِ ، صَلَّى اللهُ عَلَيه وَسَلَّمُ: ثَمَّ عَرَجِ بِي حَتَّى ظَهَرْتِ لِمُسْتَوى أَسْمَعُ صَرِيفَ الْأَقْلاَمِ وَفِيه قَالَ الزُّهْرِيُّ: قَالَ اِبْنُ حَزْمٍ ، وَأَنَسُ بْن مَالِكٍ: قَالَ رَسُولُ اللهِ صَلَّى اللهُ عَلَيه وَسَلَّمُ: فَرَضَ اللَّهُ عَلَى أمتي خَمْسِينَ صَلاَةٌ ، فَرَجَعْتِ بِذَلِكَ حَتَّى أَمْرِ عَلَى مُوسى الْحَديثِ ، تَفْرُدُ بِهِ .( 1 / 6) 2 71.16% 83.07% 92 54 154 319 69.85% 81.62% 28: حَديثُ كَمْ حَمُ: فِي هَذِهِ الْآيَةَ :{ وَإِذْ أَخَذَ رَبُّكَ مِنْ بُنِّيِّ آدَمِ مِنْ ظُهورِهُمْ ذَرِّيَّتِهُمْ } الْآيَةُ ، قَالٌ: جَمْعُهُمْ لَهُ يَوْمَئِذٍ جَمِيعًا فَجَعَلَهُمْ أَرَواحًا ثَمَّ صُورِهُمْ وَاِسْتَنْطَقُهُمْ الْحَديثِ ، وَفِيه قَوْلُ آدَمِ: رُبَّ لَوْ سُوِّيتِ بَيْنَ عِبَادِكَ ، قَالٌ: إِنَِّي أَحُبَّ أَنْ أَشْكَرَ ، وَفِيه ذِكْرُ عِيسَى اِبْنُ مَرْيَمٍ ، وَقَوْلُ أَبِي بْن كَعْبٍ: إِنَّ الرَّوْحَ دُخِلَ مِنْ فِي مَرْيَمِ كَمْ فِي تَفْسِيرِ الْأَعْرَافِ: أَنَا أَبُو جَعْفَرٍ مُحَمَّدُ بْن عَلِيٍّ الشَّيْبانِيُّ ، أَنَا أَحُمِدَ بْن حازِمٍ ، ثِنَا عَبِيدَ اللهِ بْن مُوسى ، ثِنَا أَبُو جَعْفَرٌ ، عَنِ الرَّبِيعُ بْن أُنْسٍ ، عَنْ أَبِي الْعَالِيَةَ ، عَنْ أَبِي بِطُولِهِ وَرَوَاهُ عَبْدُ اللهِ بْن أَحَمْدَ فِي زِيادَاتِهِ: حَدِّثِنَّي مُحَمَّدَ بْن يَعْقُوبِ الرَّبالِيِ ، ثِنَا الْمُعْتَمِرُ بْن سَلِيمَانِ ، سَمِعْتِ أَبِي يُحَدِّثُ عَنِ الرَّبِيعِ ، بِهِ. 3 72.39% 85.31% 156 83 242 565 73.98% 88.21% 44 :* حَديثُ حُبِّ حَمُ عَمٌّ: قَالَ لِي جِبْرِيلُ :{ قُلْ أَعُوذُ بِرَبِّ الْفَلْقِ } فَقِلْتِهَا الْحَديثَ حُبٌّ: فِي الْعَشْرَيْنِ مِنَ الثَّالِثِ: أَنَا عِمْرَانُ بْن مُوسى ، ثِنَا هُدْبَةُ بْن خَالِدٍ ، ثِنَا حَمَّادُ بْن سلمةٍ ، عَنْ عَاصِمٍ ، عَنْ زِرٍّ: قُلْتِ لِأَبِي بْن كَعْبٍ: إِنَّ اِبْنَ مَسْعُودِ لَا يَكْتُبْ فِي مُصْحَفِهِ المعوذتين فَقَالَ أَبِي: قَالَ لِي رَسُولُ اللهِ: قَالَ لِي جِبْرِيلُ فَذَكَرَهُ رَوَاهُ أَحْمَدُ: عَنْ أَبِي بِكَرِّ بْن عَيّاشٍ ، عَنْ عَاصِمِ بِلَفْظٍ: قُلْتِ لِأَبِي: إِنَّ عَبْدَ اللهِ يَقُولُ فِي المعوذتين فَقَالَ أَبِي: سَأَلَنَا عَنْهُمَا رَسُولُ اللهِ ، فَقَالٌ: قَيَّلَ لِي: قَلَّ وَأَنَا أَقُولُ كَمَا قَالَ وَعَنْ وكيع ، وَعَبْدُ الرَّحْمَنِ بْن مَهْدِي كِلَاهُمَا ، عَنْ سُفْيانٍ ، وَعَنْ مُحَمَّدِ بْن جَعْفَرٍ ، عَنْ شُعْبَةِ وَعَنْ عَفّانٍ ، عَنْ حَمَّادُ بْن سلمةٍ ، وَأَبِي عَوانَةٌ ، فَرَقَهُمَا ، كلَهُمْ عَنْ عَاصِمِ وَعَنْ سُفْيانِ بْن عيينة ، عَنْ عَبْدَةُ بْن أَبِي لُبَابَةٌ ، وَعَاصِمُ وَعَنْ عَبْدِ الرَّحْمَنِ بْن مَهْدِيٍّ ، عَنْ سُفْيانٍ ، عَنِ الزُّبَيْرِ بْن عِدِّيِ ، عَنْ أَبِي رَزينٌ ، ثلاثتهم عَنْ زِرِّ وَقَالَ عَبْدُ اللهِ: حَدِّثِنَّي مُحَمَّدَ بْن الحسين بْن إشكاب ، ثِنَا مُحَمَّدَ بْن أَبِي عُبَيْدَةُ بْن مِعْنَ ، ثِنَا أَبِي ، عَنِ الْأعْمَشِ ، عَنْ أَبِي إسْحَاقُ ، عَنْ عَبْدِ الرَّحْمَنِ بْن يَزِيدُ ، قَالٌ: كَانَ عَبْدُ اللهِ يَحُكُّ المعوذتين مِنْ مَصَاحِفِهِ وَيَقُولُ: إِنَّهُمَا لَيْسَتَا مِنْ كِتَابِ اللهِ قَالِ الْأعْمَشِ: وَثَنَا عَاصِمُ ، عَنْ زِرِّ فَذكرِ نَحْوَ الْأَوَّلِ .( 1 / 16) 4 74.60% 85.77% 207 116 321 815 79.60% 86.80% 54 :* حَديثُ كَمْ حَمُ عَمٌّ: إِذَا كَانَ يَوْمُ الْقِيَامَةِ كِنْتِ إمَامَ النَّبِيِّينَ وَخَطِيبُهُمْ وَصَاحِبُ شَفَاعَتِهُمْ ، غَيْرَ فَخْرُ كَمْ فِي الْإيمَانِ: ثِنَا الْحُسَيْنُ بْن الْحُسْنِ الطَّوْسِيِ ، ثِنَا أَبُو حاتِمٍ الرّازِيُّ ، ثِنَا عَبْدَ اللهِ بْن جَعْفَرٍ الرَّقِّيُّ ، ثِنَا عَبِيدَ اللهِ بْن عَمْروِ وَعَنْ مُحَمَّدِ بْن صَالِحِ بْن هَانِئٍ ، ثِنَا السَّرِيُّ بْن خَزِيمَةٍ ، ثِنَا أَبُو حُذَيْفَةُ النَّهْدِيِ ، ثِنَا زُهَيْرُ بْن مُحَمَّدٍ ، كِلَاهُمَا عَنْ عَبْدِ اللهِ بْن مُحَمَّدِ بْن عَقِيلٍ ، عَنِ الطفيل بْن أَبِي بْن كَعْبٍ ، عَنْ أَبِيه ، بِهِ وَقَالٌ: صَحِيحُ الْإِسْنادِ وَلَمْ يُخْرِجَاهُ لِتَفَرُّدِ اِبْنِ عَقِيلِ بِهِ لَمَّا نَسْبِ إِلَيه مِنْ سُوءِ الْحِفْظِ ، وَهُوَ عِنْدَ أئِمَّتُنَا مِنَ الْمُتَقَدِّمِينَ ثِقَةُ مَأْمُونِ وَفِي الْفَضَائِلِ: أَنَا الْقَطِيعِيُّ ، ثِنَا عَبْدَ اللهِ بْن أَحَمْدٌ ، حَدَّثَنِي أُبَيُّ ، ثِنَا عَبْدَ الرَّحْمَنِ ، وَهُوَ اِبْنُ مَهْدِيٍّ ، ثِنَا زُهَيْرُ بْن مُحَمَّدٍ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدٍ ، بِهِ وَرَوَاهُ الْإمَامُ أَحْمَدُ: عَنْ أَبِي عَامِرٌ ، عَنْ زُهَيْرٍ ، يَعْنِي: اِبْنُ مُحَمَّدٍ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدٍ ، بِهِ وَعَنْ زَكَرِيّا بْن عِدِّيِ ، وَأَحْمَدُ بْن عَبْدِ الْمَلِكِ الْحَرَّانِيِ ، كِلَاهُمَا عَنْ عَبِيدِ اللهِ بْن عَمْروٍ ، بِهِ وَعَنْ أَبِي أَحْمَدَ الزُّبَيْرِيُّ ، عَنْ شَرِيكِ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدٍ ، بِهِ وَرَوَاهُ اِبْنُهُ عَبْدُ اللهِ فِي زِيادَاتِهِ: حَدَّثَنِي عُبَيْدُ اللَّهِ الْقَوَارِيرِيُّ ، ثِنَا مُحَمَّدَ بْن عَبْدِ اللهِ بْن الزُّبَيْرِ ، ثِنَا شَرِيكُ ، بِهِ وَقَالَ أيضا: ثِنَا هَاشِمُ بْن الْحارِثِ ، ثِنَا عَبِيدَ اللهِ بْن عَمْروٍ ، بِهِ وَحَدِّثِنَّي ( 1 / 24) 5 75.54% 85.94% 228 131 354 932 82.05% 87.18% 56 :* حَديثُ كَمْ حَمُ: بَيَّنَا نَحْنُ فِي صَلاَةِ الظَّهيرَةِ وَالنَّاسَ فِي الصُّفُوفِ فَرَأَيْنَاهُ يَتَنَاوَلُ شِيئَا الْحَديثَ كَمْ فِي الْأَهْوَالِ: أَنَا عَبْدُ الرَّحْمَنِ بْن حَمْدانٍ ، ثِنَا هِلاَلُ بْن الْعَلاءِ ، ثِنَا أَبِي ، ثِنَا عَبِيدَ اللهِ بْن عَمْروٍ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدِ بْن عَقِيلٍ ، عَنِ الطفيل بْن أَبِي بْن كَعْبٍ ، عَنْ أَبِيه ، وَقَالٌ: صَحِيحُ الْإِسْنادِ رَوَاهُ أَحْمَدُ بِطُولِهِ: عَنْ أَحُمِدَ بْن عَبْدِ الْمَلِكِ بْن واقد الْحَرَّانِيِ ، عَنْ عَبِيدِ اللهِ بْن عَمْروٍ ، بِهِ قُلْتُ: رواه زَكَرِيّا بْن عِدِّيِ ، عَنْ عَبِيدِ اللهِ بْن عَمْروٍ ، فَقَالٌ: عَنْ عَبْدِ اللهِ بْن مُحَمَّدِ بْن عَقِيلٍ ، عَنْ جَابِرِ وَأَخْرَجَهُ أَحْمَدُ ، أيضا: عَنْ زَكَرِيّا. 6 75.46% 86.02% 265 151 403 1080 75.00% 86.49% 68 :* عَبْدُ اللهِ بْن رباحٍ ، عَنْ أَبِي حَديثُ كَمْ م حَمُ عَمٌّ: قَالَ لِي رَسُولُ اللهِ ، صَلَّى اللهُ عَلَيه وَسَلَّمُ: أَيُّ آيَةِ فِي كِتَابِ اللهِ أُعْظِمُ ؟ قَالٌ: قُلْتِ :{ اللهُ لَا إلَهُ إلّا هُوَ الْحَيُّ الْقَيُّومَ } قَالٌ: فَضَرْبُ صَدْرِي وَقَالٌ: لِيَهِنُكَ الْعِلْمَ أَبَا الْمُنْذِرَ كَمْ فِي الْمَعْرِفَةِ: ثِنَا أَبُو عَبْدُ اللَّهِ الْحافِظُ ، ثِنَا إبراهيم بْن عَبْدِ اللهِ ، ثِنَا يَزِيدُ بْن هارُونٍ ، أَنَا الْجَرِيرِيِ ، عَنْ أَبِي السَّلِيلَ ، عَنْ عَبْدِ اللهِ بْن رباحٍ ، عَنْه ، بِهَذَا قُلْتُ: هُوَ فِي مُسْلِمٍ ، فَلَا يُسْتَدْرَكُ وَرَوْاهُ الْإمَامَ أَحْمَدُ: ثِنَا عَبْدَ الرَّزَّاقِ ، أَنَا سُفْيانٌ ، عَنْ سَعِيدُ الْجَرِيرِيِ ، بِهِ وَرَوَاهُ اِبْنُهُ عَبْدُ اللهِ ، فِي زِيادَاتِهِ: حَدَّثَنِي عُبَيْدُ اللَّهِ الْقَوَارِيرِيُّ ، ثِنَا جَعْفَرُ بْن سَلِيمَانِ ، ثِنَا الْجُرَيْرِيُّ ، عَنْ بَعْضُ أَصْحَابِهِ ، عَنْ عَبْدِ اللهِ بْن رباحٍ ، بِهِ. """, u"قال مُحَمَّدُ بْنُ خَالِدُ بْنُ إسماعيلفي حديثه", u"ِنْصَرَفْنَا إِلَى أَنَسُ بْنُ مَالِكَ الْحَديثِ" ]; for text in texts: positions = detectNamedPosition(text.split(' ')); print(positions); # result=extractNamed(text); # print u"\t".join(result).encode('utf8'); # result= extractNamedWithinContext(text); text=araby.stripTashkeel(text); result= preTashkeelNamed(araby.tokenize(text)); print(u' '.join(result).encode('utf8')); # result=detectNamed(text); # print u"\t".join(result).encode('utf8');
def test(): filename, text, stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs() #filename="samples/randomtext.txt" if not text and not filename: usage() sys.exit(0) if not text: try: myfile=open(filename) except: print " Can't Open the given File ", filename; sys.exit(); else: lines = text.split('\n'); # all things are well, import library import core.adaat import pyarabic.araby as araby counter=1; if not limit : limit= 100000000 if not stripTashkeel: vocalizer=ArabicVocalizer.TashkeelClass(); if ignore : vocalizer.disableLastMark(); if disableSemantic: vocalizer.disableSemanticAnalysis(); if disableSyntax: vocalizer.disableSyntaxicAnalysis(); if disableStat: vocalizer.disableStatTashkeel(); #vocalizer.disableShowCollocationMark(); #print "show delimiter", vocalizer.collo.showDelimiter; #nolimit = True; nolimit = False; if not text: line=(myfile.readline()).decode('utf8'); else: if len(lines)>0: line= lines[0]; correct=0; incorrect=0; total=0; totLetters =0; LettersError =0 WLMIncorrect =0; if compare: #dispaly stats for the current line print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct" # print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal" while line and (nolimit or counter<=limit): if not line.startswith('#'): # lineIncorrect = 0; lineCorrect = 0; lineWLMIncorrect =0; if stripTashkeel: result = araby.stripTashkeel(line); else: #vocalize line by line if compare: vocalizedLine = line; line = araby.stripTashkeel(line) result=vocalizer.tashkeel(line); #compare resultLine and vocalizedLine if compare: list1=vocalizer.analyzer.tokenize(vocalizedLine); list2=vocalizer.analyzer.tokenize(result); #print u":".join(list1).encode('utf8'); #print u":".join(list2).encode('utf8'); total+=len(list1); lineTotal = len(list1); if len(list1)!=len(list2): print "lists haven't the same length"; else: for i in range(len(list1)): simi = araby.vocalizedSimilarity(list1[i],list2[i]); if simi<0: LettersError+= -simi; incorrect +=1; # lineIncorrect += 1; # evaluation without last haraka simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i])); if simi2<0: WLMIncorrect +=1; lineWLMIncorrect+=1; else: correct+=1; lineCorrect += 1; #compare resultLine and vocalizedLine if reducedTashkeel: result= araby.reduceTashkeel(result) # print result.encode('utf8'); counter+=1; #display stat for every line if compare: print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%( counter-1,#id round(correct*100.00/total,2),#fully Correct round((total-WLMIncorrect)*100.00/total,2),#Strip Correct incorrect,#fully WER WLMIncorrect,#Strip WER LettersError,#LER total,#Total ), if lineTotal: print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct print result.encode('utf8'); #get the next line if not text: line=(myfile.readline()).decode('utf8'); else: if counter<len(lines): line= lines[counter]; else: line =None;
def check_word(self, word, guessedTag=""): """ Analyze one word morphologically as verbs @param word: the input word. @type word: unicode. @return: list of dictionaries of analyzed words with tags. @rtype: list. """ word = araby.stripTatweel(word) word_vocalised = word word_nm = araby.stripTashkeel(word) resulted_text = u"" resulted_data = [] # if word is a pounctuation resulted_data += self.check_word_as_pounct(word_nm) # Done: if the word is a stop word we have some problems, # the stop word can also be another normal word (verb or noun), # we must consider it in future works # if word is stopword allow stop words analysis resulted_data += self.check_word_as_stopword(word_nm) # if word is verb # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء if self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data += self.check_word_as_verb(word_nm) # print "is verb", rabti,len(resulted_data); # if word is noun if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag): resulted_data += self.check_word_as_noun(word_nm) if len(resulted_data) == 0: # check the word as unkonwn resulted_data += self.check_word_as_unknown(word_nm) # check if the word is nomralized and sollution are equivalent resulted_data = self.check_normalized(word_vocalised, resulted_data) # check if the word is shadda like resulted_data = self.check_shadda(word_vocalised, resulted_data) # check if the word is vocalized like results if self.partial_vocalization_support: resulted_data = self.check_partial_vocalized(word_vocalised, resulted_data) # add word frequency information in tags resulted_data = self.addWordFrequency(resulted_data) if len(resulted_data) == 0: resulted_data.append( stemmedword.stemmedWord( { "word": word, "procletic": "", "encletic": "", "prefix": "", "suffix": "", "stem": "", "original": word, "vocalized": word, "tags": u"", "type": "unknown", "root": "", "template": "", "freq": self.wordfreq.getFreq(word, "unknown"), "syntax": "", } ) ) return resulted_data
def check(self, word): key=araby.stripTashkeel(word); if self.dict.has_key(key): return False; else: return True;
def suggest(self, word): key=araby.stripTashkeel(word) if self.dict.has_key(key): return self.dict[key]; return [];
filename, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs( ) #filename="samples/randomtext.txt" try: myfile = open(filename) except: print " Can't Open the given File ", filename counter = 1 if not limit: limit = 100000000 nolimit = False correct = 0 total = 0 line = (myfile.readline()).decode('utf8') while line and (nolimit or counter <= limit): unvocline = araby.stripTashkeel(line) # named=core.named.extractNamed(unvocline); # for n in named: # print u"\t".join([str(counter),n]).encode('utf8'); named = core.named.extractNamedWithinContext(line) # print named for n in named: #display context (previous, named, next) print u"\t".join([str(counter), u'\t'.join(n)]).encode('utf8') #get the next line line = (myfile.readline()).decode('utf8') counter += 1 #print correct, total, round(correct*100.00/total,2)
def guess_stem(self, word): """ Detetect affixed letters based or phonetic root composition. In Arabic language, there are some letters which can't be adjacent in a root. This function return True, if the word is valid, else, return False @param word: the word. @type word: unicode. @return: word with a '-' to indicate the stemming position. @rtype: unicode """ # certain roots are forbiden in arabic #exprimed in letters sequences # but this sequence can be used for affixation #then we can guess that this letters are affixed # #treat one prefixe letter # we strip harkat and shadda word = araby.stripTashkeel(word) # prefixes_letters=( araby.TEH , araby.MEEM , araby.LAM, araby.WAW , araby.BEH, araby.KAF, araby.FEH, araby.HAMZA, araby.YEH, araby.NOON ) # prefixes_forbiden={ # araby.ALEF_HAMZA_ABOVE:( araby.ALEF_HAMZA_ABOVE, araby.ZAH, araby.AIN, araby.GHAIN), # araby.BEH:( araby.BEH, araby.FEH, araby.MEEM ), # araby.TEH :( araby.THEH, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH), # araby.FEH:( araby.BEH, araby.FEH, araby.MEEM ), # araby.KAF:( araby.JEEM, araby.DAD, araby.TAH, araby.ZAH, araby.QAF, araby.KAF), # araby.LAM:( araby.REH, araby.SHEEN, araby.LAM, araby.NOON ), # araby.MEEM :( araby.BEH, araby.FEH, araby.MEEM ), # araby.NOON :( araby.REH, araby.LAM, araby.NOON ), # araby.WAW :( araby.WAW , araby.YEH), # araby.YEH:( araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.KAF, araby.HEH, araby.YEH),} word_guess = word if len(word) >= 2: c1 = word[0] c2 = word[1] if c1 in wordtag_const.prefixes_letters and ( c2 in wordtag_const.prefixes_forbiden.get(c1, '')): word_guess = u"%s-%s" % (c1, word[1:]) if len(word_guess) >= 4: c1 = word_guess[2] c2 = word_guess[3] if c1 in wordtag_const.prefixes_letters and ( c2 in wordtag_const.prefixes_forbiden[c1]): word_guess = u"%s-%s" % (c1, word_guess[2:]) # # treat two suffixe letters # bisuffixes_letters=(araby.KAF+araby.MEEM ,araby.KAF+araby.NOON ,araby.HEH+araby.MEEM ,araby.HEH+araby.NOON ) # bisuffixes_forbiden={ # araby.HEH+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH), # araby.KAF+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.KHAH, araby.ZAIN, araby.SEEN , araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.FEH, araby.QAF, araby.KAF, araby.LAM, araby.NOON , araby.HEH, araby.YEH), # araby.HEH+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH), # araby.KAF+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.QAF, araby.KAF, araby.NOON , araby.HEH, araby.YEH), # } ## word_guess=word; word = word_guess if len(word) >= 3: bc_last = word[-2:] bc_blast = word[-3:-2] if bc_last in wordtag_const.bisuffixes_letters: if bc_blast in wordtag_const.bisuffixes_forbiden[bc_last]: word_guess = u"%s-%s" % (word[:-2], bc_last) # # treat one suffixe letters # suffixes_letters=(araby.KAF,araby.TEH ,araby.HEH) # suffixes_forbiden={ # araby.TEH :(araby.THEH, araby.JEEM, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.TAH, araby.ZAH), # araby.KAF:(araby.THEH, araby.JEEM, araby.KHAH, araby.THAL, araby.TAH, araby.ZAH, araby.GHAIN, araby.QAF), # araby.HEH:(araby.TEH , araby.HAH, araby.KHAH, araby.DAL, araby.REH, araby.SEEN , araby.SHEEN, araby.SAD, araby.ZAH, araby.AIN, araby.GHAIN), # } word = word_guess c_last = word[-1:] c_blast = word[-2:-1] if c_last in wordtag_const.suffixes_letters: if c_blast in wordtag_const.suffixes_forbiden[c_last]: word_guess = u"%s-%s" % (word[:-1], c_last) return word_guess
def validateTags(self, noun_tuple, affix_tags, procletic, encletic, suffix): """ Test if the given word from dictionary is compabilbe with affixes tags. @param noun_tuple: the input word attributes given from dictionary. @type noun_tuple: dict. @param affix_tags: a list of tags given by affixes. @type affix_tags:list. @param procletic: first level prefix vocalized. @type procletic: unicode. @param encletic: first level suffix vocalized. @type encletic: unicode. @return: if the tags are compaatible. @rtype: Boolean. """ procletic = araby.stripTashkeel(procletic) encletic = araby.stripTashkeel(encletic) suffix = araby.stripTashkeel(suffix) if u'مؤنث' in affix_tags and not noun_tuple['feminable']: return False if u'جمع مؤنث سالم' in affix_tags and not noun_tuple['feminin_plural']: return False if u'جمع مذكر سالم' in affix_tags and not noun_tuple['masculin_plural']: return False if u'مثنى' in affix_tags and not noun_tuple['dualable']: return False if u'تنوين' in affix_tags and noun_tuple['mamnou3_sarf']: return False if u'منسوب' in affix_tags and not noun_tuple['relative']: return False #تدقيق الغضافة إلى الضمائر المتصلة if encletic == u"هم" and noun_tuple['hm_suffix'] == 'N': return False if encletic == u"ه" and noun_tuple['ha_suffix'] == 'N': return False if encletic == u"ك" and noun_tuple['k_suffix'] == 'N': return False #حالة قابلية التشبيه if procletic.endswith(u"كال") and noun_tuple['kal_prefix'] == 'N': return False # حالة المضاف إلى ما بعهده في حالة جمع المذكر السالم # مثل لاعبو، رياضيو if suffix == araby.WAW and not noun_tuple['w_suffix']: return False # elif u'مضاف' in affix_tags and not noun_tuple['annex']: # return False; #todo # u'mankous':8, # u'feminable':9,* # u'number':10, # u'dualable':11,* # u'masculin_plural':12,* # u'feminin_plural':13,* # u'broken_plural':14, # u'mamnou3_sarf':15, # u'relative':16, # u'w_suffix':17, # u'hm_suffix':18,* # u'kal_prefix':19,* # u'ha_suffix':20,* # u'k_suffix':21,* # u'annex':22, return True
def validateTags(self, noun_tuple, affix_tags, procletic, encletic_nm , suffix_nm): """ Test if the given word from dictionary is compabilbe with affixes tags. @param noun_tuple: the input word attributes given from dictionary. @type noun_tuple: dict. @param affix_tags: a list of tags given by affixes. @type affix_tags:list. @param procletic: first level prefix vocalized. @type procletic: unicode. @param encletic_nm: first level suffix vocalized. @type encletic_nm: unicode. @param suffix_nm: first level suffix vocalized. @type suffix_nm: unicode. @return: if the tags are compaatible. @rtype: Boolean. """ procletic = araby.stripTashkeel(procletic); # encletic = araby.stripTashkeel(encletic); # suffix = araby.stripTashkeel(suffix); encletic = encletic_nm suffix = suffix_nm if u'مؤنث' in affix_tags and not noun_tuple['feminable']: return False; if u'جمع مؤنث سالم' in affix_tags and not noun_tuple['feminin_plural']: return False; if u'جمع مذكر سالم' in affix_tags and not noun_tuple['masculin_plural']: return False; if u'مثنى' in affix_tags and not noun_tuple['dualable']: return False; if u'تنوين' in affix_tags and noun_tuple['mamnou3_sarf']: return False; if u'منسوب' in affix_tags and not noun_tuple['relative']: return False; #تدقيق الغضافة إلى الضمائر المتصلة if encletic==u"هم" and noun_tuple['hm_suffix']=='N': return False; if encletic==u"ه" and noun_tuple['ha_suffix']=='N': return False; if encletic==u"ك" and noun_tuple['k_suffix']=='N': return False; #حالة قابلية التشبيه if procletic.endswith(u"كال") and noun_tuple['kal_prefix']=='N': return False; # حالة المضاف إلى ما بعهده في حالة جمع المذكر السالم # مثل لاعبو، رياضيو if suffix==araby.WAW and not noun_tuple['w_suffix']: return False; #التاء المربوطة لا تتصل بجمع التكسير # if suffix==araby.TEH_MARBUTA and noun_tuple['broken_plural']: # return False; # elif u'مضاف' in affix_tags and not noun_tuple['annex']: # return False; #todo # u'mankous':8, # u'feminable':9, * # u'number':10, # u'dualable':11, * # u'masculin_plural':12, * # u'feminin_plural':13, * # u'broken_plural':14, # u'mamnou3_sarf':15, # u'relative':16, # u'w_suffix':17, # u'hm_suffix':18, * # u'kal_prefix':19, * # u'ha_suffix':20, * # u'k_suffix':21, * # u'annex':22, return True;
def suggest(self, word): key = araby.stripTashkeel(word) if self.dict.has_key(key): return self.dict[key] return []