Пример #1
0
    def getStemVariants(self, stem, prefix, suffix):
        """
		Generate the Noun stem variants according to the affixes.
		For example مدرستي=>مدرست+ي => مدرسة +ي.
		Return a list of possible cases.
		@param stem: the input stem.
		@type stem: unicode.
		@param prefix: prefixe.
		@type prefix: unicode.
		@param suffix: suffixe.
		@type suffix: unicode.
		@return: list of stem variants.
		@rtype: list of unicode.
		"""
        #some cases must have some correction
        #determinate the prefix and suffix types
        # create a list, the first item is the verb without changes
        prefix_possible_noun_list = set([stem])
        # Prefix
        prefix = araby.stripTashkeel(prefix)
        suffix = araby.stripTashkeel(suffix)
        possible_noun_list = prefix_possible_noun_list
        if suffix in (araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA,
                      araby.YEH, araby.YEH + araby.ALEF + araby.TEH):
            possible_noun = stem + araby.TEH_MARBUTA
            possible_noun_list.add(possible_noun)
        if suffix == "" or suffix == araby.YEH + araby.NOON or suffix == araby.WAW + araby.NOON:
            possible_noun = stem + araby.YEH
            possible_noun_list.add(possible_noun)
        if stem.endswith(araby.YEH):
            possible_noun = stem[:-1] + araby.ALEF_MAKSURA
            possible_noun_list.add(possible_noun)
        #to be validated
        validated_list = possible_noun_list
        return validated_list
Пример #2
0
    def check_normalized(self, word_vocalised, resulted_data):
        """
		If the entred word is like the found word in dictionary, to treat some normalized cases, 
		the analyzer return the vocalized like words;
		ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
		this function filter normalized resulted word according the given word, and give ذئب.
		@param word_vocalised: the input word.
		@type word_vocalised: unicode.
		@param resulted_data: the founded resulat from dictionary.
		@type resulted_data: list of dict.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        # print word_vocalised.encode('utf8');
        filtred_data = []
        inputword = araby.stripTashkeel(word_vocalised)
        for item in resulted_data:
            if "vocalized" in item.__dict__:  # .has_key('vocalized') :
                # ~ if 'vocalized' in item :
                # ~ outputword = araby.stripTashkeel(item['vocalized'])
                outputword = araby.stripTashkeel(item.__dict__["vocalized"])
                # print u'\t'.join([inputword, outputword]).encode('utf8');
                if inputword == outputword:
                    # item['tags']+=':a';
                    filtred_data.append(item)
        return filtred_data
Пример #3
0
    def check_normalized(self, word_vocalised, resulted_data):
        """
		If the entred word is like the found word in dictionary, to treat some normalized cases, 
		the analyzer return the vocalized like words;
		ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
		this function filter normalized resulted word according the given word, and give ذئب.
		@param word_vocalised: the input word.
		@type word_vocalised: unicode.
		@param resulted_data: the founded resulat from dictionary.
		@type resulted_data: list of dict.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        #print word_vocalised.encode('utf8');
        filtred_data = []
        inputword = araby.stripTashkeel(word_vocalised)
        for item in resulted_data:
            if 'vocalized' in item.__dict__:  #.has_key('vocalized') :
                #~ if 'vocalized' in item :
                #~ outputword = araby.stripTashkeel(item['vocalized'])
                outputword = araby.stripTashkeel(item.__dict__['vocalized'])
                #print u'\t'.join([inputword, outputword]).encode('utf8');
                if inputword == outputword:
                    #item['tags']+=':a';
                    filtred_data.append(item)
        return filtred_data
Пример #4
0
	def getStemVariants(self,stem,prefix,suffix):
		"""
		Generate the Noun stem variants according to the affixes.
		For example مدرستي=>مدرست+ي => مدرسة +ي.
		Return a list of possible cases.
		@param stem: the input stem.
		@type stem: unicode.
		@param prefix: prefixe.
		@type prefix: unicode.
		@param suffix: suffixe.
		@type suffix: unicode.
		@return: list of stem variants.
		@rtype: list of unicode.
		"""
		#some cases must have some correction
		#determinate the prefix and suffix types
		# create a list, the first item is the verb without changes
		prefix_possible_noun_list= set([stem])
		# Prefix
		prefix=araby.stripTashkeel(prefix);
		suffix=araby.stripTashkeel(suffix);
		possible_noun_list=prefix_possible_noun_list;
		if suffix in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA,araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
			possible_noun=stem+araby.TEH_MARBUTA;
			possible_noun_list.add(possible_noun)
		if suffix=="" or suffix==araby.YEH+araby.NOON or suffix==araby.WAW+araby.NOON:
			possible_noun=stem+araby.YEH;
			possible_noun_list.add(possible_noun)
		if stem.endswith(araby.YEH):
			possible_noun=stem[:-1]+araby.ALEF_MAKSURA;
			possible_noun_list.add(possible_noun)
		#to be validated
		validated_list=possible_noun_list;
		return validated_list
    def isPossibleCollocation(self, list2, context="", lenght=2):
        """
		Guess if the given list is a possible collocation
		This is used to collect unkown collocations, from user input
		return True oor false
		@param wordlist: word of list, 2 or more words.
		@type wordlist: list of unicode.
		@param lenght: minimum number of words in the collocation
		@type lenght: integer.		
		@return : the rule of found collocation, 100 default.
		@rtype: interger.
		"""
        if len(list2) < lenght:
            return 0
        else:
            itemV1 = list2[0]
            itemV2 = list2[1]
            item1 = araby.stripTashkeel(itemV1)
            item2 = araby.stripTashkeel(itemV2)
            #if item1[-1:] in (u".",u"?",u",",u'[', u']',u'(',')'):
            #	return 0;
            if not collocation_const.token_pat.search(
                    item1) or not collocation_const.token_pat.search(item2):
                return -1
            #else: return 100;
            elif item1 in collocation_const.ADDITIONAL_WORDS:
                return 10
            elif item1 in collocation_const.NAMED_PRIOR:
                return 15
            elif (item2 not in collocation_const.SPECIAL_DEFINED):
                if item2.startswith(u'ال') and item1.startswith(
                        u'ال'
                ):  #re.search(ur'^(ال|بال|وبال|فال|وال|لل|كال|فكال|ولل|فلل|فبال)', item1):
                    return 20
                elif item1.endswith(u'ة') and item2.startswith(u'ال'):
                    return 30

                #حالة الكلمات التي تبدأ بلام الجر والتعريف
                # لا داعي لها لأنها دائما مجرورة
                #if  item2.startswith(u'لل'):
                #	return 40;
                elif item1.endswith(u'ة') and item2.endswith(u'ة'):
                    return 40
                #if item1.endswith(u'ي') and item2.endswith(u'ي'):
                #	return 60;

                elif context != u"" and context in collocation_const.tab_noun_context and item2.startswith(
                        u'ال'):
                    return 50
                #return True;

                elif item1.endswith(u'ات') and item2.startswith(u'ال'):
                    return 60
            return 100
Пример #6
0
    def isPossibleCollocation(self, list2, context="", lenght=2):
        """
		Guess if the given list is a possible collocation
		This is used to collect unkown collocations, from user input
		return True oor false
		@param wordlist: word of list, 2 or more words.
		@type wordlist: list of unicode.
		@param lenght: minimum number of words in the collocation
		@type lenght: integer.		
		@return : the rule of found collocation, 100 default.
		@rtype: interger.
		"""
        if len(list2) < lenght:
            return 0
        else:
            itemV1 = list2[0]
            itemV2 = list2[1]
            item1 = araby.stripTashkeel(itemV1)
            item2 = araby.stripTashkeel(itemV2)
            # if item1[-1:] in (u".",u"?",u",",u'[', u']',u'(',')'):
            # 	return 0;
            if not collocation_const.token_pat.search(item1) or not collocation_const.token_pat.search(item2):
                return -1
                # else: return 100;
            elif item1 in collocation_const.ADDITIONAL_WORDS:
                return 10
            elif item1 in collocation_const.NAMED_PRIOR:
                return 15
            elif item2 not in collocation_const.SPECIAL_DEFINED:
                if item2.startswith(u"ال") and item1.startswith(
                    u"ال"
                ):  # re.search(ur'^(ال|بال|وبال|فال|وال|لل|كال|فكال|ولل|فلل|فبال)', item1):
                    return 20
                elif item1.endswith(u"ة") and item2.startswith(u"ال"):
                    return 30

                    # حالة الكلمات التي تبدأ بلام الجر والتعريف
                    # لا داعي لها لأنها دائما مجرورة
                    # if  item2.startswith(u'لل'):
                    # 	return 40;
                elif item1.endswith(u"ة") and item2.endswith(u"ة"):
                    return 40
                    # if item1.endswith(u'ي') and item2.endswith(u'ي'):
                    # 	return 60;

                elif context != u"" and context in collocation_const.tab_noun_context and item2.startswith(u"ال"):
                    return 50
                    # return True;

                elif item1.endswith(u"ات") and item2.startswith(u"ال"):
                    return 60
            return 100
Пример #7
0
	def generate_possible_conjug(self, infinitive_verb, unstemed_verb , affix, future_type=araby.FATHA, externPrefix="-", externSuffix="-", transitive=True):
		"""
		"""
	##    future_type=FATHA;
		#~ transitive=True;
		list_correct_conj=[];
		if infinitive_verb=="" or unstemed_verb=="" or affix=="":
			return set();
		verb = infinitive_verb;
		future_type = libqutrub.ar_verb.get_future_type_entree(future_type);
		#print u"\t".join([verb, future_type]).encode('utf8');
		vb = libqutrub.classverb.verbclass(verb, transitive, future_type);
		# الألف ليست جزءا من السابقة، لأنها تستعمل لمنع الابتداء بساكن
		# وتصريف الفعل في الامر يولده
		if affix.startswith(araby.ALEF): affix=affix[1:]
		# get all tenses to conjugate the verb one time
		tenses=[];
		if stem_verb_const.Table_affix.has_key(affix):
			for pair in stem_verb_const.Table_affix[affix]:
				tenses.append(pair[0]);#tense=pair[0]
		tenses=list(set(tenses)); # avoid duplicata 


		if stem_verb_const.Table_affix.has_key(affix):
			for pair in stem_verb_const.Table_affix[affix]:
				tense=pair[0]
				pronoun=pair[1]
				if self.is_compatible_proaffix_tense(externPrefix, externSuffix, tense, pronoun, transitive):

					conj_vocalized = vb.conjugateTenseForPronoun( tense, pronoun)
					#strip all marks and shadda
					conj_nm =  araby.stripTashkeel(conj_vocalized);
					if conj_nm==unstemed_verb:
						list_correct_conj.append({'verb':infinitive_verb, 'tense':tense, 'pronoun':pronoun, 'vocalized':conj_vocalized, 'unvocalized':conj_nm});
		return list_correct_conj;
Пример #8
0
	def getSuffixVariant(self, word, suffix, enclitic):
		"""
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffixes  (vocalized suffix and vocalized suffix without I'rab short mark).
		@rtype: (unicode, unicode)
		"""
		enclitic_nm=araby.stripTashkeel(enclitic)
		newSuffix =suffix; #default value
		#if the word ends by a haraka
		if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0:
			newSuffix=re.sub(araby.TEH_MARBUTA, araby.TEH, suffix);
		elif 	not enclitic_nm and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH, araby.ALEF) and araby.isHaraka(suffix):
			newSuffix=u"";
		#gererate the suffix without I'rab short mark
		# here we lookup with given suffix because the new suffix is changed and can be not found in table
		if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
			suffixNonIrabMark =araby.stripLastHaraka(newSuffix);
		else:
			suffixNonIrabMark = newSuffix
		return newSuffix, suffixNonIrabMark ;
Пример #9
0
	def getWordVariant(self, word, suffix):
		"""
		Get the word variant to be joined to the suffix.
		For example: word = مدرسة, suffix=ي. The word is converted to مدرست.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: suffix ( firts or second level).
		@type suffix: unicode.
		@return: variant of word.
		@rtype: unicode.
		"""
		word_stem=word;
		# print word.encode('utf8');
		#HARAKAT=(FATHA, DAMMA, KASRA, SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN);
		suffix_nm=araby.stripTashkeel(suffix)
		#if the word ends by a haraka
		word_stem=araby.stripLastHaraka(word_stem);

		if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (araby.ALEF+araby.TEH, araby.YEH+araby.TEH_MARBUTA, araby.YEH, araby.YEH+araby.ALEF+araby.TEH):
			word_stem=word_stem[:-1];
		elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm!=u"":
			word_stem=word_stem[:-1]+araby.TEH;
		elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm!=u"":
			word_stem = word_stem[:-1]+araby.YEH;			
		elif word_stem.endswith(araby.HAMZA) and suffix_nm!=u"":
			if suffix.startswith(araby.DAMMA):
				word_stem = word_stem[:-1] + araby.WAW_HAMZA;
			elif suffix.startswith(araby.KASRA):
				word_stem = word_stem[:-1] + araby.YEH_HAMZA;
				
		return word_stem;
Пример #10
0
    def getSuffixVariant(self, word, suffix, enclitic):
        """
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is converted to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffixes  (vocalized suffix and vocalized suffix without I'rab short mark).
		@rtype: (unicode, unicode)
		"""
        enclitic_nm = araby.stripTashkeel(enclitic)
        newSuffix = suffix
        #default value
        #if the word ends by a haraka
        if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0:
            newSuffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)
        elif not enclitic_nm and word[-1:] in (
                araby.ALEF_MAKSURA, araby.YEH,
                araby.ALEF) and araby.isHaraka(suffix):
            newSuffix = u""
        #gererate the suffix without I'rab short mark
        # here we lookup with given suffix because the new suffix is changed and can be not found in table
        if u'متحرك' in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix]['tags']:
            suffixNonIrabMark = araby.stripLastHaraka(newSuffix)
        else:
            suffixNonIrabMark = newSuffix
        return newSuffix, suffixNonIrabMark
Пример #11
0
def Comparetashkeel(text):
	import tashkeel.tashkeel as ArabicVocalizer
	# the entred text is vocalized correctly
	correct_text=text;
	text=araby.stripTashkeel(text);
	vocalizer=ArabicVocalizer.TashkeelClass();
	vocalized_text=vocalizer.tashkeel(text);
	
	# compare voalized text with a correct text
	text1=correct_text;
	text2=vocalized_text;
	# remove collocations symboles
	text2=text2.replace("'","");
	text2=text2.replace("~","");
	
	#stemmer=tashaphyne.stemming.ArabicLightStemmer()
	list1=vocalizer.analyzer.tokenize(text1);
	list2=vocalizer.analyzer.tokenize(text2);
	print u":".join(list1).encode('utf8');
	print u":".join(list2).encode('utf8');
	correct=0;
	incorrect=0;
	total=len(list1);
	if len(list1)!=len(list2):
		print "lists haven't the same length";
	else:
		for i in range(total):
			if araby.vocalizedlike(list1[i],list2[i]):
				correct+=1;
			else:
				incorrect+=1;
	
	result=[vocalized_text,"correct:%0.2f%%"%round(correct*100.00/total,2),"incorrect:%0.2f%%"%round(incorrect*100.00/total,2),total]
	return result#correct*100/total;
Пример #12
0
	def generate_possible_conjug(self, infinitive_verb, unstemed_verb , affix, future_type=araby.FATHA, externPrefix="-", externSuffix="-", transitive=True):
		"""
		"""
	##    future_type=FATHA;
		#~ transitive=True;
		list_correct_conj=[];
		if infinitive_verb=="" or unstemed_verb=="" or affix=="":
			return set();
		verb = infinitive_verb;
		future_type = ar_verb.get_future_type_entree(future_type);
		#print u"\t".join([verb, future_type]).encode('utf8');
		vb = classverb.verbclass(verb, transitive, future_type);
		# الألف ليست جزءا من السابقة، لأنها تستعمل لمنع الابتداء بساكن
		# وتصريف الفعل في الامر يولده
		if affix.startswith(araby.ALEF): affix=affix[1:]
		# get all tenses to conjugate the verb one time
		tenses=[];
		if affix in stem_verb_const.Table_affix:
			for pair in stem_verb_const.Table_affix[affix]:
				tenses.append(pair[0]);#tense=pair[0]
		tenses=list(set(tenses)); # avoid duplicata 


		if affix in stem_verb_const.Table_affix:
			for pair in stem_verb_const.Table_affix[affix]:
				tense=pair[0]
				pronoun=pair[1]
				if self.is_compatible_proaffix_tense(externPrefix, externSuffix, tense, pronoun, transitive):

					conj_vocalized = vb.conjugateTenseForPronoun( tense, pronoun)
					#strip all marks and shadda
					conj_nm =  araby.stripTashkeel(conj_vocalized);
					if conj_nm==unstemed_verb:
						list_correct_conj.append({'verb':infinitive_verb, 'tense':tense, 'pronoun':pronoun, 'vocalized':conj_vocalized, 'unvocalized':conj_nm});
		return list_correct_conj;
Пример #13
0
    def getWordVariant(self, word, suffix):
        """
		Get the word variant to be joined to the suffix.
		For example: word = ةمدرس, suffix=ي. The word is converted to مدرست.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: suffix ( firts or second level).
		@type suffix: unicode.
		@return: variant of word.
		@rtype: unicode.
		"""
        word_stem = word
        #HARAKAT=(FATHA,DAMMA,KASRA,SUKUN, DAMMA, DAMMATAN, KASRATAN, FATHATAN);
        suffix_nm = araby.stripTashkeel(suffix)
        #if the word ends by a haraka
        if word_stem[-1:] in araby.HARAKAT:
            word_stem = word_stem[:-1]
        if word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm in (
                araby.ALEF + araby.TEH, araby.YEH + araby.TEH_MARBUTA,
                araby.YEH, araby.YEH + araby.ALEF + araby.TEH):
            word_stem = word_stem[:-1]
        elif word_stem.endswith(araby.TEH_MARBUTA) and suffix_nm != u"":
            word_stem = word_stem[:-1] + araby.TEH
        elif word_stem.endswith(araby.ALEF_MAKSURA) and suffix_nm != u"":
            word_stem = word_stem[:-1] + araby.YEH
        elif word_stem.endswith(araby.HAMZA) and suffix_nm != u"":
            if suffix.startswith(araby.DAMMA):
                word_stem = word_stem[:-1] + araby.WAW_HAMZA
            elif suffix.startswith(araby.KASRA):
                word_stem = word_stem[:-1] + araby.YEH_HAMZA

        return word_stem
Пример #14
0
	def check_word(self,word, guessedTag=""):
		"""
		Analyze one word morphologically as verbs
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		word=araby.stripTatweel(word);
		word_vocalised=word;
		word_nm=araby.stripTashkeel(word);
		resulted_text=u"";
		resulted_data=[];
		# if word is a pounctuation
		resulted_data+=self.check_word_as_pounct(word_nm);
		# Done: if the word is a stop word we have  some problems,
		# the stop word can also be another normal word (verb or noun),
		# we must consider it in future works
		# if word is stopword allow stop words analysis
		resulted_data+=self.check_word_as_stopword(word_nm);

		#if word is verb
		# مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
		if  self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
			resulted_data+=self.check_word_as_verb(word_nm);
			#print "is verb", rabti,len(resulted_data);
		#if word is noun
		if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):			
			resulted_data+=self.check_word_as_noun(word_nm);
		if len(resulted_data)==0:
			#check the word as unkonwn
			resulted_data+=self.check_word_as_unknown(word_nm);
			#check if the word is nomralized and solution are equivalent
		resulted_data = self.check_normalized(word_vocalised, resulted_data)
		#check if the word is shadda like
		resulted_data = self.check_shadda(word_vocalised, resulted_data)

		#check if the word is vocalized like results			
		if self.partial_vocalization_support:
			resulted_data=self.check_partial_vocalized(word_vocalised, resulted_data);
		# add word frequency information in tags
		resulted_data = self.addWordFrequency(resulted_data);

		if len(resulted_data)==0:
			resulted_data.append(wordCase.wordCase({
			'word':word,  
			'affix': ('' , '', '', ''),       
			'stem':'',
			'original':word,
			'vocalized':word,
			'tags':u'',
			'type':'unknown',
			'root':'',
			'template':'',
			'freq':self.wordfreq.getFreq(word, 'unknown'),
			'syntax':'',
			})
			);
		return resulted_data;
Пример #15
0
	def check_word(self,word, guessedTag=""):
		"""
		Analyze one word morphologically as verbs
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		word=araby.stripTatweel(word);
		word_vocalised=word;
		word_nm=araby.stripTashkeel(word);
		resulted_text=u"";
		resulted_data=[];
		# if word is a pounctuation
		resulted_data+=self.check_word_as_pounct(word_nm);
		# Done: if the word is a stop word we have  some problems,
		# the stop word can also be another normal word (verb or noun),
		# we must consider it in future works
		# if word is stopword allow stop words analysis
		resulted_data+=self.check_word_as_stopword(word_nm);

		#if word is verb
		# مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
		if  self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
			resulted_data+=self.check_word_as_verb(word_nm);
			#print "is verb", rabti,len(resulted_data);
		#if word is noun
		if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):			
			resulted_data+=self.check_word_as_noun(word_nm);
		if len(resulted_data)==0:
			#check the word as unkonwn
			resulted_data+=self.check_word_as_unknown(word_nm);
			#check if the word is nomralized and solution are equivalent
		resulted_data = self.check_normalized(word_vocalised, resulted_data)
		#check if the word is shadda like
		resulted_data = self.check_shadda(word_vocalised, resulted_data)

		#check if the word is vocalized like results			
		if self.partial_vocalization_support:
			resulted_data=self.check_partial_vocalized(word_vocalised, resulted_data);
		# add word frequency information in tags
		resulted_data = self.addWordFrequency(resulted_data);

		if len(resulted_data)==0:
			resulted_data.append(wordCase.wordCase({
			'word':word,  
			'affix': ('' , '', '', ''),       
			'stem':'',
			'original':word,
			'vocalized':word,
			'tags':u'',
			'type':'unknown',
			'root':'',
			'template':'',
			'freq':self.wordfreq.getFreq(word, 'unknown'),
			'syntax':'',
			})
			);
		return resulted_data;
Пример #16
0
    def setVocalized(self, newvocalized):
        """
		Set the vocalized word
		@param newvocalized: the new given vocalized.
		@type newvocalized: unicode string
		"""
        self.vocalized = newvocalized
        self.unvocalized = araby.stripTashkeel(newvocalized)
Пример #17
0
	def setVocalized(self,newvocalized):
		"""
		Set the vocalized word
		@param newvocalized: the new given vocalized.
		@type newvocalized: unicode string
		"""
		self.vocalized  =  newvocalized;
		self.unvocalized  =  araby.stripTashkeel(newvocalized);
Пример #18
0
def vocalizeNamed(wordlist, synTags=""):
    """ Vocalize a number words
	@param wordlist: words to vocalize
	@type wordlist: unicode list
	@param synTags: tags about the clause
	@type synTags: unicode
	@return: the vocalized wordlist.
	@rtype: unicode
	"""
    newlist = []
    prefix = u""
    next = u""
    #detect tags
    # we can pass tags to this number word
    tags = synTags
    bin_count = 0
    for i in range(len(wordlist)):
        #save the original word with possible harakat if exist
        word = wordlist[i]
        word_nm = araby.stripTashkeel(word)
        # the first word can have prefixes
        if i == 0 and word_nm:
            # word to get majrour tag
            if word_nm in (
                    u'أبي',
                    u'بنو',
                    u'آل',
                    u'ابن',
            ):
                tags += u"مجرور"
            elif word_nm in (u'أبو', ):
                tags += u"مرفوع"
            elif word_nm in (u'أبا', ):
                tags += u"منصوب"
        # select vocalization

        if word_nm == u'بن':
            bin_count += 1
            #treat first bin according to tags
            if bin_count == 1:
                if u'مجرور' in tags:
                    voc = u'بْنِ'
                elif u'مرفوع' in tags:
                    voc = u'بْنُ'
                elif u'منصوب' in tags:
                    voc = u'بْنَ'
                else:
                    voc = u'بْن'
            else:
                #  u'مجرور'
                voc = u'بْنِ'
        #Todo Vocalize names
        else:
            voc = word
        newlist.append(voc)
    return newlist
Пример #19
0
	def add(self, word, suggestList):
		if word!=u"" and  suggestList!=[] and  type(suggestList).__name__=='list': 
			#ToDo: adding different suggestion into one list;
			# NB: this is time eater because if the word is frequent.
			# if self.dict.has_key(word):
				# # if the dict has previous suggestions for the word,
				# # add new suggestions and remove duplicata;
				# suggestList+=self.dict[word];
				# suggestList=set(suggestList);
				# self.dict[word]=suggestList;
			#else:
			self.dict[araby.stripTashkeel(word)]=suggestList;
Пример #20
0
 def add(self, word, suggestList):
     if word != u"" and suggestList != [] and type(
             suggestList).__name__ == 'list':
         #ToDo: adding different suggestion into one list;
         # NB: this is time eater because if the word is frequent.
         # if self.dict.has_key(word):
         # # if the dict has previous suggestions for the word,
         # # add new suggestions and remove duplicata;
         # suggestList+=self.dict[word];
         # suggestList=set(suggestList);
         # self.dict[word]=suggestList;
         #else:
         self.dict[araby.stripTashkeel(word)] = suggestList
Пример #21
0
def vocalizeNamed(wordlist, synTags=""):
	""" Vocalize a number words
	@param wordlist: words to vocalize
	@type wordlist: unicode list
	@param synTags: tags about the clause
	@type synTags: unicode
	@return: the vocalized wordlist.
	@rtype: unicode
	"""
	newlist=[];
	prefix=u"";
	next=u"";
	#detect tags 
	# we can pass tags to this number word
	tags= synTags;
	bin_count=0;
	for i in range(len(wordlist)):
		#save the original word with possible harakat if exist
		word=wordlist[i];
		word_nm=araby.stripTashkeel(word);
		# the first word can have prefixes 
		if i==0 and word_nm:  
			# word to get majrour tag
			if word_nm in (u'أبي', u'بنو', u'آل', u'ابن',):
				tags +=u"مجرور";
			elif word_nm in (u'أبو', ):
				tags +=u"مرفوع";
			elif word_nm in (u'أبا', ):
				tags +=u"منصوب";
		# select vocalization

		if word_nm==u'بن':
			bin_count+=1;
			#treat first bin according to tags
			if bin_count==1:
				if u'مجرور' in tags:
					voc=u'بْنِ'
				elif u'مرفوع' in tags:
					voc=u'بْنُ'
				elif u'منصوب' in tags:
					voc=u'بْنَ'
				else:
					voc=u'بْن'
			else:
				#  u'مجرور' 
				voc=u'بْنِ'
		#Todo Vocalize names
		else:
			voc=word;
		newlist.append(voc);
	return newlist;
Пример #22
0
    def getUnvOriginal(self, ):
        """
		Get the unvocalized  original form of the input word
		@return: the given unvocalized original.
		@rtype: unicode string
		"""
        if self.unvoriginal:
            return self.unvoriginal
        else:
            if self.original:
                self.unvoriginal = araby.stripTashkeel(self.original)
            else:
                return u""
            return self.unvoriginal
	def getUnvocalized(self,):
		"""
		Get the unvocalized form of the input word
		@return: the given unvocalized.
		@rtype: unicode string
		"""
		if self.unvocalized:
			return self.unvocalized;
		else:
			if self.vocalized:
				self.unvocalized=araby.stripTashkeel(self.vocalized);
			else :
				return u"";
		return self.unvocalized;
Пример #24
0
	def check_normalized(self, word_vocalised, resulted_data):
		"""
		If the entred word is like the found word in dictionary, to treat some normalized cases, 
		the analyzer return the vocalized like words;
		ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
		this function filter normalized resulted word according the given word, and give ذئب.
		@param word_vocalised: the input word.
		@type word_vocalised: unicode.
		@param resulted_data: the founded resulat from dictionary.
		@type resulted_data: list of dict.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
		#print word_vocalised.encode('utf8');
		filtred_data=[];
		inputword = araby.stripTashkeel(word_vocalised)
		for item in  resulted_data:
			vocalized = getattr(item, 'vocalized') 
			if vocalized:
				outputword = araby.stripTashkeel(vocalized)
				if inputword == outputword:
					filtred_data.append(item);
		return  filtred_data;
	def getUnvOriginal(self,):
		"""
		Get the unvocalized  original form of the input word
		@return: the given unvocalized original.
		@rtype: unicode string
		"""
		if self.unvoriginal:
			return self.unvoriginal;			
		else :
			if self.original:
				self.unvoriginal = araby.stripTashkeel(self.original);
			else:
				return u"";
			return self.unvoriginal;
Пример #26
0
    def getUnvocalized(self, ):
        """
		Get the unvocalized form of the input word
		@return: the given unvocalized.
		@rtype: unicode string
		"""
        if self.unvocalized:
            return self.unvocalized
        else:
            if self.vocalized:
                self.unvocalized = araby.stripTashkeel(self.vocalized)
            else:
                return u""
        return self.unvocalized
Пример #27
0
    def check_normalized(self, word_vocalised, resulted_data):
        """
		If the entred word is like the found word in dictionary, to treat some normalized cases, 
		the analyzer return the vocalized like words;
		ُIf the word is ذئب, the normalized form is ذءب, which can give from dictionary ذئبـ ذؤب.
		this function filter normalized resulted word according the given word, and give ذئب.
		@param word_vocalised: the input word.
		@type word_vocalised: unicode.
		@param resulted_data: the founded resulat from dictionary.
		@type resulted_data: list of dict.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        #print word_vocalised.encode('utf8');
        filtred_data = []
        inputword = araby.stripTashkeel(word_vocalised)
        for item in resulted_data:
            vocalized = getattr(item, 'vocalized')
            if vocalized:
                outputword = araby.stripTashkeel(vocalized)
                if inputword == outputword:
                    filtred_data.append(item)
        return filtred_data
Пример #28
0
    def create_index_broken_plural(self):

        """Deprecated: create index from the broken_plural dictionary
		to accelerate the search in the dictionary for broken_plural
		"""
        for key in BrokenPluralTable.keys():
            vocnoun = key
            unvnoun = araby.stripTashkeel(vocnoun)
            normnoun = normalize_hamza(unvnoun)
            # transitive=BrokenPluralTable[key]
            # stamp=noun_stamp(normnoun);
            if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normnoun):
                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun].append(vocnoun)
            else:
                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun] = [vocnoun]
Пример #29
0
    def create_index_broken_plural(self):
        """Deprecated: create index from the broken_plural dictionary
		to accelerate the search in the dictionary for broken_plural
		"""
        for key in BrokenPluralTable.keys():
            vocnoun = key
            unvnoun = araby.stripTashkeel(vocnoun)
            normnoun = normalize_hamza(unvnoun)
            #transitive=BrokenPluralTable[key]
            #stamp=noun_stamp(normnoun);
            if self.BROKENPLURAL_DICTIONARY_INDEX.has_key(normnoun):
                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun].append(vocnoun)
            else:
                self.BROKENPLURAL_DICTIONARY_INDEX[normnoun] = [
                    vocnoun,
                ]
Пример #30
0
def getPreviousTag(word):
	"""Get the word tags
	@param word: given word
	@type word: unicode
	@return :word tag
	@rtype: unicode
	"""
	word=araby.stripTashkeel(word);
	tags=u'';
	if word in named_const.NOUN_NASEB_LIST:
		return u'منصوب';
	elif word in named_const.JAR_LIST:
		return u'مجرور';
	elif word in named_const.RAFE3_LIST:
		return u'مرفوع';
	else:
		return u'';
Пример #31
0
def treatLine(line, action):
	""" treat one line at once with action"""
	global globalFreq
	if action == "extract":
		words=araby.tokenize(line);
		for word in words:
			extract(word);
	elif action =="reduce":
		line= line.strip(' ');
		fields=line.split(' ');
		if len(fields)>=2:
			freq = fields[0]
			word = fields[1]
			word_nm = araby.stripTashkeel(word);
			if WordsTab.has_key(word_nm): # the word has multiple vocalization
				WordsTab[word_nm]=False;
			else:
				WordsTab[word_nm]={'f':freq,'v':word} ;	
			globalFreq += stringToInt(freq);
Пример #32
0
	def getSuffixVariant(self,word, suffix,enclitic):
		"""
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is convert to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffix.
		@rtype: unicode.
		"""
		enclitic_nm=araby.stripTashkeel(enclitic)
		#if the word ends by a haraka
		if suffix.find(araby.TEH_MARBUTA)>=0 and len (enclitic_nm)>0:
			suffix=re.sub(araby.TEH_MARBUTA,araby.TEH,suffix);
		if 	enclitic_nm==u"" and word[-1:] in (araby.ALEF_MAKSURA, araby.YEH,araby.ALEF) and suffix in araby.HARAKAT  :
			suffix=u"";
		return suffix;
Пример #33
0
    def getSuffixVariant(self, word, suffix, enclitic):
        """
		Get the suffix variant to be joined to the word.
		For example: word = مدرس, suffix=ة, encletic=ي. The suffix is convert to Teh.
		@param word: word found in dictionary.
		@type word: unicode.
		@param suffix: second level suffix.
		@type suffix: unicode.
		@param enclitic: first level suffix.
		@type enclitic: unicode.		
		@return: variant of suffix.
		@rtype: unicode.
		"""
        enclitic_nm = araby.stripTashkeel(enclitic)
        #if the word ends by a haraka
        if suffix.find(araby.TEH_MARBUTA) >= 0 and len(enclitic_nm) > 0:
            suffix = re.sub(araby.TEH_MARBUTA, araby.TEH, suffix)
        if enclitic_nm == u"" and word[-1:] in (
                araby.ALEF_MAKSURA, araby.YEH,
                araby.ALEF) and suffix in araby.HARAKAT:
            suffix = u""
        return suffix
Пример #34
0
	def generateSuggest(self,word):
		"""
		Generate word suggestion 
		@param word: input text.
		@type word: unicode.
		@return: generated suggestion.
		rtype: list of words.
		"""
		wordlist=[word, araby.stripTashkeel(word)];
		codidates=self.edits1(word)
		for condidate in codidates:
			if True :#self.accepted(condidate):
				wordlist.append(condidate);
		# commun letters error remplacement
		for tup in spellcheck_const.TabReplacment:
			sug	=word.replace(tup[0], tup[1])
			if sug!=word: 
				# evaluate generated suggestion
				if self.accepted(sug):
					wordlist.append(sug);
		wordlist = list(set(wordlist))
		return wordlist;
Пример #35
0
	def generateSuggest(self,word):
		"""
		Generate word suggestion 
		@param word: input text.
		@type word: unicode.
		@return: generated suggestion.
		rtype: list of words.
		"""
		wordlist=[word, araby.stripTashkeel(word)];
		codidates=self.edits1(word)
		for condidate in codidates:
			if True :#self.accepted(condidate):
				wordlist.append(condidate);
		# commun letters error remplacement
		for tup in spellcheck_const.TabReplacment:
			sug	=word.replace(tup[0], tup[1])
			if sug!=word: 
				# evaluate generated suggestion
				if self.accepted(sug):
					wordlist.append(sug);
		wordlist = list(set(wordlist))
		return wordlist;
Пример #36
0
    def verbStamp(self, word):
        """
		generate a stamp for a verb, 
		the verb stamp is different of word stamp, by hamza noralization
		remove all letters which can change form in the word :
		- ALEF, 
		- YEH, 
		- WAW, 
		- ALEF_MAKSURA
		- SHADDA
		@return: stamped word
		"""
        word = araby.stripTashkeel(word)
        #The vowels are striped in stamp function
        word = araby.normalizeHamza(word)
        if word.startswith(araby.HAMZA):
            #strip The first hamza
            word = word[1:]
        # strip the last letter if is doubled
        if word[-1:] == word[-2:-1]:
            word = word[:-1]
        return self.VerbSTAMP_pat.sub('', word)
Пример #37
0
	def verbStamp(self, word):
		"""
		generate a stamp for a verb, 
		the verb stamp is different of word stamp, by hamza noralization
		remove all letters which can change form in the word :
		- ALEF, 
		- YEH, 
		- WAW, 
		- ALEF_MAKSURA
		- SHADDA
		@return: stamped word
		"""
		word=araby.stripTashkeel(word);
		#The vowels are striped in stamp function
		word=araby.normalizeHamza(word);
		if word.startswith(araby.HAMZA):
			#strip The first hamza
			word=word[1:];
		# strip the last letter if is doubled
		if word[-1:]== word[-2:-1]:
			word=word[:-1];
		return self.VerbSTAMP_pat.sub('', word)
def treatLine(line, action):
    """ treat one line at once with action"""
    global globalFreq
    if action == "extract":
        words = araby.tokenize(line)
        for word in words:
            extract(word)
    elif action == "reduce":
        line = line.strip(' ')
        fields = line.split(' ')
        if len(fields) >= 2:
            freq = fields[0]
            word = fields[1]
            word_nm = araby.stripTashkeel(word)
            if WordsTab.has_key(word_nm):  # the word has multiple vocalization
                WordsTab[word_nm] = False
            else:
                WordsTab[word_nm] = {
                    'f': freq,
                    'v': word
                }
            globalFreq += stringToInt(freq)
Пример #39
0
    def segment(self, word):
        """ generate  a list of  all posibble segmentation positions (lef,  right)  of the treated word by the stemmer.
		Example:
			>>> ArListem=ArabicLightStemmer();
			>>> word=u'فتضربين'
			>>> print ArListem.segment(word);
			set(([(1, 5), (2, 5), (0, 7)])

		@return: List of segmentation
		@rtype: set of tuple of integer.
		"""
        self.word = word
        self.unvocalized = araby.stripTashkeel(word)
        # word, harakat=araby.separate(word);
        word = re.sub("[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF,
                      word)
        # word=re.sub("[^%s%s]"%(self.prefix_letters,self.suffix_letters),self.joker,word);

        # get all lefts position of prefixes
        lefts = self.lookup_prefixes(word)
        # get all rights position of suffixes
        rights = self.lookup_suffixes(word)
        if lefts: self.left = max(lefts)
        else: self.left = -1
        if rights: self.right = min(rights)
        else:
            self.right = -1
        ln = len(word)
        self.segment_list = set([(0, ln)])
        # print lefts, rights
        for i in lefts:
            for j in rights:
                if j >= i + 2:
                    self.segment_list.add((i, j))
                # self.segment_list.add((i,j));

        # self.segment_list=list_seg;
        return self.segment_list
Пример #40
0
def Comparetashkeel(text):
    import tashkeel.tashkeel as ArabicVocalizer
    # the entred text is vocalized correctly
    correct_text = text
    text = araby.stripTashkeel(text)
    vocalizer = ArabicVocalizer.TashkeelClass()
    vocalized_text = vocalizer.tashkeel(text)

    # compare voalized text with a correct text
    text1 = correct_text
    text2 = vocalized_text
    # remove collocations symboles
    text2 = text2.replace("'", "")
    text2 = text2.replace("~", "")

    #stemmer=tashaphyne.stemming.ArabicLightStemmer()
    list1 = vocalizer.analyzer.tokenize(text1)
    list2 = vocalizer.analyzer.tokenize(text2)
    print u":".join(list1).encode('utf8')
    print u":".join(list2).encode('utf8')
    correct = 0
    incorrect = 0
    total = len(list1)
    if len(list1) != len(list2):
        print "lists haven't the same length"
    else:
        for i in range(total):
            if araby.vocalizedlike(list1[i], list2[i]):
                correct += 1
            else:
                incorrect += 1

    result = [
        vocalized_text,
        "correct:%0.2f%%" % round(correct * 100.00 / total, 2),
        "incorrect:%0.2f%%" % round(incorrect * 100.00 / total, 2), total
    ]
    return result  #correct*100/total;
Пример #41
0
	def segment(self,word):
		""" generate  a list of  all posibble segmentation positions (lef,  right)  of the treated word by the stemmer.
		Example:
			>>> ArListem=ArabicLightStemmer();
			>>> word=u'فتضربين'
			>>> print ArListem.segment(word);
			set(([(1, 5), (2, 5), (0, 7)])

		@return: List of segmentation
		@rtype: set of tuple of integer.
		"""
		self.word=word;
		self.unvocalized=araby.stripTashkeel(word);
		# word, harakat=araby.separate(word);
		word=re.sub("[%s]"%(araby.ALEF_MADDA),araby.HAMZA+araby.ALEF,word)
		# word=re.sub("[^%s%s]"%(self.prefix_letters,self.suffix_letters),self.joker,word);
		
		# get all lefts position of prefixes
		lefts=self.lookup_prefixes(word);
		# get all rights position of suffixes
		rights=self.lookup_suffixes(word);
		if lefts: self.left=max(lefts)
		else:self.left = -1
		if rights: self.right=min(rights)
		else:
			self.right = -1;
		ln=len(word)
		self.segment_list=set([(0,ln)]);
		# print lefts, rights
		for i in lefts:
			for j in rights:
				if j>=i+2:
					self.segment_list.add((i,j));
				# self.segment_list.add((i,j));
					
		# self.segment_list=list_seg;
		return self.segment_list;
Пример #42
0
def DoAction(text, action, options={}):
    if action == "DoNothing":
        return text
    elif action == "TashkeelText":
        lastmark = options.get('lastmark', "0")
        return tashkeelText(text, lastmark)
    elif action == "Tashkeel2":
        lastmark = options.get('lastmark', "0")
        return tashkeel2(text, lastmark)
    elif action == "SpellCheck":
        # lastmark= options.get('lastmark', "0");
        return spellcheck(text)
    elif action == "CompareTashkeel":
        return Comparetashkeel(text)
    elif action == "ReduceTashkeel":
        return reducedTashkeelText(text)
    if action == "Contibute":
        return text
    elif action == "StripHarakat":
        return araby.stripTashkeel(text)
    elif action == "CsvToData":
        return csv_to_python_table(text)
    elif action == "Romanize":
        return romanize(text)
    elif action == "NumberToLetters":
        return numberToLetters(text)
    elif action == "LightStemmer":
        lastmark = options.get('lastmark', "0")
        return fullStemmer(text, lastmark)
    elif action == "Tokenize":
        return token_text(text)
    elif action == "Poetry":
        return justify_poetry(text)
    elif action == "Unshape":
        import pyarabic.unshape
        return pyarabic.unshape.unshaping_text(text)
    elif action == "Affixate":
        return affixate(text)
    elif action == "Normalize":
        return normalize(text)
    elif action == "Wordtag":
        return wordtag(text)
    elif action == "Inverse":
        return inverse(text)
    elif action == "Itemize":
        return itemize(text)
    elif action == "Tabulize":
        return tabulize(text)
    elif action == "Tabbing":
        return tabbing(text)
    elif action == "Language":
        return segmentLanguage(text)
    elif action == "RandomText":
        return randomText()
    elif action == "showCollocations":
        return showCollocations(text)
    elif action == "extractNamed":
        return extractNamed(text)
    elif action == "extractNumbered":
        return extractNumbered(text)
    else:

        return text
Пример #43
0
    def guess_stem(self,word):
        """
        Detetect affixed letters based or phonetic root composition.
        In Arabic language, there are some letters which can't be adjacent in a root.
        This function return True, if the word is valid, else, return False

        @param word: the word.
        @type word: unicode.
        @return: word with a '-' to indicate the stemming position.
        @rtype: unicode
        """
    # certain roots are forbiden in arabic
    #exprimed in letters sequences
    # but this sequence can be used for affixation
    #then we can guess that this letters are affixed
    #
    #treat one prefixe letter
    # we strip harkat and shadda
        word=araby.stripTashkeel(word);
        # prefixes_letters=( araby.TEH , araby.MEEM , araby.LAM, araby.WAW , araby.BEH, araby.KAF, araby.FEH, araby.HAMZA, araby.YEH, araby.NOON )
        # prefixes_forbiden={
        # araby.ALEF_HAMZA_ABOVE:( araby.ALEF_HAMZA_ABOVE, araby.ZAH, araby.AIN, araby.GHAIN), 
        # araby.BEH:( araby.BEH, araby.FEH, araby.MEEM ), 
        # araby.TEH :( araby.THEH, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH),
        # araby.FEH:( araby.BEH, araby.FEH, araby.MEEM ), 
        # araby.KAF:( araby.JEEM, araby.DAD, araby.TAH, araby.ZAH, araby.QAF, araby.KAF), 
        # araby.LAM:( araby.REH, araby.SHEEN, araby.LAM, araby.NOON ), 
        # araby.MEEM :( araby.BEH, araby.FEH, araby.MEEM ), 
        # araby.NOON :( araby.REH, araby.LAM, araby.NOON ), 
        # araby.WAW :( araby.WAW , araby.YEH), 
        # araby.YEH:( araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.KAF, araby.HEH, araby.YEH),}

        word_guess=word;
        if len(word)>=2:
            c1=word[0];
            c2=word[1];
            if c1 in wordtag_const.prefixes_letters and ( c2 in wordtag_const.prefixes_forbiden.get(c1,'')):
                word_guess=u"%s-%s"%(c1,word[1:])
                if len(word_guess)>=4:
                    c1=word_guess[2];
                    c2=word_guess[3];
                    if c1 in wordtag_const.prefixes_letters and ( c2 in wordtag_const.prefixes_forbiden[c1]):
                        word_guess=u"%s-%s"%(c1,word_guess[2:])




    # # treat two suffixe letters
        # bisuffixes_letters=(araby.KAF+araby.MEEM ,araby.KAF+araby.NOON ,araby.HEH+araby.MEEM ,araby.HEH+araby.NOON )

        # bisuffixes_forbiden={
        # araby.HEH+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH), 
        # araby.KAF+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.KHAH, araby.ZAIN, araby.SEEN , araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.FEH, araby.QAF, araby.KAF, araby.LAM, araby.NOON , araby.HEH, araby.YEH), 
        # araby.HEH+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH), 
        # araby.KAF+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.QAF, araby.KAF, araby.NOON , araby.HEH, araby.YEH), 

            # }
    ##    word_guess=word;
        word=word_guess;
        if len(word)>=3:
            bc_last=word[-2:];
            bc_blast=word[-3:-2]
            if bc_last in wordtag_const.bisuffixes_letters:
                if bc_blast in wordtag_const.bisuffixes_forbiden[bc_last]:
                    word_guess=u"%s-%s"%(word[:-2],bc_last)

    # # treat one suffixe letters
        # suffixes_letters=(araby.KAF,araby.TEH ,araby.HEH)

        # suffixes_forbiden={
        # araby.TEH :(araby.THEH, araby.JEEM, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.TAH, araby.ZAH), 
        # araby.KAF:(araby.THEH, araby.JEEM, araby.KHAH, araby.THAL, araby.TAH, araby.ZAH, araby.GHAIN, araby.QAF),
        # araby.HEH:(araby.TEH , araby.HAH, araby.KHAH, araby.DAL, araby.REH, araby.SEEN , araby.SHEEN, araby.SAD, araby.ZAH, araby.AIN, araby.GHAIN), 
            # }
        word=word_guess;
        c_last=word[-1:];
        c_blast=word[-2:-1]
        if c_last in wordtag_const.suffixes_letters:
            if c_blast in wordtag_const.suffixes_forbiden[c_last]:
                word_guess=u"%s-%s"%(word[:-1],c_last)


        return word_guess;
Пример #44
0
 def check(self, word):
     key = araby.stripTashkeel(word)
     if self.dict.has_key(key):
         return False
     else:
         return True
Пример #45
0
def detectNamedPosition(wordlist):
	"""
	Detect named enteties words in a text and return positions of each phrase.
	@param wordlist: wordlist
	@type wordlist: unicode list
	@return : list of numbers clause positions [(start,end),(start2,end2),]
	@rtype: list of tuple
	>>> detectNamedPosition(u"قال خالد بن رافع  حدثني أحمد بن عنبر عن خاله");
	((1,3), (6,8))
	"""
	wordlist#=text.split(u' ');
	#print words;
	positions = [];
	startNamed =-1;
	endNamed   =False;
	# print u":".join(wordlist).encode('utf8');
	for i in range(len(wordlist)):
		word=wordlist[i];
		if i+1<len(wordlist):
			next=araby.stripTashkeel(wordlist[i+1]);
		else: next=u''
		if i-1>=0: 
			previous=araby.stripTashkeel(wordlist[i-1]);
			if previous and startNamed<0  and previous[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
				previous=previous[1:];
		else: previous = u''
		#save the original word with possible harakat if exist
		word_nm=araby.stripTashkeel(word);
		key=word_nm;
		# the first word can have prefixes 
		if word_nm and startNamed<0  and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			key=word_nm[1:];
		if startNamed<0 and key in (u'ابن', ):
			startNamed=i;
			endNamed=i

		elif key in (u'ابن', u'بن',u'أبو',u'أبا', u'أبي', u'عبد' , u'عبيد' , u'بنو', u'بني', u'بنت'):
			if startNamed<0:
				startNamed=i;
			endNamed=i
	
		elif previous in (u'بن', u'ابن', u'أبو',u'أبا', u'أبي', u'عبد', u'عبيد', u'بنو', u'بني', u'بنت'):
			if startNamed<0:
				startNamed=i-1;
			endNamed=i
		elif next in (u'بن', u'بنت',): #  u'أبو', u'أبي', u'ابا',) :#or word in (u'الدين',):
			if startNamed<0:
				startNamed=i;
			endNamed=i
		# if the word is a proper noun
		elif startNamed<0 and isProperNoun(key):
			startNamed=i;
			endNamed=i
		else:
			if startNamed>=0: #There are a previous number phrase.
				if word_nm.startswith(u'ال') and word_nm.endswith(u'ي'):
					# add family name إضافة الكنية
					endNamed=i

				positions.append((startNamed, endNamed));
			startNamed=-1;
	# add the final phrases 
	if startNamed>=0: #There are a previous number phrase.
		positions.append((startNamed, endNamed));
	return positions
Пример #46
0
    def transformToStars(self, word):
        """
		Transform all non affixation letters into a star.
		the star is a joker(by default '*'). which indicates that the correspandent letter is an original.
		this function is used by the stmmer to identify original letters., and return a stared form and stemming positions (left, right)
		Example:
			>>> ArListem=ArabicLightStemmer();
			>>> word=u'أفتضاربانني'
			>>> starword,left, right=ArListem.transformToStrars(word);
			(أفت*ا**انني, 3, 6)

		@param word: the input word.
		@type word: unicode
		@return: (starword,left, right):
			- starword : all original letters converted into a star
			- left : the greater possible left stemming position.
			- right : the greater possible right stemming position.
		@rtype: tuple.
		"""
        self.word = word
        word = araby.stripTashkeel(word)
        # word, harakat=araby.separate(word);
        self.unvocalized = word
        word = re.sub("[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF,
                      word)
        word = re.sub("[^%s%s]" % (self.prefix_letters, self.suffix_letters),
                      self.joker, word)
        ln = len(word)
        left = word.find(self.joker)
        right = word.rfind(self.joker)
        if left >= 0:
            left = min(left, self.max_prefix_length - 1)
            right = max(right + 1,
                        len(word) - self.max_suffix_length)
            prefix = word[:left]
            stem = word[left:right]
            suffix = word[right:]
            prefix = re.sub("[^%s]" % self.prefix_letters, self.joker, prefix)
            # avoid null infixes
            if (self.infix_letters != u""):
                stem = re.sub("[^%s]" % self.infix_letters, self.joker, stem)
            suffix = re.sub("[^%s]" % self.suffix_letters, self.joker, suffix)
            word = prefix + stem + suffix

        left = word.find(self.joker)
        right = word.rfind(self.joker)
        # prefix_list=self.PREFIX_LIST;
        # suffix_list=self.SUFFIX_LIST;

        if left < 0:
            left = min(self.max_prefix_length,
                       len(word) - 2)
        if left >= 0:
            prefix = word[:left]
            while prefix != "" and prefix not in self.prefix_list:
                prefix = prefix[:-1]
            if right < 0:
                right = max(len(prefix), len(word) - self.max_suffix_length)
            suffix = word[right:]

            while suffix != "" and suffix not in self.suffix_list:
                suffix = suffix[1:]
            left = len(prefix)
            right = len(word) - len(suffix)
            stem = word[left:right]
            # convert stem into  stars.
            # a stem must starts with alef, or end with alef.
            # any other infixes letter isnt infixe at the border of the stem.
            #substitute all non infixes letters
            if self.infix_letters != "":
                stem = re.sub("[^%s]" % self.infix_letters, self.joker, stem)

            # substitube teh in infixes the teh mst be in the first  or second place, all others, are converted
            #
            # stem=stem[:2]+re.sub(TEH,self.joker,stem[2:])
            word = prefix + stem + suffix
# store result
        self.left = left
        self.right = right
        self.starword = word
        self.extract_root()
        # return starword, left, right position of stem
        return (word, left, right)
Пример #47
0
import tashkeel
if __name__ == '__main__':
	filename, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs()
	#filename="samples/randomtext.txt"	
	try:
		myfile=open(filename)
	except:
		print " Can't Open the given File ", filename;

	counter=1;
	if not limit : 
		limit=	100000000
	nolimit = False;
	correct=0;
	total=0;
	line=(myfile.readline()).decode('utf8');
	while line and (nolimit or counter<=limit):
		unvocline= araby.stripTashkeel(line);
		vocalized=pyarabic.number.preTashkeelNumber(araby.tokenize(unvocline));
		vocalized=u' '.join(vocalized);
		if vocalized!=unvocline:
			total+=1;
			sim = araby.vocalizedSimilarity(vocalized, araby.stripShadda( line));
			if sim>=0: correct+=1;
			#		for res in result:
			if sim<0:
				print u"\t".join([str(sim),str(counter),str(len(vocalized)),str(len(line)),vocalized, line]).encode('utf8');
		#get the next line
		line=(myfile.readline()).decode('utf8');
		counter+=1;
	print correct, total, round(correct*100.00/total,2)
Пример #48
0
if __name__ == '__main__':
	#import number as ArabicNumberToLetters
	texts=[
	u"وجد عبد الله بن عمر دينارا",
	
	u"جاء  خالد بن الوليد وقاتل مسيلمة بن حذام الكذاب في موقعة الحديقة", 
	u'روى أحمد بن عقيل الشامي عن أبي طلحة المغربي أنّ عقابا بن مسعود بن أبي سعاد قال',
	u"""
6 :* حَديثُ عَمٍّ: فَرَجُ سَقْفِ بَيْتِي وَأَنَا بِمَكَّةٍ ، فَنَزَلَ جِبْرِيلُ ، فَفَرَجُ صَدْرِي ، ثُمَّ غَسَلَهُ مِنْ مَاءِ زَمْزَمَ ، ثُمَّ جَاءَ بِطَسْتِ مَمْلُوءِ حِكْمَةِ وَإيمَانَا فَأُفْرِغُهَا فِي صَدْرِي ، ثُمَّ أَطُبِّقَهُ قَالَ عَبْدُ اللهِ بْن أَحَمْدٌ: حَدِّثِنَّي مُحَمَّدَ بْن عَبَّادٍ الْمَكِّيُّ ، ثِنَا أَبُو ضَمْرَةٌ ، عَنْ يُونِسٍ ، عَنِ الزَّهْرِيِ ، عَنْ أُنْسٍ: كَانَ أَبِي يُحَدِّثُ بِمَا هُنَا وَحَدِّثِنَّي مُحَمَّدَ بْن إسحاق بْن مُحَمَّدِ المسيبي ، ثِنَا أَنَسُ بْن عياض ، عَنْ يُونُسُ بْن يَزِيدُ ، قَالٌ: قَالَ اِبْنُ شِهَابٍ: قَالَ أَنَسُ بْن مَالِكٍ: كَانَ أَبِي بْن كَعْبِ يَحْدُثُ ، فَذُكِرَ حَديثُ الْإِسْراءِ بِطُولِهِ ، وَفِيه: قَالَ الزُّهْرِيُّ: وَأَخْبَرَنِي اِبْنُ حَزْمٍ ، أَنَّ اِبْنَ عَبَّاسٍ ، وَأَبَا حَبَّةُ الْأَنْصارِيِ يَقُولَانِّ: قَالَ رَسُولُ اللهِ ، صَلَّى اللهُ عَلَيه وَسَلَّمُ: ثَمَّ عَرَجِ بِي حَتَّى ظَهَرْتِ لِمُسْتَوى أَسْمَعُ صَرِيفَ الْأَقْلاَمِ وَفِيه قَالَ الزُّهْرِيُّ: قَالَ اِبْنُ حَزْمٍ ، وَأَنَسُ بْن مَالِكٍ: قَالَ رَسُولُ اللهِ صَلَّى اللهُ عَلَيه وَسَلَّمُ: فَرَضَ اللَّهُ عَلَى أمتي خَمْسِينَ صَلاَةٌ ، فَرَجَعْتِ بِذَلِكَ حَتَّى أَمْرِ عَلَى مُوسى الْحَديثِ ، تَفْرُدُ بِهِ .( 1 / 6)
2	71.16%	83.07%	92	54	154	319	69.85%	81.62%	 28: حَديثُ كَمْ حَمُ: فِي هَذِهِ الْآيَةَ :{ وَإِذْ أَخَذَ رَبُّكَ مِنْ بُنِّيِّ آدَمِ مِنْ ظُهورِهُمْ ذَرِّيَّتِهُمْ } الْآيَةُ ، قَالٌ: جَمْعُهُمْ لَهُ يَوْمَئِذٍ جَمِيعًا فَجَعَلَهُمْ أَرَواحًا ثَمَّ صُورِهُمْ وَاِسْتَنْطَقُهُمْ الْحَديثِ ، وَفِيه قَوْلُ آدَمِ: رُبَّ لَوْ سُوِّيتِ بَيْنَ عِبَادِكَ ، قَالٌ: إِنَِّي أَحُبَّ أَنْ أَشْكَرَ ، وَفِيه ذِكْرُ عِيسَى اِبْنُ مَرْيَمٍ ، وَقَوْلُ أَبِي بْن كَعْبٍ: إِنَّ الرَّوْحَ دُخِلَ مِنْ فِي مَرْيَمِ كَمْ فِي تَفْسِيرِ الْأَعْرَافِ: أَنَا أَبُو جَعْفَرٍ مُحَمَّدُ بْن عَلِيٍّ الشَّيْبانِيُّ ، أَنَا أَحُمِدَ بْن حازِمٍ ، ثِنَا عَبِيدَ اللهِ بْن مُوسى ، ثِنَا أَبُو جَعْفَرٌ ، عَنِ الرَّبِيعُ بْن أُنْسٍ ، عَنْ أَبِي الْعَالِيَةَ ، عَنْ أَبِي بِطُولِهِ وَرَوَاهُ عَبْدُ اللهِ بْن أَحَمْدَ فِي زِيادَاتِهِ: حَدِّثِنَّي مُحَمَّدَ بْن يَعْقُوبِ الرَّبالِيِ ، ثِنَا الْمُعْتَمِرُ بْن سَلِيمَانِ ، سَمِعْتِ أَبِي يُحَدِّثُ عَنِ الرَّبِيعِ ، بِهِ.
3	72.39%	85.31%	156	83	242	565	73.98%	88.21%	 44 :* حَديثُ حُبِّ حَمُ عَمٌّ: قَالَ لِي جِبْرِيلُ :{ قُلْ أَعُوذُ بِرَبِّ الْفَلْقِ } فَقِلْتِهَا الْحَديثَ حُبٌّ: فِي الْعَشْرَيْنِ مِنَ الثَّالِثِ: أَنَا عِمْرَانُ بْن مُوسى ، ثِنَا هُدْبَةُ بْن خَالِدٍ ، ثِنَا حَمَّادُ بْن سلمةٍ ، عَنْ عَاصِمٍ ، عَنْ زِرٍّ: قُلْتِ لِأَبِي بْن كَعْبٍ: إِنَّ اِبْنَ مَسْعُودِ لَا يَكْتُبْ فِي مُصْحَفِهِ المعوذتين فَقَالَ أَبِي: قَالَ لِي رَسُولُ اللهِ: قَالَ لِي جِبْرِيلُ فَذَكَرَهُ رَوَاهُ أَحْمَدُ: عَنْ أَبِي بِكَرِّ بْن عَيّاشٍ ، عَنْ عَاصِمِ بِلَفْظٍ: قُلْتِ لِأَبِي: إِنَّ عَبْدَ اللهِ يَقُولُ فِي المعوذتين فَقَالَ أَبِي: سَأَلَنَا عَنْهُمَا رَسُولُ اللهِ ، فَقَالٌ: قَيَّلَ لِي: قَلَّ وَأَنَا أَقُولُ كَمَا قَالَ وَعَنْ وكيع ، وَعَبْدُ الرَّحْمَنِ بْن مَهْدِي كِلَاهُمَا ، عَنْ سُفْيانٍ ، وَعَنْ مُحَمَّدِ بْن جَعْفَرٍ ، عَنْ شُعْبَةِ وَعَنْ عَفّانٍ ، عَنْ حَمَّادُ بْن سلمةٍ ، وَأَبِي عَوانَةٌ ، فَرَقَهُمَا ، كلَهُمْ عَنْ عَاصِمِ وَعَنْ سُفْيانِ بْن عيينة ، عَنْ عَبْدَةُ بْن أَبِي لُبَابَةٌ ، وَعَاصِمُ وَعَنْ عَبْدِ الرَّحْمَنِ بْن مَهْدِيٍّ ، عَنْ سُفْيانٍ ، عَنِ الزُّبَيْرِ بْن عِدِّيِ ، عَنْ أَبِي رَزينٌ ، ثلاثتهم عَنْ زِرِّ وَقَالَ عَبْدُ اللهِ: حَدِّثِنَّي مُحَمَّدَ بْن الحسين بْن إشكاب ، ثِنَا مُحَمَّدَ بْن أَبِي عُبَيْدَةُ بْن مِعْنَ ، ثِنَا أَبِي ، عَنِ الْأعْمَشِ ، عَنْ أَبِي إسْحَاقُ ، عَنْ عَبْدِ الرَّحْمَنِ بْن يَزِيدُ ، قَالٌ: كَانَ عَبْدُ اللهِ يَحُكُّ المعوذتين مِنْ مَصَاحِفِهِ وَيَقُولُ: إِنَّهُمَا لَيْسَتَا مِنْ كِتَابِ اللهِ قَالِ الْأعْمَشِ: وَثَنَا عَاصِمُ ، عَنْ زِرِّ فَذكرِ نَحْوَ الْأَوَّلِ .( 1 / 16)
4	74.60%	85.77%	207	116	321	815	79.60%	86.80%	 54 :* حَديثُ كَمْ حَمُ عَمٌّ: إِذَا كَانَ يَوْمُ الْقِيَامَةِ كِنْتِ إمَامَ النَّبِيِّينَ وَخَطِيبُهُمْ وَصَاحِبُ شَفَاعَتِهُمْ ، غَيْرَ فَخْرُ كَمْ فِي الْإيمَانِ: ثِنَا الْحُسَيْنُ بْن الْحُسْنِ الطَّوْسِيِ ، ثِنَا أَبُو حاتِمٍ الرّازِيُّ ، ثِنَا عَبْدَ اللهِ بْن جَعْفَرٍ الرَّقِّيُّ ، ثِنَا عَبِيدَ اللهِ بْن عَمْروِ وَعَنْ مُحَمَّدِ بْن صَالِحِ بْن هَانِئٍ ، ثِنَا السَّرِيُّ بْن خَزِيمَةٍ ، ثِنَا أَبُو حُذَيْفَةُ النَّهْدِيِ ، ثِنَا زُهَيْرُ بْن مُحَمَّدٍ ، كِلَاهُمَا عَنْ عَبْدِ اللهِ بْن مُحَمَّدِ بْن عَقِيلٍ ، عَنِ الطفيل بْن أَبِي بْن كَعْبٍ ، عَنْ أَبِيه ، بِهِ وَقَالٌ: صَحِيحُ الْإِسْنادِ وَلَمْ يُخْرِجَاهُ لِتَفَرُّدِ اِبْنِ عَقِيلِ بِهِ لَمَّا نَسْبِ إِلَيه مِنْ سُوءِ الْحِفْظِ ، وَهُوَ عِنْدَ أئِمَّتُنَا مِنَ الْمُتَقَدِّمِينَ ثِقَةُ مَأْمُونِ وَفِي الْفَضَائِلِ: أَنَا الْقَطِيعِيُّ ، ثِنَا عَبْدَ اللهِ بْن أَحَمْدٌ ، حَدَّثَنِي أُبَيُّ ، ثِنَا عَبْدَ الرَّحْمَنِ ، وَهُوَ اِبْنُ مَهْدِيٍّ ، ثِنَا زُهَيْرُ بْن مُحَمَّدٍ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدٍ ، بِهِ وَرَوَاهُ الْإمَامُ أَحْمَدُ: عَنْ أَبِي عَامِرٌ ، عَنْ زُهَيْرٍ ، يَعْنِي: اِبْنُ مُحَمَّدٍ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدٍ ، بِهِ وَعَنْ زَكَرِيّا بْن عِدِّيِ ، وَأَحْمَدُ بْن عَبْدِ الْمَلِكِ الْحَرَّانِيِ ، كِلَاهُمَا عَنْ عَبِيدِ اللهِ بْن عَمْروٍ ، بِهِ وَعَنْ أَبِي أَحْمَدَ الزُّبَيْرِيُّ ، عَنْ شَرِيكِ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدٍ ، بِهِ وَرَوَاهُ اِبْنُهُ عَبْدُ اللهِ فِي زِيادَاتِهِ: حَدَّثَنِي عُبَيْدُ اللَّهِ الْقَوَارِيرِيُّ ، ثِنَا مُحَمَّدَ بْن عَبْدِ اللهِ بْن الزُّبَيْرِ ، ثِنَا شَرِيكُ ، بِهِ وَقَالَ أيضا: ثِنَا هَاشِمُ بْن الْحارِثِ ، ثِنَا عَبِيدَ اللهِ بْن عَمْروٍ ، بِهِ وَحَدِّثِنَّي ( 1 / 24)
5	75.54%	85.94%	228	131	354	932	82.05%	87.18%	 56 :* حَديثُ كَمْ حَمُ: بَيَّنَا نَحْنُ فِي صَلاَةِ الظَّهيرَةِ وَالنَّاسَ فِي الصُّفُوفِ فَرَأَيْنَاهُ يَتَنَاوَلُ شِيئَا الْحَديثَ كَمْ فِي الْأَهْوَالِ: أَنَا عَبْدُ الرَّحْمَنِ بْن حَمْدانٍ ، ثِنَا هِلاَلُ بْن الْعَلاءِ ، ثِنَا أَبِي ، ثِنَا عَبِيدَ اللهِ بْن عَمْروٍ ، عَنْ عَبْدِ اللهِ بْن مُحَمَّدِ بْن عَقِيلٍ ، عَنِ الطفيل بْن أَبِي بْن كَعْبٍ ، عَنْ أَبِيه ، وَقَالٌ: صَحِيحُ الْإِسْنادِ رَوَاهُ أَحْمَدُ بِطُولِهِ: عَنْ أَحُمِدَ بْن عَبْدِ الْمَلِكِ بْن واقد الْحَرَّانِيِ ، عَنْ عَبِيدِ اللهِ بْن عَمْروٍ ، بِهِ قُلْتُ: رواه زَكَرِيّا بْن عِدِّيِ ، عَنْ عَبِيدِ اللهِ بْن عَمْروٍ ، فَقَالٌ: عَنْ عَبْدِ اللهِ بْن مُحَمَّدِ بْن عَقِيلٍ ، عَنْ جَابِرِ وَأَخْرَجَهُ أَحْمَدُ ، أيضا: عَنْ زَكَرِيّا.
6	75.46%	86.02%	265	151	403	1080	75.00%	86.49%	 68 :* عَبْدُ اللهِ بْن رباحٍ ، عَنْ أَبِي حَديثُ كَمْ م حَمُ عَمٌّ: قَالَ لِي رَسُولُ اللهِ ، صَلَّى اللهُ عَلَيه وَسَلَّمُ: أَيُّ آيَةِ فِي كِتَابِ اللهِ أُعْظِمُ ؟ قَالٌ: قُلْتِ :{ اللهُ لَا إلَهُ إلّا هُوَ الْحَيُّ الْقَيُّومَ } قَالٌ: فَضَرْبُ صَدْرِي وَقَالٌ: لِيَهِنُكَ الْعِلْمَ أَبَا الْمُنْذِرَ كَمْ فِي الْمَعْرِفَةِ: ثِنَا أَبُو عَبْدُ اللَّهِ الْحافِظُ ، ثِنَا إبراهيم بْن عَبْدِ اللهِ ، ثِنَا يَزِيدُ بْن هارُونٍ ، أَنَا الْجَرِيرِيِ ، عَنْ أَبِي السَّلِيلَ ، عَنْ عَبْدِ اللهِ بْن رباحٍ ، عَنْه ، بِهَذَا قُلْتُ: هُوَ فِي مُسْلِمٍ ، فَلَا يُسْتَدْرَكُ وَرَوْاهُ الْإمَامَ أَحْمَدُ: ثِنَا عَبْدَ الرَّزَّاقِ ، أَنَا سُفْيانٌ ، عَنْ سَعِيدُ الْجَرِيرِيِ ، بِهِ وَرَوَاهُ اِبْنُهُ عَبْدُ اللهِ ، فِي زِيادَاتِهِ: حَدَّثَنِي عُبَيْدُ اللَّهِ الْقَوَارِيرِيُّ ، ثِنَا جَعْفَرُ بْن سَلِيمَانِ ، ثِنَا الْجُرَيْرِيُّ ، عَنْ بَعْضُ أَصْحَابِهِ ، عَنْ عَبْدِ اللهِ بْن رباحٍ ، بِهِ.	
	""",
	u"قال مُحَمَّدُ بْنُ خَالِدُ بْنُ إسماعيلفي حديثه",
	u"ِنْصَرَفْنَا إِلَى أَنَسُ بْنُ مَالِكَ الْحَديثِ"
	];
	for text in texts:
		positions = detectNamedPosition(text.split(' '));
		print(positions);
		# result=extractNamed(text);
		# print u"\t".join(result).encode('utf8');
		# result= extractNamedWithinContext(text);
		text=araby.stripTashkeel(text);
		result= preTashkeelNamed(araby.tokenize(text));
		print(u' '.join(result).encode('utf8'));
		# result=detectNamed(text);
		# print u"\t".join(result).encode('utf8');

Пример #49
0
def test():
	filename, text,  stripTashkeel, reducedTashkeel, disableSyntax, disableSemantic, disableStat, ignore, limit, compare =grabargs()
	#filename="samples/randomtext.txt"	
	if not text and not filename:
		usage()
		sys.exit(0)
		
	if not text:
		try:
			myfile=open(filename)
		except:
			print " Can't Open the given File ", filename;
			sys.exit();
	else:
		lines = text.split('\n');
	# all things are well, import library
	import core.adaat 
	import pyarabic.araby as araby

	counter=1;
	if not limit : 
		limit=	100000000
	if not stripTashkeel: 
		vocalizer=ArabicVocalizer.TashkeelClass();
		if ignore : 
			vocalizer.disableLastMark();
		if disableSemantic:
			vocalizer.disableSemanticAnalysis();
		if disableSyntax:
			vocalizer.disableSyntaxicAnalysis();
		if disableStat:
			vocalizer.disableStatTashkeel();

	#vocalizer.disableShowCollocationMark();
	#print "show delimiter", vocalizer.collo.showDelimiter;
	#nolimit = True;
	nolimit = False;
	if not text:
		line=(myfile.readline()).decode('utf8');
	else:
		if len(lines)>0:
			line= lines[0];
	correct=0;
	incorrect=0;
	total=0;
	totLetters =0;
	LettersError =0
	WLMIncorrect =0;
	if compare:
		#dispaly stats for the current line
		print "id\tfully Correct\tStrip Correct\tfully WER\tStrip WER\tLER\tTotal\tline Fully correct\tline Strip correct"
		
		# print "Full\tPartial\tFull correct \tfull incorrect\tpartial correct\tpartial incorrect\tWER\tLER\tTotal"
	
	while line and (nolimit or counter<=limit):
		if not line.startswith('#'):
			# lineIncorrect = 0;
			lineCorrect   = 0;
			lineWLMIncorrect =0;
			if stripTashkeel:
				result = araby.stripTashkeel(line);
			else:	#vocalize line by line
				if compare:
					vocalizedLine = line;
					line = araby.stripTashkeel(line)
				result=vocalizer.tashkeel(line);
				#compare resultLine and vocalizedLine
				if compare:
					list1=vocalizer.analyzer.tokenize(vocalizedLine);
					list2=vocalizer.analyzer.tokenize(result);
					#print u":".join(list1).encode('utf8');
					#print u":".join(list2).encode('utf8');
					total+=len(list1);
					lineTotal = len(list1);
					if len(list1)!=len(list2):
						print "lists haven't the same length";
					else:
						for i in range(len(list1)):
							simi = araby.vocalizedSimilarity(list1[i],list2[i]);
							if simi<0:
								LettersError+= -simi;
								incorrect   +=1;
								# lineIncorrect += 1;
								# evaluation without last haraka
								simi2 = araby.vocalizedSimilarity(araby.stripLastHaraka(list1[i]),araby.stripLastHaraka(list2[i]));
								if simi2<0: 
									WLMIncorrect    +=1;
									lineWLMIncorrect+=1;								

							else:
								correct+=1;
								lineCorrect += 1;
					
			#compare resultLine and vocalizedLine
			if reducedTashkeel:
				result= araby.reduceTashkeel(result)
			# print result.encode('utf8');
			counter+=1;

			#display stat for every line
			if compare:
				print "%d\t%0.2f%%\t%0.2f%%\t%d\t%d\t%d\t%d\t"%(
						counter-1,#id
						round(correct*100.00/total,2),#fully Correct
						round((total-WLMIncorrect)*100.00/total,2),#Strip Correct
						incorrect,#fully WER
						WLMIncorrect,#Strip WER
						LettersError,#LER
						total,#Total
						),
				if lineTotal:
					print "%0.2f%%\t"%round(lineCorrect*100.00/lineTotal,2),#line Fully correct
					print "%0.2f%%\t"%round((lineTotal-lineWLMIncorrect)*100.00/lineTotal,2),#line Strip correct
						
			print result.encode('utf8');
		#get the next line
		if not text:
			line=(myfile.readline()).decode('utf8');
		else:
			if counter<len(lines):
				line= lines[counter];
			else:
				line =None;
Пример #50
0
    def check_word(self, word, guessedTag=""):
        """
		Analyze one word morphologically as verbs
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        word = araby.stripTatweel(word)
        word_vocalised = word
        word_nm = araby.stripTashkeel(word)
        resulted_text = u""
        resulted_data = []
        # if word is a pounctuation
        resulted_data += self.check_word_as_pounct(word_nm)
        # Done: if the word is a stop word we have  some problems,
        # the stop word can also be another normal word (verb or noun),
        # we must consider it in future works
        # if word is stopword allow stop words analysis
        resulted_data += self.check_word_as_stopword(word_nm)

        # if word is verb
        # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
        if self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
            resulted_data += self.check_word_as_verb(word_nm)
            # print "is verb", rabti,len(resulted_data);
            # if word is noun
        if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
            resulted_data += self.check_word_as_noun(word_nm)
        if len(resulted_data) == 0:
            # check the word as unkonwn
            resulted_data += self.check_word_as_unknown(word_nm)
            # check if the word is nomralized and sollution are equivalent
        resulted_data = self.check_normalized(word_vocalised, resulted_data)
        # check if the word is shadda like
        resulted_data = self.check_shadda(word_vocalised, resulted_data)

        # check if the word is vocalized like results
        if self.partial_vocalization_support:
            resulted_data = self.check_partial_vocalized(word_vocalised, resulted_data)
            # add word frequency information in tags
        resulted_data = self.addWordFrequency(resulted_data)

        if len(resulted_data) == 0:
            resulted_data.append(
                stemmedword.stemmedWord(
                    {
                        "word": word,
                        "procletic": "",
                        "encletic": "",
                        "prefix": "",
                        "suffix": "",
                        "stem": "",
                        "original": word,
                        "vocalized": word,
                        "tags": u"",
                        "type": "unknown",
                        "root": "",
                        "template": "",
                        "freq": self.wordfreq.getFreq(word, "unknown"),
                        "syntax": "",
                    }
                )
            )
        return resulted_data
Пример #51
0
	def check(self, word):
		key=araby.stripTashkeel(word);
		if self.dict.has_key(key):
			return False;
		else:
			return True;
Пример #52
0
	def suggest(self, word):
		key=araby.stripTashkeel(word)
		if self.dict.has_key(key):
			return self.dict[key];
		return [];
Пример #53
0
    filename, disableSyntax, disableSemantic, disableStat, ignore, limit, compare = grabargs(
    )
    #filename="samples/randomtext.txt"
    try:
        myfile = open(filename)
    except:
        print " Can't Open the given File ", filename

    counter = 1
    if not limit:
        limit = 100000000
    nolimit = False
    correct = 0
    total = 0
    line = (myfile.readline()).decode('utf8')
    while line and (nolimit or counter <= limit):
        unvocline = araby.stripTashkeel(line)
        # named=core.named.extractNamed(unvocline);
        # for n in named:
        # print u"\t".join([str(counter),n]).encode('utf8');

        named = core.named.extractNamedWithinContext(line)
        # print named
        for n in named:
            #display context (previous, named, next)
            print u"\t".join([str(counter), u'\t'.join(n)]).encode('utf8')

        #get the next line
        line = (myfile.readline()).decode('utf8')
        counter += 1
    #print correct, total, round(correct*100.00/total,2)
Пример #54
0
    def guess_stem(self, word):
        """
        Detetect affixed letters based or phonetic root composition.
        In Arabic language, there are some letters which can't be adjacent in a root.
        This function return True, if the word is valid, else, return False

        @param word: the word.
        @type word: unicode.
        @return: word with a '-' to indicate the stemming position.
        @rtype: unicode
        """
        # certain roots are forbiden in arabic
        #exprimed in letters sequences
        # but this sequence can be used for affixation
        #then we can guess that this letters are affixed
        #
        #treat one prefixe letter
        # we strip harkat and shadda
        word = araby.stripTashkeel(word)
        # prefixes_letters=( araby.TEH , araby.MEEM , araby.LAM, araby.WAW , araby.BEH, araby.KAF, araby.FEH, araby.HAMZA, araby.YEH, araby.NOON )
        # prefixes_forbiden={
        # araby.ALEF_HAMZA_ABOVE:( araby.ALEF_HAMZA_ABOVE, araby.ZAH, araby.AIN, araby.GHAIN),
        # araby.BEH:( araby.BEH, araby.FEH, araby.MEEM ),
        # araby.TEH :( araby.THEH, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH),
        # araby.FEH:( araby.BEH, araby.FEH, araby.MEEM ),
        # araby.KAF:( araby.JEEM, araby.DAD, araby.TAH, araby.ZAH, araby.QAF, araby.KAF),
        # araby.LAM:( araby.REH, araby.SHEEN, araby.LAM, araby.NOON ),
        # araby.MEEM :( araby.BEH, araby.FEH, araby.MEEM ),
        # araby.NOON :( araby.REH, araby.LAM, araby.NOON ),
        # araby.WAW :( araby.WAW , araby.YEH),
        # araby.YEH:( araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.ZAIN, araby.SHEEN, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.KAF, araby.HEH, araby.YEH),}

        word_guess = word
        if len(word) >= 2:
            c1 = word[0]
            c2 = word[1]
            if c1 in wordtag_const.prefixes_letters and (
                    c2 in wordtag_const.prefixes_forbiden.get(c1, '')):
                word_guess = u"%s-%s" % (c1, word[1:])
                if len(word_guess) >= 4:
                    c1 = word_guess[2]
                    c2 = word_guess[3]
                    if c1 in wordtag_const.prefixes_letters and (
                            c2 in wordtag_const.prefixes_forbiden[c1]):
                        word_guess = u"%s-%s" % (c1, word_guess[2:])

    # # treat two suffixe letters
    # bisuffixes_letters=(araby.KAF+araby.MEEM ,araby.KAF+araby.NOON ,araby.HEH+araby.MEEM ,araby.HEH+araby.NOON )

    # bisuffixes_forbiden={
    # araby.HEH+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH),
    # araby.KAF+araby.MEEM :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.KHAH, araby.ZAIN, araby.SEEN , araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.GHAIN, araby.FEH, araby.QAF, araby.KAF, araby.LAM, araby.NOON , araby.HEH, araby.YEH),
    # araby.HEH+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.SAD, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.HEH, araby.YEH),
    # araby.KAF+araby.NOON :(araby.ALEF_HAMZA_ABOVE, araby.HAMZA, araby.WAW_HAMZA, araby.YEH_HAMZA, araby.BEH, araby.THEH, araby.JEEM, araby.HAH, araby.KHAH, araby.THAL, araby.SHEEN, araby.DAD, araby.TAH, araby.ZAH, araby.AIN, araby.GHAIN, araby.QAF, araby.KAF, araby.NOON , araby.HEH, araby.YEH),

    # }
    ##    word_guess=word;
        word = word_guess
        if len(word) >= 3:
            bc_last = word[-2:]
            bc_blast = word[-3:-2]
            if bc_last in wordtag_const.bisuffixes_letters:
                if bc_blast in wordtag_const.bisuffixes_forbiden[bc_last]:
                    word_guess = u"%s-%s" % (word[:-2], bc_last)

    # # treat one suffixe letters
    # suffixes_letters=(araby.KAF,araby.TEH ,araby.HEH)

    # suffixes_forbiden={
    # araby.TEH :(araby.THEH, araby.JEEM, araby.DAL, araby.THAL, araby.ZAIN, araby.SHEEN, araby.TAH, araby.ZAH),
    # araby.KAF:(araby.THEH, araby.JEEM, araby.KHAH, araby.THAL, araby.TAH, araby.ZAH, araby.GHAIN, araby.QAF),
    # araby.HEH:(araby.TEH , araby.HAH, araby.KHAH, araby.DAL, araby.REH, araby.SEEN , araby.SHEEN, araby.SAD, araby.ZAH, araby.AIN, araby.GHAIN),
    # }
        word = word_guess
        c_last = word[-1:]
        c_blast = word[-2:-1]
        if c_last in wordtag_const.suffixes_letters:
            if c_blast in wordtag_const.suffixes_forbiden[c_last]:
                word_guess = u"%s-%s" % (word[:-1], c_last)

        return word_guess
Пример #55
0
    def validateTags(self, noun_tuple, affix_tags, procletic, encletic,
                     suffix):
        """
		Test if the given word from dictionary is compabilbe with affixes tags.
		@param noun_tuple: the input word attributes given from dictionary.
		@type noun_tuple: dict.
		@param affix_tags: a list of tags given by affixes.
		@type affix_tags:list.
		@param procletic: first level prefix vocalized.
		@type procletic: unicode.		
		@param encletic: first level suffix vocalized.
		@type encletic: unicode.
		@return: if the tags are compaatible.
		@rtype: Boolean.
		"""
        procletic = araby.stripTashkeel(procletic)
        encletic = araby.stripTashkeel(encletic)
        suffix = araby.stripTashkeel(suffix)

        if u'مؤنث' in affix_tags and not noun_tuple['feminable']:
            return False
        if u'جمع مؤنث سالم' in affix_tags and not noun_tuple['feminin_plural']:
            return False
        if u'جمع مذكر سالم' in affix_tags and not noun_tuple['masculin_plural']:
            return False
        if u'مثنى' in affix_tags and not noun_tuple['dualable']:
            return False
        if u'تنوين' in affix_tags and noun_tuple['mamnou3_sarf']:
            return False
        if u'منسوب' in affix_tags and not noun_tuple['relative']:
            return False
        #تدقيق الغضافة إلى الضمائر المتصلة
        if encletic == u"هم" and noun_tuple['hm_suffix'] == 'N':
            return False
        if encletic == u"ه" and noun_tuple['ha_suffix'] == 'N':
            return False
        if encletic == u"ك" and noun_tuple['k_suffix'] == 'N':
            return False
        #حالة قابلية التشبيه
        if procletic.endswith(u"كال") and noun_tuple['kal_prefix'] == 'N':
            return False
        # حالة المضاف إلى ما بعهده في حالة جمع المذكر السالم
        # مثل لاعبو، رياضيو
        if suffix == araby.WAW and not noun_tuple['w_suffix']:
            return False
        # elif  u'مضاف' in affix_tags and not noun_tuple['annex']:
        # return False;
#todo
# u'mankous':8,
# u'feminable':9,*
# u'number':10,
# u'dualable':11,*
# u'masculin_plural':12,*
# u'feminin_plural':13,*
# u'broken_plural':14,
# u'mamnou3_sarf':15,
# u'relative':16,
# u'w_suffix':17,
# u'hm_suffix':18,*
# u'kal_prefix':19,*
# u'ha_suffix':20,*
# u'k_suffix':21,*
# u'annex':22,
        return True
Пример #56
0
	def validateTags(self, noun_tuple, affix_tags, procletic, encletic_nm , suffix_nm):
		"""
		Test if the given word from dictionary is compabilbe with affixes tags.
		@param noun_tuple: the input word attributes given from dictionary.
		@type noun_tuple: dict.
		@param affix_tags: a list of tags given by affixes.
		@type affix_tags:list.
		@param procletic: first level prefix vocalized.
		@type procletic: unicode.		
		@param encletic_nm: first level suffix vocalized.
		@type encletic_nm: unicode.
		@param suffix_nm: first level suffix vocalized.
		@type suffix_nm: unicode.		
		@return: if the tags are compaatible.
		@rtype: Boolean.
		"""
		procletic = araby.stripTashkeel(procletic);
		# encletic = araby.stripTashkeel(encletic);	
		# suffix   = araby.stripTashkeel(suffix);
		encletic = encletic_nm
		suffix   = suffix_nm
		if u'مؤنث' in affix_tags and not noun_tuple['feminable']:
			return False;
		if  u'جمع مؤنث سالم' in affix_tags and not noun_tuple['feminin_plural']:
			return False;
		if  u'جمع مذكر سالم' in affix_tags and not noun_tuple['masculin_plural']:
			return False;
		if  u'مثنى' in affix_tags and not noun_tuple['dualable']:
			return False;
		if  u'تنوين' in affix_tags and  noun_tuple['mamnou3_sarf']:
			return False;
		if  u'منسوب' in affix_tags and not noun_tuple['relative']:
			return False;
		#تدقيق الغضافة إلى الضمائر المتصلة
		if encletic==u"هم" and noun_tuple['hm_suffix']=='N':
			return False;
		if encletic==u"ه" and noun_tuple['ha_suffix']=='N':
			return False;
		if encletic==u"ك" and noun_tuple['k_suffix']=='N':
			return False;
		#حالة قابلية التشبيه
		if procletic.endswith(u"كال") and noun_tuple['kal_prefix']=='N':
			return False;
		# حالة المضاف إلى ما بعهده في حالة جمع المذكر السالم
		# مثل لاعبو، رياضيو
		if suffix==araby.WAW and not noun_tuple['w_suffix']:
			return False;
		#التاء المربوطة لا تتصل بجمع التكسير
		# if suffix==araby.TEH_MARBUTA and noun_tuple['broken_plural']:
			# return False;
		# elif  u'مضاف' in affix_tags and not noun_tuple['annex']:
			# return False;
#todo
		# u'mankous':8, 
# u'feminable':9, *
# u'number':10, 
# u'dualable':11, *
# u'masculin_plural':12, *
# u'feminin_plural':13, *
# u'broken_plural':14, 
# u'mamnou3_sarf':15, 
# u'relative':16, 
# u'w_suffix':17, 
# u'hm_suffix':18, *
# u'kal_prefix':19, *
# u'ha_suffix':20, *
# u'k_suffix':21, *
# u'annex':22, 
		return True;
Пример #57
0
 def suggest(self, word):
     key = araby.stripTashkeel(word)
     if self.dict.has_key(key):
         return self.dict[key]
     return []