예제 #1
0
	def stemming_stopword(self, word):
		"""
		Analyze word morphologically as noun
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		# the detailled stemmming result
		detailed_result=[];
		# search the sw in the dictionary
		# we can return the tashkeel
		#list of IDs of found stopwords in dictionary
		swIdList = [];
		# search in database by word, and return all ids
		#word = araby.stripTashkeel(word);
		swIdList = self.swDictionary.lookup(word);
		for sw_tuple in swIdList:
			# sw_tuple = self.swDictionary.getEntryById(id);
			detailed_result.append(wordCase.wordCase({
			'word':		word,
			'affix': (sw_tuple['procletic'],
								'',
								'',
								sw_tuple['encletic']),			
			'stem':			sw_tuple['stem'],
			'original':		sw_tuple['original'],
			'vocalized':	sw_tuple['vocalized'],
			'tags':			sw_tuple['tags'],
			'type':			sw_tuple['type'],
			'freq':'freqstopword',
			'originaltags':sw_tuple['tags'],
			'syntax':'',
			}));
		return detailed_result;
예제 #2
0
	def check_word(self,word, guessedTag=""):
		"""
		Analyze one word morphologically as verbs
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		word=araby.stripTatweel(word);
		word_vocalised=word;
		word_nm=araby.stripTashkeel(word);
		resulted_text=u"";
		resulted_data=[];
		# if word is a pounctuation
		resulted_data+=self.check_word_as_pounct(word_nm);
		# Done: if the word is a stop word we have  some problems,
		# the stop word can also be another normal word (verb or noun),
		# we must consider it in future works
		# if word is stopword allow stop words analysis
		resulted_data+=self.check_word_as_stopword(word_nm);

		#if word is verb
		# مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
		if  self.tagger.hasVerbTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):
			resulted_data+=self.check_word_as_verb(word_nm);
			#print "is verb", rabti,len(resulted_data);
		#if word is noun
		if self.tagger.hasNounTag(guessedTag) or self.tagger.isStopWordTag(guessedTag):			
			resulted_data+=self.check_word_as_noun(word_nm);
		if len(resulted_data)==0:
			#check the word as unkonwn
			resulted_data+=self.check_word_as_unknown(word_nm);
			#check if the word is nomralized and solution are equivalent
		resulted_data = self.check_normalized(word_vocalised, resulted_data)
		#check if the word is shadda like
		resulted_data = self.check_shadda(word_vocalised, resulted_data)

		#check if the word is vocalized like results			
		if self.partial_vocalization_support:
			resulted_data=self.check_partial_vocalized(word_vocalised, resulted_data);
		# add word frequency information in tags
		resulted_data = self.addWordFrequency(resulted_data);

		if len(resulted_data)==0:
			resulted_data.append(wordCase.wordCase({
			'word':word,  
			'affix': ('' , '', '', ''),       
			'stem':'',
			'original':word,
			'vocalized':word,
			'tags':u'',
			'type':'unknown',
			'root':'',
			'template':'',
			'freq':self.wordfreq.getFreq(word, 'unknown'),
			'syntax':'',
			})
			);
		return resulted_data;
예제 #3
0
    def check_word_as_pounct(self, word):
        """
		Check if the word is a pounctuation, 
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.		
		"""
        detailed_result = []
        # ToDo : fix it to isdigit, by moatz saad
        if word.isnumeric():
            detailed_result.append(
                wordCase.wordCase({
                    'word': word,
                    'affix': ('', '', '', ''),
                    'stem': '',
                    'original': word,
                    'vocalized': word,
                    'tags': self.get_number_tags(word),
                    'type': 'NUMBER',
                    'freq': 0,
                    'syntax': '',
                }))
        if word in stem_pounct_const.POUNCTUATION:
            detailed_result.append(
                wordCase.wordCase({
                    'word':
                    word,
                    'affix': ('', '', '', ''),
                    'stem':
                    '',
                    'original':
                    word,
                    'vocalized':
                    word,
                    'tags':
                    stem_pounct_const.POUNCTUATION[word]['tags'],
                    'type':
                    'POUNCT',
                    'freq':
                    0,
                    'syntax':
                    '',
                }))

        return detailed_result
예제 #4
0
파일: analex.py 프로젝트: ATouhou/mishkal
    def check_word_as_pounct(self, word):
        """
		Check if the word is a pounctuation, 
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.		
		"""
        detailed_result = []
        # ToDo : fix it to isdigit, by moatz saad
        if word.isnumeric():
            detailed_result.append(
                wordCase.wordCase(
                    {
                        "word": word,
                        "affix": ("", "", "", ""),
                        "stem": "",
                        "original": word,
                        "vocalized": word,
                        "tags": self.get_number_tags(word),
                        "type": "NUMBER",
                        "freq": 0,
                        "syntax": "",
                    }
                )
            )
        if word in stem_pounct_const.POUNCTUATION:
            detailed_result.append(
                wordCase.wordCase(
                    {
                        "word": word,
                        "affix": ("", "", "", ""),
                        "stem": "",
                        "original": word,
                        "vocalized": word,
                        "tags": stem_pounct_const.POUNCTUATION[word]["tags"],
                        "type": "POUNCT",
                        "freq": 0,
                        "syntax": "",
                    }
                )
            )

        return detailed_result
예제 #5
0
파일: analex.py 프로젝트: abom/mishkal
	def check_word_as_pounct(self,word):
		"""
		Check if the word is a pounctuation, 
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.		
		"""		
		detailed_result=[]
		# ToDo : fix it to isdigit, by moatz saad
		if word.isnumeric():
			detailed_result.append(wordCase.wordCase({
			'word':word,
			'affix': ('', '', '', ''),			
			'stem':'',
			'original':word,
			'vocalized':word,
			'tags':self.get_number_tags(word),
			'type':'NUMBER',
			'freq':0,
			'syntax':'',			
			}));	
		if word in stem_pounct_const.POUNCTUATION:
			detailed_result.append(wordCase.wordCase({
			'word':word,
			'affix': ('', '', '', ''),
			'stem':'',
			'original':word,
			'vocalized':word,
			'tags':stem_pounct_const.POUNCTUATION[word]['tags'],
			'type':'POUNCT',
			'freq':0,
			'syntax':'',			
			}));

		return detailed_result;
예제 #6
0
    def stemming_stopword(self, word):
        """
		Analyze word morphologically as noun
		@param word: the input word.
		@type word: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        # the detailled stemmming result
        detailed_result = []
        # search the sw in the dictionary
        # we can return the tashkeel
        # list of IDs of found stopwords in dictionary
        swIdList = []
        # search in database by word, and return all ids
        # word = araby.stripTashkeel(word);
        swIdList = self.swDictionary.lookup(word)
        for sw_tuple in swIdList:
            # sw_tuple = self.swDictionary.getEntryById(id);
            detailed_result.append(
                wordCase.wordCase(
                    {
                        "word": word,
                        "affix": (sw_tuple["procletic"], "", "", sw_tuple["encletic"]),
                        "stem": sw_tuple["stem"],
                        "original": sw_tuple["original"],
                        "vocalized": sw_tuple["vocalized"],
                        "tags": sw_tuple["tags"],
                        "type": sw_tuple["type"],
                        "freq": "freqstopword",
                        "originaltags": sw_tuple["tags"],
                        "syntax": "",
                    }
                )
            )
        return detailed_result
예제 #7
0
    def steming_second_level(self, noun, noun2, procletic, encletic):
        """
		Analyze word morphologically by stemming the conjugation affixes.
		@param noun: the input noun.
		@type noun: unicode.
		@param noun2: the noun stemed from syntaxic affixes.
		@type noun2: unicode.
		@param procletic: the syntaxic prefixe extracted in the fisrt stage.
		@type procletic: unicode.
		@param encletic: the syntaxic suffixe extracted in the fisrt stage.
		@type encletic: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conjStemmer.segment(noun2)
        # verify affix compatibility
        list_seg_conj = self.verify_affix(
            noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX)

        # add vocalized forms of suffixes
        list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            prefix_conj = noun2[:seg_conj[0]]
            stem_conj = noun2[seg_conj[0]:seg_conj[1]]
            suffix_conj = noun2[seg_conj[1]:]
            affix_conj = prefix_conj + '-' + suffix_conj
            # get all vocalized form of suffixes
            for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[
                    suffix_conj]['vocalized']:
                seg_conj_voc = {
                    'prefix': '',
                    'suffix': vocalized_suffix,
                    'stem': stem_conj
                }
                # verify compatibility between procletics and afix
                if (self.is_compatible_proaffix_affix(procletic, encletic,
                                                      vocalized_suffix)):
                    # verify the existing of a noun stamp in the dictionary
                    # if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
                    # list_seg_conj2.append(seg_conj)
                    list_seg_conj_voc.append(seg_conj_voc)
        list_seg_conj = list_seg_conj_voc
        for seg_conj in list_seg_conj:
            prefix_conj = seg_conj['prefix']
            stem_conj = seg_conj['stem']
            suffix_conj = seg_conj['suffix']
            has_plural_suffix = (
                (u"جمع"
                 in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])
                or
                (u"مثنى"
                 in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags'])
            )
            #print "has_plural", has_plural_suffix;
            affix_conj = '-'.join([prefix_conj, suffix_conj])
            # noirmalize hamza before gessing  deffirents origines
            stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj)
            if self.debug:
                print "*\t", "-".join(
                    [str(len(stem_conj)), prefix_conj, stem_conj,
                     suffix_conj]).encode("utf8")
            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = self.getStemVariants(stem_conj, prefix_conj,
                                                      suffix_conj)
            if self.debug:
                print "\tpossible original nouns:  ", "\t".join(
                    possible_noun_list).encode('utf8')
            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in possible_noun_list:
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up in broken plural dictionary
                infnoun_foundL = self.nounDictionary.lookup(
                    infnoun, 'unknown')
                #infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix);
                ##							listsingle=self.find_broken_plural(infnoun);
                ##							print ' *****','-'.join(listsingle).encode('utf8')
                if len(infnoun_foundL) > 0:
                    if self.debug: print "\t in dict", infnoun.encode('utf8')
                else:
                    if self.debug:
                        print infnoun.encode('utf8'), "not found in dictionary"
                infnoun_form_list += infnoun_foundL
            for noun_tuple in infnoun_form_list:
                # noun_tuple=self.nounDictionary.getEntryById(id);
                infnoun = noun_tuple['vocalized']
                originalTags = ()
                original = noun_tuple['vocalized']
                wordtype = noun_tuple['word_type']
                detailed_result.append(
                    wordCase.wordCase({
                        'word':
                        noun,
                        'affix':
                        (procletic, prefix_conj, suffix_conj, encletic),
                        #~ 'procletic':procletic,
                        #~ 'encletic':encletic,
                        #~ 'prefix':prefix_conj,
                        #~ 'suffix':suffix_conj,
                        'stem':
                        stem_conj,
                        'original':
                        infnoun,  #original,
                        'vocalized':
                        self.vocalize(infnoun, procletic, prefix_conj,
                                      suffix_conj, encletic),
                        'tags':
                        u':'.join(stem_noun_const.
                                  COMP_PREFIX_LIST_TAGS[procletic]['tags'] +
                                  stem_noun_const.
                                  COMP_SUFFIX_LIST_TAGS[encletic]['tags'] +
                                  stem_noun_const.
                                  CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']),
                        'type':
                        u':'.join(['Noun', wordtype]),  #'Noun',
                        #~ 'root':'',
                        #~ 'template':'',
                        'freq':
                        noun_tuple[
                            'freq'],  #self.wordfreq.getFreq(infnoun,'noun'),
                        'originaltags':
                        u':'.join(originalTags),
                        'syntax':
                        '',
                    }))
        return detailed_result
예제 #8
0
	def stemming_verb(self, verb):
		list_found = [];
		display_conj_result=False;
		detailed_result = [];
		verb			= verb.strip();
		verb_list		= [verb];
		if verb.startswith(araby.ALEF_MADDA):
			verb_list.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE+verb[1:])
			verb_list.append(araby.HAMZA+araby.ALEF+verb[1:])

		for verb in verb_list:

			list_seg_comp=self.compStemmer.segment(verb);
			for seg in list_seg_comp:
				procletic=verb[:seg[0]];
				stem=verb[seg[0]:seg[1]]
				encletic=verb[seg[1]:]
				secondsuffix=u'';
				# حالة الفعل المتعدي لمفعولين
				if stem_verb_const.TableDoubleTransitiveSuffix.has_key(encletic ):
					firstsuffix=stem_verb_const.TableDoubleTransitiveSuffix[encletic]['first'];
					secondsuffix=stem_verb_const.TableDoubleTransitiveSuffix[encletic]['second'];
					encletic=firstsuffix;


				affix = u'-'.join([procletic, encletic])
				#if self.debug: print "\t", "-".join([procletic, stem, encletic]).encode("utf8") ;
				# ajusting verbs variant
				list_stem=[stem];
				if encletic:  #!="":
					transitive=True;
					if stem.endswith(araby.TEH + araby.MEEM + araby.WAW):
						list_stem.append(stem[:-1]);
					elif stem.endswith(araby.WAW):
						list_stem.append(stem+ araby.ALEF);
					elif stem.endswith( araby.ALEF):
						list_stem.append(stem[:-1]+ araby.ALEF_MAKSURA);

				else: transitive=False;
				if verb.startswith(araby.ALEF_MADDA):
					# االبداية بألف مد
					list_stem.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE+verb[1:])
					list_stem.append(araby.HAMZA+ araby.ALEF+verb[1:])

		# stem reduced verb : level two
				result=[];
				for verb2 in list_stem:
					#segment the coinjugated verb
					list_seg_conj=self.conjStemmer.segment(verb2);

					# verify affix compatibility
					list_seg_conj = self.verify_affix(verb2, list_seg_conj, stem_verb_const.VERBAL_CONJUGATION_AFFIX);
					# verify procletics and enclitecs
					# verify length pof stem
					list_seg_conj2=[];
					for seg_conj in list_seg_conj:
						if (seg_conj[1] - seg_conj[0])<=6 :
							prefix_conj  = verb2[:seg_conj[0]];
							stem_conj    = verb2[seg_conj[0]:seg_conj[1]]
							suffix_conj  = verb2[seg_conj[1]:]
							affix_conj   = prefix_conj+'-'+suffix_conj;


						# verify compatibility between procletics and afix
							if (self.is_compatible_proaffix_affix(procletic, encletic, affix_conj)):
								# verify the existing of a verb stamp in the dictionary
								if self.verbDictionary.existsAsStamp(stem_conj):
									list_seg_conj2.append(seg_conj)

					list_seg_conj     = list_seg_conj2;
					list_correct_conj = [];

					for seg_conj in list_seg_conj:
						prefix_conj = verb2[:seg_conj[0]];
						stem_conj   = verb2[seg_conj[0]:seg_conj[1]]
						suffix_conj = verb2[seg_conj[1]:]
						affix_conj  = '-'.join([prefix_conj, suffix_conj])

							
						# search the verb in the dictionary by stamp
						# if the verb exists in dictionary, 
						# The transitivity is consedered
						# if is trilateral return its forms and Tashkeel
						# if not return forms without tashkeel, because the conjugator can vocalized it, 
						# we can return the tashkeel if we don't need the conjugation step						
						infverb_dict=self.getInfinitiveVerbByStem(stem_conj, transitive);

						infverb_dict = self.verifyInfinitiveVerbs(stem_conj, infverb_dict);
							

						for item in infverb_dict:
							#The haraka from is given from the dict
							inf_verb     = item['verb'];
							haraka       = item['haraka'];
							transtag     =  item['transitive'] #=='y'or not item['transitive']);
							transitive    =  (item['transitive']=='y'or not item['transitive']);

							originalTags = transtag; 
							# dict tag is used to mention word dictionary tags: the original word tags like transitive attribute
							unstemed_verb= verb2;

							# conjugation step

							# ToDo, conjugate the verb with affix, 
							# if exists one verb which match, return it
							# تصريف الفعل مع الزوائد
							# إذا توافق التصريف مع الكلمة الناتجة
							# تعرض النتيجة
							onelist_correct_conj = [];
							onelist_correct_conj = self.generate_possible_conjug(inf_verb, unstemed_verb, affix_conj, haraka, procletic, encletic, transitive);

							if len(onelist_correct_conj)>0:
								list_correct_conj+=onelist_correct_conj;
					# if 	not list_correct_conj :		print "No Verb Found ";
					for conj in list_correct_conj:
						result.append(conj['verb'])

						detailed_result.append(wordCase.wordCase({
						'word':verb, 
						'affix': ( procletic, prefix_conj, suffix_conj, encletic),						
						#~ 'procletic':procletic, 
						#~ 'encletic':encletic, 
						#~ 'prefix':prefix_conj, 
						#~ 'suffix':suffix_conj, 
						'stem':stem_conj, 
						'original':conj['verb'], 
						'vocalized':self.vocalize(conj['vocalized'], procletic, encletic), 
						'tags':u':'.join((conj['tense'], conj['pronoun'])+stem_verb_const.COMP_PREFIX_LIST_TAGS[procletic]['tags']+stem_verb_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags']), 
						'type':'Verb', 
						#~ 'root':'', 
						#~ 'template':'', 
						'freq':'freqverb', 
						'originaltags':originalTags, 
						'syntax':'', 
						}));

	##				result+=detect_arabic_verb(verb2, transitive, prefix_conj, suffix_conj, debug);
				list_found+=result;

		list_found=set(list_found);
		return detailed_result
예제 #9
0
	def steming_second_level(self,noun,noun2,procletic,encletic):
		"""
		Analyze word morphologically by stemming the conjugation affixes.
		@param noun: the input noun.
		@type noun: unicode.
		@param noun2: the noun stemed from syntaxic affixes.
		@type noun2: unicode.
		@param procletic: the syntaxic prefixe extracted in the fisrt stage.
		@type procletic: unicode.
		@param encletic: the syntaxic suffixe extracted in the fisrt stage.
		@type encletic: unicode.
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		detailed_result=[];
		#segment the coinjugated verb
		list_seg_conj=self.conjStemmer.segment(noun2);
		# verify affix compatibility
		list_seg_conj=self.verify_affix(noun2,list_seg_conj,stem_noun_const.NOMINAL_CONJUGATION_AFFIX);

		# add vocalized forms of suffixes
		list_seg_conj_voc=[];
		for seg_conj in list_seg_conj:
			prefix_conj=noun2[:seg_conj[0]];
			stem_conj=noun2[seg_conj[0]:seg_conj[1]]
			suffix_conj=noun2[seg_conj[1]:]
			affix_conj=prefix_conj+'-'+suffix_conj;
			# get all vocalized form of suffixes
			for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']:
				seg_conj_voc={'prefix':'','suffix':vocalized_suffix,'stem':stem_conj}
				# verify compatibility between procletics and afix
				if (self.is_compatible_proaffix_affix(procletic, encletic, vocalized_suffix)):
				# verify the existing of a noun stamp in the dictionary
				# if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
					# list_seg_conj2.append(seg_conj)
					list_seg_conj_voc.append(seg_conj_voc)
		list_seg_conj=list_seg_conj_voc;
		for seg_conj in list_seg_conj:
			prefix_conj=seg_conj['prefix'];
			stem_conj=seg_conj['stem']
			suffix_conj=seg_conj['suffix']
			has_plural_suffix=((u"جمع" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or( u"مثنى" in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']))
			#print "has_plural", has_plural_suffix;
			affix_conj='-'.join([prefix_conj,suffix_conj])
			# noirmalize hamza before gessing  deffirents origines
			stem_conj=tashaphyne.normalize.normalize_hamza(stem_conj)
			if self.debug:
				print "*\t", "-".join([str(len(stem_conj)),prefix_conj,stem_conj,suffix_conj]).encode("utf8") ;
			# generate possible stems
			# add stripped letters to the stem to constitute possible noun list
			possible_noun_list=self.getStemVariants(stem_conj,prefix_conj,suffix_conj);
			if self.debug:
				print "\tpossible original nouns:  ","\t".join(possible_noun_list).encode('utf8');
			# search the noun in the dictionary
			# we can return the tashkeel
			infnoun_form_list=[];
			for infnoun in possible_noun_list:
				# get the noun and get all its forms from the dict
				# if the noun has plural suffix, don't look up in broken plural dictionary
				infnoun_foundL=self.nounDictionary.lookup(infnoun,'unknown');
				#infnoun_found=self.find_nouns_in_dictionary(infnoun,has_plural_suffix);
##							listsingle=self.find_broken_plural(infnoun);
##							print ' *****','-'.join(listsingle).encode('utf8')
				if len(infnoun_foundL)>0:
					if self.debug: print "\t in dict",infnoun.encode('utf8');
				else:
					if self.debug: print infnoun.encode('utf8'),"not found in dictionary"
				infnoun_form_list+=infnoun_foundL;
			for noun_tuple in infnoun_form_list:
				# noun_tuple=self.nounDictionary.getEntryById(id);
				infnoun=noun_tuple['vocalized'];
				originalTags=()
				original=noun_tuple['vocalized'];
				wordtype=noun_tuple['word_type'];
				detailed_result.append(wordCase.wordCase({
				'word':noun,
				'affix': ( procletic,
								 prefix_conj,
								suffix_conj,
								 encletic),	
				#~ 'procletic':procletic,
				#~ 'encletic':encletic,
				#~ 'prefix':prefix_conj,
				#~ 'suffix':suffix_conj,
				'stem':stem_conj,
				'original':infnoun,#original,
				'vocalized':self.vocalize(infnoun,procletic,prefix_conj,suffix_conj,encletic),
				'tags':u':'.join(stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags']+stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic]['tags']+stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']),
				'type':u':'.join(['Noun',wordtype]),#'Noun',
				#~ 'root':'',
				#~ 'template':'',
				'freq':noun_tuple['freq'],#self.wordfreq.getFreq(infnoun,'noun'),
				'originaltags':u':'.join(originalTags),
				'syntax':'',
				}));
		return detailed_result;
예제 #10
0
	def steming_second_level(self, noun, noun2, procletic, encletic, encletic_nm):
		"""
		Analyze word morphologically by stemming the conjugation affixes.
		@param noun: the input noun.
		@type noun: unicode.
		@param noun2: the noun stemed from syntaxic affixes.
		@type noun2: unicode.
		@param procletic: the syntaxic prefixe extracted in the fisrt stage.
		@type procletic: unicode.
		@param encletic: the syntaxic suffixe extracted in the fisrt stage.
		@type encletic: unicode.
		@param encletic_nm: the syntaxic suffixe extracted in the fisrt stage (not vocalized).
		@type encletic_nm: unicode.		
		@return: list of dictionaries of analyzed words with tags.
		@rtype: list.
		"""	
		detailed_result=[];
		#segment the coinjugated verb
		list_seg_conj = self.conjStemmer.segment(noun2);
		# verify affix compatibility
		list_seg_conj = self.verify_affix(noun2, list_seg_conj, stem_noun_const.NOMINAL_CONJUGATION_AFFIX);
		# add vocalized forms of suffixes
		# and create the real affixes from the word
		list_seg_conj_voc=[];
		for seg_conj in list_seg_conj:
			stem_conj   = noun2[seg_conj[0]:seg_conj[1]]
			suffix_conj_nm = noun2[seg_conj[1]:]

			# noirmalize hamza before gessing  differents origines
			stem_conj = araby.normalizeHamza(stem_conj)

			# generate possible stems
			# add stripped letters to the stem to constitute possible noun list
			possible_noun_list=self.getStemVariants(stem_conj, suffix_conj_nm);

			# search the noun in the dictionary
			# we can return the tashkeel
			infnoun_form_list=[];
			for infnoun in set(possible_noun_list):
				# get the noun and get all its forms from the dict
				# if the noun has plural suffix, don't look up in broken plural dictionary
				if not self.CacheDictSearch.has_key(infnoun):
					infnoun_foundL = self.nounDictionary.lookup(infnoun);
					self.CacheDictSearch[infnoun]  = self.createDictWord(infnoun_foundL);
				else: 
					infnoun_foundL = self.CacheDictSearch[infnoun]  ;					
				infnoun_form_list.extend(infnoun_foundL);
			#print "len loooked up noun in dictionnary ",len(infnoun_form_list), len(set(infnoun_form_list));
			for noun_tuple in infnoun_form_list:
				# noun_tuple=self.nounDictionary.getEntryById(id);
				infnoun = noun_tuple['vocalized'];
				# affixes tags contains prefixes and suffixes tags
				affix_tags  =  stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
								+stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic_nm]['tags'] \
								+stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['tags']
				#test if the  given word from dictionary accept those tags given by affixes
				# دراسة توافق الزوائد مع خصائص الاسم،
				# مثلا هل يقبل الاسم التأنيث.
	
				if self.validateTags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm):
					## get all vocalized form of suffixes
					for vocalized_encletic in stem_noun_const.COMP_SUFFIX_LIST_TAGS[encletic_nm]['vocalized']:
						for vocalized_suffix in stem_noun_const.CONJ_SUFFIX_LIST_TAGS[suffix_conj_nm]['vocalized']:
							## verify compatibility between procletics and affix
							if (self.is_compatible_proaffix_affix(noun_tuple, procletic, vocalized_encletic, vocalized_suffix)):
								vocalized, semiVocalized = self.vocalize(infnoun, procletic,  vocalized_suffix, vocalized_encletic);
								vocalized_affix_tags  =  stem_noun_const.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
												+stem_noun_const.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags'] \
												+stem_noun_const.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']								
								
								#add some tags from dictionary entry as mamnou3 min sarf and broken plural
								originalTags=[];
								if noun_tuple['mamnou3_sarf']==u"ممنوع من الصرف":
									originalTags.append(u"ممنوع من الصرف");
								if noun_tuple['number']==u"جمع تكسير":
									originalTags.append(u"جمع تكسير");						
									# affix_tags+=(, );
								detailed_result.append(wordCase.wordCase({
								'word':noun, 
								#~ 'affix': analex_const.AffixTuple((procletic=procletic, encletic=vocalized_encletic, prefix='', suffix=vocalized_suffix)),								
								'affix': (procletic,  '', vocalized_suffix, vocalized_encletic),								
								#~ 'procletic': , 
								#~ 'encletic':  , 
								#~ 'prefix':    '', 
								#~ 'suffix':    vocalized_suffix, 
								'stem':      stem_conj, 
								'original':  infnoun, #original, 
								'vocalized': vocalized, 
								'semivocalized':semiVocalized,
								'tags':      u':'.join(vocalized_affix_tags), 
								'type':      u':'.join(['Noun', noun_tuple['wordtype']]), #'Noun', 
								#~ 'root':      '', 
								#~ 'template':  '', 
								'freq':'freqnoun', # to note the frequency type 
								'originaltags':u':'.join(originalTags), 
								'syntax':'', 
								}));
		return detailed_result;
예제 #11
0
    def stemming_verb(self, verb):
        list_found = []
        display_conj_result = False
        detailed_result = []
        verb = verb.strip()
        verb_list = [verb]
        if verb.startswith(araby.ALEF_MADDA):
            verb_list.append(araby.ALEF_HAMZA_ABOVE + araby.ALEF_HAMZA_ABOVE +
                             verb[1:])
            verb_list.append(araby.HAMZA + araby.ALEF + verb[1:])

        for verb in verb_list:

            list_seg_comp = self.compStemmer.segment(verb)
            for seg in list_seg_comp:
                procletic = verb[:seg[0]]
                stem = verb[seg[0]:seg[1]]
                encletic = verb[seg[1]:]
                secondsuffix = u''
                # حالة الفعل المتعدي لمفعولين
                if stem_verb_const.TableDoubleTransitiveSuffix.has_key(
                        encletic):
                    firstsuffix = stem_verb_const.TableDoubleTransitiveSuffix[
                        encletic]['first']
                    secondsuffix = stem_verb_const.TableDoubleTransitiveSuffix[
                        encletic]['second']
                    encletic = firstsuffix

                affix = u'-'.join([procletic, encletic])
                #if self.debug: print "\t", "-".join([procletic, stem, encletic]).encode("utf8") ;
                # ajusting verbs variant
                list_stem = [stem]
                if encletic:  #!="":
                    transitive = True
                    if stem.endswith(araby.TEH + araby.MEEM + araby.WAW):
                        list_stem.append(stem[:-1])
                    elif stem.endswith(araby.WAW):
                        list_stem.append(stem + araby.ALEF)
                    elif stem.endswith(araby.ALEF):
                        list_stem.append(stem[:-1] + araby.ALEF_MAKSURA)

                else:
                    transitive = False
                if verb.startswith(araby.ALEF_MADDA):
                    # االبداية بألف مد
                    list_stem.append(araby.ALEF_HAMZA_ABOVE +
                                     araby.ALEF_HAMZA_ABOVE + verb[1:])
                    list_stem.append(araby.HAMZA + araby.ALEF + verb[1:])

        # stem reduced verb : level two
                result = []
                for verb2 in list_stem:
                    #segment the coinjugated verb
                    list_seg_conj = self.conjStemmer.segment(verb2)

                    # verify affix compatibility
                    list_seg_conj = self.verify_affix(
                        verb2, list_seg_conj,
                        stem_verb_const.VERBAL_CONJUGATION_AFFIX)
                    # verify procletics and enclitecs
                    # verify length pof stem
                    list_seg_conj2 = []
                    for seg_conj in list_seg_conj:
                        if (seg_conj[1] - seg_conj[0]) <= 6:
                            prefix_conj = verb2[:seg_conj[0]]
                            stem_conj = verb2[seg_conj[0]:seg_conj[1]]
                            suffix_conj = verb2[seg_conj[1]:]
                            affix_conj = prefix_conj + '-' + suffix_conj

                            # verify compatibility between procletics and afix
                            if (self.is_compatible_proaffix_affix(
                                    procletic, encletic, affix_conj)):
                                # verify the existing of a verb stamp in the dictionary
                                if self.verbDictionary.existsAsStamp(
                                        stem_conj):
                                    list_seg_conj2.append(seg_conj)

                    list_seg_conj = list_seg_conj2
                    list_correct_conj = []

                    for seg_conj in list_seg_conj:
                        prefix_conj = verb2[:seg_conj[0]]
                        stem_conj = verb2[seg_conj[0]:seg_conj[1]]
                        suffix_conj = verb2[seg_conj[1]:]
                        affix_conj = '-'.join([prefix_conj, suffix_conj])

                        # search the verb in the dictionary by stamp
                        # if the verb exists in dictionary,
                        # The transitivity is consedered
                        # if is trilateral return its forms and Tashkeel
                        # if not return forms without tashkeel, because the conjugator can vocalized it,
                        # we can return the tashkeel if we don't need the conjugation step
                        infverb_dict = self.getInfinitiveVerbByStem(
                            stem_conj, transitive)

                        infverb_dict = self.verifyInfinitiveVerbs(
                            stem_conj, infverb_dict)

                        for item in infverb_dict:
                            #The haraka from is given from the dict
                            inf_verb = item['verb']
                            haraka = item['haraka']
                            transtag = item[
                                'transitive']  #=='y'or not item['transitive']);
                            transitive = (item['transitive'] == 'y'
                                          or not item['transitive'])

                            originalTags = transtag
                            # dict tag is used to mention word dictionary tags: the original word tags like transitive attribute
                            unstemed_verb = verb2

                            # conjugation step

                            # ToDo, conjugate the verb with affix,
                            # if exists one verb which match, return it
                            # تصريف الفعل مع الزوائد
                            # إذا توافق التصريف مع الكلمة الناتجة
                            # تعرض النتيجة
                            onelist_correct_conj = []
                            onelist_correct_conj = self.generate_possible_conjug(
                                inf_verb, unstemed_verb, affix_conj, haraka,
                                procletic, encletic, transitive)

                            if len(onelist_correct_conj) > 0:
                                list_correct_conj += onelist_correct_conj
                    # if 	not list_correct_conj :		print "No Verb Found ";
                    for conj in list_correct_conj:
                        result.append(conj['verb'])

                        detailed_result.append(
                            wordCase.wordCase({
                                'word':
                                verb,
                                'affix': (procletic, prefix_conj, suffix_conj,
                                          encletic),
                                #~ 'procletic':procletic,
                                #~ 'encletic':encletic,
                                #~ 'prefix':prefix_conj,
                                #~ 'suffix':suffix_conj,
                                'stem':
                                stem_conj,
                                'original':
                                conj['verb'],
                                'vocalized':
                                self.vocalize(conj['vocalized'], procletic,
                                              encletic),
                                'tags':
                                u':'.join(
                                    (conj['tense'], conj['pronoun']) +
                                    stem_verb_const.
                                    COMP_PREFIX_LIST_TAGS[procletic]['tags'] +
                                    stem_verb_const.
                                    COMP_SUFFIX_LIST_TAGS[encletic]['tags']),
                                'type':
                                'Verb',
                                #~ 'root':'',
                                #~ 'template':'',
                                'freq':
                                'freqverb',
                                'originaltags':
                                originalTags,
                                'syntax':
                                '',
                            }))

    ##				result+=detect_arabic_verb(verb2, transitive, prefix_conj, suffix_conj, debug);
                list_found += result

        list_found = set(list_found)
        return detailed_result