Пример #1
0
    def check_word_as_pounct(self, word):
        """
        Check if the word is a pounctuation,
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        if not word:
            return detailed_result
        # ToDo : fix it to isdigit, by moatz saad
        if word.isnumeric():
            detailed_result.append(
                wordcase.WordCase({
                    'word': word,
                    'affix': ('', '', '', ''),
                    'stem': '',
                    'original': word,
                    'vocalized': word,
                    'tags': u"عدد",
                    'type': 'NUMBER',
                    'freq': 0,
                    'syntax': '',
                    'root': '',
                }))
        # test if all chars in word are punctuation
        for char in word:
            # if one char is not a pounct, break
            if char not in stem_pounct_const.POUNCTUATION:
                break
        else:
            # if all chars are pounct, the word take tags of the first char
            detailed_result.append(
                wordcase.WordCase({
                    'word':
                    word,
                    'affix': ('', '', '', ''),
                    'stem':
                    '',
                    'original':
                    word,
                    'vocalized':
                    word,
                    'tags':
                    stem_pounct_const.POUNCTUATION[word[0]]['tags'],
                    'type':
                    'POUNCT',
                    'freq':
                    0,
                    'syntax':
                    '',
                    'root':
                    '',
                }))

        return detailed_result
Пример #2
0
    def stemming_noun(self, noun):
        """
        Analyze word morphologically as noun
        @param noun: the input noun.
        @type noun: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        detailed_result.append(wordcase.WordCase({
                    'word':noun,
                    'affix': "",
                    'stem':noun,
                    'original':noun, #original,
                    'type':u'unkonwn',
                    'original':noun,
                }))

        return detailed_result
Пример #3
0
    def steming_second_level(self, noun, noun2, procletic, encletic):
        """
        Analyze word morphologically by stemming the conjugation affixes.
        @param noun: the input noun.
        @type noun: unicode.
        @param noun2: the noun stemed from syntaxic affixes.
        @type noun2: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic: the syntaxic suffixe extracted in the fisrt stage.
        @type encletic: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conj_stemmer.segment(noun2)
        # verify affix compatibility
        list_seg_conj = verify_affix(noun2, list_seg_conj,
                                     snconst.NOMINAL_CONJUGATION_AFFIX)

        # add vocalized forms of suffixes
        list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            prefix_conj = noun2[:seg_conj[0]]
            stem_conj = noun2[seg_conj[0]:seg_conj[1]]
            suffix_conj = noun2[seg_conj[1]:]
            #~affix_conj = prefix_conj+'-'+suffix_conj
            # get all vocalized form of suffixes
            for vocalized_suffix in \
            snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['vocalized']:
                seg_conj_voc = {
                    'prefix': '',
                    'suffix': vocalized_suffix,
                    'stem': stem_conj
                }
                # verify compatibility between procletics and afix
                if (is_compatible_proaffix_affix(procletic, encletic,
                                                 vocalized_suffix)):
                    # verify the existing of a noun stamp in the dictionary
                    # if self.NOUN_DICTIONARY_STAMP.has_key(stamp):
                    # list_seg_conj2.append(seg_conj)
                    list_seg_conj_voc.append(seg_conj_voc)
        list_seg_conj = list_seg_conj_voc
        for seg_conj in list_seg_conj:
            prefix_conj = seg_conj['prefix']
            stem_conj = seg_conj['stem']
            suffix_conj = seg_conj['suffix']
            #~has_plural_suffix = ((u"جمع" in \
            #~snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']) or\
            #~( u"مثنى" in snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']))
            #print "has_plural", has_plural_suffix
            #~affix_conj = '-'.join([prefix_conj, suffix_conj])
            # noirmalize hamza before gessing  deffirents origines
            stem_conj = tashaphyne.normalize.normalize_hamza(stem_conj)
            # generate possible stems
            # add stripped letters to the stem to constitute possible noun list
            possible_noun_list = get_stem_variants(stem_conj, \
            prefix_conj, suffix_conj)
            # search the noun in the dictionary
            # we can return the tashkeel
            infnoun_form_list = []
            for infnoun in possible_noun_list:
                # get the noun and get all its forms from the dict
                # if the noun has plural suffix, don't look up
                # in broken plural dictionary
                infnoun_foundlist = self.noun_dictionary.lookup(
                    infnoun, 'unknown')
                infnoun_form_list += infnoun_foundlist
            for noun_tuple in infnoun_form_list:
                # noun_tuple = self.noun_dictionary.getEntryById(id)
                infnoun = noun_tuple['vocalized']
                original_tags = ()
                #~original = noun_tuple['vocalized']
                wordtype = noun_tuple['word_type']
                vocalized = vocalize(infnoun, procletic, prefix_conj,
                                     suffix_conj, encletic)
                #print "v", vocalized.encode('utf8')
                detailed_result.append(wordcase.WordCase({
                    'word':noun,
                    'affix': (procletic, prefix_conj, suffix_conj, encletic),
                    'stem':stem_conj,
                    'original':infnoun, #original,
                    'vocalized':vocalized,
                    'semivocalized':vocalized,
                    'tags':u':'.join(snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags']\
                    +snconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags']+\
                    snconst.CONJ_SUFFIX_LIST_TAGS[suffix_conj]['tags']),
                    'type':u':'.join(['Noun', wordtype]), #'Noun',
                    'freq':noun_tuple['freq'],
                    'originaltags':u':'.join(original_tags),
                    'syntax':'',
                }))

        return detailed_result
Пример #4
0
    def steming_second_level(self, stop, stop2, procletic, encletic_nm):
        """
        Analyze word morphologically by stemming the conjugation affixes.
        @param stop: the input stop.
        @type stop: unicode.
        @param stop2: the stop stemed from syntaxic affixes.
        @type stop2: unicode.
        @param procletic: the syntaxic prefixe extracted in the fisrt stage.
        @type procletic: unicode.
        @param encletic_nm: the syntaxic suffixe extracted in the
        first stage (not vocalized).
        @type encletic_nm: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        detailed_result = []
        #segment the coinjugated verb
        list_seg_conj = self.conj_stemmer.segment(stop2)
        # verify affix compatibility
        list_seg_conj = self.verify_affix(stop2, list_seg_conj,
                                     ssconst.STOPWORDS_CONJUGATION_AFFIX)
        # add vocalized forms of suffixes
        # and create the real affixes from the word
        #~list_seg_conj_voc = []
        for seg_conj in list_seg_conj:
            stem_conj = stop2[seg_conj[0]:seg_conj[1]]
            suffix_conj_nm = stop2[seg_conj[1]:]

            # noirmalize hamza before gessing  differents origines
            #~stem_conj = araby.normalize_hamza(stem_conj)

            # generate possible stems
            # add stripped letters to the stem to constitute possible stop list
            possible_stop_list = self.get_stem_variants(stem_conj, suffix_conj_nm)

            # search the stop in the dictionary
            # we can return the tashkeel
            infstop_form_list = []
            for infstop in set(possible_stop_list):
                # get the stop and get all its forms from the dict
                # if the stop has plural suffix, don't look up in
                #broken plural dictionary
                if infstop not in self.cache_dict_search:
                    infstop_foundlist = self.stop_dictionary.lookup(infstop)
                    self.cache_dict_search[infstop] = self.create_dict_word(
                        infstop_foundlist)
                else:
                    infstop_foundlist = self.cache_dict_search[infstop]
                infstop_form_list.extend(infstop_foundlist)
            for stop_tuple in infstop_form_list:
                # stop_tuple = self.stop_dictionary.getEntryById(id)
                original = stop_tuple['vocalized']

                #test if the  given word from dictionary accept those
                # tags given by affixes
                # دراسة توافق الزوائد مع خصائص الاسم،
                # مثلا هل يقبل الاسم التأنيث.
                #~if validate_tags(stop_tuple, affix_tags, procletic, encletic_nm, suffix_conj_nm):
                for vocalized_encletic in ssconst.COMP_SUFFIX_LIST_TAGS[
                        encletic_nm]['vocalized']:
                    for vocalized_suffix in ssconst.CONJ_SUFFIX_LIST_TAGS[
                            suffix_conj_nm]['vocalized']:
                        # affixes tags contains prefixes and suffixes tags
                        affix_tags = ssconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
                                  +ssconst.COMP_SUFFIX_LIST_TAGS[vocalized_encletic]['tags'] \
                                  +ssconst.CONJ_SUFFIX_LIST_TAGS[vocalized_suffix]['tags']
                        ## verify compatibility between procletics and affix
                        valid = self.validate_tags(stop_tuple, affix_tags, procletic, encletic_nm)
                        compatible = self.is_compatible_proaffix_affix(
                            stop_tuple, procletic, vocalized_encletic,
                            vocalized_suffix)
                        if valid and compatible:
                            vocalized, semi_vocalized = self.vocalize(
                                original, procletic, vocalized_suffix,
                                vocalized_encletic)
                            vocalized = self.ajust_vocalization(vocalized)
                            #ToDo:
                            # if the stop word is inflected or not
                            is_inflected = u"مبني" if stop_tuple[
                                'is_inflected'] == 0 else u"معرب"
                            #add some tags from dictionary entry as
                            # use action and object_type
                            original_tags = u":".join([
                                stop_tuple['word_type'],
                                stop_tuple['word_class'],
                                is_inflected,
                                stop_tuple['action'],
                            ])
                            #~print "STOP_TUPEL[action]:", stop_tuple['action'].encode("utf8")
                            # generate word case
                            detailed_result.append(
                                wordcase.WordCase({
                                    'word':
                                    stop,
                                    'affix': (procletic, '', vocalized_suffix,
                                              vocalized_encletic),
                                    'stem':
                                    stem_conj,
                                    'original':
                                    original,
                                    'vocalized':
                                    vocalized,
                                    'semivocalized':
                                    semi_vocalized,
                                    'tags':
                                    u':'.join(affix_tags),
                                    'type':
                                    u':'.join(
                                        ['STOPWORD', stop_tuple['word_type']]),
                                    'freq':
                                    'freqstopword',  # to note the frequency type
                                    'originaltags':
                                    original_tags,
                                    "action":
                                    stop_tuple['action'],
                                    "object_type":
                                    stop_tuple['object_type'],
                                    "need":
                                    stop_tuple['need'],
                                    'syntax':
                                    '',
                                }))
        return detailed_result
Пример #5
0
    def stemming_noun(self, noun_in):
        """
        Analyze word morphologically as noun
        @param noun_in: the input noun.
        @type noun_in: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """
        self.set_error_code('')
        if not noun_in:
            self.set_error_code('Empty word')
            return None
        debug = self.debug
        #~list_found = []
        detailed_result = []
        noun_list = [
            noun_in,
        ] + self.get_noun_variants(noun_in)
        word_segmented_list = []
        for noun in noun_list:
            list_seg_comp = self.comp_stemmer.segment(noun)
            # filter
            list_seg_comp = self.verify_affix(noun, list_seg_comp,
                                              SNC.COMP_NOUN_AFFIXES)
            # treat multi vocalization enclitic
            for seg in list_seg_comp:
                proclitic_nm = noun[:seg[0]]
                stem = noun[seg[0]:seg[1]]
                enclitic_nm = noun[seg[1]:]
                # ajusting nouns variant
                list_stem = [
                    stem,
                ] + self.get_input_stem_variants(stem, enclitic_nm)

                # stem reduced noun : level two
                for stem in list_stem:
                    word_seg = {
                        'noun': noun,
                        'stem_comp': stem,
                        'pro': proclitic_nm,
                        'enc': enclitic_nm,
                    }
                    word_segmented_list.append(word_seg)
        if not word_segmented_list:
            self.set_error_code(" First level segmentation error")

        # level two

        tmp_list = []
        if debug: print("after first level")
        if debug:
            #~ print(repr(word_segmented_list).replace(
            #~ '},', '},\n').decode("unicode-escape"))
            print(arepr(noun_in))
            print(print_table(word_segmented_list))

        for word_seg in word_segmented_list:

            #~ detailed_result.extend(
            #~ self.steming_second_level(word_seg['noun'], word_seg['stem_comp'],
            #~ word_seg['pro'], word_seg['enc']))
            #~ detailed_result_one = []
            #segment the coinjugated noun
            list_seg_conj = self.conj_stemmer.segment(word_seg['stem_comp'])
            # verify affix compatibility
            # filter
            list_seg_conj = self.verify_affix(word_seg['stem_comp'],
                                              list_seg_conj,
                                              SNC.NOMINAL_CONJUGATION_AFFIX)
            # add vocalized forms of suffixes
            # and create the real affixes from the word
            for seg_conj in list_seg_conj:
                stem_conj = word_seg['stem_comp'][:seg_conj[1]]
                suffix = word_seg['stem_comp'][seg_conj[1]:]
                stem_conj = ar.normalize_hamza(stem_conj)
                stem_conj_list = self.get_stem_variants(stem_conj, suffix)

                # generate possible stems
                # add stripped letters to the stem to constitute possible noun list
                for stem in stem_conj_list:
                    word_seg_l2 = word_seg.copy()
                    # normalize hamza before gessing  differents origines
                    word_seg_l2['stem_conj'] = stem
                    word_seg_l2['suffix'] = suffix
                    #affixes tags contains prefixes and suffixes tags
                    word_seg_l2['affix_tags'] = list(
                        set(SNC.COMP_PREFIX_LIST_TAGS[word_seg_l2['pro']]
                            ['tags'] + SNC.COMP_SUFFIX_LIST_TAGS[
                                word_seg_l2['enc']]['tags'] +
                            SNC.CONJ_SUFFIX_LIST_TAGS[
                                word_seg_l2['suffix']]['tags']))
                    tmp_list.append(word_seg_l2)

        if debug: print("after second level")
        if debug:
            print(arepr(noun_in))
            print(print_table(tmp_list))
        # lookup in dictionary
        if not tmp_list:
            self.set_error_code(" Second level segmentation error")
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # search the noun in the dictionary
            # we can return the tashkeel
            inf_noun = word_seg['stem_conj']
            # get the noun and get all its forms from the dict
            # if the noun has plural suffix, don't look up in
            #broken plural dictionary
            if inf_noun in self.cache_dict_search:
                infnoun_foundlist = self.cache_dict_search[inf_noun]
            else:
                infnoun_foundlist = self.lookup_dict(inf_noun)
                self.cache_dict_search[inf_noun] = infnoun_foundlist

            for noun_tuple in infnoun_foundlist:
                word_seg_l3 = word_seg.copy()
                word_seg_l3["original"] = noun_tuple['vocalized']
                word_seg_l3["noun_tuple"] = dict(noun_tuple)
                tmp_list.append(word_seg_l3)

        if debug: print("after lookup dict")
        if debug:
            print(arepr(noun_in))
            noun_tuples = [item['noun_tuple'] for item in tmp_list]
            print(print_table(noun_tuples))
        # test compatiblity noun_tuple with affixes and proaffixes
        # and generate vocalized affixes and suffixes
        if not tmp_list:
            self.set_error_code("Not exists in dictionary")
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            #test if the  given word from dictionary accept those
            # tags given by affixes
            # دراسة توافق الزوائد مع خصائص الاسم،
            # مثلا هل يقبل الاسم التأنيث.
            if self.validate_tags(word_seg['noun_tuple'],
                                  word_seg['affix_tags'], word_seg['pro'],
                                  word_seg['enc'], word_seg['suffix']):
                ## get all vocalized form of suffixes
                for pro_voc in SNC.COMP_PREFIX_LIST_TAGS[
                        word_seg['pro']]['vocalized']:
                    for enc_voc in SNC.COMP_SUFFIX_LIST_TAGS[
                            word_seg['enc']]['vocalized']:
                        for suf_voc in SNC.CONJ_SUFFIX_LIST_TAGS[
                                word_seg['suffix']]['vocalized']:
                            ## verify compatibility between proclitics and affix
                            if self.__check_clitic_affix(
                                    word_seg['noun_tuple'], pro_voc, enc_voc,
                                    suf_voc):
                                # get affix tags
                                affix_tags_voc = SNC.COMP_PREFIX_LIST_TAGS[pro_voc]['tags']\
                                  +SNC.COMP_SUFFIX_LIST_TAGS[enc_voc]['tags']\
                                  +SNC.CONJ_SUFFIX_LIST_TAGS[suf_voc]['tags']
                                word_seg_l4 = word_seg.copy()
                                word_seg_l4['suf_voc'] = suf_voc
                                word_seg_l4['enc_voc'] = enc_voc
                                word_seg_l4['affix_tags'] = affix_tags_voc
                                tmp_list.append(word_seg_l4)

        if debug: print("after check compatibility")
        if debug:
            print(arepr(noun_in))
            noun_tuples = [item['noun_tuple'] for item in tmp_list]
            print(print_table(noun_tuples))
        # Generate results
        if not tmp_list:
            self.set_error_code("Affixes not compatible")

        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get voalized and vocalized without inflection
            #~ vocalized, semi_vocalized, _ = self.vocalize(
            voca_tuple_list = self.vocalize(
                word_seg['noun_tuple']['vocalized'], word_seg['pro'],
                word_seg['suf_voc'], word_seg['enc_voc'])
            for vocalized, semi_vocalized, _ in voca_tuple_list:
                #add some tags from dictionary entry as
                #mamnou3 min sarf and broken plural
                original_tags = []
                if word_seg['noun_tuple']['mankous'] == u"Tk":
                    original_tags.append(u"منقوص")
                # if there are many cases like feminin plural with mansoub and majrour
                if 'cases' in SNC.CONJ_SUFFIX_LIST_TAGS[word_seg['suf_voc']]:
                    list_cases = SNC.CONJ_SUFFIX_LIST_TAGS[
                        word_seg['suf_voc']]['cases']
                else:
                    list_cases = ('', )
                for case in list_cases:
                    voc_affix_case = word_seg['affix_tags'] + (case, )
                    # filter empty
                    voc_affix_case = [vac for vac in voc_affix_case if vac]
                    detailed_result.append(
                        wordcase.WordCase({
                            'word':
                            noun_in,
                            'affix': (word_seg['pro'], '', word_seg['suf_voc'],
                                      word_seg['enc_voc']),
                            'stem':
                            word_seg['stem_conj'],
                            'root':
                            ar.normalize_hamza(word_seg['noun_tuple'].get(
                                'root', '')),
                            'original':
                            word_seg['noun_tuple']['vocalized'],  #original,
                            'vocalized':
                            vocalized,
                            'semivocalized':
                            semi_vocalized,
                            'tags':
                            u':'.join(voc_affix_case),
                            'type':
                            u':'.join(
                                ['Noun', word_seg['noun_tuple']['wordtype']]),
                            'number':
                            word_seg['noun_tuple']['number'],
                            'gender':
                            word_seg['noun_tuple']['gender'],
                            'freq':
                            'freqnoun',  # to note the frequency type
                            'originaltags':
                            u':'.join(original_tags),
                            'syntax':
                            '',
                        }))
        if not detailed_result:
            self.set_error_code("Forms are not generated")

        if debug: print("after generate result")
        if debug: print(len(detailed_result))
        #~ if debug: print repr(detailed_result).replace('},','},\n').decode("unicode-escape")
        return detailed_result
Пример #6
0
    def stemming_verb(self, verb_in):
        """
        Stemming verb
        @param verb_in: given verb
        @type verb_in: unicode
        @return : stemmed words
        @rtype:
        """
        #~ list_found = []
        detailed_result = []
        verb_list = [
            verb_in,
        ] + self.get_verb_variants(verb_in)

        #list of segmented words
        word_segmented_list = []
        for verb in verb_list:

            list_seg_comp = self.comp_stemmer.segment(verb)
            for seg in list_seg_comp:
                proclitic = verb[:seg[0]]
                stem = verb[seg[0]:seg[1]]
                enclitic = verb[seg[1]:]
                #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8')
                #~secondsuffix = u''
                # حالة الفعل المتعدي لمفعولين
                if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX:
                    firstsuffix = \
                    SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first']
                    enclitic = firstsuffix

                list_stem = [stem] + self.get_in_stem_variants(stem, enclitic)
                #if enclitic, then transitive is ok
                transitive_comp = bool(enclitic)
                for stm in list_stem:
                    word_seg = {
                        "verb": verb,
                        "pro": proclitic,
                        "enc": enclitic,
                        'stem_comp': stm,
                        'trans_comp': transitive_comp,
                    }
                    word_segmented_list.append(word_seg)

        # second level for segmented word
        tmp_list = []
        #~ print 'first level', verb_in, len(word_segmented_list)
        for word_seg in word_segmented_list:
            verb2 = word_seg['stem_comp']
            # stem reduced verb : level two
            #segment the conjugated verb
            list_seg_conj = self.conj_stemmer.segment(verb2)

            # verify affix compatibility
            list_seg_conj = self.verify_affix(verb2, list_seg_conj,
                                              SVC.VERBAL_CONJUGATION_AFFIX)
            # verify proclitics and enclitecs
            # verify length pof stem
            for seg_conj in list_seg_conj:
                if (seg_conj[1] - seg_conj[0]) <= 6:

                    #word seg in level 2
                    word_seg_l2 = word_seg.copy()
                    word_seg_l2["prefix"] = verb2[:seg_conj[0]]
                    word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]]
                    word_seg_l2["suffix"] = verb2[seg_conj[1]:]
                    tmp_list.append(word_seg_l2)

        # verify compatibilty between proclitic and affixes
        word_segmented_list = tmp_list
        #~ print 'compatibility', verb_in, len(tmp_list)
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify compatibility between proclitics and affixes
            proclitic = word_seg['pro']
            enclitic = word_seg['enc']
            affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']])
            if self.__check_clitic_affix(proclitic, enclitic, affix_conj):
                tmp_list.append(word_seg.copy())

        #~ print 'stamp', verb_in, len(tmp_list)
        # verify existance of condidate verb by stamp
        word_segmented_list = tmp_list
        #~ tmp_list = []
        #~ for word_seg in word_segmented_list:
        #~ # verify existance of condidate verb by stamp
        #~ if self.verb_dictionary.exists_as_stamp(word_seg['stem_conj']):
        #~ tmp_list.append(word_seg.copy())

        #print 'infinitive', verb_in, len(tmp_list)
        #~ # get infinitive of condidate verbs
        #~ word_segmented_list = tmp_list
        #~ tmp_list = []
        #~ for word_seg in word_segmented_list:
        #~ # get infinitive of condidate verb by stamp

        #~ # search the verb in the dictionary by stamp
        #~ # if the verb exists in dictionary,
        #~ # The transitivity is consedered
        #~ # if is trilateral return its forms and Tashkeel
        #~ # if not return forms without tashkeel,
        #~ #because the conjugator can vocalized it,
        #~ # we can return the tashkeel if we don't need the
        #~ #conjugation step
        #~ infverb_dict = self.__get_infinitive_verb_by_stem(
        #~ word_seg['stem_conj'], word_seg['trans_comp'])
        #print "list possible verbs", len(infverb_dict)
        #for item in infverb_dict:
        #print item['verb']
        #~ # filter verbs
        #~ infverb_dict = self.__verify_infinitive_verbs(
        #~ word_seg['stem_conj'], infverb_dict)

        #~ for item in infverb_dict:
        #~ #The haraka from is given from the dict
        #~ word_seg_l3 = word_seg.copy()
        #~ word_seg_l3['inf'] = item['verb']
        #~ word_seg_l3['haraka'] = item['haraka']
        #~ word_seg_l3['root'] = item.get('root','')
        #~ word_seg_l3['transitive'] = bool(item['transitive'] in ('y',
        #~ 1))
        #~ tmp_list.append(word_seg_l3)
        #~ # conjugation step

        #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape")
        #~ print 'conj', verb_in, len(tmp_list)
        # get conjugation for every infinitive verb
        #~ word_segmented_list = tmp_list
        #~ tmp_list = []
        #~ for word_seg in word_segmented_list:
        #~ # ToDo, conjugate the verb with affix,
        #~ # if exists one verb which match, return it
        #~ # تصريف الفعل مع الزوائد
        #~ # إذا توافق التصريف مع الكلمة الناتجة
        #~ # تعرض النتيجة
        #~ one_correct_conj = self.__generate_possible_conjug(
        #~ word_seg['inf'], word_seg['stem_comp'],
        #~ word_seg['prefix'] + '-' + word_seg['suffix'],
        #~ word_seg['haraka'], word_seg['pro'], word_seg['enc'],
        #~ word_seg['transitive'])

        ##~ print "len correct_conj", len(one_correct_conj)
        #~ for conj in one_correct_conj:
        #~ word_seg_l4 = word_seg.copy()
        #~ word_seg_l4['conj'] = conj.copy()
        #~ tmp_list.append(word_seg_l4)

        #~ print 'result', verb_in, len(tmp_list)
        # generate all resulted data
        #~ word_segmented_list = tmp_list

        #filter invalid verb stem like the case of TEH Marbuta
        word_segmented_list = [
            x for x in word_segmented_list
            if self.is_valid_verb_stem(x['stem_conj'])
        ]

        # add root ans lemma
        for word_seg in word_segmented_list:
            # Choose a root
            word_seg['root'] = self.choose_wazn_root(stem)
        # remove empty roots
        tmp_list = [x for x in word_segmented_list if x['root']]
        # if the tmp list is empty, there are no root,
        # in this case we sould remove this case
        # make it temporary for test
        if tmp_list:
            word_segmented_list = tmp_list

        # create result

        for word_seg in word_segmented_list:
            #~ conj = word_seg['conj']
            #~ vocalized, semivocalized = self.vocalize(
            #~ conj['vocalized'], word_seg['pro'], word_seg['enc'])
            tag_type = 'Verb'
            #~ original_tags = "y" if conj['transitive'] else "n"
            stem = word_seg['stem_conj']
            detailed_result.append(
                wordcase.WordCase({
                    'word':
                    word_seg['verb'],
                    'affix': (word_seg['pro'], word_seg['prefix'],
                              word_seg['suffix'], word_seg['enc']),
                    'stem':
                    stem,
                    #~ 'root':"VTODO",
                    'root':
                    self.choose_wazn_root(stem),
                    "original":
                    "VTODO",
                    #~ 'root':ar.normalize_hamza(word_seg.get('root','')),
                    #~ 'original':conj['verb'],
                    #~ 'vocalized':vocalized,
                    #~ 'semivocalized':semivocalized,
                    #~ 'tags':u':'.join((conj['tense'], conj['pronoun'])+\
                    #~ SVC.COMP_PREFIX_LIST_TAGS[proclitic]['tags']+\
                    #~ SVC.COMP_SUFFIX_LIST_TAGS[enclitic]['tags']),#\
                    'type':
                    tag_type,
                    #~ 'number': conj['pronoun_tags'].get('number', ''),
                    #~ 'gender': conj['pronoun_tags'].get('gender', ''),
                    #~ 'person': conj['pronoun_tags'].get('person', ''),
                    #~ 'tense2': conj['tense_tags'].get('tense', ''),
                    #~ 'voice': conj['tense_tags'].get('voice', ''),
                    #~ 'mood': conj['tense_tags'].get('mood', ''),
                    #~ 'confirmed': conj['tense_tags'].get('confirmed', ''),
                    #~ 'transitive': conj['transitive'],
                    #~ 'tense': conj['tense'],
                    #~ 'pronoun': conj['pronoun'],
                    #~ 'freq':'freqverb',
                    #~ 'originaltags':original_tags,
                    #~ 'syntax':'',
                }))

        return detailed_result
Пример #7
0
    def check_word(self, word, guessedtag=""):
        """
        Analyze one word morphologically as verbs
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """

        word = araby.strip_tatweel(word)
        word_vocalised = word
        word_nm = araby.strip_tashkeel(word)
        # get analysed details from cache if used
        if self.allow_cache_use and self.cache.is_already_checked(word_nm):
            #~ print (u"'%s'"%word).encode('utf8'), 'found'
            resulted_data = self.cache.get_checked(word_nm)
        else:
            resulted_data = []
            # if word is a pounctuation
            resulted_data += self.check_word_as_pounct(word_nm)
            # Done: if the word is a stop word we have  some problems,
            # the stop word can also be another normal word (verb or noun),
            # we must consider it in future works
            # if word is stopword allow stop words analysis
            if araby.is_arabicword(word_nm):
                resulted_data += self.check_word_as_stopword(word_nm)

                #if word is verb
                # مشكلة بعض الكلمات المستبعدة تعتبر أفعلا أو اسماء
                #~if  self.tagger.has_verb_tag(guessedtag) or \
                #~self.tagger.is_stopword_tag(guessedtag):
                #~resulted_data += self.check_word_as_verb(word_nm)
                resulted_data += self.check_word_as_verb(word_nm)
                #print "is verb", rabti, len(resulted_data)
                #if word is noun
                #~if self.tagger.has_noun_tag(guessedtag) or \
                #~self.tagger.is_stopword_tag(guessedtag):
                #~resulted_data += self.check_word_as_noun(word_nm)
                resulted_data += self.check_word_as_noun(word_nm)
            if len(resulted_data) == 0:
                #print (u"1 _unknown %s-%s"%(word, word_nm)).encode('utf8')
                #check the word as unkonwn
                resulted_data += self.check_word_as_unknown(word_nm)
                #check if the word is nomralized and solution are equivalent
            resulted_data = self.check_normalized(word_vocalised,
                                                  resulted_data)
            #check if the word is shadda like

            resulted_data = self.check_shadda(word_vocalised, resulted_data,
                                              self.fully_vocalized_input)

            # add word frequency information in tags
            resulted_data = self.add_word_frequency(resulted_data)

            # add the stemmed words details into Cache
            data_list_to_serialize = [w.__dict__ for w in resulted_data]
            if self.allow_cache_use:
                self.cache.add_checked(word_nm, data_list_to_serialize)

        #check if the word is vocalized like results
        if self.partial_vocalization_support:
            resulted_data = self.check_partial_vocalized(
                word_vocalised, resulted_data)

        if len(resulted_data) == 0:
            error_code = self.get_error_code()
            resulted_data.append(
                wordcase.WordCase({
                    'word':
                    word,
                    'affix': ('', '', '', ''),
                    'stem':
                    word,
                    'original':
                    word,
                    'vocalized':
                    word,
                    'semivocalized':
                    word,
                    'tags':
                    u'%s' % error_code,
                    'type':
                    'unknown',
                    'root':
                    '',
                    'template':
                    '',
                    'freq':
                    self.wordfreq.get_freq(word, 'unknown'),
                    'syntax':
                    '',
                }))
        return resulted_data
Пример #8
0
    def stemming_verb(self, verb_in):
        """
        Stemming verb
        @param verb_in: given verb
        @type verb_in: unicode
        @return : stemmed words
        @rtype:
        """
        if not verb_in:
            return None
        #~ list_found = []
        detailed_result = []
        verb_list = [
            verb_in,
        ] + self.get_verb_variants(verb_in)
        debug = self.debug
        #list of segmented words
        word_segmented_list = []
        for verb in verb_list:

            list_seg_comp = self.comp_stemmer.segment(verb)
            for seg in list_seg_comp:
                proclitic = verb[:seg[0]]
                stem = verb[seg[0]:seg[1]]
                enclitic = verb[seg[1]:]
                #~ print "stem_verb affix 93", "-".join([proclitic, stem, enclitic]).encode('utf8')
                #~secondsuffix = u''
                # حالة الفعل المتعدي لمفعولين
                if enclitic in SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX:
                    firstsuffix = \
                    SVC.TABLE_DOUBLE_TRANSITIVE_SUFFIX[enclitic]['first']
                    enclitic = firstsuffix

                list_stem = [stem] + self.get_in_stem_variants(stem, enclitic)
                #if enclitic, then transitive is ok
                transitive_comp = bool(enclitic)
                for stm in list_stem:
                    word_seg = {
                        "verb": verb,
                        "pro": proclitic,
                        "enc": enclitic,
                        'stem_comp': stm,
                        'trans_comp': transitive_comp,
                    }
                    word_segmented_list.append(word_seg)
        if debug: print("after first level")
        if debug:
            #~ print(repr(word_segmented_list).replace(
            #~ '},', '},\n').decode("unicode-escape"))
            print(arepr(verb_in))
            print(print_table(word_segmented_list))
        # second level for segmented word
        tmp_list = []
        #~ print 'first level', verb_in, len(word_segmented_list)
        for word_seg in word_segmented_list:
            verb2 = word_seg['stem_comp']
            # stem reduced verb : level two
            #segment the conjugated verb
            list_seg_conj = self.conj_stemmer.segment(verb2)

            # verify affix compatibility
            list_seg_conj = self.verify_affix(verb2, list_seg_conj,
                                              SVC.VERBAL_CONJUGATION_AFFIX)
            # verify proclitics and enclitecs
            # verify length pof stem
            for seg_conj in list_seg_conj:
                if (seg_conj[1] - seg_conj[0]) <= 6:

                    #word seg in level 2
                    word_seg_l2 = word_seg.copy()
                    word_seg_l2["prefix"] = verb2[:seg_conj[0]]
                    word_seg_l2["stem_conj"] = verb2[seg_conj[0]:seg_conj[1]]
                    word_seg_l2["suffix"] = verb2[seg_conj[1]:]
                    tmp_list.append(word_seg_l2)

        # verify compatibilty between proclitic and affixes
        word_segmented_list = tmp_list
        #~ print 'compatibility', verb_in, len(tmp_list)
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify compatibility between proclitics and affixes
            proclitic = word_seg['pro']
            enclitic = word_seg['enc']
            affix_conj = u"-".join([word_seg['prefix'], word_seg['suffix']])
            if self.__check_clitic_affix(proclitic, enclitic, affix_conj):
                tmp_list.append(word_seg.copy())

        #~ print 'stamp', verb_in, len(tmp_list)
        # verify existance of condidate verb by stamp
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # verify existance of condidate verb by stamp
            if self.exists_as_stamp(word_seg['stem_conj']):
                tmp_list.append(word_seg.copy())

        if debug: print("after second level")
        if debug:
            print(arepr(verb_in))
            print(print_table(tmp_list))
        #~ print 'infinitive', verb_in, len(tmp_list)
        # get infinitive of condidate verbs
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # get infinitive of condidate verb by stamp

            # search the verb in the dictionary by stamp
            # if the verb exists in dictionary,
            # The transitivity is consedered
            # if is trilateral return its forms and Tashkeel
            # if not return forms without tashkeel,
            #because the conjugator can vocalized it,
            # we can return the tashkeel if we don't need the
            #conjugation step
            infverb_dict = self.__get_infinitive_verb_by_stem(
                word_seg['stem_conj'], word_seg['trans_comp'])
            if debug: print("infinitive candidat verbs")
            if debug:
                print(arepr(verb_in))
                print(print_table(infverb_dict))
            #~ print "list possible verbs", len(infverb_dict)
            #~ for item in infverb_dict:
            #~ print item['verb']
            # filter verbs
            infverb_dict = self.__verify_infinitive_verbs(
                word_seg['stem_conj'], infverb_dict)

            if debug: print("valid infinitive candidat verbs")
            if debug:
                print(arepr(verb_in))
                print(print_table(infverb_dict))
            for item in infverb_dict:
                #The haraka from is given from the dict
                word_seg_l3 = word_seg.copy()
                word_seg_l3['inf'] = item['verb']
                word_seg_l3['haraka'] = item['haraka']
                word_seg_l3['root'] = item.get('root', '')
                word_seg_l3['transitive'] = bool(item['transitive'] in ('y',
                                                                        1))
                tmp_list.append(word_seg_l3)
                # conjugation step
        if debug: print("after lookup dict")
        if debug:
            print(arepr(verb_in))
            print(print_table(tmp_list))
        #~ print repr(tmp_list).replace('},','},\n').decode("unicode-escape")
        #~ print 'conj', verb_in, len(tmp_list)
        # get conjugation for every infinitive verb
        word_segmented_list = tmp_list
        tmp_list = []
        for word_seg in word_segmented_list:
            # ToDo, conjugate the verb with affix,
            # if exists one verb which match, return it
            # تصريف الفعل مع الزوائد
            # إذا توافق التصريف مع الكلمة الناتجة
            # تعرض النتيجة
            one_correct_conj = self.__generate_possible_conjug(
                word_seg['inf'], word_seg['stem_comp'],
                word_seg['prefix'] + '-' + word_seg['suffix'],
                word_seg['haraka'], word_seg['pro'], word_seg['enc'],
                word_seg['transitive'])

            #~ print "len correct_conj", len(one_correct_conj)
            for conj in one_correct_conj:
                word_seg_l4 = word_seg.copy()
                word_seg_l4['conj'] = conj.copy()
                tmp_list.append(word_seg_l4)
        if debug: print("after generating conjugation")
        if debug:
            print(arepr(verb_in))
            conjs = [item['conj'] for item in tmp_list]
            print(print_table(conjs))
        #~ print 'result', verb_in, len(tmp_list)
        # generate all resulted data
        word_segmented_list = tmp_list

        #~ tmp_list = []
        for word_seg in word_segmented_list:
            conj = word_seg['conj']
            #~ vocalized, semivocalized = self.vocalize(
            vocal_tuple_list = self.vocalize(conj['vocalized'],
                                             word_seg['pro'], word_seg['enc'])
            tag_type = 'Verb'
            original_tags = "y" if conj['transitive'] else "n"
            for vocalized, semivocalized in vocal_tuple_list:
                # prepare tags
                tags = self.prepare_tags(conj, proclitic, enclitic)

                detailed_result.append(
                    wordcase.WordCase({
                        'word':
                        word_seg['verb'],
                        'affix': (word_seg['pro'], word_seg['prefix'],
                                  word_seg['suffix'], word_seg['enc']),
                        'stem':
                        word_seg['stem_conj'],
                        'root':
                        ar.normalize_hamza(word_seg.get('root', '')),
                        'original':
                        conj['verb'],
                        'vocalized':
                        vocalized,
                        'semivocalized':
                        semivocalized,
                        'tags':
                        tags,  #\
                        'type':
                        tag_type,
                        'number':
                        conj['pronoun_tags'].get('number', ''),
                        'gender':
                        conj['pronoun_tags'].get('gender', ''),
                        'person':
                        conj['pronoun_tags'].get('person', ''),
                        'tense2':
                        conj['tense_tags'].get('tense', ''),
                        'voice':
                        conj['tense_tags'].get('voice', ''),
                        'mood':
                        conj['tense_tags'].get('mood', ''),
                        'confirmed':
                        conj['tense_tags'].get('confirmed', ''),
                        'transitive':
                        conj['transitive'],
                        'tense':
                        conj['tense'],
                        'pronoun':
                        conj['pronoun'],
                        'freq':
                        'freqverb',
                        'originaltags':
                        original_tags,
                        'syntax':
                        '',
                    }))

        return detailed_result