def getstem(self, word):
        """ get a stem from word"""
        word_in = word
        if not is_stop(word):
            word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                          araby.HAMZA + araby.ALEF, word)

            if self.is_noun(word_in) and not self.is_verb(word_in):
                self.noun_stemmer.light_stem(word)
                self.noun_stemmer.segment(word)
                affixation_list = self.noun_stemmer.get_affix_list()
                affixation_list = filter(self.verify_affix, affixation_list)
                return self.choose_stem(affixation_list, "verb")

            elif self.is_verb(word_in) and not self.is_noun(word_in):
                self.verb_stemmer.light_stem(word)
                self.verb_stemmer.segment(word)
                affixation_list = self.verb_stemmer.get_affix_list()
                affixation_list = filter(self.verb_verify_affix,
                                         affixation_list)
                return self.choose_stem(affixation_list, "noun")
            else:
                self.light_stem(word)
                self.segment(word)
                affixation_list = self.get_affix_list()
                # filter valid affixes
                affixation_list = filter(self.noun_verify_affix,
                                         affixation_list)
                chosen = self.choose_stem(affixation_list)
                #~ return self.normalize(chosen)
                return chosen
            # filter valid affixes

        else:
            return stop_stem(word)
 def getroot(self, word):
     """ get a stem from word"""
     if not is_stop(word):
         self.light_stem(word)
         return self.get_root()
     else:
         return stop_root(word)
    def getroot(self, word):
        """ get a stem from word"""
        if not is_stop(word):
            word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                          araby.HAMZA + araby.ALEF, word)
            # default
            self.light_stem(word)
            self.segment(word)
            affixation_list = self.get_affix_list()
            default_root = self.get_root()

            if self.is_noun(word):
                self.noun_stemmer.light_stem(word)
                self.noun_stemmer.segment(word)
                affixation_list = self.noun_stemmer.get_affix_list()
                default_root = self.noun_stemmer.get_root()
            elif self.is_verb(word):
                self.verb_stemmer.light_stem(word)
                self.verb_stemmer.segment(word)
                affixation_list = self.verb_stemmer.get_affix_list()
                default_root = self.verb_stemmer.get_root()

            # filter valid affixes
            #~ affixation_list = filter(self.verify_affix, affixation_list)

            #~ root_result = self.rootdict.choose_root(affixation_list)
            #~ if root_result:
            #~ return root_result
            #~ else:
            #~ return default_root
            return default_root

        else:
            return stop_root(word)
 def getstem(self, word):
     """ get a stem from word"""
     if not is_stop(word):
         word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                       araby.HAMZA + araby.ALEF, word)
         return self.light_stem(word)
     else:
         return stop_stem(word)
    def getstem(self, word):
        """ get a stem from word"""
        if not is_stop(word):

            return self.light_stem(word)

        else:
            return stop_stem(word)
 def getstem(self, word):
     """ get a stem from word"""
     if is_stop(word):
         return stop_stem(word)
     elif self.is_verb(word):
         return self.verb_stemmer.light_stem(word)
     elif self.is_noun(word):
         return self.noun_stemmer.light_stem(word)
     else:  # a non defined verb or noun
         return self.light_stem(word)
Пример #7
0
def test_rooter3(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    asl = abstractstemmer.customStemmer_roots_rhyzome()
    # debug in rhyzome rooter
    asl.rootdict.rhyzome_rooter.debug = True
    df = dataframe_result
    # avoid null roots

    total = len(df.index)
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        root_list = root.split(';')
        if not is_stop(word):
            word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                          araby.HAMZA + araby.ALEF, word)
            asl.light_stem(word)
            default_root = asl.get_root()
            starword = asl.get_starword()
            asl.segment(word)
            affixa_list = asl.get_affix_list()
            # filter valid affixes
            affixa_list = filter(asl.verify_affix, affixa_list)
            #~ root_result = rootslib.choose_root(affixation_list)
            if True:
                stems = [d['stem'] for d in affixa_list]
                roots = [d['root'] for d in affixa_list]
                print((u"**********%s*********" % word).encode('utf8'))
                print((u"Start Word : %s" % starword).encode('utf8'))
                print("Stems: " + u' '.join(stems).encode('utf8'))
                print((u"Dafault roots: [%s] a %s" %
                       (default_root, u' '.join(roots))).encode('utf8'))
                print(arepr(affixa_list))

            root_result = asl.rootdict.choose_root(word,
                                                   affixa_list,
                                                   debug=True)
        else:
            root_result = stop_root(word)
            roots = []
            stems = []
            startword = ""
            default_root = ""
            affixa_list = []
        if root_result in root_list:
            cpt += 1
        if True:
            print((u" ".join([
                u"Test root", root, u"found root", root_result,
                str(root_result in root_list)
            ])).encode('utf8'))

    print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
    def getstem(self, word):
        """ get a stem from word"""
        if not is_stop(word):
            word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                          araby.HAMZA + araby.ALEF, word)
            self.light_stem(word)
            self.segment(word)
            affixation_list = self.get_affix_list()
            # filter valid affixes
            #~ print("Before filtering")
            #~ print(arepr(affixation_list))
            affixation_list = filter(self.verify_affix, affixation_list)

            chosen = self.lemmatizer.choose_stem(affixation_list)
            #~ return self.normalize(chosen)
            return chosen
        else:
            return stop_stem(word)
    def getroot(self, word):
        """ get a stem from word"""
        if not is_stop(word):
            word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                          araby.HAMZA + araby.ALEF, word)
            self.light_stem(word)
            self.segment(word)
            affixation_list = self.get_affix_list()
            # filter valid affixes
            affixation_list = filter(self.verify_affix, affixation_list)

            #~ root_result = rootslib.choose_root(affixation_list)
            root_result = self.rootdict.choose_root_matrix(
                word, affixation_list, self.debug_root)
            if root_result:
                return root_result
            else:
                return self.get_root()

        else:
            return stop_root(word)
    def getroot(self, word):
        """ get a root from word"""
        if not is_stop(word):
            word = re.sub(u"[%s]" % (araby.ALEF_MADDA),
                          araby.HAMZA + araby.ALEF, word)
            stem = self.stemWord(word)
            affixation_list = [
                {
                    'prefix': '',
                    'suffix': '',
                    'root': stem,
                    'stem': stem,
                },
            ]
            root_result = self.rootdict.choose_root(word, affixation_list)
            if root_result:
                return root_result
            else:
                return stem

        else:
            return stop_root(word)