def getstem(self, word): """ get a stem from word""" word_in = word if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) if self.is_noun(word_in) and not self.is_verb(word_in): self.noun_stemmer.light_stem(word) self.noun_stemmer.segment(word) affixation_list = self.noun_stemmer.get_affix_list() affixation_list = filter(self.verify_affix, affixation_list) return self.choose_stem(affixation_list, "verb") elif self.is_verb(word_in) and not self.is_noun(word_in): self.verb_stemmer.light_stem(word) self.verb_stemmer.segment(word) affixation_list = self.verb_stemmer.get_affix_list() affixation_list = filter(self.verb_verify_affix, affixation_list) return self.choose_stem(affixation_list, "noun") else: self.light_stem(word) self.segment(word) affixation_list = self.get_affix_list() # filter valid affixes affixation_list = filter(self.noun_verify_affix, affixation_list) chosen = self.choose_stem(affixation_list) #~ return self.normalize(chosen) return chosen # filter valid affixes else: return stop_stem(word)
def getroot(self, word): """ get a stem from word""" if not is_stop(word): self.light_stem(word) return self.get_root() else: return stop_root(word)
def getroot(self, word): """ get a stem from word""" if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) # default self.light_stem(word) self.segment(word) affixation_list = self.get_affix_list() default_root = self.get_root() if self.is_noun(word): self.noun_stemmer.light_stem(word) self.noun_stemmer.segment(word) affixation_list = self.noun_stemmer.get_affix_list() default_root = self.noun_stemmer.get_root() elif self.is_verb(word): self.verb_stemmer.light_stem(word) self.verb_stemmer.segment(word) affixation_list = self.verb_stemmer.get_affix_list() default_root = self.verb_stemmer.get_root() # filter valid affixes #~ affixation_list = filter(self.verify_affix, affixation_list) #~ root_result = self.rootdict.choose_root(affixation_list) #~ if root_result: #~ return root_result #~ else: #~ return default_root return default_root else: return stop_root(word)
def getstem(self, word): """ get a stem from word""" if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) return self.light_stem(word) else: return stop_stem(word)
def getstem(self, word): """ get a stem from word""" if not is_stop(word): return self.light_stem(word) else: return stop_stem(word)
def getstem(self, word): """ get a stem from word""" if is_stop(word): return stop_stem(word) elif self.is_verb(word): return self.verb_stemmer.light_stem(word) elif self.is_noun(word): return self.noun_stemmer.light_stem(word) else: # a non defined verb or noun return self.light_stem(word)
def test_rooter3(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne asl = abstractstemmer.customStemmer_roots_rhyzome() # debug in rhyzome rooter asl.rootdict.rhyzome_rooter.debug = True df = dataframe_result # avoid null roots total = len(df.index) cpt = 0 for word, root in zip(df["word"], df["root"]): root_list = root.split(';') if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) asl.light_stem(word) default_root = asl.get_root() starword = asl.get_starword() asl.segment(word) affixa_list = asl.get_affix_list() # filter valid affixes affixa_list = filter(asl.verify_affix, affixa_list) #~ root_result = rootslib.choose_root(affixation_list) if True: stems = [d['stem'] for d in affixa_list] roots = [d['root'] for d in affixa_list] print((u"**********%s*********" % word).encode('utf8')) print((u"Start Word : %s" % starword).encode('utf8')) print("Stems: " + u' '.join(stems).encode('utf8')) print((u"Dafault roots: [%s] a %s" % (default_root, u' '.join(roots))).encode('utf8')) print(arepr(affixa_list)) root_result = asl.rootdict.choose_root(word, affixa_list, debug=True) else: root_result = stop_root(word) roots = [] stems = [] startword = "" default_root = "" affixa_list = [] if root_result in root_list: cpt += 1 if True: print((u" ".join([ u"Test root", root, u"found root", root_result, str(root_result in root_list) ])).encode('utf8')) print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
def getstem(self, word): """ get a stem from word""" if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) self.light_stem(word) self.segment(word) affixation_list = self.get_affix_list() # filter valid affixes #~ print("Before filtering") #~ print(arepr(affixation_list)) affixation_list = filter(self.verify_affix, affixation_list) chosen = self.lemmatizer.choose_stem(affixation_list) #~ return self.normalize(chosen) return chosen else: return stop_stem(word)
def getroot(self, word): """ get a stem from word""" if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) self.light_stem(word) self.segment(word) affixation_list = self.get_affix_list() # filter valid affixes affixation_list = filter(self.verify_affix, affixation_list) #~ root_result = rootslib.choose_root(affixation_list) root_result = self.rootdict.choose_root_matrix( word, affixation_list, self.debug_root) if root_result: return root_result else: return self.get_root() else: return stop_root(word)
def getroot(self, word): """ get a root from word""" if not is_stop(word): word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) stem = self.stemWord(word) affixation_list = [ { 'prefix': '', 'suffix': '', 'root': stem, 'stem': stem, }, ] root_result = self.rootdict.choose_root(word, affixation_list) if root_result: return root_result else: return stem else: return stop_root(word)