def __init__(self, ): self.word = u"" self.verbstemmer = tashaphyne.ArabicLightStemmer() # prepare the verb stemmer verb_prefix = u"أسفلونيتا" verb_infix = u"اتويدط" verb_suffix = u"امتةكنهوي" verb_max_prefix = 4 verb_max_suffix = 6 self.verbstemmer.set_max_prefix_length(verb_max_prefix) self.verbstemmer.set_max_suffix_length(verb_max_suffix) self.verbstemmer.set_prefix_letters(verb_prefix) self.verbstemmer.set_suffix_letters(verb_suffix) self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST) self.verbstemmer.infix_letters = verb_infix # prepare the noun stemmer self.nounstemmer = tashaphyne.ArabicLightStemmer() noun_prefix = u"مأسفلونيتاكب" noun_infix = u"اتويدط" noun_suffix = u"امتةكنهوي" noun_max_prefix = 4 noun_max_suffix = 6 self.nounstemmer.set_max_prefix_length(noun_max_prefix) self.nounstemmer.set_max_suffix_length(noun_max_suffix) self.nounstemmer.set_prefix_letters(noun_prefix) self.nounstemmer.set_suffix_letters(noun_suffix) self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST) self.nounstemmer.infix_letters = noun_infix self.Cache = {}
def __init__(self, ): self.word = u"" self.verbstemmer = tashaphyne.ArabicLightStemmer() # prepare the verb stemmer #verb_prefix = u"أسفلونيتا" #verb_infix = u"اتويدط" #verb_suffix = u"امتةكنهوي" #verb_max_prefix = 4 #verb_max_suffix = 6 #self.verbstemmer.set_max_prefix_length(verb_max_prefix) #self.verbstemmer.set_max_suffix_length(verb_max_suffix) #self.verbstemmer.set_prefix_letters(verb_prefix) #self.verbstemmer.set_suffix_letters(verb_suffix) self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST) #self.verbstemmer.infix_letters = verb_infix # prepare the noun stemmer self.nounstemmer = tashaphyne.ArabicLightStemmer() #noun_prefix = u"مأسفلونيتاكب" #noun_infix = u"اتويدط" #noun_suffix = u"امتةكنهوي" #self.nounstemmer.set_prefix_letters(noun_prefix) #self.nounstemmer.set_suffix_letters(noun_suffix) self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST) #self.nounstemmer.infix_letters = noun_infix self.cache = {} # a cache to speed up the tagging process
def __init__(self, ): self.word = u"" self.verbstemmer = tashaphyne.ArabicLightStemmer() # prepare the verb stemmer verb_prefix = u"أسفلونيتا" verb_infix = u"اتويدط" verb_suffix = u"امتةكنهوي" verb_max_prefix = 4 verb_max_suffix = 6 self.verbstemmer.set_max_prefix_length(verb_max_prefix) self.verbstemmer.set_max_suffix_length(verb_max_suffix) self.verbstemmer.set_prefix_letters(verb_prefix) self.verbstemmer.set_suffix_letters(verb_suffix) self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST) self.verbstemmer.infix_letters = verb_infix # prepare the noun stemmer self.nounstemmer = tashaphyne.ArabicLightStemmer() noun_prefix = u"مأسفلونيتاكب" noun_infix = u"اتويدط" noun_suffix = u"امتةكنهوي" noun_max_prefix = 4 noun_max_suffix = 6 self.nounstemmer.set_max_prefix_length(noun_max_prefix) self.nounstemmer.set_max_suffix_length(noun_max_suffix) self.nounstemmer.set_prefix_letters(noun_prefix) self.nounstemmer.set_suffix_letters(noun_suffix) self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST) self.nounstemmer.infix_letters = noun_infix self.Cache = {} # a cache to speed up the tagging process # prepare verb pattern def __del__(self): """ Delete instance and clear cache """ self.Cache = {}
def light_stemmer(text): """ LightStemming unsing Tashaphyne """ result = [] als = tashaphyne.ArabicLightStemmer() word_list = als.tokenize(text) for word in word_list: #~listseg = als.segment(word) als.segment(word) affix_list = als.get_affix_list() for affix in affix_list: result.append({'word':word, 'prefix':affix['prefix'], 'stem':affix['stem'], 'suffix':affix['suffix'], 'root':affix['root'], 'type':'-'} ) return result
def lightStemmer(text): result = [] als = tashaphyne.ArabicLightStemmer() word_list = als.tokenize(text) for word in word_list: listseg = als.segment(word) ## print word.encode("utf8"),listseg affix_list = als.get_affix_list() for affix in affix_list: result.append({ 'word': word, 'prefix': affix['prefix'], 'stem': affix['stem'], 'suffix': affix['suffix'], 'root': affix['root'], 'type': '-' }) return result
def normalize(text): """ normalize a text """ tasha = tashaphyne.ArabicLightStemmer() return tasha.normalize(text)
def token_text(text): """ tokenize a text into words """ tasha = tashaphyne.ArabicLightStemmer() return tasha.tokenize(text)
def token_text(text): tasha = tashaphyne.ArabicLightStemmer() return tasha.tokenize(text)