Пример #1
0
    def __init__(self, ):
        self.word = u""
        self.verbstemmer = tashaphyne.ArabicLightStemmer()
        # prepare the verb stemmer
        verb_prefix = u"أسفلونيتا"
        verb_infix = u"اتويدط"
        verb_suffix = u"امتةكنهوي"
        verb_max_prefix = 4
        verb_max_suffix = 6
        self.verbstemmer.set_max_prefix_length(verb_max_prefix)
        self.verbstemmer.set_max_suffix_length(verb_max_suffix)
        self.verbstemmer.set_prefix_letters(verb_prefix)
        self.verbstemmer.set_suffix_letters(verb_suffix)
        self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST)
        self.verbstemmer.infix_letters = verb_infix
        # prepare the noun stemmer
        self.nounstemmer = tashaphyne.ArabicLightStemmer()
        noun_prefix = u"مأسفلونيتاكب"
        noun_infix = u"اتويدط"
        noun_suffix = u"امتةكنهوي"
        noun_max_prefix = 4
        noun_max_suffix = 6

        self.nounstemmer.set_max_prefix_length(noun_max_prefix)
        self.nounstemmer.set_max_suffix_length(noun_max_suffix)
        self.nounstemmer.set_prefix_letters(noun_prefix)
        self.nounstemmer.set_suffix_letters(noun_suffix)
        self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST)
        self.nounstemmer.infix_letters = noun_infix
        self.Cache = {}
Пример #2
0
 def __init__(self, ):
     self.word = u""
     self.verbstemmer = tashaphyne.ArabicLightStemmer()
     # prepare the verb stemmer
     #verb_prefix = u"أسفلونيتا"
     #verb_infix = u"اتويدط"
     #verb_suffix = u"امتةكنهوي"
     #verb_max_prefix = 4
     #verb_max_suffix = 6
     #self.verbstemmer.set_max_prefix_length(verb_max_prefix)
     #self.verbstemmer.set_max_suffix_length(verb_max_suffix)
     #self.verbstemmer.set_prefix_letters(verb_prefix)
     #self.verbstemmer.set_suffix_letters(verb_suffix)
     self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST)
     #self.verbstemmer.infix_letters = verb_infix
     # prepare the noun stemmer
     self.nounstemmer = tashaphyne.ArabicLightStemmer()
     #noun_prefix = u"مأسفلونيتاكب"
     #noun_infix = u"اتويدط"
     #noun_suffix = u"امتةكنهوي"
     #self.nounstemmer.set_prefix_letters(noun_prefix)
     #self.nounstemmer.set_suffix_letters(noun_suffix)
     self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST)
     #self.nounstemmer.infix_letters = noun_infix
     self.cache = {}  # a cache to speed up the tagging process
Пример #3
0
    def __init__(self, ):
        self.word = u""
        self.verbstemmer = tashaphyne.ArabicLightStemmer()
        # prepare the verb stemmer
        verb_prefix = u"أسفلونيتا"
        verb_infix = u"اتويدط"
        verb_suffix = u"امتةكنهوي"
        verb_max_prefix = 4
        verb_max_suffix = 6
        self.verbstemmer.set_max_prefix_length(verb_max_prefix)
        self.verbstemmer.set_max_suffix_length(verb_max_suffix)
        self.verbstemmer.set_prefix_letters(verb_prefix)
        self.verbstemmer.set_suffix_letters(verb_suffix)
        self.verbstemmer.set_prefix_list(affix_const.VERBAL_PREFIX_LIST)
        self.verbstemmer.infix_letters = verb_infix
        # prepare the noun stemmer
        self.nounstemmer = tashaphyne.ArabicLightStemmer()
        noun_prefix = u"مأسفلونيتاكب"
        noun_infix = u"اتويدط"
        noun_suffix = u"امتةكنهوي"
        noun_max_prefix = 4
        noun_max_suffix = 6

        self.nounstemmer.set_max_prefix_length(noun_max_prefix)
        self.nounstemmer.set_max_suffix_length(noun_max_suffix)
        self.nounstemmer.set_prefix_letters(noun_prefix)
        self.nounstemmer.set_suffix_letters(noun_suffix)
        self.nounstemmer.set_prefix_list(affix_const.NOMINAL_PREFIXES_LIST)
        self.nounstemmer.infix_letters = noun_infix
        self.Cache = {}
        # a cache to speed up the tagging process

        # prepare verb pattern
        def __del__(self):
            """
		Delete instance and clear cache
		
		"""

        self.Cache = {}
Пример #4
0
def light_stemmer(text):
    """
    LightStemming unsing Tashaphyne
    """
    result = []
    als = tashaphyne.ArabicLightStemmer()
    word_list = als.tokenize(text)
    for word in word_list:
        #~listseg =  als.segment(word)
        als.segment(word)
        affix_list = als.get_affix_list()
        for affix in affix_list:
            result.append({'word':word, 'prefix':affix['prefix'], 
            'stem':affix['stem'], 'suffix':affix['suffix'], 
            'root':affix['root'], 'type':'-'}
                          )
    return result
Пример #5
0
def lightStemmer(text):
    result = []
    als = tashaphyne.ArabicLightStemmer()
    word_list = als.tokenize(text)
    for word in word_list:
        listseg = als.segment(word)
        ##        print word.encode("utf8"),listseg
        affix_list = als.get_affix_list()
        for affix in affix_list:
            result.append({
                'word': word,
                'prefix': affix['prefix'],
                'stem': affix['stem'],
                'suffix': affix['suffix'],
                'root': affix['root'],
                'type': '-'
            })

    return result
Пример #6
0
def normalize(text):
    """
    normalize a text
    """
    tasha = tashaphyne.ArabicLightStemmer()
    return tasha.normalize(text)
Пример #7
0
def token_text(text):
    """
    tokenize a text into words
    """
    tasha = tashaphyne.ArabicLightStemmer()
    return tasha.tokenize(text)
Пример #8
0
def token_text(text):
    tasha = tashaphyne.ArabicLightStemmer()
    return tasha.tokenize(text)