Exemplo n.º 1
0
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
def lightStemAr(word_list):
	result = []
	arstemmer = ISRIStemmer()
	for word in word_list:
		word = arstemmer.norm(word, num=1)  #  remove diacritics which representing Arabic short vowels  
		if not word in arstemmer.stop_words:   # exclude stop words from being processed
			word = arstemmer.pre32(word)        # remove length three and length two prefixes in this order
			word = arstemmer.suf32(word)        # remove length three and length two suffixes in this order
			word = arstemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
			word = arstemmer.norm(word, num=2)       # normalize initial hamza to bare alif
		result.append(word)
	return ' '.join(result)
Exemplo n.º 3
0
def light_stem(text):
    words = text
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)
        if word not in stemmer.stop_words:
            word = stemmer.pre32(word)
            word = stemmer.suf32(word)
            word = stemmer.waw(word)
            word = stemmer.norm(word, num=2)
            result.append(word)
    return ' '.join(result)
Exemplo n.º 4
0
def light_stem(text):
    words = text.split()
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        if not word in stemmer.stop_words:    # exclude stop words from being processed
            word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
            word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
            word = stemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
            word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
#             word=stemmer.pro_w4(word)         #process length four patterns and extract length three roots
#             word=stemmer.pro_w53(word)        #process length five patterns and extract length three roots
#             word=stemmer.pro_w54(word)        #process length five patterns and extract length four roots
#             word=stemmer.end_w5(word)         #ending step (word of length five)
#             word=stemmer.pro_w6(word)         #process length six patterns and extract length three roots
#             word=stemmer.pro_w64(word)        #process length six patterns and extract length four roots
#             word=stemmer.end_w6(word)         #ending step (word of length six)
#             word=stemmer.suf1(word)           #normalize short sufix
#             word=stemmer.pre1(word)           #normalize short prefix
            
        result.append(word)
    return ' '.join(result)