class BasicStemmer(Stemmer): def __init__(self): self.stemmer = ISRIStemmer() self.stopWordsIndex = ArabicStopWordsIndex(self) self.stopWordsIndex.buildIndex() def getStems(self, tokens, flag=False): rootList = [] for token in tokens: #token=stemmer.norm(token) root = self.stemmer.pre32(token) rootList.append(root) print(token, " : ", root) return rootList def stem(self, word): root = self.stemmer.pre32(word) root = self.stemmer.norm(root, 3) return root def loadStemsDictionnary(self, filePath="dictStems.txt"): lines = open(filePath, "r", encoding="windows-1256").readlines() dictionary = nltk.defaultdict(list) for line in lines: if not re.match("^;.*", line): parts = line.split('\t') if len(parts) != 4: break else: [rootStem, stem, tag, enGloss] = parts dictionary[rootStem].append( [stem, tag, ' '.join(enGloss.split(';'))]) return dictionary def verify(self, word): if self.stopWordsIndex.access(word): return True def setStopWordsIndex(self, index: ArabicStopWordsIndex): self.stopWordsIndex = index self.stopWordsIndex.buildIndex()
def lightStemAr(word_list): result = [] arstemmer = ISRIStemmer() for word in word_list: word = arstemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in arstemmer.stop_words: # exclude stop words from being processed word = arstemmer.pre32(word) # remove length three and length two prefixes in this order word = arstemmer.suf32(word) # remove length three and length two suffixes in this order word = arstemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = arstemmer.norm(word, num=2) # normalize initial hamza to bare alif result.append(word) return ' '.join(result)
def light_stem(text): words = text result = list() stemmer = ISRIStemmer() for word in words: word = stemmer.norm(word, num=1) if word not in stemmer.stop_words: word = stemmer.pre32(word) word = stemmer.suf32(word) word = stemmer.waw(word) word = stemmer.norm(word, num=2) result.append(word) return ' '.join(result)
def light_stem(text): words = text.split() result = list() stemmer = ISRIStemmer() for word in words: word = stemmer.norm(word, num=1) # remove diacritics which representing Arabic short vowels if not word in stemmer.stop_words: # exclude stop words from being processed word = stemmer.pre32(word) # remove length three and length two prefixes in this order word = stemmer.suf32(word) # remove length three and length two suffixes in this order word = stemmer.waw(word) # remove connective ‘و’ if it precedes a word beginning with ‘و’ word = stemmer.norm(word, num=2) # normalize initial hamza to bare alif # word=stemmer.pro_w4(word) #process length four patterns and extract length three roots # word=stemmer.pro_w53(word) #process length five patterns and extract length three roots # word=stemmer.pro_w54(word) #process length five patterns and extract length four roots # word=stemmer.end_w5(word) #ending step (word of length five) # word=stemmer.pro_w6(word) #process length six patterns and extract length three roots # word=stemmer.pro_w64(word) #process length six patterns and extract length four roots # word=stemmer.end_w6(word) #ending step (word of length six) # word=stemmer.suf1(word) #normalize short sufix # word=stemmer.pre1(word) #normalize short prefix result.append(word) return ' '.join(result)
break elif len(current_word) <= 3 and ed == 1: suggestions.append((ed, current_word, output_word)) elif len(current_word) > 3 and ed <= 2: suggestions.append((ed, current_word, output_word)) else: continue if len(suggestions) > 0: for suggest in suggestions: lemmas_cw = [] lemmas_cw.append(suggest[1]) lemmas_cw.append(st.suf1(suggest[1])) lemmas_cw.append(st.suf32(suggest[1])) lemmas_cw.append(st.pre1(suggest[1])) lemmas_cw.append(st.pre32(suggest[1])) lemmas_ow = [] lemmas_ow.append(suggest[2]) lemmas_ow.append(st.suf1(suggest[2])) lemmas_ow.append(st.suf32(suggest[2])) lemmas_ow.append(st.pre1(suggest[2])) lemmas_ow.append(st.pre32(suggest[2])) if correct != 1 and len(suggest[1]) > 7: for l in lemmas_cw: if l in lemmas_ow: correct = 2 print("I got the lemma; it seems correct -->", current_word, "~", suggest[2])
bigrams_model = gensim.models.phrases.Phraser(bigrams) print(bigrams_model) bigram_data = [bigrams_model[doc] for doc in no_stopwords_data] print(bigram_data[0:2]) ############# (5)lemmatizing the data ############# # produces a list of lists of the data lemmatized ... the lemmatizer does not work well when lemmatizing suffixes stemmer = ISRIStemmer() lemmatized_data = [] for items in bigram_data: lemmas = [] for token in items: token = stemmer.pre32( token) # removes the three-letter and two-letter prefixes token = stemmer.suf32( token) # removes the three-letter and two-letter suffixes token = stemmer.norm(token, num=1) # removes diacritics lemmas.append(token) lemmatized_data.append(lemmas) print(lemmatized_data[0:2]) ############# (5) Preparing the data using gensim for the Model ############# # the preprocess using gensim involves buidling the dictionary, the corpus and the bigrams # the data is (data_after_lemmatization) and it is a list of lists # The Dictionary dictionary = corpora.Dictionary(lemmatized_data) # the corpus