def get(self, text): ArListem = ArabicLightStemmer() list_Stemming = [] tokens = nltk.word_tokenize(text) for word in tokens: stem = ArListem.light_stem(word) list_Stemming.append(ArListem.get_stem()) return {"Stemming": list_Stemming}
def Lemmatisation(self): tagger = naftawayh.wordtag.WordTagger() ws = self.Pretraitement() ArListem = ArabicLightStemmer() words_root = [] words_all = {} words_all['words'] = [] for w in ws: #if not tagger.is_noun(w): stem = ArListem.light_stem(w) ww = ArListem.get_prefix() + " + " + ArListem.get_stem( ) + " + " + ArListem.get_suffix() words_all['words'].append(ww) words_root.append(ArListem.get_stem()) self.aff(words_all) result = json.dumps(words_all, ensure_ascii=False, indent=4).encode('utf-8') return words_root
def Light_Stem_word(self, body): ArListem = ArabicLightStemmer() word = body.split(u" ") word_stem = list() for w in word: w_stem = ArListem.light_stem(w) word_stem.append(ArListem.get_stem()) # print (ArListem.get_stem()) # extract root # print (ArListem.get_root()) body = " ".join(word_stem) return body
def segmenteur_phrases(self): tagger = naftawayh.wordtag.WordTagger() ArListem = ArabicLightStemmer() stop_words1 = [ u"كما", u"أيضا", u"كذالك", u"مثلا", u"وكما", u"شبيه", u"نضير", u"ماعدا", u"باستثناء", u"إلا", u"بسبب", u"لأن", u"لكي", u"والنتيجة", u"والخلاصة", u"أولا", u"ثانيا", u"يليه", u"لذالك", u"إذا", u"نستنتج", u"أم", u"أي", u"فقد", u"لكن", u"بينما", u"فإذا", u"إذا", u"حيث", u"بسبب", u"لذالك", u"لما", u"حينما", u"وذلك", u"حيث" ] stop_words2 = [[u"بالإضافة", u"إلى"], [u"ومن", u"ذالك"], [u"من", u"هنا"], [u"ونخلص", u"إلى"], [u"وفي", u"البداية"], [u"إلى", u"جانب"], [u"علاوة", u"على"], [u"غير", u"أنه"]] #fonction return la premier element dans la liste stop_words2 def prem_ele(u, x): h = [] for d in u: h.append(d[x]) return h #eleminer la signe de ponctuation def ele_sign(s): if re.split(u'،', s): lt = re.split(u'،', s) if len(lt) > 0: for u in lt: if u != '': return u liste1 = [ ch for ch in re.split(r"[.!؟:()[]\n]+", unicode(self.text, "utf-8")) if ch != '' ] liste3 = [] i = 0 while i < len(liste1): liste2 = [ch for ch in re.split(r"[ ]+", liste1[i]) if ch != ''] k = 0 s = '' while k < len(liste2): if ele_sign(liste2[k]) == u'و': stem = ArListem.light_stem(ele_sign(liste2[k + 1])) if tagger.is_verb(stem) == True and tagger.is_noun( stem) == False: if s != '': liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k]) in stop_words1: liste3.append(s) s = '' elif ele_sign(liste2[k]) == u'ثم': stem = ArListem.light_stem(ele_sign(liste2[k + 1])) if tagger.is_verb(stem) == True and tagger.is_noun( stem) == False: if s != '': liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k][0]) == u'ف': stem = ArListem.light_stem(ele_sign(liste2[k][1::])) if tagger.is_verb( ArListem.get_stem()) == True and tagger.is_noun( ArListem.get_stem()) == False: liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k]) in prem_ele(stop_words2, 0): if ele_sign(liste2[k + 1]) in prem_ele(stop_words2, 1): liste3.append(s) s = '' k += 1 else: s += liste2[k] s += ' ' else: s += liste2[k] s += ' ' k += 1 if len(s) != 0: liste3.append(s) s = '' i += 1 liste3 = [ch for ch in liste3 if ch != ''] with io.open('output.txt', 'a', encoding="utf-8") as file: file.write( unicode("\n\n" + "il y a " + str(len(liste3)) + " phrases\n", "utf-8")) file.write(unicode("la liste des phrases : \n\n ", "utf-8")) file.write(unicode(" [ ")) for ch in liste3: file.write(" ' " + ch + " ' \n\n") file.write(unicode(" ] "))
''' Created on 15 juin 2019 @author: KHALID-RAMI ''' # coding=utf8 import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr ArListem = ArabicLightStemmer() word = u'قال' stem = ArListem.light_stem(word) print(ArListem.get_stem()) print(ArListem.get_root()) print(ArListem.get_left()) print(ArListem.get_prefix(2)) print(ArListem.get_right()) print(ArListem.get_unvocalized())
tweet = tweet + st.stem(a)+ " " data1.append(tweet.strip()) #print(data1[:10]) #tashfeen data2 = [] import pyarabic.arabrepr arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr from tashaphyne.stemming import ArabicLightStemmer ArListem = ArabicLightStemmer() for tx in texts: tweet = "" for a in word_tokenize(tx): stem = ArListem.light_stem(a) tweet = tweet + ArListem.get_stem()+ " " #tweet = tweet + ArListem.get_root()+ " " data2.append(tweet.strip()) #print(data2[:10]) # create a dataframe using texts and lables trainDF = pandas.DataFrame() trainDF['tweet'] = data2 trainDF['class'] = labels # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['tweet'], trainDF['class'],test_size = 0.2)