def segmenteur_phrases(self): tagger = naftawayh.wordtag.WordTagger() ArListem = ArabicLightStemmer() stop_words1 = [ u"كما", u"أيضا", u"كذالك", u"مثلا", u"وكما", u"شبيه", u"نضير", u"ماعدا", u"باستثناء", u"إلا", u"بسبب", u"لأن", u"لكي", u"والنتيجة", u"والخلاصة", u"أولا", u"ثانيا", u"يليه", u"لذالك", u"إذا", u"نستنتج", u"أم", u"أي", u"فقد", u"لكن", u"بينما", u"فإذا", u"إذا", u"حيث", u"بسبب", u"لذالك", u"لما", u"حينما", u"وذلك", u"حيث" ] stop_words2 = [[u"بالإضافة", u"إلى"], [u"ومن", u"ذالك"], [u"من", u"هنا"], [u"ونخلص", u"إلى"], [u"وفي", u"البداية"], [u"إلى", u"جانب"], [u"علاوة", u"على"], [u"غير", u"أنه"]] #fonction return la premier element dans la liste stop_words2 def prem_ele(u, x): h = [] for d in u: h.append(d[x]) return h #eleminer la signe de ponctuation def ele_sign(s): if re.split(u'،', s): lt = re.split(u'،', s) if len(lt) > 0: for u in lt: if u != '': return u liste1 = [ ch for ch in re.split(r"[.!؟:()[]\n]+", unicode(self.text, "utf-8")) if ch != '' ] liste3 = [] i = 0 while i < len(liste1): liste2 = [ch for ch in re.split(r"[ ]+", liste1[i]) if ch != ''] k = 0 s = '' while k < len(liste2): if ele_sign(liste2[k]) == u'و': stem = ArListem.light_stem(ele_sign(liste2[k + 1])) if tagger.is_verb(stem) == True and tagger.is_noun( stem) == False: if s != '': liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k]) in stop_words1: liste3.append(s) s = '' elif ele_sign(liste2[k]) == u'ثم': stem = ArListem.light_stem(ele_sign(liste2[k + 1])) if tagger.is_verb(stem) == True and tagger.is_noun( stem) == False: if s != '': liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k][0]) == u'ف': stem = ArListem.light_stem(ele_sign(liste2[k][1::])) if tagger.is_verb( ArListem.get_stem()) == True and tagger.is_noun( ArListem.get_stem()) == False: liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k]) in prem_ele(stop_words2, 0): if ele_sign(liste2[k + 1]) in prem_ele(stop_words2, 1): liste3.append(s) s = '' k += 1 else: s += liste2[k] s += ' ' else: s += liste2[k] s += ' ' k += 1 if len(s) != 0: liste3.append(s) s = '' i += 1 liste3 = [ch for ch in liste3 if ch != ''] with io.open('output.txt', 'a', encoding="utf-8") as file: file.write( unicode("\n\n" + "il y a " + str(len(liste3)) + " phrases\n", "utf-8")) file.write(unicode("la liste des phrases : \n\n ", "utf-8")) file.write(unicode(" [ ")) for ch in liste3: file.write(" ' " + ch + " ' \n\n") file.write(unicode(" ] "))
def test1(args): word = u"لعلهم" print(is_root(word)) word = u"علم" print(is_root(word)) #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer asl = ArabicLightStemmer() words = [u'أفتضاربانني', u'بأبأ', u'يريدون', u'يستطعن', u'كتاب', u"بالميدان", u"بالأسيهم", ] ext = extend_root(u"رم") print ("extende") print(repr(ext).decode('unicode-escape').encode('utf8')) for word in words: print(u"**********%s*********"%word) asl.light_stem(word) asl.segment(word) print(asl.get_segment_list()) seg_list = asl.get_segment_list() starstem_list =[] for seg in seg_list: left, right = seg starstem_list.append(asl.get_starstem(left, right)) print("star stems") print (u"\t".join(starstem_list)).encode('utf8') filtered_starstem_list =filter(valid_starstem, starstem_list) print("filtred star stem") print (u"\t".join(filtered_starstem_list)).encode('utf8') for st in starstem_list: print(st, u"\t".join(valid_starstem(st)).encode('utf8')) affixation_list= asl.get_affix_list() stems = [d['stem'] for d in affixation_list] print ("Candidats stems%s"%u'\t'.join(stems)) for st in stems: print( st, u"\t".join(valid_starstem(st)).encode('utf8') ) print( repr(affixation_list).replace('},','},\n').decode('unicode-escape').encode('utf8')) print("reduce") #~ affixation_list = filter(verify_affix, affixation_list) print(repr(affixation_list).replace('},','},\n').decode('unicode-escape').encode('utf8')) roots = [normalize_root(d['root']) for d in affixation_list] print ("Candidats %s"%u'\t'.join(roots)) # get uniq root accepted = set(filter(is_root, roots)) print ("accepted %s"%u'\t'.join(accepted)) if not accepted: # try to extend roots extended_roots = [] for x in roots: extended_roots.extend(extend_root(x)) print ("Candidats extended %s"%u'\t'.join(extended_roots)) accepted = set(filter(is_root, extended_roots )) print ("accepted level2 %s"%u'\t'.join(accepted)) print('root %s'%asl.get_root()) #~ print repr(STAMP_DICT).replace('},','},\n').decode('unicode-escape').encode('utf8') return 0
st = ISRIStemmer() for tx in texts: tweet = "" for a in word_tokenize(tx): tweet = tweet + st.stem(a) + " " data1.append(tweet.strip()) #print(data1[:10]) #tashfeen data2 = [] import pyarabic.arabrepr arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr from tashaphyne.stemming import ArabicLightStemmer ArListem = ArabicLightStemmer() for tx in texts: tweet = "" for a in word_tokenize(tx): stem = ArListem.light_stem(a) #tweet = tweet + ArListem.get_stem()+ " " tweet = tweet + ArListem.get_root() + " " data2.append(tweet.strip()) #print(data2[:10]) # create a dataframe using texts and lables trainDF = pandas.DataFrame() trainDF['tweet'] = texts trainDF['class'] = labels # split the dataset into training and validation datasets
from tashaphyne.stemming import ArabicLightStemmer from sklearn.preprocessing import LabelEncoder import pickle import pandas as pd import numpy as np from keras.preprocessing import sequence from keras.models import load_model ArListem = ArabicLightStemmer() classes = ['Calture', 'Diverse', 'Economy', 'Politics', 'Sport'] def text_to_tfidf(input_text): tfidf_model = pickle.load(open('model/tfidf_model.sav', 'rb')) temp_text = [] for word in input_text.split(" "): stem = ArListem.light_stem(word) stem = ArListem.get_stem() temp_text.append(stem) temp_text = pd.DataFrame([" ".join(temp_text)]) tfidf_text = tfidf_model.transform(temp_text[0]) return tfidf_text def classification_nb(input_text): nb_model = pickle.load(open('model/naive_bayes_model.sav', 'rb')) tfidf_text = text_to_tfidf(input_text) label = nb_model.predict(tfidf_text)[0] return classes[label]
''' Created on 15 juin 2019 @author: KHALID-RAMI ''' # coding=utf8 import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr ArListem = ArabicLightStemmer() word = u'قال' stem = ArListem.light_stem(word) print(ArListem.get_stem()) print(ArListem.get_root()) print(ArListem.get_left()) print(ArListem.get_prefix(2)) print(ArListem.get_right()) print(ArListem.get_unvocalized())
def test_rooter_matrix(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict() df = dataframe_result total = df.size cpt = 0 for word, root in zip(df["word"], df["root"]): print((u"**********%s*********"%word).encode('utf8')) asl.light_stem(word) root_list = root.split(';') print((u"Start Word : %s"%asl.get_starword()).encode('utf8')) asl.segment(word) print(asl.get_segment_list() ) seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() # stems prints stems = [ d['stem'] for d in affixa_list] print("Stems: "+u' '.join(stems).encode('utf8')) roots = [ d['root'] for d in affixa_list] print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root_matrix(word, affixa_list, debug=True) #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root) #~ print((u" ".join([u"Test root", root, u"found root", #~ root_result, str(root_result == root)])).encode('utf8')) #~ if root_result == root: #~ cpt += 1 print((u" ".join([u"Test root", root, u"found root", root_result, str(root_result in root_list)])).encode('utf8')) if root_result in root_list: cpt += 1 #~ print("***** Percent %.2f%%"%(cpt*100/total)) print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))
def _stem_light(word): from tashaphyne.stemming import ArabicLightStemmer stemmer = ArabicLightStemmer() return stemmer.light_stem(word)
import codecs, nltk, re, random from stanfordcorenlp import StanfordCoreNLP from tashaphyne.stemming import ArabicLightStemmer ArListem = ArabicLightStemmer() def generate_featureS(sentence): depgender = sentence[3][2] return {"depdender": depgender} def generate_featureJ(sentence): depgender = sentence[3][2] depn = sentence[3][1] depsuf = sentence[3][4] return {"depgender": depgender, "depnum": depn, "depsuf": depsuf[2:]} def generate_featureO(sentence): n = sentence[4] gender = sentence[5] return {"num": n, "gender": gender} def generate_featureR(sentence): n = sentence[4] gender = sentence[5] return {"num": n, "gender": gender} corp = codecs.open('finalData.txt', 'r', 'utf-8')
from tinydb import TinyDB, Query from tashaphyne.stemming import ArabicLightStemmer import os ArListem = ArabicLightStemmer() dir_path = os.path.dirname(os.path.realpath(__file__)) sw = tuple(open('/home/ahmd/Downloads/python/stopword.txt').read().split()) text = open('/home/ahmd/Downloads/code/out.txt') count = {} xx = 1 for line in text: if xx == 1000000: break for word in line.split(): print(xx) xx += 1 if xx == 1000000: break if not word in sw: ArListem.light_stem(word) count[ArListem.get_root()] = count.get(ArListem.get_root(), 0) + 1 lis = [] for key, value in count.items(): lis.append({'name': key, 'value': value}) db = TinyDB('/home/ahmd/Downloads/code/bows.json') db.purge_tables() db.insert_multiple(lis)