def segmenteur_phrases(self):
        tagger = naftawayh.wordtag.WordTagger()
        ArListem = ArabicLightStemmer()

        stop_words1 = [
            u"كما", u"أيضا", u"كذالك", u"مثلا", u"وكما", u"شبيه", u"نضير",
            u"ماعدا", u"باستثناء", u"إلا", u"بسبب", u"لأن", u"لكي",
            u"والنتيجة", u"والخلاصة", u"أولا", u"ثانيا", u"يليه", u"لذالك",
            u"إذا", u"نستنتج", u"أم", u"أي", u"فقد", u"لكن", u"بينما", u"فإذا",
            u"إذا", u"حيث", u"بسبب", u"لذالك", u"لما", u"حينما", u"وذلك",
            u"حيث"
        ]
        stop_words2 = [[u"بالإضافة", u"إلى"], [u"ومن",
                                               u"ذالك"], [u"من", u"هنا"],
                       [u"ونخلص", u"إلى"], [u"وفي", u"البداية"],
                       [u"إلى", u"جانب"], [u"علاوة", u"على"], [u"غير", u"أنه"]]

        #fonction return la premier element dans la liste stop_words2
        def prem_ele(u, x):
            h = []
            for d in u:
                h.append(d[x])
            return h

        #eleminer la signe de ponctuation
        def ele_sign(s):
            if re.split(u'،', s):
                lt = re.split(u'،', s)
                if len(lt) > 0:
                    for u in lt:
                        if u != '':
                            return u

        liste1 = [
            ch
            for ch in re.split(r"[.!؟:()[]\n]+", unicode(self.text, "utf-8"))
            if ch != ''
        ]

        liste3 = []

        i = 0
        while i < len(liste1):
            liste2 = [ch for ch in re.split(r"[ ]+", liste1[i]) if ch != '']

            k = 0
            s = ''
            while k < len(liste2):
                if ele_sign(liste2[k]) == u'و':
                    stem = ArListem.light_stem(ele_sign(liste2[k + 1]))
                    if tagger.is_verb(stem) == True and tagger.is_noun(
                            stem) == False:
                        if s != '':
                            liste3.append(s)
                            s = ''
                    else:
                        s += liste2[k]
                        s += ' '
                elif ele_sign(liste2[k]) in stop_words1:
                    liste3.append(s)
                    s = ''
                elif ele_sign(liste2[k]) == u'ثم':
                    stem = ArListem.light_stem(ele_sign(liste2[k + 1]))
                    if tagger.is_verb(stem) == True and tagger.is_noun(
                            stem) == False:
                        if s != '':
                            liste3.append(s)
                            s = ''
                        else:
                            s += liste2[k]
                            s += ' '
                elif ele_sign(liste2[k][0]) == u'ف':
                    stem = ArListem.light_stem(ele_sign(liste2[k][1::]))
                    if tagger.is_verb(
                            ArListem.get_stem()) == True and tagger.is_noun(
                                ArListem.get_stem()) == False:
                        liste3.append(s)
                        s = ''
                    else:
                        s += liste2[k]
                        s += ' '
                elif ele_sign(liste2[k]) in prem_ele(stop_words2, 0):
                    if ele_sign(liste2[k + 1]) in prem_ele(stop_words2, 1):
                        liste3.append(s)
                        s = ''
                        k += 1
                    else:
                        s += liste2[k]
                        s += ' '
                else:
                    s += liste2[k]
                    s += ' '
                k += 1
            if len(s) != 0:
                liste3.append(s)
                s = ''
            i += 1

        liste3 = [ch for ch in liste3 if ch != '']

        with io.open('output.txt', 'a', encoding="utf-8") as file:
            file.write(
                unicode("\n\n" + "il y a " + str(len(liste3)) + " phrases\n",
                        "utf-8"))
            file.write(unicode("la liste des phrases : \n\n ", "utf-8"))
            file.write(unicode(" [ "))
            for ch in liste3:
                file.write(" ' " + ch + " ' \n\n")
            file.write(unicode(" ] "))
Exemplo n.º 2
0
def test1(args):
    word = u"لعلهم"
    print(is_root(word))
    word = u"علم"
    print(is_root(word))
    
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    asl = ArabicLightStemmer()        
    words = [u'أفتضاربانني',
    u'بأبأ',
    u'يريدون',
    u'يستطعن',
    u'كتاب',
    u"بالميدان",
    u"بالأسيهم",
    
    ]
    ext = extend_root(u"رم")
    print ("extende")
    print(repr(ext).decode('unicode-escape').encode('utf8'))

    for word in words:
        print(u"**********%s*********"%word)
        asl.light_stem(word)
        asl.segment(word)
        print(asl.get_segment_list())  
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        for seg in seg_list:
            left, right = seg
            starstem_list.append(asl.get_starstem(left, right))
        print("star stems")
        
        print (u"\t".join(starstem_list)).encode('utf8')
        filtered_starstem_list =filter(valid_starstem, starstem_list)
        print("filtred star stem")
        print (u"\t".join(filtered_starstem_list)).encode('utf8')
        for st in starstem_list:
            print(st, u"\t".join(valid_starstem(st)).encode('utf8'))
        affixation_list= asl.get_affix_list()
        stems = [d['stem'] for d in affixation_list]
        print ("Candidats stems%s"%u'\t'.join(stems))
        for st in stems:
            print( st, u"\t".join(valid_starstem(st)).encode('utf8') )       
        
        print( repr(affixation_list).replace('},','},\n').decode('unicode-escape').encode('utf8'))
        print("reduce")
        #~ affixation_list = filter(verify_affix, affixation_list)
        print(repr(affixation_list).replace('},','},\n').decode('unicode-escape').encode('utf8'))

        roots = [normalize_root(d['root']) for d in affixation_list]
        print ("Candidats %s"%u'\t'.join(roots))
        # get uniq root
        accepted = set(filter(is_root, roots))
        print ("accepted %s"%u'\t'.join(accepted))
        if not accepted:
            # try to extend roots
            
            extended_roots = []
            for x in roots:
                extended_roots.extend(extend_root(x))
            print ("Candidats extended %s"%u'\t'.join(extended_roots))
            accepted = set(filter(is_root, extended_roots ))
            print ("accepted level2 %s"%u'\t'.join(accepted))            
        print('root %s'%asl.get_root())
    #~ print repr(STAMP_DICT).replace('},','},\n').decode('unicode-escape').encode('utf8')
    return 0
Exemplo n.º 3
0
st = ISRIStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        tweet = tweet + st.stem(a) + " "
    data1.append(tweet.strip())

#print(data1[:10])
#tashfeen
data2 = []
import pyarabic.arabrepr
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        stem = ArListem.light_stem(a)
        #tweet = tweet + ArListem.get_stem()+ " "
        tweet = tweet + ArListem.get_root() + " "
    data2.append(tweet.strip())
#print(data2[:10])

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['tweet'] = texts
trainDF['class'] = labels

# split the dataset into training and validation datasets
from tashaphyne.stemming import ArabicLightStemmer
from sklearn.preprocessing import LabelEncoder
import pickle
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.models import load_model
ArListem = ArabicLightStemmer()

classes = ['Calture', 'Diverse', 'Economy', 'Politics', 'Sport']


def text_to_tfidf(input_text):
    tfidf_model = pickle.load(open('model/tfidf_model.sav', 'rb'))
    temp_text = []
    for word in input_text.split(" "):
        stem = ArListem.light_stem(word)
        stem = ArListem.get_stem()
        temp_text.append(stem)
    temp_text = pd.DataFrame([" ".join(temp_text)])
    tfidf_text = tfidf_model.transform(temp_text[0])
    return tfidf_text


def classification_nb(input_text):
    nb_model = pickle.load(open('model/naive_bayes_model.sav', 'rb'))
    tfidf_text = text_to_tfidf(input_text)
    label = nb_model.predict(tfidf_text)[0]
    return classes[label]

Exemplo n.º 5
0
'''
Created on 15 juin 2019

@author: KHALID-RAMI
'''
# coding=utf8
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
ArListem = ArabicLightStemmer()
word = u'قال'
stem = ArListem.light_stem(word)
print(ArListem.get_stem())
print(ArListem.get_root())
print(ArListem.get_left())
print(ArListem.get_prefix(2))
print(ArListem.get_right())
print(ArListem.get_unvocalized())
Exemplo n.º 6
0
def test_rooter_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer() 
    rooter = rootslibclass.rootDict()       
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        root_list = root.split(';')        
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        print("Stems: "+u' '.join(stems).encode('utf8'))        
        roots = [ d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8'))        
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root_matrix(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        #~ print((u" ".join([u"Test root", root, u"found root",
        #~ root_result, str(root_result == root)])).encode('utf8'))
        #~ if root_result == root:
            #~ cpt += 1
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result in root_list)])).encode('utf8'))
        if root_result in  root_list:
            cpt += 1            
    #~ print("***** Percent %.2f%%"%(cpt*100/total)) 
    print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))
Exemplo n.º 7
0
def _stem_light(word):

    from tashaphyne.stemming import ArabicLightStemmer

    stemmer = ArabicLightStemmer()
    return stemmer.light_stem(word)
Exemplo n.º 8
0
import codecs, nltk, re, random
from stanfordcorenlp import StanfordCoreNLP
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()


def generate_featureS(sentence):
    depgender = sentence[3][2]
    return {"depdender": depgender}


def generate_featureJ(sentence):
    depgender = sentence[3][2]
    depn = sentence[3][1]
    depsuf = sentence[3][4]
    return {"depgender": depgender, "depnum": depn, "depsuf": depsuf[2:]}


def generate_featureO(sentence):
    n = sentence[4]
    gender = sentence[5]
    return {"num": n, "gender": gender}


def generate_featureR(sentence):
    n = sentence[4]
    gender = sentence[5]
    return {"num": n, "gender": gender}


corp = codecs.open('finalData.txt', 'r', 'utf-8')
Exemplo n.º 9
0
from tinydb import TinyDB, Query
from tashaphyne.stemming import ArabicLightStemmer
import os

ArListem = ArabicLightStemmer()

dir_path = os.path.dirname(os.path.realpath(__file__))
sw = tuple(open('/home/ahmd/Downloads/python/stopword.txt').read().split())
text = open('/home/ahmd/Downloads/code/out.txt')
count = {}
xx = 1
for line in text:
    if xx == 1000000:
        break
    for word in line.split():
        print(xx)
        xx += 1
        if xx == 1000000:
            break
        if not word in sw:
            ArListem.light_stem(word)
            count[ArListem.get_root()] = count.get(ArListem.get_root(), 0) + 1

lis = []
for key, value in count.items():
    lis.append({'name': key, 'value': value})
db = TinyDB('/home/ahmd/Downloads/code/bows.json')
db.purge_tables()
db.insert_multiple(lis)