def Get_root_word(self, body):
     ArListem = ArabicLightStemmer()
     word = body.split(u" ")
     word_stem = list()
     for w in word:
         w_stem = ArListem.light_stem(w)
         word_stem.append(ArListem.get_root())
     body = " ".join(word_stem) 
     return body
예제 #2
0
    def search_engine(search_id):  
        print("Input query: ", search_id)
        hasilQuery = preprocessing_query(search_id)
        print("Preprocessing query: ", hasilQuery[-1])
        query = query_translation(hasilQuery)
        print("Query translation: ", query)
        
        ArListem = ArabicLightStemmer()
        stem = ArListem.light_stem(query) 
        hasil = ArListem.get_root()
        print("Stem: ", hasil)

        exquery = request.POST.get('exquery', None)
        print(exquery)
        
        # Query Expansion
        if(exquery=='Iya'):
            print("Pakai Ekspansi Query")
            # pass
            token = wordpunct_tokenize(hasil)
            query = []
            for word in token:
                 pq = PredictorConfig.modelFT.wv.most_similar(word)
                 print(pq)
                 words = []
                 for i in range(4):
                     words.append(pq[i][0])
                 words.append(word)
                 print(words)
                
                 query.append(' '.join(words))
                 queries = []
                 queries.append(' '.join(query))
                 print("Query Expansion: ", queries)
                 hasil = queries[0]

        query_vec = PredictorConfig.tfidf_vectorizer.transform([hasil])
        
        print(query_vec)

        results = cosine_similarity(PredictorConfig.tfidf_matrix,query_vec).reshape((-1,))

        list_object = []
        list_id = results.argsort()[-10:][::-1]
        list_id = [x+1 for x in list_id]
        for x in list_id:
            list_object.append(Kitabs.objects.filter(id=x))
        
        return list_object
예제 #3
0
def test_rooter(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer()
    rooter = rootslibclass.rootDict(algos=['rhyzome'])
    # debug in rhyzome rooter
    rooter.rhyzome_rooter.debug = True
    #~ rooter = rootslibclass.rootDict()
    df = dataframe_result
    # avoid null roots

    #~ total = df.size
    total = len(df.index)
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        root_list = root.split(';')
        print((u"**********%s*********" % word).encode('utf8'))
        asl.light_stem(word)
        print((u"Start Word : %s" % asl.get_starword()).encode('utf8'))

        word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF,
                      word)

        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        starstem_list = []
        affixa_list = asl.get_affix_list()
        # stems prints
        stems = [d['stem'] for d in affixa_list]
        print("Stems: " + u' '.join(stems).encode('utf8'))
        roots = [d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s" %
               (asl.get_root(), u' '.join(roots))).encode('utf8'))
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        print((u" ".join([
            u"Test root", root, u"found root", root_result,
            str(root_result in root_list)
        ])).encode('utf8'))
        if root_result in root_list:
            cpt += 1
    print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
예제 #4
0
def stemmingـprocess(word):
    # Initialize Arabic stemmer
    arepr = pyarabic.arabrepr.ArabicRepr()
    repr = arepr.repr
    ArListem = ArabicLightStemmer()

    if word in stem_not:
        wordRoot = word
    elif len(word) <= 3:
        wordRoot = word
    else:
        # Stemming word
        stem = ArListem.light_stem(word)
        # Extract root
        wordRoot = ArListem.get_root()
    return wordRoot
예제 #5
0
 def getStemmedText(self, text):
     stemmedText = []
     if self.lang == 1:
         stemmer = nltk.stem.snowball.FrenchStemmer()
         stemmedText = [
             stemmer.stem(word) for word in text if word.isalpha()
         ]
     else:
         from tashaphyne.stemming import ArabicLightStemmer
         ArListem = ArabicLightStemmer()
         for word in text:
             if word.isalpha():
                 stem = ArListem.light_stem(word)
                 root = ArListem.get_root()
                 stemmedText.append(root)
     return stemmedText
예제 #6
0
def test_rooter_matrix(dataframe_result):
    """
    """
    from pyarabic.arabrepr import arepr
    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    import rootslibclass
    asl = ArabicLightStemmer() 
    rooter = rootslibclass.rootDict()       
    df = dataframe_result
    total = df.size
    cpt = 0
    for word, root in zip(df["word"], df["root"]):
        print((u"**********%s*********"%word).encode('utf8'))
        asl.light_stem(word)
        root_list = root.split(';')        
        print((u"Start Word : %s"%asl.get_starword()).encode('utf8'))        
        
        asl.segment(word)
        print(asl.get_segment_list()  )
        seg_list = asl.get_segment_list()  
        starstem_list =[]
        affixa_list = asl.get_affix_list()
        # stems prints 
        stems = [ d['stem'] for d in affixa_list]
        print("Stems: "+u' '.join(stems).encode('utf8'))        
        roots = [ d['root'] for d in affixa_list]
        print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8'))        
        #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True)
        root_result = rooter.choose_root_matrix(word, affixa_list, debug=True)
        #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root)
        #~ print((u" ".join([u"Test root", root, u"found root",
        #~ root_result, str(root_result == root)])).encode('utf8'))
        #~ if root_result == root:
            #~ cpt += 1
        print((u" ".join([u"Test root", root, u"found root",
        root_result, str(root_result in root_list)])).encode('utf8'))
        if root_result in  root_list:
            cpt += 1            
    #~ print("***** Percent %.2f%%"%(cpt*100/total)) 
    print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))
예제 #7
0
def test1(args):
    word = u"لعلهم"
    print(is_root(word))
    word = u"علم"
    print(is_root(word))

    #test with tashaphyne
    from tashaphyne.stemming import ArabicLightStemmer
    asl = ArabicLightStemmer()
    words = [
        u'أفتضاربانني',
        u'بأبأ',
        u'يريدون',
        u'يستطعن',
        u'كتاب',
        u"بالميدان",
        u"بالأسيهم",
    ]
    ext = extend_root(u"رم")
    print("extende")
    print(repr(ext).decode('unicode-escape').encode('utf8'))

    for word in words:
        print(u"**********%s*********" % word)
        asl.light_stem(word)
        asl.segment(word)
        print(asl.get_segment_list())
        seg_list = asl.get_segment_list()
        starstem_list = []
        for seg in seg_list:
            left, right = seg
            starstem_list.append(asl.get_starstem(left, right))
        print("star stems")

        print(u"\t".join(starstem_list)).encode('utf8')
        filtered_starstem_list = filter(valid_starstem, starstem_list)
        print("filtred star stem")
        print(u"\t".join(filtered_starstem_list)).encode('utf8')
        for st in starstem_list:
            print(st, u"\t".join(valid_starstem(st)).encode('utf8'))
        affixation_list = asl.get_affix_list()
        stems = [d['stem'] for d in affixation_list]
        print("Candidats stems%s" % u'\t'.join(stems))
        for st in stems:
            print(st, u"\t".join(valid_starstem(st)).encode('utf8'))

        print(
            repr(affixation_list).replace(
                '},', '},\n').decode('unicode-escape').encode('utf8'))
        print("reduce")
        #~ affixation_list = filter(verify_affix, affixation_list)
        print(
            repr(affixation_list).replace(
                '},', '},\n').decode('unicode-escape').encode('utf8'))

        roots = [normalize_root(d['root']) for d in affixation_list]
        print("Candidats %s" % u'\t'.join(roots))
        # get uniq root
        accepted = set(filter(is_root, roots))
        print("accepted %s" % u'\t'.join(accepted))
        if not accepted:
            # try to extend roots

            extended_roots = []
            for x in roots:
                extended_roots.extend(extend_root(x))
            print("Candidats extended %s" % u'\t'.join(extended_roots))
            accepted = set(filter(is_root, extended_roots))
            print("accepted level2 %s" % u'\t'.join(accepted))
        print('root %s' % asl.get_root())
    #~ print repr(STAMP_DICT).replace('},','},\n').decode('unicode-escape').encode('utf8')
    return 0
예제 #8
0
from tinydb import TinyDB, where
from tashaphyne.stemming import ArabicLightStemmer

ArListem = ArabicLightStemmer()
db = TinyDB('/json.json')
while True:
    x = input('Input to search or "q" to quit:\n>>> ')
    if x == 'q':
        break
    ArListem.light_stem(x)
    x = ArListem.get_root()
    data = db.search(where('name').matches('.*%s.*' % x))
    for line in data:
        print(line['name'] + ': ', end='')
        print(line['value'])
        print()
    if not data:
        print('Not found result')
예제 #9
0
    data1.append(tweet.strip())

#print(data1[:10])
#tashfeen
data2 = []
import pyarabic.arabrepr
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
from tashaphyne.stemming import ArabicLightStemmer
ArListem = ArabicLightStemmer()
for tx in texts:
    tweet = ""
    for a in word_tokenize(tx):
        stem = ArListem.light_stem(a)
        #tweet = tweet + ArListem.get_stem()+ " "
        tweet = tweet + ArListem.get_root() + " "
    data2.append(tweet.strip())
#print(data2[:10])

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['tweet'] = texts
trainDF['class'] = labels

# split the dataset into training and validation datasets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
    trainDF['tweet'], trainDF['class'], test_size=0.2)

# create a count vectorizer object
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['tweet'])
예제 #10
0
'''
Created on 15 juin 2019

@author: KHALID-RAMI
'''
# coding=utf8
import pyarabic.arabrepr
from tashaphyne.stemming import ArabicLightStemmer
arepr = pyarabic.arabrepr.ArabicRepr()
repr = arepr.repr
ArListem = ArabicLightStemmer()
word = u'قال'
stem = ArListem.light_stem(word)
print(ArListem.get_stem())
print(ArListem.get_root())
print(ArListem.get_left())
print(ArListem.get_prefix(2))
print(ArListem.get_right())
print(ArListem.get_unvocalized())
예제 #11
0
from tinydb import TinyDB, Query
from tashaphyne.stemming import ArabicLightStemmer
import os

ArListem = ArabicLightStemmer()

dir_path = os.path.dirname(os.path.realpath(__file__))
sw = tuple(open('/home/ahmd/Downloads/python/stopword.txt').read().split())
text = open('/home/ahmd/Downloads/code/out.txt')
count = {}
xx = 1
for line in text:
    if xx == 1000000:
        break
    for word in line.split():
        print(xx)
        xx += 1
        if xx == 1000000:
            break
        if not word in sw:
            ArListem.light_stem(word)
            count[ArListem.get_root()] = count.get(ArListem.get_root(), 0) + 1

lis = []
for key, value in count.items():
    lis.append({'name': key, 'value': value})
db = TinyDB('/home/ahmd/Downloads/code/bows.json')
db.purge_tables()
db.insert_multiple(lis)