def test2(): #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer asl = ArabicLightStemmer() words = [ (u'أفتضاربانني', u'ضرب'), (u'بأبأ', u'بءبء'), (u'يريدون', u'ريد'), (u'يستطعن', u'ريد'), (u'كتاب', u'كتب'), (u"بالميدان", u'ميد'), (u"بالأسيهم", u'سهم'), (u"آخرين", u'ءخر'), (u"بالأخرة", u'ءخر'), ] for word, root in words: print(u"**********%s*********" % word) asl.light_stem(word) asl.segment(word) print(asl.get_segment_list()) seg_list = asl.get_segment_list() seg_list = asl.get_segment_list() starstem_list = [] affixa_list = asl.get_affix_list() #~ root_result = choose_root(affixa_list, debug=True) root_result = choose_root(word, affixa_list, debug=True) print(root_result, root_result == root) return 0
def test2(): #test with tashaphyne #~ rootslib.create_stamped_roots() #~ rootslib.create_virtual_roots() #~ print repr(rootslib.VIRTUAL_DICT).replace('],','],\n').decode('unicode-escape').encode('utf8') from tashaphyne.stemming import ArabicLightStemmer asl = ArabicLightStemmer() asl_custom = abstractstemmer.customStemmer_roots() words = [(u'أفتضاربانني',u'ضرب'), (u'بأبأ',u'بءبء'), (u'يريدون',u'ريد'), (u'يستطعن', u'طوع'), (u'يستطيعون', u'طوع'), (u'الصيام', u'صوم'), (u'يخاف', u'خوف'), (u'كتاب',u'كتب'), (u"بالميدان",u'ميد'), (u"بالأسيهم",u'سهم'), (u"آخرين",u'ءخر'), (u"بالآخرة",u'ءخر'), (u"لارتاب",u'ريب'), (u"وسائل",u'وسل'), (u"وصائل",u'وصل'), (u"أخاه",u'ءخو'), (u"أخوه",u'ءخو'), (u"أخاهم",u'ءخو'), (u"أخانا",u'ءخو'), (u"بإذن",u'ءذن'), (u"للأبرار",u"برر"), (u'واتبعوا', u'تبع'), (u'والكاظمين', u'كظم'), (u'عد', u'عود'), ] # load root dictionary with features rootdict = rootslibclass.rootDict() for word, root in words: print(u"**********%s*********"%word).encode('utf8') word = re.sub(u"[%s]"%(araby.ALEF_MADDA), araby.HAMZA+araby.ALEF, word) asl.light_stem(word) asl.segment(word) print asl.get_segment_list() seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() print repr(affixa_list).replace('},','},\n').decode('unicode-escape').encode('utf8') #~ root_result = rootslib.choose_root(affixa_list, debug=True) root_result = rootdict.choose_root(affixa_list, debug=True) #~ root_result2 = rootdict.choose_root(affixa_list, debug=True) #~ print root_result.encode('utf8'),root_result2.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root, root_result == root_result2 print root_result.encode('utf8'), asl_custom.getroot(word).encode('utf8'), root_result == root return 0
def stemming(pos_tag): Ar_Listem = ArabicLightStemmer() adjective_tags = ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS'] stemmed_text = [] for word in pos_tag: p = word[1].split('/') if p[-1] in adjective_tags: stemmed_text.append(str(Ar_Listem.light_stem(p[0]))) else: stemmed_text.append(str(Ar_Listem.light_stem(p[0]))) # print("Text tokens after lemmatization of adjectives and nouns: \n") return stemmed_text
def test_rooter(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict(algos=['rhyzome']) # debug in rhyzome rooter rooter.rhyzome_rooter.debug = True #~ rooter = rootslibclass.rootDict() df = dataframe_result # avoid null roots #~ total = df.size total = len(df.index) cpt = 0 for word, root in zip(df["word"], df["root"]): root_list = root.split(';') print((u"**********%s*********" % word).encode('utf8')) asl.light_stem(word) print((u"Start Word : %s" % asl.get_starword()).encode('utf8')) word = re.sub(u"[%s]" % (araby.ALEF_MADDA), araby.HAMZA + araby.ALEF, word) asl.segment(word) print(asl.get_segment_list()) seg_list = asl.get_segment_list() starstem_list = [] affixa_list = asl.get_affix_list() # stems prints stems = [d['stem'] for d in affixa_list] print("Stems: " + u' '.join(stems).encode('utf8')) roots = [d['root'] for d in affixa_list] print((u"Dafault roots: [%s] a %s" % (asl.get_root(), u' '.join(roots))).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root(word, affixa_list, debug=True) #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root) print((u" ".join([ u"Test root", root, u"found root", root_result, str(root_result in root_list) ])).encode('utf8')) if root_result in root_list: cpt += 1 print("***** Percent %.2f%% [%d/%d]" % (cpt * 100.0 / total, cpt, total))
def test_matrix(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict() rooter.debug = True #test with tashaphyne df = dataframe_result total = df.size cpt = 0 for word, root in zip(df["word"], df["root"]): print((u"**********%s*********"%word).encode('utf8')) asl.light_stem(word) print((u"Start Word : %s"%asl.get_starword()).encode('utf8')) asl.segment(word) print(asl.get_segment_list() ) seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() # stems prints stems = [ d['stem'] for d in affixa_list] roots = [] for stem in stems: temp_list = rooter.matrix_root(stem,u'توطيدا') tmp_roots = [d['root'] for d in temp_list] roots.extend(tmp_roots) #~ tmp_roots = [d['root'] for d in temp_list if rooter.is_root(d['root'])] print((u"Candidats " + u"\t".join(roots)).encode('utf8')) # lookup only one time by root in dictionary set_roots = [x for x in set(roots) if rooter.is_root(x)] # remove invalid roots and keep repetition roots = [x for x in roots if x in set_roots] root_result = most_common(roots) print((u"Accepted " + u"\t".join(roots)).encode('utf8')) print((u"root " + root_result).encode('utf8')) print((u" ".join([u"Test root", root, u"found root", root_result, str(root_result == root)])).encode('utf8')) if root_result == root: cpt += 1 print("***** Percent %.2f%%"%(cpt*100/total))
def get(self, text): ArListem = ArabicLightStemmer() list_Stemming = [] tokens = nltk.word_tokenize(text) for word in tokens: stem = ArListem.light_stem(word) list_Stemming.append(ArListem.get_stem()) return {"Stemming": list_Stemming}
def Get_root_word(self, body): ArListem = ArabicLightStemmer() word = body.split(u" ") word_stem = list() for w in word: w_stem = ArListem.light_stem(w) word_stem.append(ArListem.get_root()) body = " ".join(word_stem) return body
def one_string_Lemmatizing(sentence, language): ''' Argument: String of words return: list of words with Lemmatizing ''' sentence = one_string_tokenization(sentence) stemmer = ArabicLightStemmer() sentence = [stemmer.light_stem(word) for word in sentence] return sentence
def test_rooter_matrix(dataframe_result): """ """ from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer import rootslibclass asl = ArabicLightStemmer() rooter = rootslibclass.rootDict() df = dataframe_result total = df.size cpt = 0 for word, root in zip(df["word"], df["root"]): print((u"**********%s*********"%word).encode('utf8')) asl.light_stem(word) root_list = root.split(';') print((u"Start Word : %s"%asl.get_starword()).encode('utf8')) asl.segment(word) print(asl.get_segment_list() ) seg_list = asl.get_segment_list() starstem_list =[] affixa_list = asl.get_affix_list() # stems prints stems = [ d['stem'] for d in affixa_list] print("Stems: "+u' '.join(stems).encode('utf8')) roots = [ d['root'] for d in affixa_list] print((u"Dafault roots: [%s] a %s"%(asl.get_root(),u' '.join(roots))).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root_matrix(word, affixa_list, debug=True) #~ print(u"Test root",root_result.encode('utf8'), u"found root",root_result.encode('utf8'), root_result == root) #~ print((u" ".join([u"Test root", root, u"found root", #~ root_result, str(root_result == root)])).encode('utf8')) #~ if root_result == root: #~ cpt += 1 print((u" ".join([u"Test root", root, u"found root", root_result, str(root_result in root_list)])).encode('utf8')) if root_result in root_list: cpt += 1 #~ print("***** Percent %.2f%%"%(cpt*100/total)) print("***** Percent %.2f%% [%d/%d]"%(cpt*100.0/total, cpt, total))
def prediction(clean_post, model_file='/19_classes_7869.h5', w2idx_dict_file='/1367_roots_w2idx.npy', max_rec=3): labels = [ 'تبرع بالدم', 'توظيف', 'دعوات', 'خدمات ولاد العم', 'احتياجات طبية', 'أدوية', 'مفقودات أشخاص وأشياء', 'ملابس', 'الرفق بالحيوان', 'قصص ولاد العم', 'استفسارات عن أي موضوع', 'استشارات طبية', 'أعطال طرق', 'طلبات مساعدة', 'احتياجات منزلية', 'مساعدة كبار السن', 'مساعدات تعليمية', 'توصيل', 'كتب' ] word2index = np.load(os.path.dirname(os.path.realpath(__file__)) + w2idx_dict_file, allow_pickle=True).item() vocab_size = len(root_w2idx) model = load_model( os.path.dirname(os.path.realpath(__file__)) + model_file) features = np.zeros((1, vocab_size)) ArListem = ArabicLightStemmer() for word in clean_post.split(): root_flag = 0 ArListem.light_stem(word) roots = [dic['root'] for dic in ArListem.get_affix_list()] for root in roots: if (root in root_w2idx.keys() and features[0, root_w2idx[root]] < max_rec): features[0, root_w2idx[root]] += 1 root_flag = 1 break if (not root_flag and features[0, root_w2idx['<unk>']] < max_rec): features[0, root_w2idx['<unk>']] += 1 prediction = model.predict(features)[0].argsort()[-2:][::-1] if (prediction[0] == 3 and prediction[1] != 8): prediction = [labels[i] for i in prediction] else: prediction = labels[prediction[0]] return prediction
def search_engine(search_id): print("Input query: ", search_id) hasilQuery = preprocessing_query(search_id) print("Preprocessing query: ", hasilQuery[-1]) query = query_translation(hasilQuery) print("Query translation: ", query) ArListem = ArabicLightStemmer() stem = ArListem.light_stem(query) hasil = ArListem.get_root() print("Stem: ", hasil) exquery = request.POST.get('exquery', None) print(exquery) # Query Expansion if(exquery=='Iya'): print("Pakai Ekspansi Query") # pass token = wordpunct_tokenize(hasil) query = [] for word in token: pq = PredictorConfig.modelFT.wv.most_similar(word) print(pq) words = [] for i in range(4): words.append(pq[i][0]) words.append(word) print(words) query.append(' '.join(words)) queries = [] queries.append(' '.join(query)) print("Query Expansion: ", queries) hasil = queries[0] query_vec = PredictorConfig.tfidf_vectorizer.transform([hasil]) print(query_vec) results = cosine_similarity(PredictorConfig.tfidf_matrix,query_vec).reshape((-1,)) list_object = [] list_id = results.argsort()[-10:][::-1] list_id = [x+1 for x in list_id] for x in list_id: list_object.append(Kitabs.objects.filter(id=x)) return list_object
def one_string_Lemmatizing(sentence, language): ''' Argument: String of words return: list of words with Lemmatizing ''' sentence = one_string_tokenization(sentence) if language == 'English': lemmatizer = WordNetLemmatizer() sentence = [lemmatizer.lemmatize(word) for word in sentence] elif language == 'Arabic': stemmer = ArabicLightStemmer() sentence = [stemmer.light_stem(word) for word in sentence] return sentence
def stem(string): # split given string into words words = string.split() stems_list = [] arabic_light_stemmer = ArabicLightStemmer() for word in words: # stem word stem_word = arabic_light_stemmer.light_stem(word) # add new stem to dict stems_list.append(stem_word) return stems_list
def getStemmedText(self, text): stemmedText = [] if self.lang == 1: stemmer = nltk.stem.snowball.FrenchStemmer() stemmedText = [ stemmer.stem(word) for word in text if word.isalpha() ] else: from tashaphyne.stemming import ArabicLightStemmer ArListem = ArabicLightStemmer() for word in text: if word.isalpha(): stem = ArListem.light_stem(word) root = ArListem.get_root() stemmedText.append(root) return stemmedText
def stemmingـprocess(word): # Initialize Arabic stemmer arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr ArListem = ArabicLightStemmer() if word in stem_not: wordRoot = word elif len(word) <= 3: wordRoot = word else: # Stemming word stem = ArListem.light_stem(word) # Extract root wordRoot = ArListem.get_root() return wordRoot
def Lemmatisation(self): tagger = naftawayh.wordtag.WordTagger() ws = self.Pretraitement() ArListem = ArabicLightStemmer() words_root = [] words_all = {} words_all['words'] = [] for w in ws: #if not tagger.is_noun(w): stem = ArListem.light_stem(w) ww = ArListem.get_prefix() + " + " + ArListem.get_stem( ) + " + " + ArListem.get_suffix() words_all['words'].append(ww) words_root.append(ArListem.get_stem()) self.aff(words_all) result = json.dumps(words_all, ensure_ascii=False, indent=4).encode('utf-8') return words_root
def _stem_light(word): from tashaphyne.stemming import ArabicLightStemmer stemmer = ArabicLightStemmer() return stemmer.light_stem(word)
#tag words for l in corps: ps=nlp.pos_tag(l) if ps[0][0]==u'\ufeff': #ZERO WIDTH NO-BREAK SPACE ps=ps[1:] dp=nlp.dependency_parse(l) dp2=[] if len(dp)==len(ps): i = dp[0][2] for ind,w in enumerate(dp): if ind+1==i: dp2.append(w) dp2.append(("NONE",i,i)) else: dp2.append(w) else: dp2=dp dp2 = dp2[1:] for ind,w in enumerate(ps) : stem = ArListem.light_stem(w[0]) pre = ArListem.get_prefix() suf = ArListem.get_suffix() ls.append(w[0]+"|"+w[1]+"|"+dp2[ind][0]+"|"+str(dp2[ind][1]-1)+"|"+func([w[0],w[1]],classifier)+"p="+pre+"|s="+suf+"\n") ls.append(". PUNC\n") corpw.writelines(ls) corp.close() corpw.close()
''' Created on 15 juin 2019 @author: KHALID-RAMI ''' # coding=utf8 import pyarabic.arabrepr from tashaphyne.stemming import ArabicLightStemmer arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr ArListem = ArabicLightStemmer() word = u'قال' stem = ArListem.light_stem(word) print(ArListem.get_stem()) print(ArListem.get_root()) print(ArListem.get_left()) print(ArListem.get_prefix(2)) print(ArListem.get_right()) print(ArListem.get_unvocalized())
for a in word_tokenize(tx): tweet = tweet + st.stem(a) + " " data1.append(tweet.strip()) #print(data1[:10]) #tashfeen data2 = [] import pyarabic.arabrepr arepr = pyarabic.arabrepr.ArabicRepr() repr = arepr.repr from tashaphyne.stemming import ArabicLightStemmer ArListem = ArabicLightStemmer() for tx in texts: tweet = "" for a in word_tokenize(tx): stem = ArListem.light_stem(a) #tweet = tweet + ArListem.get_stem()+ " " tweet = tweet + ArListem.get_root() + " " data2.append(tweet.strip()) #print(data2[:10]) # create a dataframe using texts and lables trainDF = pandas.DataFrame() trainDF['tweet'] = texts trainDF['class'] = labels # split the dataset into training and validation datasets train_x, valid_x, train_y, valid_y = model_selection.train_test_split( trainDF['tweet'], trainDF['class'], test_size=0.2) # create a count vectorizer object
def segmenteur_phrases(self): tagger = naftawayh.wordtag.WordTagger() ArListem = ArabicLightStemmer() stop_words1 = [ u"كما", u"أيضا", u"كذالك", u"مثلا", u"وكما", u"شبيه", u"نضير", u"ماعدا", u"باستثناء", u"إلا", u"بسبب", u"لأن", u"لكي", u"والنتيجة", u"والخلاصة", u"أولا", u"ثانيا", u"يليه", u"لذالك", u"إذا", u"نستنتج", u"أم", u"أي", u"فقد", u"لكن", u"بينما", u"فإذا", u"إذا", u"حيث", u"بسبب", u"لذالك", u"لما", u"حينما", u"وذلك", u"حيث" ] stop_words2 = [[u"بالإضافة", u"إلى"], [u"ومن", u"ذالك"], [u"من", u"هنا"], [u"ونخلص", u"إلى"], [u"وفي", u"البداية"], [u"إلى", u"جانب"], [u"علاوة", u"على"], [u"غير", u"أنه"]] #fonction return la premier element dans la liste stop_words2 def prem_ele(u, x): h = [] for d in u: h.append(d[x]) return h #eleminer la signe de ponctuation def ele_sign(s): if re.split(u'،', s): lt = re.split(u'،', s) if len(lt) > 0: for u in lt: if u != '': return u liste1 = [ ch for ch in re.split(r"[.!؟:()[]\n]+", unicode(self.text, "utf-8")) if ch != '' ] liste3 = [] i = 0 while i < len(liste1): liste2 = [ch for ch in re.split(r"[ ]+", liste1[i]) if ch != ''] k = 0 s = '' while k < len(liste2): if ele_sign(liste2[k]) == u'و': stem = ArListem.light_stem(ele_sign(liste2[k + 1])) if tagger.is_verb(stem) == True and tagger.is_noun( stem) == False: if s != '': liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k]) in stop_words1: liste3.append(s) s = '' elif ele_sign(liste2[k]) == u'ثم': stem = ArListem.light_stem(ele_sign(liste2[k + 1])) if tagger.is_verb(stem) == True and tagger.is_noun( stem) == False: if s != '': liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k][0]) == u'ف': stem = ArListem.light_stem(ele_sign(liste2[k][1::])) if tagger.is_verb( ArListem.get_stem()) == True and tagger.is_noun( ArListem.get_stem()) == False: liste3.append(s) s = '' else: s += liste2[k] s += ' ' elif ele_sign(liste2[k]) in prem_ele(stop_words2, 0): if ele_sign(liste2[k + 1]) in prem_ele(stop_words2, 1): liste3.append(s) s = '' k += 1 else: s += liste2[k] s += ' ' else: s += liste2[k] s += ' ' k += 1 if len(s) != 0: liste3.append(s) s = '' i += 1 liste3 = [ch for ch in liste3 if ch != ''] with io.open('output.txt', 'a', encoding="utf-8") as file: file.write( unicode("\n\n" + "il y a " + str(len(liste3)) + " phrases\n", "utf-8")) file.write(unicode("la liste des phrases : \n\n ", "utf-8")) file.write(unicode(" [ ")) for ch in liste3: file.write(" ' " + ch + " ' \n\n") file.write(unicode(" ] "))
def test3(): from pyarabic.arabrepr import arepr #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer asl = ArabicLightStemmer() rooter = rootDict() words = [ (u'أفتضاربانني', u'ضرب'), #~ (u'بأبأ',u'بءبء'), #~ (u'يسعى',u'سعى'), #~ (u'يريدون',u'ريد'), #~ (u'يستطعن', u'ريد'), #~ (u'كتاب',u'كتب'), #~ (u"بالميدان",u'ميد'), #~ (u"بالأسيهم",u'سهم'), #~ (u"آخرين",u'ءخر'), #~ (u"بالأخرة",u'ءخر'), #~ ('ويرمي',u'رمي'), #~ (u'ويرمي',u'رمي'), #~ (u'يرمون',u'رمي'), #~ (u'راميات',u'رمي'), #~ (u'وترمون',u'رمي'), #~ (u'ويرمين',u'رمي'), #~ (u'وترميان',u'رمي'), #~ (u'ورامون',u'رمي'), #~ (u'وليرميان',u'رمي'), #~ (u'لترميان',u'رمي'), #~ (u'لترمين',u'رمي'), #~ (u'رامي',u'رمي'), #~ (u'ورامي',u'رمي'), #~ (u'رماية',u'رمي'), #~ (u'رمايه',u'رمي'), #~ (u'الراميات',u'رمي'), #~ (u'المرميات',u'رمي'), #~ (u'المتراميات',u'رمي'), #~ (u'مترامية',u'رمي'), #~ (u'مترامي',u'رمي'), #~ (u'الرامون',u'رمي'), #~ (u'والراميات',u'رمي'), #~ (u'وسيقولون',u'قول'), #~ (u'وسيقال',u'قول'), #~ (u'وسيقيلوهم',u'قول'), #~ (u'وتقال',u'قول'), #~ (u'وتقولوا',u'قول'), #~ (u'وتقول',u'قول'), #~ (u'ومقاول',u'قول'), #~ (u'وقالوا',u'قول'), #~ (u'ومقال',u'قول'), (u'وتقل', u'قول'), (u'وتقلن', u'قول'), (u'وليقل', u'قول'), (u'ولتقلنا', u'قول'), (u'لتقل', u'قول'), (u'تقل', u'قول'), (u'ونقل', u'قول'), (u'ولنقل', u'قول'), (u'فتقل', u'قول'), (u'ستقل', u'قول'), (u'ستقلن', u'قول'), (u'وستقلن', u'قول'), (u'فستقل', u'قول'), (u'وقالوا', u'قول'), (u'قالوا', u'قول'), (u'وقالا', u'قول'), (u'قالا', u'قول'), (u'وقالت', u'قول'), (u'قالت', u'قول'), (u'ويقال', u'قول'), (u'يقال', u'قول'), (u'وسيقال', u'قول'), (u'سيقال', u'قول'), (u'ويقلن', u'قول'), (u'يقلن', u'قول'), (u'ويقلنا', u'قول'), (u'يقلنا', u'قول'), (u'وتقال', u'قول'), (u'تقال', u'قول'), (u'وقال', u'قول'), (u'قال', u'قول'), (u'وسأقول', u'قول'), (u'سأقول', u'قول'), (u'وقائل', u'قول'), (u'قائل', u'قول'), (u'وقائلان', u'قول'), (u'قائلان', u'قول'), (u'وقائلون', u'قول'), (u'قائلون', u'قول'), (u'وقائلا', u'قول'), (u'قائلا', u'قول'), (u'ومقال', u'قول'), (u'مقال', u'قول'), (u'وقائلتان', u'قول'), (u'قائلتان', u'قول'), (u'يعد', u'وعد'), (u'تعد', u'عدد'), (u'نعدهم', u'عدد'), (u'وتعدهم', u'وعد'), (u'تعدهم', u'وعد'), (u'وستعدهم', u'وعد'), (u'ستعدهم', u'وعد'), (u'وتعدهما', u'وعد'), (u'تعدهما', u'وعد'), (u'ويعدهم', u'وعد'), (u'يعدهم', u'وعد'), (u'ويعدهما', u'وعد'), (u'يعدهما', u'وعد'), (u'وسيعدهم', u'وعد'), (u'سيعدهم', u'وعد'), (u'وسيعدهما', u'وعد'), (u'سيعدهما', u'وعد'), (u'ولنعدهم', u'وعد'), (u'لنعدهم', u'وعد'), (u'ولنعدهما', u'وعد'), (u'لنعدهما', u'وعد'), (u'ولتعدهم', u'وعد'), (u'لتعدهم', u'وعد'), (u'ولتعدهما', u'وعد'), (u'لتعدهما', u'وعد'), (u'ولتعدها', u'وعد'), (u'لتعدها', u'وعد'), (u'وستعدها', u'وعد'), (u'ستعدها', u'وعد'), (u'ووعدها', u'وعد'), (u'وعدها', u'وعد'), (u'ووعدهم', u'وعد'), (u'وعدهم', u'وعد'), (u'ووعدهما', u'وعد'), (u'وعدهما', u'وعد'), (u'وتعد', u'وعد'), (u'تعد', u'وعد'), (u'وتعدني', u'وعد'), (u'تعدني', u'وعد'), (u'وتعدنا', u'وعد'), (u'تعدنا', u'وعد'), (u'وتعده', u'وعد'), (u'تعده', u'وعد'), (u'وواعدناهم', u'وعد'), (u'واعدناهم', u'وعد'), (u'ووعدناهم', u'وعد'), (u'وعدناهم', u'وعد'), (u'وتعدوهم', u'وعد'), (u'تعدوهم', u'وعد'), (u'يعتاد', u'عود'), (u'أحست', u'حسس'), (u'يحسون', u'حسس'), (u'ثقة', u'وثق'), (u'ثقات', u'وثق'), (u'بثقات', u'وثق'), (u'صفات', u'وصف'), (u'صلاته', u'وصل'), ] for word, root in words: print((u"**********%s*********" % word).encode('utf8')) asl.light_stem(word) asl.segment(word) print(asl.get_segment_list()) seg_list = asl.get_segment_list() starstem_list = [] affixa_list = asl.get_affix_list() stems = [d['stem'] for d in affixa_list] print(u' '.join(stems).encode('utf8')) #~ root_result = rooter.choose_wazn_root(affixa_list, debug=True) root_result = rooter.choose_root(word, affixa_list, debug=True) print("Test root", root_result.encode('utf8'), "found root", root_result.encode('utf8'), root_result == root) # test root_extension roots = [ u"قل", u"دع", ] for rt in roots: extended = rooter.extend_root(rt) print(u"\t".join([rt, u";".join(extended)]).encode('utf8')) print('stamped roots', len(rooter.STAMP_DICT)) print('stamped roots diff new', len(diff(rooter.STAMP_DICT, roots_const.ROOTS))) print('stamped roots removed', len(diff(roots_const.ROOTS, rooter.STAMP_DICT))) print('stamped roots max length', max((len(v), k, v) for k, v in rooter.STAMP_DICT.iteritems())) print('virtual roots', len(rooter.VIRTUAL_DICT)) print('virtual roots diff', len(diff(rooter.VIRTUAL_DICT, roots_const.ROOTS))) print('virtual roots removed ', len(diff(roots_const.ROOTS, rooter.VIRTUAL_DICT))) print('virtual roots max length', max((len(v), k, v) for k, v in rooter.VIRTUAL_DICT.iteritems())) print('all roots', len(roots_const.ROOTS)) return 0
from tinydb import TinyDB, where from tashaphyne.stemming import ArabicLightStemmer ArListem = ArabicLightStemmer() db = TinyDB('/json.json') while True: x = input('Input to search or "q" to quit:\n>>> ') if x == 'q': break ArListem.light_stem(x) x = ArListem.get_root() data = db.search(where('name').matches('.*%s.*' % x)) for line in data: print(line['name'] + ': ', end='') print(line['value']) print() if not data: print('Not found result')
def test1(args): word = u"لعلهم" print(is_root(word)) word = u"علم" print(is_root(word)) #test with tashaphyne from tashaphyne.stemming import ArabicLightStemmer asl = ArabicLightStemmer() words = [ u'أفتضاربانني', u'بأبأ', u'يريدون', u'يستطعن', u'كتاب', u"بالميدان", u"بالأسيهم", ] ext = extend_root(u"رم") print("extende") print(repr(ext).decode('unicode-escape').encode('utf8')) for word in words: print(u"**********%s*********" % word) asl.light_stem(word) asl.segment(word) print(asl.get_segment_list()) seg_list = asl.get_segment_list() starstem_list = [] for seg in seg_list: left, right = seg starstem_list.append(asl.get_starstem(left, right)) print("star stems") print(u"\t".join(starstem_list)).encode('utf8') filtered_starstem_list = filter(valid_starstem, starstem_list) print("filtred star stem") print(u"\t".join(filtered_starstem_list)).encode('utf8') for st in starstem_list: print(st, u"\t".join(valid_starstem(st)).encode('utf8')) affixation_list = asl.get_affix_list() stems = [d['stem'] for d in affixation_list] print("Candidats stems%s" % u'\t'.join(stems)) for st in stems: print(st, u"\t".join(valid_starstem(st)).encode('utf8')) print( repr(affixation_list).replace( '},', '},\n').decode('unicode-escape').encode('utf8')) print("reduce") #~ affixation_list = filter(verify_affix, affixation_list) print( repr(affixation_list).replace( '},', '},\n').decode('unicode-escape').encode('utf8')) roots = [normalize_root(d['root']) for d in affixation_list] print("Candidats %s" % u'\t'.join(roots)) # get uniq root accepted = set(filter(is_root, roots)) print("accepted %s" % u'\t'.join(accepted)) if not accepted: # try to extend roots extended_roots = [] for x in roots: extended_roots.extend(extend_root(x)) print("Candidats extended %s" % u'\t'.join(extended_roots)) accepted = set(filter(is_root, extended_roots)) print("accepted level2 %s" % u'\t'.join(accepted)) print('root %s' % asl.get_root()) #~ print repr(STAMP_DICT).replace('},','},\n').decode('unicode-escape').encode('utf8') return 0