def similar_word(model,sentence,word,named_entities,central_words): # find and print the most similar terms to a word val_sim_central_words=0.7 val_sim=0.7 if RepresentsInt(word) : an= number.ArNumbers() elif stopwords_classified.this_is_stop_word(word): return '' #Remove Stop words elif word in named_entities: return word #Option6--> fixé les entités nommées elif word in central_words: val_sim=val_sim_central_words; try: word_compose=word origin_word=word print origin_word word_compose=word_compose.split() if len(word_compose)>1: return word word=clean_str(word) most_similar = model.wv.most_similar( araby.normalize_ligature(word)) lst=[x for x,y in most_similar if y>val_sim] lst=[''.join(c for c in x if c not in string.punctuation) for x in lst] if len(lst)==0: return origin_word lst=[seg.stem(x) for x in lst] #option10 lst.insert(0,word) #insertion de mots source est obligatoire lst=set_list(lst) if len(lst)==1: return origin_word return '(' +(u' | '.join(lst))+')' #Option8--> similarwords to others using word2vec except Exception as e: return origin_word
def preprocess(sentence): sentence = araby.strip_tatweel(sentence) sentence = sentence.replace( araby.SMALL_ALEF+araby.ALEF_MAKSURA, araby.ALEF_MAKSURA) sentence = sentence.replace( araby.ALEF_MAKSURA+araby.SMALL_ALEF, araby.ALEF_MAKSURA) sentence = re.sub(ALEFAT_PATTERN, araby.ALEF, sentence) sentence = araby.normalize_ligature(sentence) sentence = araby.normalize_teh(sentence) sentence = araby.strip_tashkeel(sentence) sentence = re.sub(r'[^\d\w]', r' ', sentence) sentence = re.sub(r'( ){2,}', r'\1', sentence) return sentence
def create_arabic_node(self, cluster_name, label, **kwargs): """ Checks that label is an arabic string, removes tatweel and normalizes ligatures. Adds unvocalized_label. """ label = araby.normalize_ligature(araby.strip_tatweel(label)) label = label.replace(araby.SMALL_ALEF, "") if not araby.is_arabicstring(label): raise RuntimeError("'%s' is not an Arabic string" % label) if "unvocalized_label" not in kwargs: kwargs["unvocalized_label"] = araby.strip_tashkeel(label) return self.create_node(cluster_name, label, **kwargs)
def clean_str(text): #Option1--> normalizing search = [u"أ",u"إ",u"آ",u"ة",u"_",u"-",u"/",u".",u"،",u" و ",u" يا ",u'"',u"ـ",u"'",u"ى",u"\\",u'\n', u'\t',u'"',u'?',u'؟',u'!'] replace = [u"ا",u"ا",u"ا",u"ه",u" ",u" ",u"",u"",u"",u" و",u" يا",u' " ',u"",u"",u"ي",u"",u' ', u' ',u' ',u' ',u' ',u' ! '] text=araby.normalize_ligature(text) text=unicodedata.normalize('NFKD',text) text=araby.strip_tashkeel(text)#remove tashkeel p_longation = re.compile(r'(.)\1+')#remove longation subst = r"\1\1" text = re.sub(p_longation, subst, text) text = text.replace(u'وو', u'و') text = text.replace(u'يي', u'ي') text = text.replace(u'اا', u'ا') for i in range(0, len(search)): text = text.replace(unicodedata.normalize('NFKD',search[i]), unicodedata.normalize('NFKD',replace[i])) #trim text = text.replace(u'ئ', u'ئ') text = text.strip() return text
def preprocess(sentences, stopwords, isStopword = False): """ This takes in an array of complete araic sentences, and performs th following operations on all of them: 1.) strips tashkeel 2.) strips harakat 3.) strips lastharaka 4.) strips tatweel 5.) Strips shadda 6.) normalize lam alef ligatures 7.) normalize hamza 8.) tokenize Returns a 2D martix, where each row represents normalized, tokens of each sentence """ #print("SENTENCE INDEX!!!", sentences[0]) output = [] for sentence in sentences: #print("Before Preprocessing:"+ sentence) #print(sentence) text = araby.strip_harakat(sentence) #print("TEXT!!!!", text) text = araby.strip_tashkeel(text) text = araby.strip_lastharaka(text) text = araby.strip_tatweel(text) text = araby.strip_shadda(text) text = araby.normalize_ligature(text) text = araby.normalize_hamza(text) text = clean_str(text) #print("After Preprocessing:"+ text) #print("----") #print(text) try: text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group() tokens = araby.tokenize(text) if not isStopword: tokens = remove_stopwords(stopwords, tokens) tokens = [t for t in tokens if t != '\n'] output.append(tokens) except: pass return output
u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print(word, '\t', end=" ") if araby.is_vocalized(word): print(' is vocalized', end=" ") if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ") if araby.is_arabicword(word): print(' is valid word', end=" ") else: print("invalid arabic word", end=" ") print(' strip harakat', araby.strip_harakat(word), end=" ") print(' strip tashkeel', araby.strip_tashkeel(word), end=" ") print(' strip tatweel', araby.strip_tatweel(word), end=" ") print(' normalize ligature ', araby.normalize_ligature(word), end=" ") if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ") print() word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print("vocalized_like", end=" ") word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list:
if araby.is_weak(c): print ('weak'), if araby.is_moon(c): print ('moon'), if araby.is_sun(c):print ('sun'), print (araby.order(c)), print (); word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", u"سئل لأنه يؤم الإمام" ] word1=u"" for word in word_list: print (word) if araby.is_vocalized(word): print (' is vocalized') if araby.is_vocalizedtext(word): print (' is vocalized text') if araby.is_arabicword(word): print (' is valid word') else: print ("invalid arabic word") print (' strip harakat', araby.strip_harakat(word)) print (' strip tashkeel', araby.strip_tashkeel(word)) print (' strip tatweel',araby.strip_tatweel(word)) print (' normalize ligature ', araby.normalize_ligature(word)) print (' normalize hamza', araby.normalize_hamza(word)) if araby.vocalizedlike(word, word1): print ("vocalized_like") word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")
u"غير مشكول", "Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print; word1=word; if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like", word=u"الْعَرَيِيّةُ" word_list=[ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1=u"" for word in word_list: print word.encode('utf8'),'\t',
import numpy as np import pickle as pkl from pyarabic.araby import normalize_hamza, normalize_ligature trn_ids = np.load('data/wiki/ar/tmp/val_ids.npy') it = pkl.load(open('data/wiki/ar/tmp/itos.pkl', 'rb')) t = 'وإن لم يريد أن يفعل بي الفصحى' print(normalize_ligature(t))
def run_diac(gomla, dialect): sos = 'بدايةجملة' if dialect == 'ca' else 'بداية' eos = 'نهايةجملة' if dialect == 'ca' else 'نهاية' token_list_7 = LastNTokens(7, sos) fname = randint(0, 100000) with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.fmt', mode='w', encoding='utf-8') as infile: gomla = strip_tatweel(araby.normalize_ligature(gomla)) # gomla_list = araby.tokenize(gomla.replace('_', '-'), conditions=araby.is_arabicrange, morphs=araby.strip_tashkeel) gomla_list = araby.tokenize(gomla.replace('_', '-'), morphs=araby.strip_tashkeel) # gomla_list = gomla.strip().split() for token in gomla_list: t = ' '.join(token) token_list_7.add_tokens_list(t, 0) infile.write(token_list_7.get_n_tokens() + '\n') else: token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') token_list_7.add_tokens_list(eos, 0) infile.write(token_list_7.get_n_tokens() + '\n') if dialect == 'ca': ca_runner.infer( f"diacritizer/userdata/ca/{fname}.fmt", predictions_file=f"diacritizer/userdata/ca/{fname}.rlt", checkpoint_path=None, log_time=False) elif dialect == 'msa': msa_runner.infer( f"diacritizer/userdata/msa/{fname}.fmt", predictions_file=f"diacritizer/userdata/msa/{fname}.rlt", checkpoint_path=None, log_time=False) elif dialect == 'tun': tn_runner.infer( f"diacritizer/userdata/tun/{fname}.fmt", predictions_file=f"diacritizer/userdata/tun/{fname}.rlt", checkpoint_path=None, log_time=False) elif dialect == 'mor': ma_runner.infer( f"diacritizer/userdata/mor/{fname}.fmt", predictions_file=f"diacritizer/userdata/mor/{fname}.rlt", checkpoint_path=None, log_time=False) with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.rlt', mode='r', encoding='utf-8') as outfile: diacritized_tokens = list() counters = defaultdict(Counter) for i, line in enumerate(outfile): dtokens = line.strip().split(' _ ') # print(len(dtokens), dtokens) for j, _ in enumerate(dtokens): tk = dtokens[j - 1 - i % 7] if tk not in [eos, sos]: counters[j].update([tk]) if sum(counters[j].values()) >= 7: diacritized_tokens.append( counters[j].most_common(1)[0][0].replace(' ', '')) counters[j].clear() else: return ' '.join(diacritized_tokens)
u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print word.encode('utf8'), '\t', if araby.is_vocalized(word): print ' is vocalized', ## if araby.isArabicstring(word): print ' iisArabicstring', ## else:print ' invalid arabicstring', if araby.is_vocalizedtext(word): print ' is vocalized text', if araby.is_arabicword(word): print ' is valid word', else: print "invalid arabic word", print ' strip harakat', araby.strip_harakat(word).encode('utf8'), print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'), print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'), print ' normalize ligature ', araby.normalize_ligature(word).encode( 'utf8'), if araby.vocalizedlike(word, word1): print "vocalized_like", print word1 = word if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like", word = u"الْعَرَيِيّةُ" word_list = [ u"الْعَرَيِيّةُ", u"العربية", u"الْعَرَيِيّةُ الفصحى", u"غير مشكول", "Taha", ] word1 = u"" for word in word_list: print word.encode('utf8'), '\t',
def ar_tokenizer(t): return [ wordpunct_tokenize( normalize_ligature(normalize_hamza(strip_tashkeel(k)))) for k in t ]
import sys import pyarabic.araby as araby ayah = sys.argv[1] ayah = araby.strip_tatweel(ayah) ayah = araby.strip_tashkeel(ayah) ayah = araby.normalize_ligature(ayah) ayah = araby.normalize_hamza(ayah) ayah = araby.normalize_alef(ayah) ayah = araby.normalize_teh(ayah) ayah = ayah.replace("ے", "ى") print(ayah) sys.stdout.flush()