def similar_word(model,sentence,word,named_entities,central_words):
# find and print the most similar terms to a word
    val_sim_central_words=0.7
    val_sim=0.7
    if RepresentsInt(word) :
        an= number.ArNumbers()
    elif stopwords_classified.this_is_stop_word(word):
        return '' #Remove Stop words
    elif word in named_entities:
        return word #Option6--> fixé les entités nommées 
    elif word in central_words:
        val_sim=val_sim_central_words;
    try:
        word_compose=word
        origin_word=word
        print origin_word
        word_compose=word_compose.split()
        if len(word_compose)>1:
            return word
        word=clean_str(word)
        most_similar = model.wv.most_similar( araby.normalize_ligature(word))
        lst=[x for x,y in most_similar if y>val_sim]
        lst=[''.join(c for c in x if c not in string.punctuation) for x in lst]
        if len(lst)==0:
            return origin_word
        lst=[seg.stem(x) for x in lst] #option10
        lst.insert(0,word) #insertion de mots source est obligatoire
        lst=set_list(lst)
        if len(lst)==1:
            return origin_word
        return '(' +(u' | '.join(lst))+')' #Option8--> similarwords to others using word2vec 
    except Exception as e:
        return origin_word
 def preprocess(sentence):
     sentence = araby.strip_tatweel(sentence)
     sentence = sentence.replace(
         araby.SMALL_ALEF+araby.ALEF_MAKSURA, araby.ALEF_MAKSURA)
     sentence = sentence.replace(
         araby.ALEF_MAKSURA+araby.SMALL_ALEF, araby.ALEF_MAKSURA)
     sentence = re.sub(ALEFAT_PATTERN, araby.ALEF, sentence)
     sentence = araby.normalize_ligature(sentence)
     sentence = araby.normalize_teh(sentence)
     sentence = araby.strip_tashkeel(sentence)
     sentence = re.sub(r'[^\d\w]', r' ', sentence)
     sentence = re.sub(r'( ){2,}', r'\1', sentence)
     return sentence
Пример #3
0
 def create_arabic_node(self, cluster_name, label, **kwargs):
     """
     Checks that label is an arabic string, removes tatweel and normalizes 
     ligatures. Adds unvocalized_label.
     
     """
     label = araby.normalize_ligature(araby.strip_tatweel(label))
     label = label.replace(araby.SMALL_ALEF, "")
     if not araby.is_arabicstring(label):
         raise RuntimeError("'%s' is not an Arabic string" % label)
     
     if "unvocalized_label" not in kwargs:
         kwargs["unvocalized_label"] = araby.strip_tashkeel(label)
     
     return self.create_node(cluster_name, label, **kwargs)
Пример #4
0
    def create_arabic_node(self, cluster_name, label, **kwargs):
        """
        Checks that label is an arabic string, removes tatweel and normalizes 
        ligatures. Adds unvocalized_label.
        
        """
        label = araby.normalize_ligature(araby.strip_tatweel(label))
        label = label.replace(araby.SMALL_ALEF, "")
        if not araby.is_arabicstring(label):
            raise RuntimeError("'%s' is not an Arabic string" % label)

        if "unvocalized_label" not in kwargs:
            kwargs["unvocalized_label"] = araby.strip_tashkeel(label)

        return self.create_node(cluster_name, label, **kwargs)
def clean_str(text): #Option1--> normalizing
    search = [u"أ",u"إ",u"آ",u"ة",u"_",u"-",u"/",u".",u"،",u" و ",u" يا ",u'"',u"ـ",u"'",u"ى",u"\\",u'\n', u'\t',u'"',u'?',u'؟',u'!']
    replace = [u"ا",u"ا",u"ا",u"ه",u" ",u" ",u"",u"",u"",u" و",u" يا",u' " ',u"",u"",u"ي",u"",u' ', u' ',u' ',u' ',u' ',u' ! ']
    text=araby.normalize_ligature(text)
    text=unicodedata.normalize('NFKD',text)
    text=araby.strip_tashkeel(text)#remove tashkeel
    p_longation = re.compile(r'(.)\1+')#remove longation
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    text = text.replace(u'وو', u'و')
    text = text.replace(u'يي', u'ي')
    text = text.replace(u'اا', u'ا')
    for i in range(0, len(search)):
        text = text.replace(unicodedata.normalize('NFKD',search[i]), unicodedata.normalize('NFKD',replace[i]))
    #trim
    text = text.replace(u'ئ', u'ئ')
    text = text.strip()
    return text
def preprocess(sentences, stopwords, isStopword = False):
  """
    This takes in an array of complete araic sentences, and performs th following operations on all of them:
        1.) strips tashkeel
        2.) strips harakat
        3.) strips lastharaka
        4.) strips tatweel
        5.) Strips shadda
        6.) normalize lam alef ligatures 
        7.) normalize hamza
        8.) tokenize

    Returns a 2D martix, where each row represents normalized, tokens of each sentence
  """
  #print("SENTENCE INDEX!!!", sentences[0])
  output = []
  for sentence in sentences:
    #print("Before Preprocessing:"+ sentence)
    #print(sentence)
    text = araby.strip_harakat(sentence)
    #print("TEXT!!!!", text)
    text = araby.strip_tashkeel(text)
    text = araby.strip_lastharaka(text)
    text = araby.strip_tatweel(text)
    text = araby.strip_shadda(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_hamza(text)
    text = clean_str(text)
    #print("After Preprocessing:"+ text)
    #print("----")
    #print(text)
    try:
      text = re.match(r'[^\\n\\s\\p{Latin}]+', text).group()
      tokens = araby.tokenize(text)
      if not isStopword:
        tokens = remove_stopwords(stopwords, tokens)
      tokens = [t for t in tokens if t != '\n']
      output.append(tokens)
    except:
      pass
  
  return output
Пример #7
0
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print(word, '\t', end=" ")
    if araby.is_vocalized(word): print(' is vocalized', end=" ")
    if araby.is_vocalizedtext(word): print(' is vocalized text', end=" ")
    if araby.is_arabicword(word): print(' is valid word', end=" ")
    else: print("invalid arabic word", end=" ")
    print(' strip harakat', araby.strip_harakat(word), end=" ")
    print(' strip tashkeel', araby.strip_tashkeel(word), end=" ")
    print(' strip tatweel', araby.strip_tatweel(word), end=" ")
    print(' normalize ligature ', araby.normalize_ligature(word), end=" ")
    if araby.vocalizedlike(word, word1): print("vocalized_like", end=" ")
    print()
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"):
    print("vocalized_like", end=" ")
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
Пример #8
0
    if araby.is_weak(c): print ('weak'),
    if araby.is_moon(c): print ('moon'),
    if araby.is_sun(c):print ('sun'),
    print (araby.order(c)),
    print ();
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
u"سئل لأنه يؤم الإمام"
]
word1=u""
for word in word_list:
    print (word)
    if araby.is_vocalized(word): print (' is vocalized')
    if araby.is_vocalizedtext(word): print (' is vocalized text')
    if araby.is_arabicword(word): print (' is valid word')
    else: print ("invalid arabic word")
    print (' strip harakat', araby.strip_harakat(word))
    print (' strip tashkeel', araby.strip_tashkeel(word))
    print (' strip tatweel',araby.strip_tatweel(word))
    print (' normalize ligature ', araby.normalize_ligature(word))
    print (' normalize hamza', araby.normalize_hamza(word))
    if araby.vocalizedlike(word, word1): print ("vocalized_like")
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print ("vocalized_like")

Пример #9
0
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
    if araby.is_vocalized(word): print ' is vocalized',
##    if araby.isArabicstring(word): print ' iisArabicstring',
##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel',araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode('utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print;
    word1=word;
if araby.vocalizedlike(u"العربية",u"العرَبية"): print "vocalized_like",
word=u"الْعَرَيِيّةُ"
word_list=[
u"الْعَرَيِيّةُ",
u"العربية",
u"الْعَرَيِيّةُ الفصحى",
u"غير مشكول",
"Taha",
]
word1=u""
for word in word_list:
    print word.encode('utf8'),'\t',
Пример #10
0
import numpy as np
import pickle as pkl
from pyarabic.araby import normalize_hamza, normalize_ligature

trn_ids = np.load('data/wiki/ar/tmp/val_ids.npy')
it = pkl.load(open('data/wiki/ar/tmp/itos.pkl', 'rb'))

t = 'وإن لم يريد أن يفعل بي الفصحى'

print(normalize_ligature(t))
Пример #11
0
def run_diac(gomla, dialect):
    sos = 'بدايةجملة' if dialect == 'ca' else 'بداية'
    eos = 'نهايةجملة' if dialect == 'ca' else 'نهاية'
    token_list_7 = LastNTokens(7, sos)

    fname = randint(0, 100000)
    with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.fmt',
                     mode='w',
                     encoding='utf-8') as infile:
        gomla = strip_tatweel(araby.normalize_ligature(gomla))
        # gomla_list = araby.tokenize(gomla.replace('_', '-'), conditions=araby.is_arabicrange, morphs=araby.strip_tashkeel)
        gomla_list = araby.tokenize(gomla.replace('_', '-'),
                                    morphs=araby.strip_tashkeel)
        # gomla_list = gomla.strip().split()

        for token in gomla_list:
            t = ' '.join(token)
            token_list_7.add_tokens_list(t, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')
        else:
            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

            token_list_7.add_tokens_list(eos, 0)
            infile.write(token_list_7.get_n_tokens() + '\n')

    if dialect == 'ca':
        ca_runner.infer(
            f"diacritizer/userdata/ca/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/ca/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)
    elif dialect == 'msa':
        msa_runner.infer(
            f"diacritizer/userdata/msa/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/msa/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)
    elif dialect == 'tun':
        tn_runner.infer(
            f"diacritizer/userdata/tun/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/tun/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)
    elif dialect == 'mor':
        ma_runner.infer(
            f"diacritizer/userdata/mor/{fname}.fmt",
            predictions_file=f"diacritizer/userdata/mor/{fname}.rlt",
            checkpoint_path=None,
            log_time=False)

    with codecs.open(f'diacritizer/userdata/{dialect}/{fname}.rlt',
                     mode='r',
                     encoding='utf-8') as outfile:
        diacritized_tokens = list()
        counters = defaultdict(Counter)
        for i, line in enumerate(outfile):
            dtokens = line.strip().split(' _ ')
            # print(len(dtokens), dtokens)
            for j, _ in enumerate(dtokens):
                tk = dtokens[j - 1 - i % 7]

                if tk not in [eos, sos]:
                    counters[j].update([tk])

                if sum(counters[j].values()) >= 7:
                    diacritized_tokens.append(
                        counters[j].most_common(1)[0][0].replace(' ', ''))
                    counters[j].clear()
        else:
            return ' '.join(diacritized_tokens)
Пример #12
0
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print word.encode('utf8'), '\t',
    if araby.is_vocalized(word): print ' is vocalized',
    ##    if araby.isArabicstring(word): print ' iisArabicstring',
    ##    else:print ' invalid arabicstring',
    if araby.is_vocalizedtext(word): print ' is vocalized text',
    if araby.is_arabicword(word): print ' is valid word',
    else: print "invalid arabic word",
    print ' strip harakat', araby.strip_harakat(word).encode('utf8'),
    print ' strip tashkeel', araby.strip_tashkeel(word).encode('utf8'),
    print ' strip tatweel', araby.strip_tatweel(word).encode('utf8'),
    print ' normalize ligature ', araby.normalize_ligature(word).encode(
        'utf8'),
    if araby.vocalizedlike(word, word1): print "vocalized_like",
    print
    word1 = word
if araby.vocalizedlike(u"العربية", u"العرَبية"): print "vocalized_like",
word = u"الْعَرَيِيّةُ"
word_list = [
    u"الْعَرَيِيّةُ",
    u"العربية",
    u"الْعَرَيِيّةُ الفصحى",
    u"غير مشكول",
    "Taha",
]
word1 = u""
for word in word_list:
    print word.encode('utf8'), '\t',
Пример #13
0
def ar_tokenizer(t):
    return [
        wordpunct_tokenize(
            normalize_ligature(normalize_hamza(strip_tashkeel(k)))) for k in t
    ]
import sys
import pyarabic.araby as araby

ayah = sys.argv[1]
ayah = araby.strip_tatweel(ayah)
ayah = araby.strip_tashkeel(ayah)
ayah = araby.normalize_ligature(ayah)
ayah = araby.normalize_hamza(ayah)
ayah = araby.normalize_alef(ayah)
ayah = araby.normalize_teh(ayah)
ayah = ayah.replace("ے", "ى")

print(ayah)
sys.stdout.flush()