def moushakeel_V1(self,text,smooth_const): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') result = {} hdb.connect() result_teshkeel = [] result['token'] = 0 result['type'] = 0 for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']): text = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent))+" $" list_words = tool.words(text) #Récuperer les possibilités pour chaque mot dict = {} for word in list_words: res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies') if res == None: dict[word] = word else : dict[word] = res[0] dict['#'] = '#' dict['$'] = '$' possibilities = self.getPossibilities(list_words,dict) max_p = 0 best_sent = '' for possib in possibilities: p = self.sentProbaility(possib,smooth_const) if p > max_p : p = max_p best_sent = possib result_teshkeel.append(' '.join(tool.words(best_sent)[1:len(tool.words(best_sent))-1])) result['token'] = len(list_words)-2 result['type'] = len(dict)-2 result['result'] = result_teshkeel result['time'] = round(time.time() - t1,2) return result
def GetTestSents(request): nbr = request.POST['nbrTestSents'].strip() if nbr != "": try: tool = MyToolKit() hdb = DBHandler('data/model.db') result = {} hdb.connect() cond = ' ' for i in range(int(nbr)): if i < int(nbr)-1 : cond += "id="+str(random.randint(1,36111))+" or " else : cond += "id="+str(random.randint(1,36111)) DataSents = hdb.getFromTable('sents_test',attribute='sent',condition=cond) #pprint(sents) sents_diac = '' for sent in DataSents: sents_diac += ' '.join(tool.words(sent[0])[1:len(tool.words(sent[0]))-1])+'\n' sents = tool.DeleteDiacritic(sents_diac) result['sents_diac'] = sents_diac result['sents_whitout_diac'] = sents except ValueError: result = None #r = random.randint(1,100) else : result = None return HttpResponse(json.dumps(result),content_type="application/json")
def LaplaceSmoothing(self,w2,w1,u,v): hdb = DBHandler('data/model.db') hdb.connect() count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq') if count_w2 != None: count_w2 = count_w2[0] else : count_w2 = 0 count_w1_w2 = hdb.SelectOne('grams2','grams="'+w1+' '+w2+'"',attribute='freq') if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else : count_w1_w2 = 0 return (count_w1_w2+u)/(count_w2+v*u)
def moushakeel_V2(self, text, smooth_const): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') result = {} hdb.connect() result_teshkeel = [] result['token'] = 0 result['type'] = 0 ###################### Vocalisation by word ######################### for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'], subsent=['"', "'", '-']): #le replace a revoir, car il change un peut les mots (tatwil) sent = "# " + tool.normalizeArabicAlif( tool.DeleteDiacritic(sent)).replace('ـ', '') + " $" list_words = tool.words(sent) #Récuperer les possibilités pour chaque mot dict = {} for word in list_words: res = hdb.SelectOne('dictionary', 'type="' + word + '"', attribute='vocabularies') if res == None: dict[word] = word else: dict[word] = res[0] dict['#'] = '#' dict['$'] = '$' #HMM matrice = [] for word in list_words: list_dict = [] for possib in tool.words(dict[word]): if possib == "#": list_dict.append([-1, possib, 1]) else: list_dict.append([-1, possib, 0]) matrice.append(list_dict) sent = self.Viterbi(matrice, smooth_const) result_teshkeel.append(' '.join( tool.words(sent)[1:len(tool.words(sent)) - 1])) result['token'] += len(list_words) - 2 result['type'] += len(dict) - 2 ####################################################################### ###################### Vocalisation by letter ######################### result_teshkeel = self.LettersVocaliser(result_teshkeel, smooth_const) ####################################################################### result['result'] = result_teshkeel result['time'] = round(time.time() - t1, 2) return result
def moushakeel_V2(self,text,smooth_const): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') result = {} hdb.connect() result_teshkeel = [] result['token'] = 0 result['type'] = 0 ###################### Vocalisation by word ######################### for sent in tool.sents(text.strip(),["\n","\r",".",":",",",';'],subsent=['"',"'",'-']): #le replace a revoir, car il change un peut les mots (tatwil) sent = "# "+tool.normalizeArabicAlif(tool.DeleteDiacritic(sent)).replace('ـ','')+" $" list_words = tool.words(sent) #Récuperer les possibilités pour chaque mot dict = {} for word in list_words: res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies') if res == None: dict[word] = word else : dict[word] = res[0] dict['#'] = '#' dict['$'] = '$' #HMM matrice = [] for word in list_words: list_dict = [] for possib in tool.words(dict[word]): if possib == "#":list_dict.append([-1,possib,1]) else : list_dict.append([-1,possib,0]) matrice.append(list_dict) sent = self.Viterbi(matrice,smooth_const) result_teshkeel.append(' '.join(tool.words(sent)[1:len(tool.words(sent))-1])) result['token'] += len(list_words)-2 result['type'] += len(dict)-2 ####################################################################### ###################### Vocalisation by letter ######################### result_teshkeel = self.LettersVocaliser(result_teshkeel,smooth_const) ####################################################################### result['result'] = result_teshkeel result['time'] = round(time.time() - t1,2) return result
def LaplaceSmoothing_letters(self, w1, w2, u, v): hdb = DBHandler('data/model.db') hdb.connect() count_w1 = hdb.SelectOne('letters', 'letter="' + w1 + '"', attribute='freq') if count_w1 != None: count_w1 = count_w1[0] else: count_w1 = 0 count_w2 = hdb.SelectOne('letters_grams3', 'grams="' + w2 + '"', attribute='freq') if count_w2 != None: count_w2 = count_w2[0] else: count_w2 = 0 #pprint('phase 1 : '+w2+' | '+str(count_w2)); count_w1_w2 = hdb.SelectOne('letters_grams4', 'grams="' + w2 + w1 + '"', attribute='freq') if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else: count_w1_w2 = 0 #pprint('phase 2 : '+w2+''+w1+' | '+str(count_w1_w2)); #Probleme de zero , avec la matrice d'emission(*count_w1) return ((count_w1_w2 + u) / (count_w2 + v * u))
def AbsoluteDiscountingSmoothing_letters(self, w2, w1, d, v): hdb = DBHandler('data/model.db') hdb.connect() count_w1 = hdb.SelectOne('letters', 'letter="' + w1 + '"', attribute='freq') #count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq') count_w1_w2 = hdb.SelectOne('letters_grams4', 'grams="' + w2 + w1 + '"', attribute='freq') count_w_w1 = hdb.SelectOne('abs_letters', 'letter="' + w1 + '"', attribute='freq') if count_w1 != None: count_w1 = count_w1[0] else: count_w1 = 0 if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else: count_w1_w2 = 0 if count_w_w1 != None: count_w_w1 = count_w_w1[0] else: count_w_w1 = 0 if count_w1 == 0: return 0 return (max([count_w1_w2 - d, 0]) / count_w1) + (d * count_w_w1 * (1 / v)) / count_w1
def AbsoluteDiscountingSmoothing_letters(self,w2,w1,d,v): hdb = DBHandler('data/model.db') hdb.connect() count_w1 = hdb.SelectOne('letters','letter="'+w1+'"',attribute='freq') #count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq') count_w1_w2 = hdb.SelectOne('letters_grams4','grams="'+w2+w1+'"',attribute='freq') count_w_w1 = hdb.SelectOne('abs_letters','letter="'+w1+'"',attribute='freq') if count_w1 != None: count_w1 = count_w1[0] else : count_w1 = 0 if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else : count_w1_w2 = 0 if count_w_w1 != None: count_w_w1 = count_w_w1[0] else : count_w_w1 = 0 if count_w1 == 0 : return 0 return (max([count_w1_w2-d,0])/count_w1)+(d*count_w_w1*(1/v))/count_w1
def LaplaceSmoothing_letters(self,w1,w2,u,v): hdb = DBHandler('data/model.db') hdb.connect() count_w1 = hdb.SelectOne('letters','letter="'+w1+'"',attribute='freq') if count_w1 != None: count_w1 = count_w1[0] else : count_w1 = 0 count_w2 = hdb.SelectOne('letters_grams3','grams="'+w2+'"',attribute='freq') if count_w2 != None: count_w2 = count_w2[0] else : count_w2 = 0 #pprint('phase 1 : '+w2+' | '+str(count_w2)); count_w1_w2 = hdb.SelectOne('letters_grams4','grams="'+w2+w1+'"',attribute='freq') if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else : count_w1_w2 = 0 #pprint('phase 2 : '+w2+''+w1+' | '+str(count_w1_w2)); #Probleme de zero , avec la matrice d'emission(*count_w1) return ((count_w1_w2+u)/(count_w2+v*u))
def getdict(request): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') hdb.connect() result = {} dict = {} text = tool.DeleteDiacritic(tool.normalizeArabicAlif(request.POST['text'].strip())) list_words = tool.words(text) for word in list_words: res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies') if res == None: dict[word] = '' else : dict[word] = re.sub(' ',' - ',res[0]) result['results'] = dict result['time'] = round(time.time() - t1,2) result['type'] = len(dict) result['token'] = len(list_words) return HttpResponse(json.dumps(result),content_type="application/json")
def moushakeel_V1(self, text, smooth_const): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') result = {} hdb.connect() result_teshkeel = [] result['token'] = 0 result['type'] = 0 for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'], subsent=['"', "'", '-']): text = "# " + tool.normalizeArabicAlif( tool.DeleteDiacritic(sent)) + " $" list_words = tool.words(text) #Récuperer les possibilités pour chaque mot dict = {} for word in list_words: res = hdb.SelectOne('dictionary', 'type="' + word + '"', attribute='vocabularies') if res == None: dict[word] = word else: dict[word] = res[0] dict['#'] = '#' dict['$'] = '$' possibilities = self.getPossibilities(list_words, dict) max_p = 0 best_sent = '' for possib in possibilities: p = self.sentProbaility(possib, smooth_const) if p > max_p: p = max_p best_sent = possib result_teshkeel.append(' '.join( tool.words(best_sent)[1:len(tool.words(best_sent)) - 1])) result['token'] = len(list_words) - 2 result['type'] = len(dict) - 2 result['result'] = result_teshkeel result['time'] = round(time.time() - t1, 2) return result
def LaplaceSmoothing(self, w2, w1, u, v): hdb = DBHandler('data/model.db') hdb.connect() count_w2 = hdb.SelectOne('words', 'word="' + w2 + '"', attribute='freq') if count_w2 != None: count_w2 = count_w2[0] else: count_w2 = 0 count_w1_w2 = hdb.SelectOne('grams2', 'grams="' + w1 + ' ' + w2 + '"', attribute='freq') if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else: count_w1_w2 = 0 return (count_w1_w2 + u) / (count_w2 + v * u)
def LettersVocaliser(self,sents,smooth_const): tool = MyToolKit() hdb = DBHandler('data/model.db') hdb.connect() res = hdb.getFromTable('letters_dictionary',attribute='type,vocabularies') dict = {} for r in res: dict[r[0]] = r[1] not_vocalised = self.getNotVocalised(sents) #parcourir les phrases for i in range(len(sents)): #pprint(not_vocalised[i]) #parcourir les chaines non vocalisé dans une phrase for j in range(len(not_vocalised[i])): matrice = [] char = tool.LettersDiac(not_vocalised[i][j]) list_dict = [] list_dict.append([-1,tool.HideChar(char[0],expect=['#',' ']),1]) matrice.append(list_dict) list_dict = [] list_dict.append([0,tool.HideChar(char[1],expect=['#',' ']),1]) matrice.append(list_dict) list_dict = [] list_dict.append([0,"#",1]) matrice.append(list_dict) string = tool.DeleteDiacritic(not_vocalised[i][j]) #pprint(string) #parcourir les caractéres #if(string[0] == 'ا' and string[1] == 'ل') k = 3 while k < len(string): list_dict = [] if string[k-1] == "#" and string[k] == 'ا' and string[k+1] == 'ل': list_dict = [] list_dict.append([0,'_',1]) matrice.append(list_dict) list_dict = [] list_dict.append([0,'_ْ',1]) matrice.append(list_dict) k += 2 else : #parcourir les possibilités for possib in tool.words(dict[string[k]]): if possib == "#":list_dict.append([-1,possib,1]) else : list_dict.append([-1,tool.HideChar(possib,expect=['#',' ']),0]) matrice.append(list_dict) k += 1 v = self.ViterbiLetter(matrice,smooth_const) string = self.alignLetter(v,not_vocalised[i][j]) #pprint(string) #pprint(not_vocalised[i][j]) #pprint(sents[i]) #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string) sents[i] = sents[i].replace(not_vocalised[i][j].replace('#',' ').strip(),string.replace('#',' ').strip()) """ if v[:2] == "##" : n = 0 else: n = 1 string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' ')) string2 = re.sub('#+',' ',v).strip().split(' ') pprint([0]) pprint(string) pprint('------------') """ #print(not_vocalised[i][j]+' => '+v) #exit() #pprint('---------------------------------') #pprint(matrice) """ for a in matrice: #matrice[i-1][k][1] pprint(a[0][0]) exit() """ return sents
def LettersVocaliser(self, sents, smooth_const): tool = MyToolKit() hdb = DBHandler('data/model.db') hdb.connect() res = hdb.getFromTable('letters_dictionary', attribute='type,vocabularies') dict = {} for r in res: dict[r[0]] = r[1] not_vocalised = self.getNotVocalised(sents) #parcourir les phrases for i in range(len(sents)): #pprint(not_vocalised[i]) #parcourir les chaines non vocalisé dans une phrase for j in range(len(not_vocalised[i])): matrice = [] char = tool.LettersDiac(not_vocalised[i][j]) list_dict = [] list_dict.append( [-1, tool.HideChar(char[0], expect=['#', ' ']), 1]) matrice.append(list_dict) list_dict = [] list_dict.append( [0, tool.HideChar(char[1], expect=['#', ' ']), 1]) matrice.append(list_dict) list_dict = [] list_dict.append([0, "#", 1]) matrice.append(list_dict) string = tool.DeleteDiacritic(not_vocalised[i][j]) #pprint(string) #parcourir les caractéres #if(string[0] == 'ا' and string[1] == 'ل') k = 3 while k < len(string): list_dict = [] if string[k - 1] == "#" and string[k] == 'ا' and string[ k + 1] == 'ل': list_dict = [] list_dict.append([0, '_', 1]) matrice.append(list_dict) list_dict = [] list_dict.append([0, '_ْ', 1]) matrice.append(list_dict) k += 2 else: #parcourir les possibilités for possib in tool.words(dict[string[k]]): if possib == "#": list_dict.append([-1, possib, 1]) else: list_dict.append([ -1, tool.HideChar(possib, expect=['#', ' ']), 0 ]) matrice.append(list_dict) k += 1 v = self.ViterbiLetter(matrice, smooth_const) string = self.alignLetter(v, not_vocalised[i][j]) #pprint(string) #pprint(not_vocalised[i][j]) #pprint(sents[i]) #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string) sents[i] = sents[i].replace( not_vocalised[i][j].replace('#', ' ').strip(), string.replace('#', ' ').strip()) """ if v[:2] == "##" : n = 0 else: n = 1 string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' ')) string2 = re.sub('#+',' ',v).strip().split(' ') pprint([0]) pprint(string) pprint('------------') """ #print(not_vocalised[i][j]+' => '+v) #exit() #pprint('---------------------------------') #pprint(matrice) """ for a in matrice: #matrice[i-1][k][1] pprint(a[0][0]) exit() """ return sents