def LaplaceSmoothing_letters(self, w1, w2, u, v): hdb = DBHandler('data/model.db') hdb.connect() count_w1 = hdb.SelectOne('letters', 'letter="' + w1 + '"', attribute='freq') if count_w1 != None: count_w1 = count_w1[0] else: count_w1 = 0 count_w2 = hdb.SelectOne('letters_grams3', 'grams="' + w2 + '"', attribute='freq') if count_w2 != None: count_w2 = count_w2[0] else: count_w2 = 0 #pprint('phase 1 : '+w2+' | '+str(count_w2)); count_w1_w2 = hdb.SelectOne('letters_grams4', 'grams="' + w2 + w1 + '"', attribute='freq') if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else: count_w1_w2 = 0 #pprint('phase 2 : '+w2+''+w1+' | '+str(count_w1_w2)); #Probleme de zero , avec la matrice d'emission(*count_w1) return ((count_w1_w2 + u) / (count_w2 + v * u))
def AbsoluteDiscountingSmoothing_letters(self, w2, w1, d, v): hdb = DBHandler('data/model.db') hdb.connect() count_w1 = hdb.SelectOne('letters', 'letter="' + w1 + '"', attribute='freq') #count_w2 = hdb.SelectOne('words','word="'+w2+'"',attribute='freq') count_w1_w2 = hdb.SelectOne('letters_grams4', 'grams="' + w2 + w1 + '"', attribute='freq') count_w_w1 = hdb.SelectOne('abs_letters', 'letter="' + w1 + '"', attribute='freq') if count_w1 != None: count_w1 = count_w1[0] else: count_w1 = 0 if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else: count_w1_w2 = 0 if count_w_w1 != None: count_w_w1 = count_w_w1[0] else: count_w_w1 = 0 if count_w1 == 0: return 0 return (max([count_w1_w2 - d, 0]) / count_w1) + (d * count_w_w1 * (1 / v)) / count_w1
def LaplaceSmoothing(self, w2, w1, u, v): hdb = DBHandler('data/model.db') hdb.connect() count_w2 = hdb.SelectOne('words', 'word="' + w2 + '"', attribute='freq') if count_w2 != None: count_w2 = count_w2[0] else: count_w2 = 0 count_w1_w2 = hdb.SelectOne('grams2', 'grams="' + w1 + ' ' + w2 + '"', attribute='freq') if count_w1_w2 != None: count_w1_w2 = count_w1_w2[0] else: count_w1_w2 = 0 return (count_w1_w2 + u) / (count_w2 + v * u)
def moushakeel_V2(self, text, smooth_const): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') result = {} hdb.connect() result_teshkeel = [] result['token'] = 0 result['type'] = 0 ###################### Vocalisation by word ######################### for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'], subsent=['"', "'", '-']): #le replace a revoir, car il change un peut les mots (tatwil) sent = "# " + tool.normalizeArabicAlif( tool.DeleteDiacritic(sent)).replace('ـ', '') + " $" list_words = tool.words(sent) #Récuperer les possibilités pour chaque mot dict = {} for word in list_words: res = hdb.SelectOne('dictionary', 'type="' + word + '"', attribute='vocabularies') if res == None: dict[word] = word else: dict[word] = res[0] dict['#'] = '#' dict['$'] = '$' #HMM matrice = [] for word in list_words: list_dict = [] for possib in tool.words(dict[word]): if possib == "#": list_dict.append([-1, possib, 1]) else: list_dict.append([-1, possib, 0]) matrice.append(list_dict) sent = self.Viterbi(matrice, smooth_const) result_teshkeel.append(' '.join( tool.words(sent)[1:len(tool.words(sent)) - 1])) result['token'] += len(list_words) - 2 result['type'] += len(dict) - 2 ####################################################################### ###################### Vocalisation by letter ######################### result_teshkeel = self.LettersVocaliser(result_teshkeel, smooth_const) ####################################################################### result['result'] = result_teshkeel result['time'] = round(time.time() - t1, 2) return result
def getdict(request): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') hdb.connect() result = {} dict = {} text = tool.DeleteDiacritic(tool.normalizeArabicAlif(request.POST['text'].strip())) list_words = tool.words(text) for word in list_words: res = hdb.SelectOne('dictionary','type="'+word+'"',attribute='vocabularies') if res == None: dict[word] = '' else : dict[word] = re.sub(' ',' - ',res[0]) result['results'] = dict result['time'] = round(time.time() - t1,2) result['type'] = len(dict) result['token'] = len(list_words) return HttpResponse(json.dumps(result),content_type="application/json")
def moushakeel_V1(self, text, smooth_const): t1 = time.time() tool = MyToolKit() hdb = DBHandler('data/model.db') result = {} hdb.connect() result_teshkeel = [] result['token'] = 0 result['type'] = 0 for sent in tool.sents(text.strip(), ["\n", "\r", ".", ":", ",", ';'], subsent=['"', "'", '-']): text = "# " + tool.normalizeArabicAlif( tool.DeleteDiacritic(sent)) + " $" list_words = tool.words(text) #Récuperer les possibilités pour chaque mot dict = {} for word in list_words: res = hdb.SelectOne('dictionary', 'type="' + word + '"', attribute='vocabularies') if res == None: dict[word] = word else: dict[word] = res[0] dict['#'] = '#' dict['$'] = '$' possibilities = self.getPossibilities(list_words, dict) max_p = 0 best_sent = '' for possib in possibilities: p = self.sentProbaility(possib, smooth_const) if p > max_p: p = max_p best_sent = possib result_teshkeel.append(' '.join( tool.words(best_sent)[1:len(tool.words(best_sent)) - 1])) result['token'] = len(list_words) - 2 result['type'] = len(dict) - 2 result['result'] = result_teshkeel result['time'] = round(time.time() - t1, 2) return result