def JOIN(text, trans, langin, langout, change, un): i = 0 j = 0 while i < len(trans): try: # print 'orig: ', text[i+1], 'trans:', trans[i] if change: prob = Translation.objects.get(orig=trans[i], lang_orig=langout, trans=text[i+1], lang_trans=langin).probability else: prob = Translation.objects.get(orig=text[i+1], lang_orig=langin, trans=trans[i], lang_trans=langout).probability # print 'ok' except: prob = 0.0 # print 'fail' if prob > 0.0: ngramm = make_string([text[i], text[i+1]]) res_text = text[0:i] + [ngramm] + text[i+2:len(text)] res_trans = trans[0:i] + [trans[i]] + trans[i+2:len(trans)] un_new = uncertainty(res_text, langin, res_trans, langout, change) if un_new < un: text = res_text trans = res_trans un = un_new print 'HYPO: ', make_string(trans), ' PP: ', un i += 1 return {'orig': text, 'trans': trans, 'un': un}
def JOIN(text, trans, langin, langout, change, un): i = 0 j = 0 while i < len(trans): try: # print 'orig: ', text[i+1], 'trans:', trans[i] if change: prob = Translation.objects.get(orig=trans[i], lang_orig=langout, trans=text[i + 1], lang_trans=langin).probability else: prob = Translation.objects.get(orig=text[i + 1], lang_orig=langin, trans=trans[i], lang_trans=langout).probability # print 'ok' except: prob = 0.0 # print 'fail' if prob > 0.0: ngramm = make_string([text[i], text[i + 1]]) res_text = text[0:i] + [ngramm] + text[i + 2:len(text)] res_trans = trans[0:i] + [trans[i]] + trans[i + 2:len(trans)] un_new = uncertainty(res_text, langin, res_trans, langout, change) if un_new < un: text = res_text trans = res_trans un = un_new print 'HYPO: ', make_string(trans), ' PP: ', un i += 1 return {'orig': text, 'trans': trans, 'un': un}
def join_by_n(words, n): tmp = [] i = 0 while i < len(words): if i+n < len(words): tmp.append(make_string(words[i:i+n])) else: tmp.append(make_string(words[i:len(words)])) i += 1 # n return tmp
def join_by_n(words, n): tmp = [] i = 0 while i < len(words): if i + n < len(words): tmp.append(make_string(words[i:i + n])) else: tmp.append(make_string(words[i:len(words)])) i += 1 # n return tmp
def JOIN_P(text, trans, langin, langout, change, pp): print 'JOIN' i = 0 j = 0 while i < len(trans): try: # print 'orig: ', text[i+1], 'trans:', trans[i] if change: prob = Translation.objects.get(orig=trans[i], lang_orig=langout, trans=text[i+1], lang_trans=langin).probability else: prob = Translation.objects.get(orig=text[i+1], lang_orig=langin, trans=trans[i], lang_trans=langout).probability # print 'ok' except: prob = 0.0 # print 'fail' if prob > 0.0: ngramm = make_string([text[i], text[i+1]]) res_text = text[0:i] + [ngramm] + text[i+2:len(text)] res_trans = trans[0:i] + [trans[i]] + trans[i+2:len(trans)] new_pp = perplexity(res_trans, langout, 5) # print 'HYPO: ', make_string(res_trans), ' PP: ', new_pp if new_pp <= pp: text = res_text trans = res_trans pp = new_pp # print 'HYPO: ', make_string(trans), ' PP: ', new_pp i += 1 return {'orig': text, 'trans': trans, 'pp': pp}
def uncertainty(orig, langin, trans, langout, change): # print 'UNCERTAINTY' t = make_string(trans) words = split_n_gramm(t) # print words sum_entropy = 0.0 i = 0 n = 5 while i < len(words): try: if i + n < len(words): sum_entropy += log( n_gramm_estimation(words[i:i + n], langout, n), 2) else: sum_entropy += log( n_gramm_estimation(words[i:len(words)], langout, len(words) - i), 2) i += n except: sum_entropy += -99999 sum_max_prob = 0.0 e_log = sum_entropy # print 'Entropy = ', sum_entropy for (i, n_gramm) in enumerate(trans): try: t_gramm = Ngramm.objects.get(n_gramm=n_gramm, lang=langout) o_gramm = Ngramm.objects.get(n_gramm=orig[i], lang=langout) if change: sum_max_prob += log( Translation.objects.get(orig=n_gramm, lang_orig=langout, trans=orig[i], lang_trans=langin).probability, 2) else: sum_max_prob += log( Translation.objects.get(orig=orig[i], lang_orig=langin, trans=n_gramm, lang_trans=langout).probability, 2) except: # sum_frequence = -99999 sum_max_prob += -99999 # try: # e_log = log(sum_entropy, 2) # except: # e_log = -99999 power = -1 * (e_log + sum_max_prob / len(trans)) # print power if power > 10: # print 'a ', power return power # print 'b ',power return pow(2, power)
def language_model(word, seq, lang, size): phrase = make_string([seq, word]) try: phrase_freq = Ngramm.objects.get(n_gramm=phrase, lang=lang).frequence except: phrase_freq = 0 # print seq try: seq_freq = Ngramm.objects.filter(n_gramm__istartswith=seq, lang=lang, n=size).aggregate(Sum('frequence')) except: seq_freq = 0 # seq_freq = Ngramm.objects.get(n_gramm = seq, lang = lang) V = Ngramm.objects.filter(n=size, lang=lang).count() # print V p = float(1 + phrase_freq) / (seq_freq + V) return p
def cross_entropy(text, langout, size): sum = 0.0 text = make_string(text) words = split_n_gramm(text) # разбиваем на н-граммы высшего порядка # words = join_by_n(text, size) i = 0 while i < len(words): try: if i+size < len(words): sum += log(n_gramm_estimation(words[i:i+size], langout, size), 2) else: sum += log(n_gramm_estimation(words[i:len(words)], langout, len(words)-i), 2) except: sum += -99999 i += size return sum / len(words)
def uncertainty(orig, langin, trans, langout, change): # print 'UNCERTAINTY' t = make_string(trans) words = split_n_gramm(t) # print words sum_entropy = 0.0 i = 0 n = 5 while i < len(words): try: if i+n < len(words): sum_entropy += log(n_gramm_estimation(words[i:i+n], langout, n), 2) else: sum_entropy += log(n_gramm_estimation(words[i:len(words)], langout, len(words)-i), 2) i += n except: sum_entropy += -99999 sum_max_prob = 0.0 e_log = sum_entropy # print 'Entropy = ', sum_entropy for (i, n_gramm) in enumerate(trans): try: t_gramm = Ngramm.objects.get(n_gramm=n_gramm, lang=langout) o_gramm = Ngramm.objects.get(n_gramm=orig[i], lang=langout) if change: sum_max_prob += log(Translation.objects.get(orig=n_gramm, lang_orig=langout, trans=orig[i], lang_trans=langin).probability, 2) else: sum_max_prob += log(Translation.objects.get(orig=orig[i], lang_orig=langin, trans=n_gramm, lang_trans=langout).probability, 2) except: # sum_frequence = -99999 sum_max_prob += -99999 # try: # e_log = log(sum_entropy, 2) # except: # e_log = -99999 power = -1*(e_log + sum_max_prob/len(trans)) # print power if power > 10: # print 'a ', power return power # print 'b ',power return pow(2, power)
def language_model(word, seq, lang, size): phrase = make_string([seq, word]) try: phrase_freq = Ngramm.objects.get(n_gramm=phrase, lang=lang).frequency except: phrase_freq = 0 # print seq try: seq_freq = Ngramm.objects.filter(n_gramm__istartswith=seq, lang=lang, n=size).aggregate(Sum('frequency')) except: seq_freq = 0 # seq_freq = Ngramm.objects.get(n_gramm = seq, lang = lang) V = Ngramm.objects.filter(n=size, lang=lang).count() # print V p = float(1 + phrase_freq) / (seq_freq + V) return p
def cross_entropy(text, langout, size): sum = 0.0 text = make_string(text) words = split_n_gramm(text) # разбиваем на н-граммы высшего порядка # words = join_by_n(text, size) i = 0 while i < len(words): try: if i + size < len(words): sum += log( n_gramm_estimation(words[i:i + size], langout, size), 2) else: sum += log( n_gramm_estimation(words[i:len(words)], langout, len(words) - i), 2) except: sum += -99999 i += size return sum / len(words)
def CHANGE(words, trans, langin, langout, change, un): i = 0 j = 0 n = len(words) # получаем все комбинации для change # while i <= n*n: # создаем гипотезу hypo for (j, word) in enumerate(words): # print j changes = CHANGE10(word, trans[j], langin, langout, change)['top_ten'] changes = [item for sublist in changes for item in sublist] for c in changes: hypo = trans[0:j] + [c] + trans[j+1:len(trans)] # new_pp = perplexity(trans, langout, 5) un_new = uncertainty(words, langin, hypo, langout, change) if un_new < un: trans = hypo un = un_new print 'HYPO: ', make_string(hypo), ' UNS: ', un_new break else: i += 1 return {'orig': words, 'trans': trans, 'un': un}
def CHANGE(words, trans, langin, langout, change, un): i = 0 j = 0 n = len(words) # получаем все комбинации для change # while i <= n*n: # создаем гипотезу hypo for (j, word) in enumerate(words): # print j changes = CHANGE10(word, trans[j], langin, langout, change)['top_ten'] changes = [item for sublist in changes for item in sublist] for c in changes: hypo = trans[0:j] + [c] + trans[j + 1:len(trans)] # new_pp = perplexity(trans, langout, 5) un_new = uncertainty(words, langin, hypo, langout, change) if un_new < un: trans = hypo un = un_new print 'HYPO: ', make_string(hypo), ' UNS: ', un_new break else: i += 1 return {'orig': words, 'trans': trans, 'un': un}
def JOIN_P(text, trans, langin, langout, change, pp): print 'JOIN' i = 0 j = 0 while i < len(trans): try: # print 'orig: ', text[i+1], 'trans:', trans[i] if change: prob = Translation.objects.get(orig=trans[i], lang_orig=langout, trans=text[i + 1], lang_trans=langin).probability else: prob = Translation.objects.get(orig=text[i + 1], lang_orig=langin, trans=trans[i], lang_trans=langout).probability # print 'ok' except: prob = 0.0 # print 'fail' if prob > 0.0: ngramm = make_string([text[i], text[i + 1]]) res_text = text[0:i] + [ngramm] + text[i + 2:len(text)] res_trans = trans[0:i] + [trans[i]] + trans[i + 2:len(trans)] new_pp = perplexity(res_trans, langout, 5) # print 'HYPO: ', make_string(res_trans), ' PP: ', new_pp if new_pp <= pp: text = res_text trans = res_trans pp = new_pp # print 'HYPO: ', make_string(trans), ' PP: ', new_pp i += 1 return {'orig': text, 'trans': trans, 'pp': pp}
def n_gramm_estimation(n_gramm, lang, size): return language_model(make_string([n_gramm[size - 1]]), make_string(n_gramm[0:size - 1]), lang, size)
def translating(msg, langin, langout, change): print 'START TRANSLATION', datetime.datetime.now() t = time.time() text = split_by_sentences(msg) result = [] words = [] for s in text: words += encode_phrase(s) # вычисление прямого перевода trans = simple_translation(words, langin, langout, change) # вычисление величины неопределенности для полученного перевода un = uncertainty(words, langin, trans, langout, change) un_old = un print 'TRANS', make_final_string(trans), 'uns', un pp = perplexity(trans, langout, 5) i = 0 j = 0 k = 1 reject = True n = len(words) ok = True # получаем все комбинации для change while i < 100: # создаем гипотезу hypo # print i if not reject: i = 0 k = 1 j = 0 print k pair = improve(words, trans, langin, langout, change, pp) tmp_words = pair['orig'] tmp_trans = pair['trans'] pp_new = pair['pp'] for j in range(1, 4): pair = improve_n(j, tmp_words, tmp_trans, langin, langout, change, pp_new) tmp2_words = pair['orig'] tmp2_trans = pair['trans'] pp_new2 = pair['pp'] un_new = uncertainty(tmp2_words, langin, tmp2_trans, langout, change) if un - un_new >= 0.001: un = un_new words = tmp2_words trans = tmp2_trans pp = pp_new2 print 'HYPO: ', make_string(trans), ' PP: ', pp reject = False break else: reject = True i += 1 if j == 3: k += 1 if k > 9: break print 'TRANS', make_final_string(trans), 'uns-old', un_old, 'uns', un_new print i print 'END TRANSLATION', datetime.datetime.now() t = time.time() - t print 'TIME: ', t # result += pair['trans'] # где-то тут мы производим детокенизацию return make_final_string(trans)
def translating(msg, langin, langout, change): print 'START TRANSLATION', datetime.datetime.now() t = time.time() text = split_by_sentences(msg) result = [] words = [] for s in text: words += encode_phrase(s) # вычисление прямого перевода trans = simple_translation(words, langin, langout, change) # вычисление величины неопределенности для полученного перевода un = uncertainty(words, langin, trans, langout, change) un_old = un print 'TRANS', make_final_string(trans), 'uns', un pp = perplexity(trans, langout, 5) i = 0 j = 0 k = 1 reject = True n = len(words) ok = True # получаем все комбинации для change while i < 100: # создаем гипотезу hypo # print i if not reject: i = 0 k = 1 j = 0 print k pair = improve(words, trans, langin, langout, change, pp) tmp_words = pair['orig'] tmp_trans = pair['trans'] pp_new = pair['pp'] for j in range(1,4): pair = improve_n(j, tmp_words, tmp_trans, langin, langout, change, pp_new) tmp2_words = pair['orig'] tmp2_trans = pair['trans'] pp_new2 = pair['pp'] un_new = uncertainty(tmp2_words, langin, tmp2_trans, langout, change) if un - un_new >= 0.001: un = un_new words = tmp2_words trans = tmp2_trans pp = pp_new2 print 'HYPO: ', make_string(trans), ' PP: ', pp reject = False break else: reject = True i += 1 if j == 3: k += 1 if k > 9: break print 'TRANS', make_final_string(trans), 'uns-old', un_old, 'uns', un_new print i print 'END TRANSLATION', datetime.datetime.now() t = time.time() - t print 'TIME: ', t # result += pair['trans'] # где-то тут мы производим детокенизацию return make_final_string(trans)
def spec_split(sp): v1 = [make_string(sp[0:len(sp) - 1]), sp[len(sp) - 1]] v2 = [sp[0], make_string(sp[1:len(sp)])] return [v1, v2]
def spec_split(sp): v1 = [make_string(sp[0:len(sp)-1]), sp[len(sp)-1]] v2 = [sp[0], make_string(sp[1:len(sp)])] return [v1, v2]
def n_gramm_estimation(n_gramm, lang, size): return language_model(make_string([n_gramm[size-1]]), make_string(n_gramm[0:size-1]), lang, size)