def vn_fix(): vn_vocab = vnnorm_stats_unibi.load_vn_vocab() #fixing_map = vnnorm_stats.load_hard_fixing() hard_regex = HardRegex() hard_regex.load_from_file() one_fix = OneFixRegex() one_fix.load_from_file() multi_fix = MultiFixRegex() multi_fix.load_from_file() common_fixing = CommonRegex() common_fixing.load_from_file() from load_products import load_questions questions = load_questions() f_fix = open("q_fixing", "w", encoding="utf-8") question_norm1 = [] for qs in questions: s = qs qs = unicode(qs) qs = qs.lower() qs = hard_regex.replace(qs) qs = common_fixing.replace(qs) qs = one_fix.replace(qs) qs = multi_fix.replace(qs) question_norm1.append(qs) f_fix.write(u"%s | %s\n" % (qs,s)) #print s #print qs #exit(-1) f_fix.close()
def stats(data="", path=""): vn_vocab,vn_bare_vocab,vn_long_vocab,vn_long_bare_vocab \ =load_vn_vocab() fixing_map = load_hard_fixing() wrong_words_counters = dict() bigram_counters = dict() if data != "": questions = data elif path != "": from load_products import load_question_from_file load_question_from_file(path) else: from load_products import load_questions questions = load_questions() cdir = os.path.abspath(os.path.dirname(__file__)) for qs in questions: #print qs qs = unicode(qs) qs = qs.lower() qs = norm_fix_common(qs, fixing_map) _tokens = split_sentece(qs) tokens = [] for token in _tokens: token = utils.accent2bare(token) tokens.append(token) for i in xrange(len(tokens)): if not tokens[i] in vn_bare_vocab: utils.add_dict_counter(wrong_words_counters, tokens[i]) if i < len(tokens) - 1: utils.add_dict_counter(bigram_counters, u"%s %s" % (tokens[i], tokens[i + 1])) sorted_wrong_tokens = utils.sort_dict(wrong_words_counters) sorted_bigram_counter = utils.sort_dict(bigram_counters) f_wrong = open("%s/models/data/out/wrong_tokens.dat" % cdir, "w", encoding="utf-8") f_bigram_stats = open("%s/models/data/out/bigram_tokens.dat" % cdir, "w", encoding="utf-8") for kv in sorted_wrong_tokens: ss = DIGIT.search(kv[0]) if ss != None: continue f_wrong.write(u"%s : %s\n" % (kv[0], kv[1])) f_wrong.close() for kv in sorted_bigram_counter: f_bigram_stats.write(u"%s : %s\n" % (kv[0], kv[1])) f_bigram_stats.close()
def fix_question(): bi_gram_fixing = BigramFixing() from load_products import load_questions questions = load_questions() f = open("stats/bare_question_fixing.dat","w",encoding="utf-8") cc = 0 for qs in questions: # print qs cc += 1 print "\r%s"%cc, if qs == None or qs == "": continue try: fixed_bare,fix_accent = bi_gram_fixing.fix(qs) f.write(u"%s | %s | %s\n"%(fixed_bare,qs,fix_accent)) except: continue f.close() f.write("\n Done")
def extract_wrong_words(): vn_vocab = load_vn_vocab() fixing_map = load_hard_fixing() cc = [] for c in cc: if not c in vn_vocab: print "Wrong",c #exit(-1) wrong_words_counters = dict() from load_products import load_questions questions = load_questions() for q in questions: q = unicode(q).lower() tokens = split_sentece(q) for token in tokens: token = norm_token(token) try: token = fixing_map[token] continue except: pass if is_skip_token(token): continue else: if not token in vn_vocab: try: wrong_words_counters[token] += 1 except: wrong_words_counters[token] = 1 kvs = [] for key, value in sorted(wrong_words_counters.iteritems(), key=lambda (k, v): (v, k)): kvs.append([key, value]) TOP = 1000 f = open("models/data/out/popular_wrong_words.dat","w",encoding="utf-8") for i in xrange(1,TOP): f.write(u"%s\n"%kvs[-i][0]) print kvs[-i][0],kvs[-i][1] f.close()
def export_bare_questions(): from load_products import load_questions questions = load_questions() fixing_map = load_hard_fixing() f = open("out/bare_questions.dat", "w") for qs in questions: # print qs qs = unicode(qs) qs = qs.lower() qs = norm_fix_common(qs, fixing_map) _tokens = split_sentece(qs) tokens = [] for token in _tokens: token = utils.accent2bare(token) tokens.append(token) sentence = " ".join(tokens) f.write(u"%s\n" % sentence) f.close()
def first_stats(): tokenizer = Tokenizer() tokenizer.run() question_vocabulary = Vocabulary() questions = load_questions() cc = 0 for question in questions: #print question if cc % 10 == 0: print "\r%s" % cc, cc += 1 sen = tokenizer.predict(question) sen = sen.lower() tokens = question_vocabulary.get_sentence_token_ids(sen) question_list.append(tokens) print "\n Saving..." question_vocabulary.save(Q_VOCAB_NAME) utils.pickle_save(question_list, "question_tokens.dat") print "Done"
def fix_wrong_words_heuristic(data="",path=""): vn_vocab = load_vn_vocab() fixing_map = load_hard_fixing() hard_regex = HardRegex() hard_regex.load_from_file() cc = [] for c in cc: if not c in vn_vocab: print "Wrong",c #exit(-1) wrong_words_counters = dict() if data != "": questions = data elif path != "": from load_products import load_question_from_file questions = load_question_from_file(path) else: from load_products import load_questions questions = load_questions() f_fix = open("%s/models/data/out/fixing"%cdir,"w",encoding="utf-8") bi_forward = dict() bi_backward = dict() question_norm1 = [] for qs in questions: qs = unicode(qs) qs = qs.lower() qs = hard_regex.replace(qs) tokens = split_sentece(qs) qq = [] ii = -1 for token in tokens: ii += 1 token = norm_token(token) try: token = fixing_map[token] qq.append(token) continue except: pass if is_skip_token(token): continue else: if not token in vn_vocab: #if token == u"luc": # print "LUC here ",qs try: if ii > 0: #if tokens[ii-1] == u"cường": # print "\t",token try: bi_backward[token][tokens[ii-1]] += 1 except: try: mm = bi_backward[token] except: mm = dict() bi_backward[token] = mm try: mm[tokens[ii-1]] += 1 except: mm[tokens[ii-1]] = 1 if ii < len(tokens) - 1: try: mm = bi_forward[token] except: mm = dict() bi_forward[token] = mm try: mm[tokens[ii + 1]] += 1 except: mm[tokens[ii + 1]] = 1 wrong_words_counters[token] += 1 except: wrong_words_counters[token] = 1 qq.append(token) ss = " ".join(qq) question_norm1.append(qq) f_fix.write(u"%s\n"%ss) f_fix.close() kvs = [] for key, value in sorted(wrong_words_counters.iteritems(), key=lambda (k, v): (v, k)): kvs.append([key, value]) TOP = 400 f = open("%s/models/data/out/popular_wrong_words.dat"%cdir,"w",encoding="utf-8") for i in xrange(1,TOP): f.write(u"%s\n"%kvs[-i][0]) #print kvs[-ie WMT’14 English to French][0],kvs[-i][1] f.close() #TOP = 300 candidates_f = dict() candidates_b = dict() revert_f = dict() revert_b = dict() T_TOP = 2 T_MIN = 8 f_forward_exist = dict() f_backward_exist = dict() for i in xrange(1,TOP): k = kvs[-i][0] #print kvs[-i][0],kvs[-i][1] forward_exist = True backward_exist = True try: f_forward = utils.sort_dict(bi_forward[k]) except: forward_exist = False try: f_backward = utils.sort_dict(bi_backward[k]) except: backward_exist = False f_forward_exist[k] = forward_exist f_backward_exist[k] = backward_exist if forward_exist: sz = min(T_TOP,len(f_forward)) for i in xrange(sz): if f_forward[i][1] > T_MIN: try: #print f_forward[i][0] revert_f[f_forward[i][0]].add(k) except: revert_f[f_forward[i][0]] = set() revert_f[f_forward[i][0]].add(k) if backward_exist: sz = min(T_TOP,len(f_backward)) for i in xrange(sz): if f_backward[i][1] > T_MIN: try: revert_b[f_backward[i][0]].add(k) except: revert_b[f_backward[i][0]] = set() revert_b[f_backward[i][0]].add(k) #print revert_b #print revert_f b_stores = dict() f_stores = dict() for q in question_norm1: i = -1 for token in q: i += 1 if i < len(q) - 1: w_next = q[i+1] if w_next in vn_vocab: try: b_own = revert_b[token] #Saving backward word context try: bb = b_stores[w_next] except: bb = dict() b_stores[w_next] = bb try: bb[token] += 1 except: bb[token] = 1 #Adding to the bw candidates for w in b_own: try: d_cand = candidates_b[w] except: d_cand = dict() candidates_b[w] = d_cand try: d_cand[w_next] += 1 except: d_cand[w_next] = 1 except: pass if i > 0: w_before = q[i - 1] if w_before in vn_vocab: try: b_own = revert_f[token] try: ff = f_stores[w_before] except: ff = dict() f_stores[w_before] = ff try: ff[token] += 1 except: ff[token] = 1 for w in b_own: try: d_cand = candidates_f[w] except: d_cand = dict() candidates_f[w] = d_cand try: d_cand[w_before] += 1 except: d_cand[w_before] = 1 except: pass f = open("%s/models/data/out/fix_candidates"%cdir,"w",encoding="utf-8") one_fix = dict() f_one_fix = open("%s/models/data/out/one_fix.dat"%cdir,"w",encoding="utf-8") f_multi_fix = open("%s/models/data/out/multi_fix.dat"%cdir,"w",encoding="utf-8") N_MULTI = 2 N_CONTEXT = 3 THRES_2 = 0.7 for k,v in b_stores.iteritems(): v = utils.sort_dict(v) b_stores[k] = v for k,v in f_stores.iteritems(): v = utils.sort_dict(v) f_stores[k] = v for k,v in candidates_b.iteritems(): if f_backward_exist[k]: #print "Error_b: ",k ss = utils.sort_dict(v) #print "\t",ss ll = [] l_candidates = [] l_ref_scores = [] for s in ss: ll.append(u"%s:%s " % (s[0], s[1])) l_candidates.append(s[0]) l_ref_scores.append(s[1]) ll = " ".join(ll) f.write(u"%s:\n" % k) f.write(u"\t%s\n" % ll) true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores) ll2 = [] for i in xrange(len(true_candidates)): ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i])) f.write(u"\t%s\n"%" ".join(ll2)) #Write one fix: if sorted_socre[1] < 1 and sorted_socre[0] > 1: one_fix[k] = true_candidates[0] elif sorted_socre[1] > THRES_2: for i in reversed(xrange(2)): fix = true_candidates[i] try: ll_context = [] back_context = b_stores[fix] for i in xrange(N_CONTEXT): ll_context.append(back_context[i][0]) f_multi_fix.write("B\t%s\t%s\t%s\n"%(k,fix," ".join(ll_context))) except: pass f.write(u"\n\n\n") for k,v in candidates_f.iteritems(): if f_forward_exist[k]: #print "Error_f: ",k ss = utils.sort_dict(v) #print "\t",ss ll = [] l_candidates = [] l_ref_scores = [] for s in ss: ll.append(u"%s:%s " % (s[0], s[1])) l_candidates.append(s[0]) l_ref_scores.append(s[1]) ll = " ".join(ll) f.write(u"%s:\n" % k) f.write(u"\t%s\n" % ll) true_candidates,sorted_socre = get_candidate(k,l_candidates,l_ref_scores) ll2 = [] for i in xrange(len(true_candidates)): ll2.append(u"%s:%s "%(true_candidates[i],sorted_socre[i])) f.write(u"\t%s\n"%" ".join(ll2)) #one fix: if sorted_socre[1] < 1 and sorted_socre[0] > 1: one_fix[k] = true_candidates[0] elif sorted_socre[1] > THRES_2: for i in reversed(xrange(2)): fix = true_candidates[i] try: ll_context = [] forward_context = f_stores[fix] for i in xrange(N_CONTEXT): ll_context.append(forward_context[i][0]) f_multi_fix.write("F\t%s\t%s\t%s\n" % (k, fix, " ".join(ll_context))) except: pass f.close() for k,v in one_fix.iteritems(): f_one_fix.write("%s\t%s\n"%(k,v)) f_one_fix.close() f_multi_fix.close()