def filter_id_set_on_conf(conf, cand_id_set, corr): ret_id_set = set() feat_name = conf[0] clas_name = conf[1] pred_root_path = os.path.join(data_cfg.pred_root_path, clas_name, feat_name) gt_root_path = data_cfg.gt_root_path for qid in cand_id_set: gt_fn = os.path.join(gt_root_path, str(qid) + ".pkl") pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl") pred_meta = gen_utils.read_dict_from_pkl(pred_fn) gt = gen_utils.read_dict_from_pkl(gt_fn) pred_term = pred_meta['pred_term'] if corr and gt == pred_term: ret_id_set.add(qid) continue if (not corr) and gt != pred_term: ret_id_set.add(qid) continue return ret_id_set
def evaluate_a_conf(conf): feat_type = conf[0] cls_type = conf[1] qid_list = gen_utils.read_dict_from_pkl(conf[2]) pred_root_path = os.path.join(data_cfg.pred_root_path, cls_type, feat_type) gt_root_path = data_cfg.gt_root_path total_ins = 0 acc_dict = {} for k in topk_list: acc_dict[k] = 0 for qid in qid_list: total_ins += 1 gt_fn = os.path.join(gt_root_path, str(qid) + ".pkl") pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl") pred_meta = gen_utils.read_dict_from_pkl(pred_fn) gt = gen_utils.read_dict_from_pkl(gt_fn) for k in topk_list: ret = pred_k_correct(pred_meta['all_pred_probs'], gt, k) if ret: acc_dict[k] += 1 pass for k in topk_list: acc_dict[k] = float(acc_dict[k]) / float(total_ins) return acc_dict
def vec_bm25_func(feat_root_path, qid): feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") ret_feat = np.asarray([0.0] * len(cand_list)) try: org_feat = gen_utils.read_dict_from_pkl(feat_fn) for key, val in org_feat: ret_feat[cand_list.index(key)] = val except: pass return ret_feat
def tag_pos(qid): qtokens = gen_utils.read_dict_from_pkl(os.path.join(data_cfg.root_path,qid & "_tokens.pkl")) qpos = pos_tagger.tag(qtokens) pos_list = [] for q in qpos: tok, pos = q pos_list.append(pos) pos_fn=os.path.join(feat_root_path,str(qid)+".pkl") gen_utils.write_dict_to_pkl(pos_list,pos_fn) return
def evaluate_a_conf(conf): feat_type = conf[0] cls_type = conf[1] qid_list = gen_utils.read_dict_from_pkl(conf[2]) pred_root_path = os.path.join(data_cfg.pred_root_path, cls_type, feat_type) gt_root_path = data_cfg.gt_root_path acc_count = 0 total_ins = 0 for qid in qid_list: total_ins += 1 gt_fn = os.path.join(gt_root_path, str(qid) + ".pkl") pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl") pred_meta = gen_utils.read_dict_from_pkl(pred_fn) gt = gen_utils.read_dict_from_pkl(gt_fn) pred_term = pred_meta['pred_term'] if gt == pred_term: acc_count += 1 pass return acc_count / float(total_ins)
def vec_embed_sim(feat_root_path, qid, bin_size=10): feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") ret_feat = np.asarray([0.0] * bin_size * len(cand_list)) try: org_feat = gen_utils.read_dict_from_pkl(feat_fn) for i in xrange(0, len(cand_list)): token = cand_list[i] ret_feat[i * bin_size:(i + 1) * bin_size] = org_feat[token] except: pass return ret_feat
def context(qid, threshold): qtokens = gen_utils.read_dict_from_pkl( os.path.join(data_cfg.root_path, qid & "_tokens.pkl")) #check actual address conVectorsSt = [] conVectorsDm = [] targVectorsSt = [] targVectorsDm = [] for i in range(qtokens.size()): token = qtokens[i] qvecSt = standard_vectors.lookup(token) qvecDo = domain_vectors.lookup(token) if token in candidate_list: targVectorsSt.append(qvecSt) targVectorsDo.append(qvecDo) else: conVectorsSt.append(qvecSt) conVectorsDo.append(qvecDo) count = 0 stanDistTotal = 0 domDistTotal = 0 for i in range(targVectorsSt.size()): count += 1 targSt = targVectorsSt[i] targDo = targVectorsDo[i] for j in range(conVectorsSt.size()): conSt = conVectorsSt[j] conDo = conVectorsSt[j] stanDistTotal += vecDistance(targSt, conSt) domDistTotal += vecDistance(targDo, conDo) stanDistTotal = float(stanDistTotal) / count domDistTotal = float(domDistTotal) / count if stanDistTotal > domDistTotal: context_feat = domDistTotal / stanDistTotal else: context_feat = -1 * (stanDistTotal / domDistTotal) con_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") gen_utils.write_dict_to_pkl(context_feat, con_feat_fn) return
def pred_a_cfg(cfg): feat_type=cfg[0];trn_split=cfg[2];eval_list=cfg[-1] feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type) pred_root_path = os.path.join(data_cfg.pred_root_path,model_type,feat_type) if not os.path.exists(pred_root_path): os.makedirs(pred_root_path) model_root_path = os.path.join(data_cfg.model_root_path,feat_type,trn_split,model_type) model_fn=os.path.join(model_root_path,"model.pkl") model=gen_utils.read_dict_from_pkl(model_fn) qid_list=gen_utils.read_dict_from_pkl(eval_list) for qid in qid_list: print "Loading feature: ", qid feat_fn = os.path.join(feat_root_path, str(qid) + ".npz") if not os.path.exists(feat_fn): print "Warning: feature not exist:", qid continue feat=np.load(feat_fn)['feat'] feat=np.expand_dims(feat, axis=0) pred_label = cand_list[model.predict(feat)[0]] all_pred_score = model.decision_function(feat)[0].tolist() q_pred = {'all_pred_probs': all_pred_score, 'pred_term': pred_label} pred_fn=os.path.join(pred_root_path,str(qid)+".pkl") gen_utils.write_dict_to_pkl(q_pred,pred_fn) pass return
def vec_mfe_func(feat_root_path, qid): feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") ret_feat = np.asarray([0.0] * len(cand_list)) try: org_feat = gen_utils.read_dict_from_pkl(feat_fn) ret_feat[cand_list.index(org_feat['most_common'])] = 1 for other_token in org_feat['cand_token_list']: ret_feat[cand_list.index(other_token)] = 0.5 except: # feature may not exist pass return ret_feat
def get_cand_gt_dist(): qid_list=gen_utils.read_dict_from_pkl(qid_list_fn) # calculate the candidate ground-truth distribution count_dict={} for cand in candidate_list: question=data_utils.load_quaser_qmeta_by_id(0) print "pase" count_dict[cand]=0 for qid in qid_list: gt=data_utils.load_quaser_gt_by_id(qid) count_dict[gt]+=1 for key,val in count_dict.iteritems(): print key,val/float(sum(count_dict.values())) return
def load_quaser_qmeta_by_id(id): pass q_fn = os.path.join(q_root_path, str(id) + ".pkl") q = gen_utils.read_dict_from_pkl(q_fn) return q
def load_quaser_sctx_by_id(id): pass sctx_fn = os.path.join(sctx_root_path, str(id) + ".pkl") sctx = gen_utils.read_dict_from_pkl(sctx_fn) return sctx
def load_quaser_gt_by_id(id): pass gt_fn=os.path.join(gt_root_path,str(id)+".pkl") gt=gen_utils.read_dict_from_pkl(gt_fn) return gt
import os import sys sys.path.append("../") sys.path.append("../../") import config.data_config as data_cfg import utils.gen_utils as gen_utils import utils.data_utils as data_utils from modules.VocabEntry import VocabEntry lst_fn=data_cfg.all_list_fn qa_list=gen_utils.read_dict_from_pkl(lst_fn) gt_root_path=data_cfg.gt_root_path lctx_root_path=data_cfg.long_ctx_root_path sctx_root_path=data_cfg.short_ctx_root_path q_root_path=data_cfg.q_root_path cand_list=gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn) def get_q_text(qid): ret_lines=[] gt_text,lctx_dict,sctx_dict,q_meta=data_utils.load_quaser_all_by_id(qid) ret_lines.append(gt_text) for _,meta in lctx_dict.iteritems(): ret_lines.append(meta['question']) for _, meta in sctx_dict.iteritems(): ret_lines.append(meta['question']) ret_lines.append(q_meta['question']) ret_lines.append(q_meta['answer'])
''' 11/11/2018: Calculate the BM25 meta information. (N and dft) ''' import os import sys sys.path.append("../") sys.path.append("../../") import config.data_config as data_cfg import utils.gen_utils as gen_utils import utils.data_utils as data_utils from nltk import word_tokenize # process for all type of questions lst_fn = data_cfg.all_list_fn q_list = gen_utils.read_dict_from_pkl(lst_fn) candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn) def calc_bm25_N(): doc_id_list = gen_utils.read_dict_from_pkl(lst_fn) N = len(doc_id_list) return N def calc_candidate_df(): cand_df_dict = {} for cand_word in candidate_list: if not cand_word in cand_df_dict: cand_df_dict[cand_word] = 0
# obtain the one with maximum frequency most_common_token = most_common_in_list(all_context_tokens) else: most_common_token = None dst_meta = { 'cand_token_list': all_context_tokens, 'most_common': most_common_token } gen_utils.write_dict_to_pkl(dst_meta, dst_feat_fn) return if __name__ == "__main__": qa_list = gen_utils.read_dict_from_pkl(lst_fn) # use multi-thread for fast processing thread_pool = [] # extract feature for a specific question/answer context for qid in qa_list: th = threading.Thread(target=extract_feat_on_qid, args=(qid, )) th.start() thread_pool.append(th) while len(threading.enumerate()) >= MAX_TH: pass for th in thread_pool: th.join()
import os import sys sys.path.append("../") import config.data_config as data_cfg import utils.gen_utils as gen_utils import utils.data_utils as data_utils import threading import nltk lst_fn = data_cfg.all_list_fn candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cad_lst.fn) standard_vectors = gen_utils.read_dict_from_pkl( data_cfg.standard_vectors) #address domain_vectors = gen_utils.read_dict_from_pkl( data_cfg.domain_vectors) #address def context(qid, threshold): qtokens = gen_utils.read_dict_from_pkl( os.path.join(data_cfg.root_path, qid & "_tokens.pkl")) #check actual address conVectorsSt = [] conVectorsDm = [] targVectorsSt = [] targVectorsDm = [] for i in range(qtokens.size()): token = qtokens[i]
# the name of the feature feat_type = "w_embed_sim" MAX_TH = 16 stop_words = set(stopwords.words('english')) # create the feature root path feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type) if not os.path.exists(feat_root_path): os.makedirs(feat_root_path) candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn) word_embed_mat = np.load( "/home/jiangl1/data/11791_data/quaser/raw/id2wordvec.npy") vocab = gen_utils.read_dict_from_pkl( "/home/jiangl1/data/11791_data/quaser/vocab.pkl") print "Initialize finished..." def calc_bin_vec(cand_vec, token_list): bin = np.zeros(10) for token in token_list: try: token_vec = word_embed_mat[vocab.word2id[token]] except: continue sim = 1 - spatial.distance.cosine(cand_vec, token_vec) sim_dim = int((sim + 1) / 0.2 - 1) #(from -1 to 1, 0.2 per interval) bin[sim_dim] += 1
from boom.modules import Module from multiprocessing import Pool from nltk import word_tokenize from nltk.corpus import stopwords import utils.gen_utils as gen_utils import math indri_meta = gen_utils.read_dict_from_pkl("data/Indri_meta.pkl") stop_words = set(stopwords.words('english')) candidate_list = gen_utils.read_lines_from_text_file("data/candidate.lst") lam = indri_meta['lambda'] mu = indri_meta['mu'] C = indri_meta['C'] ctf_dict = indri_meta['ctf'] def get_token_dict(tokens): ret_dict = {} total_token = 0 for token in tokens: if not token in ret_dict: ret_dict[token] = 0 ret_dict[token] += 1 total_token += 1 return ret_dict, total_token def multi_process_helper(args):
import os import sys sys.path.append("../") import config.data_config as data_cfg import utils.gen_utils as gen_utils import utils.data_utils as data_utils trn_lst=gen_utils.read_dict_from_pkl(data_cfg.trn_list_fn) print "Initialize finished..." ins_lst=trn_lst[0:2] if __name__=="__main__": all_ins=[] for ins_id in ins_lst: print ins_id question=data_utils.load_quaser_qmeta_by_id(ins_id) context=data_utils.load_quaser_lctx_by_id(ins_id) q_and_context={'question':question,'context':context} all_ins.append(q_and_context) gen_utils.write_dict_to_json(all_ins,"/home/jiangl1/data/11791_data/jiang_codes/BOOM/examples/QS/test_data.json") print "done."
from boom.modules import Module from multiprocessing import Pool from nltk import word_tokenize from nltk.corpus import stopwords import utils.gen_utils as gen_utils import math bm25_meta = gen_utils.read_dict_from_pkl("data/BM25_meta.pkl") N = bm25_meta['N'] avg_doclen = bm25_meta['avg_doc_len'] df_dict = bm25_meta['df_dict'] stop_words = set(stopwords.words('english')) candidate_list = gen_utils.read_lines_from_text_file("data/candidate.lst") def get_token_dict(tokens): ret_dict = {} total_token = 0 for token in tokens: if not token in ret_dict: ret_dict[token] = 0 ret_dict[token] += 1 total_token += 1 return ret_dict, total_token def multi_process_helper(args): q_and_context_list = args[0] k1 = args[1] b = args[2]
from boom.modules import Module from multiprocessing import Pool from nltk import word_tokenize from nltk.corpus import stopwords import utils.gen_utils as gen_utils import numpy as np stop_words = set(stopwords.words('english')) candidate_list = gen_utils.read_lines_from_text_file("data/candidate.lst") model = gen_utils.read_dict_from_pkl( "models/MF-e.MF-i.bm25_scores.indri_scores/train/linear_svm/model.pkl") def multi_process_helper(args): q_and_context_list = args[0] ret_list = [] print "In prediction: ", len(q_and_context_list), type(q_and_context_list) id = 0 for q_and_context in q_and_context_list: id += 1 print "In prediction, ind ", id, "/", len(q_and_context_list) feat = np.asarray(q_and_context['final_feat']) feat = np.expand_dims(feat, axis=0) pred_label = candidate_list[model.predict(feat)[0]] final_meta = {} final_meta['pred'] = pred_label final_meta['q_meta'] = q_and_context['question']
# process for all type of questions lst_fn = data_cfg.all_list_fn # the name of the feature feat_type = "indri_scores" MAX_TH = 32 # create the feature root path feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type) if not os.path.exists(feat_root_path): os.makedirs(feat_root_path) candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn) indri_meta = gen_utils.read_dict_from_pkl( os.path.join(data_cfg.dataset_root_path, "Indri_meta.pkl")) lam = indri_meta['lambda'] mu = indri_meta['mu'] C = indri_meta['C'] ctf_dict = indri_meta['ctf'] print lam, mu, C print "init finiished..." def get_token_dict(tokens): ret_dict = {} total_token = 0 for token in tokens: if not token in ret_dict:
''' 11/22/2018: Viz and compare the qualitative errors on the given data. ''' import os import sys sys.path.append("../") import config.data_config as data_cfg import utils.gen_utils as gen_utils import utils.data_utils as data_utils qid_list = gen_utils.read_dict_from_pkl(data_cfg.tst_list_fn) ''' # all correct # a list of correction cfgs (intersection will be applied internally) corr_conf_list=[("MF-e.MF-i.bm25_scores.indri_scores","linear_svm"),("MF-e.MF-i","linear_svm")] # a list of error cfgs (intersection will be applied internally) erro_conf_list=[] ''' MAX_VIZ = 10 ''' # one correct # a list of correction cfgs (intersection will be applied internally) corr_conf_list=[("MF-e.MF-i.bm25_scores.indri_scores","linear_svm")] # a list of error cfgs (intersection will be applied internally) erro_conf_list=[("MF-e.MF-i","linear_svm")] '''
def calc_bm25_N(): doc_id_list = gen_utils.read_dict_from_pkl(lst_fn) N = len(doc_id_list) return N
feat_fn = os.path.join(feat_root_path, str(qid) + ".npz") if not os.path.exists(feat_fn): print "Warning: feature not exist:", qid continue gt_token = data_utils.load_quaser_gt_by_id(qid) gt_label = cand_list.index(gt_token) feat = np.load(feat_fn)['feat'] ret_feats.append(feat) ret_labels.append(gt_label) return ret_feats, ret_labels if __name__ == "__main__": for cfg in train_cfgs: qid_list = gen_utils.read_dict_from_pkl(cfg[1]) model_root_path = os.path.join(data_cfg.model_root_path, cfg[0], cfg[2], model_type) print model_root_path all_train_feats, all_train_labels = load_feat_and_labels(qid_list, cfg) print "training model, cfg: ", cfg model = MultinomialNB() model = model.fit(all_train_feats, all_train_labels) if not os.path.exists(model_root_path): os.makedirs(model_root_path) model_fn = os.path.join(model_root_path, "model.pkl") gen_utils.write_dict_to_pkl(model, model_fn)
# process for all type of questions lst_fn = data_cfg.all_list_fn feat_type = "bm25_scores" classifier_type = "heuristic" feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type) pred_root_path = os.path.join(data_cfg.pred_root_path, classifier_type, feat_type) if not os.path.exists(pred_root_path): os.makedirs(pred_root_path) candidate_list = gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn) if __name__ == "__main__": qa_list = gen_utils.read_dict_from_pkl(lst_fn) for qid in qa_list: print "classify on qid: ", qid feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") bm25_dict = gen_utils.read_dict_from_pkl(feat_fn) cur_prob = [] max_prob = -1 max_label = None for cand_term in candidate_list: cur_prob.append(bm25_dict[cand_term]) if bm25_dict[cand_term] > max_prob: max_prob = bm25_dict[cand_term] max_label = cand_term