def extract_feat_on_qid(qid): print "qid: ", qid q_meta = data_utils.load_quaser_qmeta_by_id(qid) #q_lctx=data_utils.load_quaser_lctx_by_id(qid) q_question = q_meta['question'] q_question_tokens = set(word_tokenize(q_question)) q_question_tokens = [w for w in q_question_tokens if not w in stop_words] feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") cand2question_dict = {} #cand2context_dict={} for cand in candidate_list: #print cand #cid2bin_dict={} cand_vec = word_embed_mat[vocab.word2id[cand]] cand_question_vec = calc_bin_vec(cand_vec, q_question_tokens) cand2question_dict[cand] = cand_question_vec ''' for ctx_id,ctx_str in q_lctx.iteritems(): ctx_str=ctx_str['question'] ctx_str_tokens = set(word_tokenize(ctx_str)) ctx_str_tokens = [w for w in ctx_str_tokens if not w in stop_words] cur_bin=calc_bin_vec(cand_vec,ctx_str_tokens) cid2bin_dict[ctx_id]=cur_bin cand2context_dict[cand]=cid2bin_dict ''' gen_utils.write_dict_to_pkl(cand2question_dict, feat_fn) return
def extract_feat_on_qid(qid): print qid dst_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") q_context_dict = data_utils.load_quaser_lctx_by_id(qid) all_q_tokens = [] for qid, q_context in q_context_dict.iteritems(): q_context_text = q_context['question'] tokens = word_tokenize(q_context_text) all_q_tokens += tokens token_dict, doclen = get_token_dict(all_q_tokens) word_indri_score = {} for cand_word in candidate_list: p_mle_tc = ctf_dict[cand_word] / float(C) if not cand_word in token_dict: tf = 0.5 else: tf = token_dict[cand_word] p_score = (1 - lam) * float(tf + mu * p_mle_tc) / float( doclen + mu) + lam * p_mle_tc word_indri_score[cand_word] = p_score gen_utils.write_dict_to_pkl(word_indri_score, dst_feat_fn) return
def extract_feat_on_qid(qid): print qid q_context_dict=data_utils.load_quaser_lctx_by_id(qid) # load the query sentences, so that we can exclude it accordingly q_query=data_utils.load_quaser_qmeta_by_id(qid) q_question=q_query['question'] q_tokens=set(word_tokenize(q_question)) dst_feat_fn=os.path.join(feat_root_path,str(qid)+".pkl") all_context_tokens=[] for qid,q_context in q_context_dict.iteritems(): q_context_text=q_context['question'] # LUKE: can you toeknize the text we needed as a feature using nltk as following? tokens = word_tokenize(q_context_text) # remove the stop words tokens = [w for w in tokens if not w in stop_words] # remove the candidates in original questions tokens = [w for w in tokens if (not w in q_tokens)] all_context_tokens+=tokens all_context_tokens=set(all_context_tokens) token_avg_dist_dict={} for cand_token in candidate_list: #print cand_token dist=calc_avg_dist(set(list(cand_token)),all_context_tokens) token_avg_dist_dict[cand_token]=dist gen_utils.write_dict_to_pkl(token_avg_dist_dict,dst_feat_fn) return
def extract_feat_on_qid(qid): print qid q_context_dict = data_utils.load_quaser_lctx_by_id(qid) dst_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") all_context_tokens = [] for qid, q_context in q_context_dict.iteritems(): q_context_text = q_context['question'] # LUKE: can you toeknize the text we needed as a feature using nltk as following? tokens = word_tokenize(q_context_text) # remove the stop words tokens = [w for w in tokens if not w in stop_words] # remove those words not in candidate list tokens = [w for w in tokens if w in candidate_list] all_context_tokens += tokens if len(all_context_tokens) != 0: # obtain the one with maximum frequency most_common_token = most_common_in_list(all_context_tokens) else: most_common_token = None dst_meta = { 'cand_token_list': all_context_tokens, 'most_common': most_common_token } gen_utils.write_dict_to_pkl(dst_meta, dst_feat_fn) return
def tag_pos(qid): qtokens = gen_utils.read_dict_from_pkl(os.path.join(data_cfg.root_path,qid & "_tokens.pkl")) qpos = pos_tagger.tag(qtokens) pos_list = [] for q in qpos: tok, pos = q pos_list.append(pos) pos_fn=os.path.join(feat_root_path,str(qid)+".pkl") gen_utils.write_dict_to_pkl(pos_list,pos_fn) return
def context(qid, threshold): qtokens = gen_utils.read_dict_from_pkl( os.path.join(data_cfg.root_path, qid & "_tokens.pkl")) #check actual address conVectorsSt = [] conVectorsDm = [] targVectorsSt = [] targVectorsDm = [] for i in range(qtokens.size()): token = qtokens[i] qvecSt = standard_vectors.lookup(token) qvecDo = domain_vectors.lookup(token) if token in candidate_list: targVectorsSt.append(qvecSt) targVectorsDo.append(qvecDo) else: conVectorsSt.append(qvecSt) conVectorsDo.append(qvecDo) count = 0 stanDistTotal = 0 domDistTotal = 0 for i in range(targVectorsSt.size()): count += 1 targSt = targVectorsSt[i] targDo = targVectorsDo[i] for j in range(conVectorsSt.size()): conSt = conVectorsSt[j] conDo = conVectorsSt[j] stanDistTotal += vecDistance(targSt, conSt) domDistTotal += vecDistance(targDo, conDo) stanDistTotal = float(stanDistTotal) / count domDistTotal = float(domDistTotal) / count if stanDistTotal > domDistTotal: context_feat = domDistTotal / stanDistTotal else: context_feat = -1 * (stanDistTotal / domDistTotal) con_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") gen_utils.write_dict_to_pkl(context_feat, con_feat_fn) return
def extract_feat_on_qid(qid): print qid dst_feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") q_context_dict = data_utils.load_quaser_lctx_by_id(qid) all_q_tokens = [] for qid, q_context in q_context_dict.iteritems(): q_context_text = q_context['question'] tokens = word_tokenize(q_context_text) all_q_tokens += tokens token_dict, doclen = get_token_dict(all_q_tokens) word_bm25_score = {} for cand_word in candidate_list: try: df = df_dict[cand_word] except: df = 0 try: tf = token_dict[cand_word] except: tf = 0 rsj_weight = math.log((N - df + 0.5) / float(df + 0.5)) tf_weight = tf / float(tf + k1 * ((1 - b) + b * (doclen / float(avg_doclen)))) user_weight = (k3 + 1) * 1 / float((k3 + 1)) all_score = rsj_weight * tf_weight * user_weight word_bm25_score[cand_word] = all_score if all_score != 0: print cand_word, qid pass gen_utils.write_dict_to_pkl(word_bm25_score, dst_feat_fn) return
def pred_a_cfg(cfg): feat_type=cfg[0];trn_split=cfg[2];eval_list=cfg[-1] feat_root_path = os.path.join(data_cfg.feat_root_path, feat_type) pred_root_path = os.path.join(data_cfg.pred_root_path,model_type,feat_type) if not os.path.exists(pred_root_path): os.makedirs(pred_root_path) model_root_path = os.path.join(data_cfg.model_root_path,feat_type,trn_split,model_type) model_fn=os.path.join(model_root_path,"model.pkl") model=gen_utils.read_dict_from_pkl(model_fn) qid_list=gen_utils.read_dict_from_pkl(eval_list) for qid in qid_list: print "Loading feature: ", qid feat_fn = os.path.join(feat_root_path, str(qid) + ".npz") if not os.path.exists(feat_fn): print "Warning: feature not exist:", qid continue feat=np.load(feat_fn)['feat'] feat=np.expand_dims(feat, axis=0) pred_label = cand_list[model.predict(feat)[0]] all_pred_score = model.decision_function(feat)[0].tolist() q_pred = {'all_pred_probs': all_pred_score, 'pred_term': pred_label} pred_fn=os.path.join(pred_root_path,str(qid)+".pkl") gen_utils.write_dict_to_pkl(q_pred,pred_fn) pass return
def handle_a_split(split): global gid lctx_fn=os.path.join(lctx_root_path,split+"_contexts.json.gz") sctx_fn=os.path.join(sctx_root_path,split+"_contexts.json.gz") q_fn=os.path.join(q_root_path,split+"_questions.json.gz") assert os.path.isfile(lctx_fn) assert os.path.isfile(sctx_fn) assert os.path.isfile(q_fn) print "Handling: ",lctx_fn f_lctx=gzip.open(lctx_fn) f_sctx=gzip.open(sctx_fn) f_q=gzip.open(q_fn) for lctx,sctx,q in zip(f_lctx,f_sctx,f_q): #print "Handling q:",q lctx=json.loads(lctx);sctx=json.loads(sctx);q=json.loads(q) lctx_dict=parse_ctx(lctx['contexts']) sctx_dict=parse_ctx(sctx['contexts']) q_meta=q gt=str(q_meta['answer']) lctx_fn=os.path.join(data_cfg.long_ctx_root_path,str(gid)+".pkl") sctx_fn=os.path.join(data_cfg.short_ctx_root_path,str(gid)+".pkl") q_meta_fn=os.path.join(data_cfg.q_root_path,str(gid)+".pkl") gt_fn=os.path.join(data_cfg.gt_root_path,str(gid)+".pkl") gid+=1 gen_utils.write_dict_to_pkl(lctx_dict,lctx_fn) gen_utils.write_dict_to_pkl(sctx_dict,sctx_fn) gen_utils.write_dict_to_pkl(q_meta,q_meta_fn) gen_utils.write_dict_to_pkl(gt,gt_fn) pass f_lctx.close() f_sctx.close() f_q.close() return gid
feat_fn = os.path.join(feat_root_path, str(qid) + ".npz") if not os.path.exists(feat_fn): print "Warning: feature not exist:", qid continue gt_token = data_utils.load_quaser_gt_by_id(qid) gt_label = cand_list.index(gt_token) feat = np.load(feat_fn)['feat'] ret_feats.append(feat) ret_labels.append(gt_label) return ret_feats, ret_labels if __name__ == "__main__": for cfg in train_cfgs: qid_list = gen_utils.read_dict_from_pkl(cfg[1]) model_root_path = os.path.join(data_cfg.model_root_path, cfg[0], cfg[2], model_type) print model_root_path all_train_feats, all_train_labels = load_feat_and_labels(qid_list, cfg) print "training model, cfg: ", cfg model = MultinomialNB() model = model.fit(all_train_feats, all_train_labels) if not os.path.exists(model_root_path): os.makedirs(model_root_path) model_fn = os.path.join(model_root_path, "model.pkl") gen_utils.write_dict_to_pkl(model, model_fn)
candidate_list=gen_utils.read_lines_from_text_file(data_cfg.cand_lst_fn) if __name__=="__main__": qa_list = gen_utils.read_dict_from_pkl(lst_fn) for qid in qa_list: if qid%1000==0: print "classify on qid: ",qid feat_fn = os.path.join(feat_root_path, str(qid) + ".pkl") wd_dict = gen_utils.read_dict_from_pkl(feat_fn) cur_probs = []; max_prob = -10000; max_label = None for cand_term in candidate_list: if cand_term in wd_dict: cur_prob=-wd_dict[cand_term] else: cur_prob=0 if cur_prob>max_prob: max_label=cand_term max_prob=cur_prob cur_probs.append(cur_prob) q_pred = {'all_pred_probs': cur_probs, 'pred_term': max_label} pred_fn = os.path.join(pred_root_path, str(qid) + ".pkl") gen_utils.write_dict_to_pkl(q_pred, pred_fn) print "done."
gen_utils.write_dict_to_pkl(q_meta,q_meta_fn) gen_utils.write_dict_to_pkl(gt,gt_fn) pass f_lctx.close() f_sctx.close() f_q.close() return gid ''' if __name__=="__main__": for split in split_list: cur_gid=handle_a_split(split) print split,cur_gid print "done." ''' if __name__=="__main__": trn_list=[_ for _ in xrange(0,31049)] test_list=[_ for _ in xrange(31049,34223)] val_list=[_ for _ in xrange(34223,37362)] all_list=trn_list+test_list+val_list gen_utils.write_dict_to_pkl(trn_list,data_cfg.trn_list_fn) gen_utils.write_dict_to_pkl(val_list,data_cfg.val_list_fn) gen_utils.write_dict_to_pkl(test_list,data_cfg.tst_list_fn) gen_utils.write_dict_to_pkl(all_list,data_cfg.all_list_fn) print "done."
for cid, context in q_context_dict.iteritems(): cur_question = context['question'] cur_question_tokens = word_tokenize(cur_question) all_context_tokens += cur_question_tokens doc_len = doc_len + len(cur_question_tokens) all_doc_len.append(doc_len) if len(all_doc_len) > 2000: break import numpy as np return np.average(all_doc_len) if __name__ == "__main__": dst_fn = os.path.join(data_cfg.dataset_root_path, "BM25_meta.pkl") bm25_meta = gen_utils.read_dict_from_pkl(dst_fn) avg_doc_len = calc_avg_doc_len() bm25_meta['avg_doc_len'] = avg_doc_len ''' N=calc_bm25_N() cand_df_dict=calc_candidate_df() bm25_meta={'N':N,'df_dict':cand_df_dict} gen_utils.write_dict_to_pkl(bm25_meta,dst_fn) ''' gen_utils.write_dict_to_pkl(bm25_meta, dst_fn) print "done."
q_context_dict = data_utils.load_quaser_lctx_by_id(qid) for cid, context in q_context_dict.iteritems(): cur_question = context['question'] cur_question_tokens = word_tokenize(cur_question) all_context_tokens += cur_question_tokens token_cnt_dict={} for token in all_context_tokens: if not token in token_cnt_dict: token_cnt_dict[token]=0 token_cnt_dict[token]+=1 c_len+=len(all_context_tokens) for cand in candidate_list: if cand in token_cnt_dict: ret_dict[cand]+=token_cnt_dict[cand] print cand,ret_dict[cand] print qid return ret_dict,c_len if __name__=="__main__": indri_meta={'lambda':lam,'mu':mu} dst_fn = os.path.join(data_cfg.dataset_root_path, "Indri_meta.pkl") ctf_dict,c_len=get_ctf_dict() indri_meta['ctf']=ctf_dict indri_meta['C']=c_len gen_utils.write_dict_to_pkl(indri_meta,dst_fn) print "done."