def eval_by_qid_list_helper(self, qid_list, pair_generator): relevance_dict = load_pickle(self.config.relevance_dict_path) qid_list = sorted(qid_list) qualified_qid_list = [] res_dict = OrderedDict() for qid in qid_list: relevance = relevance_dict.get(qid) supervised_docid_list = relevance.get_supervised_docid_list() if len(supervised_docid_list) < self.config.nb_supervised_doc: # cannot construct d2d feature, thus not need to be update score_list = relevance.get_supervised_score_list() res = Result(qid, supervised_docid_list, score_list, self.config.runid) res_dict.update({qid: res}) logging.warn("query {0} not to be rerank".format(qid)) else: qualified_qid_list.append(qid) # generate re rank score dd_q, dd_d, score_gate, len_indicator = \ pair_generator.generate_list_batch(qualified_qid_list, self.config.rerank_topk) return [dd_q, dd_d, score_gate], len_indicator, res_dict, qualified_qid_list
def __init__(self, relevance_dict_path, dd_q_feature_path, dd_d_feature_path, sample_perquery_limit, sample_total_limit, query_maxlen=9, doc_topk_term=30, nb_supervised_doc=20, hist_size=30, batch_size=32, shuffle=True): super(NPRFDRMMPairGenerator, self).__init__(relevance_dict_path, batch_size, shuffle, sample_perquery_limit, sample_total_limit) self.query_maxlen = query_maxlen self.doc_topk_term = doc_topk_term self.nb_supervised_doc = nb_supervised_doc self.hist_size = hist_size # self.qd_q_gating_dict = load_pickle(qd_q_feature_path) self.dd_q_gating_dict = load_pickle(dd_q_feature_path) # self.qd_d_feature_path = qd_d_feature_path self.dd_d_feature_path = dd_d_feature_path
def hist_d2d(relevance_file, text_max_len, hist_size, sim_path, hist_path, d2d=False): # qid_list = os.listdir(sim_path) relevance_dict = load_pickle(relevance_file) qid_list = relevance_dict.keys() with poolcontext(processes=14) as pool: pool.map( partial(hist_per_query, relevance_dict, text_max_len, hist_size, sim_path, hist_path, d2d), qid_list) logging.info("Finish all!")
def hist_per_query(relevance_dict, text_max_len, hist_size, sim_path, hist_path, d2d, qid): # relevance_dict = load_pickle(relevance_file) hist_output_dir = make_directory(hist_path, str(qid)) # files = glob.glob(os.path.join(sim_path, str(qid), '*.pickle')) relevance = relevance_dict.get(qid) supervised_docid_list = relevance.get_supervised_docid_list() judged_docid_list = relevance.get_judged_docid_list() ''' because we only want to rerank top 500 docs, but judged docs that lie in in top 1000 should also be considered, for the sufficiency of training ''' cand = judged_docid_list[0] + judged_docid_list[1] + judged_docid_list[2] waitlist = [ docid for docid in cand if docid in supervised_docid_list[500:2000] ] useful_docid_list = supervised_docid_list[:1000] #[:500] + waitlist for docid in useful_docid_list: # supervised_docid_list[:1000]: # supervised_docid_list[:500] + waitlist: # file_name = os.path.basename(sim_file) # file_name = re.sub('pickle', 'npy', file_name) # _file = os.path.join(hist_output_dir, file_name) if d2d: sim_file_name = os.path.join(sim_path, str(qid), 'q{0}_d{1}.pickle'.format(qid, docid)) else: sim_file_name = os.path.join(sim_path, str(qid), 'q{0}_d{1}.npy'.format(qid, docid)) hist_file_name = os.path.join(hist_output_dir, 'q{0}_d{1}.npy'.format(qid, docid)) if os.path.exists(hist_file_name): pass else: if d2d: sim_list = load_pickle(sim_file_name) hist_array = np.zeros((len(sim_list), text_max_len, hist_size), dtype=np.float32) for i, sim_mat in enumerate(sim_list): sim_mat = sim_mat[:, :20000] hist = hist_from_matrix(text_max_len, hist_size, sim_mat) hist_array[i] = hist np.save(hist_file_name, hist_array) else: sim_mat = np.load(sim_file_name) hist = hist_from_matrix(text_max_len, hist_size, sim_mat) np.save(hist_file_name, hist) logging.info("Finish for topic {0}".format(qid))
def sim_mat_and_kernel_d2d(relevance_file, topic_file, corpus_file, topk_corpus_file, embedding_file, stop_file, sim_output_path, kernel_output_path, kernel_mu_list, kernel_sigma_list, topk_supervised, d2d, test): '''Simultaneously compute similarity matrix and RBF kernel features Args: relevance_file: A dumped relevance dict file topic_file: a single line format topic file. format: qid term1 term2 ... corpus_file: corpus corresponding to docnolist file. format: docno\tdoclen\tterm1 term2 topk_corpus_file: corpus that contain only the topk terms for each document, format: same as corpus_file embedding_file: output file from word2vec toolkit, boolean=True stop_file: a stopword list file, one word per line sim_output_path: kernel_output_path: kernel_mu_list: kernel_sigma_list: topk_supervised: number of top-n documents for each query d2d: True for NPRF, False for simple query-document matching used by e.g. DRMM, K-NRM test: control the temporary output. Set false Returns: ''' relevance_dict = load_pickle(relevance_file) topic_dict = parse_topic(topic_file) corpus = parse_corpus(corpus_file) topk_corpus = parse_corpus(topk_corpus_file) embeddings = KeyedVectors.load_word2vec_format(embedding_file, binary=True) stoplist = parse_stoplist(stop_file) qid_list = relevance_dict.keys() import time for qid in qid_list: start = time.time() sim_mat_and_kernel_per_query(relevance_dict, topic_dict, corpus, topk_corpus, embeddings, stoplist, sim_output_path, kernel_output_path, kernel_mu_list, kernel_sigma_list, topk_supervised, d2d, test, qid) print(time.time() - start) break
def parse_idf_for_document(relevance_file, df_file, document_file, output_file, rerank_topk=500, doc_topk_term=30, nb_doc=528155): '''Get the idf weight for top k terms in each document Args: relevance_file: df_file: document_file: output_file: rerank_topk: doc_topk_term: nb_doc: Returns: ''' relevance_dict = load_pickle(relevance_file) df_map = df_map_from_file(df_file) topk_term_corpus = parse_corpus(document_file) idf_map = OrderedDict() for qid, relevance in relevance_dict.items(): #relevance = relevance_dict.get(qid) logging.info("query {0}".format(qid)) supervised_docid_list = relevance.get_supervised_docid_list( )[:rerank_topk] curr_idf = parse_idf_per_query(supervised_docid_list, df_map, doc_topk_term, topk_term_corpus, nb_doc) idf_map.update({qid: curr_idf}) save_pickle(idf_map, output_file)
# print(n) # qid_list = np.ones((5, 30), dtype=np.int) # for i in range(30): # l = np.random.permutation(m[5*i: 5*(i+1)]) # qid_list[:, i] = l # # qid_list = np.sort(qid_list) # print(qid_list.tolist()) global_info_path = "/home/lcj/data/desc.disk12/features/global.info" idf_params = { 'df_file': os.path.join(global_info_path, 'disk12.dfcf.txt'), 'topic_file': os.path.join(global_info_path, 'disk12.desc.porter.morefilter.txt'), 'output_file': os.path.join(global_info_path, 'desc.idf.pickle'), 'maxlen': 24, 'nb_doc': 741856 } parse_idf_for_query(**idf_params) idf = load_pickle(idf_params['output_file']) print(idf) # # get_query_length("/media/klaas/data/collection/clue_final/topic/clueweb.desc.porter.txt") # get_query_length("/media/klaas/data/collection/disk12/disk12.desc.porter.morefilter.txt")
def train(self, model, pair_generator, fold, output_file, use_nprf=False): '''Driver function for training Args: model: a keras Model pair_generator: a instantiated pair generator fold: which fold to run. partitions will be automatically rotated. output_file: temporary file for valiation use_nprf: whether to use nprf Returns: ''' # set tensorflow not to use the full GPU memory session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) # qid list config qid_list = deque(self.config.qid_list) rotate = fold - 1 map(qid_list.rotate(rotate), qid_list) #train_qid_list, valid_qid_list, test_qid_list = qid_list[0].tolist() + qid_list[1].tolist() + qid_list[2].tolist(), qid_list[3].tolist(), qid_list[4].tolist() train_qid_list, valid_qid_list, test_qid_list = qid_list[0] + qid_list[ 1] + qid_list[2], qid_list[3], qid_list[4] print(train_qid_list, valid_qid_list, test_qid_list) relevance_dict = load_pickle(self.config.relevance_dict_path) # pair_generator = DDMPairGenerator(**self.config.generator_params) nb_pair_train = pair_generator.count_pairs_balanced( train_qid_list, self.config.pair_sample_size) valid_params = self.eval_by_qid_list_helper(valid_qid_list, pair_generator) test_params = self.eval_by_qid_list_helper(test_qid_list, pair_generator) print(valid_params[-1], test_params[-1]) batch_logger = NBatchLogger(50) batch_losses = [] met = [[], [], [], [], [], []] iteration = -1 for i in range(self.config.nb_epoch): print("Epoch " + str(i)) nb_batch = nb_pair_train / self.config.batch_size train_generator = pair_generator.generate_pair_batch( train_qid_list, self.config.pair_sample_size) for j in range(int(nb_batch / 100)): iteration += 1 history = model.fit_generator( generator=train_generator, steps_per_epoch= 100, # nb_pair_train / self.config.batch_size, epochs=1, shuffle=False, verbose=0, callbacks=[batch_logger], ) batch_losses.append(batch_logger.losses) print("[Iter {0}]\tLoss: {1}".format(iteration, history.history['loss'])) kwargs = { 'model': model, 'relevance_dict': relevance_dict, 'rerank_topk': self.config.rerank_topk, 'qrels_file': self.config.qrels_file, 'docnolist_file': self.config.docnolist_file, 'runid': self.config.runid, 'output_file': output_file } if use_nprf: kwargs.update({ 'nb_supervised_doc': self.config.nb_supervised_doc, 'doc_topk_term': self.config.doc_topk_term, }) valid_met = self.eval_by_qid_list(*valid_params, **kwargs) print("[Valid]\t\tMAP\tP20\tNDCG20") print("\t\t{0}\t{1}\t{2}".format(valid_met[0], valid_met[1], valid_met[2])) met[0].append(valid_met[0]) met[1].append(valid_met[1]) met[2].append(valid_met[2]) kwargs['output_file'] = os.path.join( self.config.result_path, "fold{0}.iter{1}.res".format(fold, iteration)) # test_met = eval_partial(qid_list=test_qid_list) test_met = self.eval_by_qid_list(*test_params, **kwargs) print("[Test]\t\tMAP\tP20\tNDCG20") print("\t\t{0}\t{1}\t{2}".format(test_met[0], test_met[1], test_met[2])) met[3].append(test_met[0]) met[4].append(test_met[1]) met[5].append(test_met[2]) print("[Attention]\t\tCurrent best iteration {0}\n".format( met[0].index(max(met[0])))) if iteration > self.config.max_iteration: break # model.save_weights(os.path.join(self.config.save_path, "fold{0}.epoch{1}.h5".format(fold, i))) best_iter, eval_met = self._extract_max_metric(met) retain_file(self.config.result_path, "fold{0}".format(fold), "fold{0}.iter{1}.res".format(fold, best_iter)) # np.save('loss.npy', batch_losses) # np.save('met.npy', met) return eval_met