def per_epoch_summary_step2(self, id_str, fold_k, list_fold_k_train_eval_track, list_fold_k_test_eval_track, do_vali, list_fold_k_vali_eval_track): sy_prefix = '_'.join(['Fold', str(fold_k)]) fold_k_train_eval = np.vstack(list_fold_k_train_eval_track) fold_k_test_eval = np.vstack(list_fold_k_test_eval_track) pickle_save(fold_k_train_eval, file=self.dir_run + '_'.join([sy_prefix, id_str, 'train_eval.np'])) pickle_save(fold_k_test_eval, file=self.dir_run + '_'.join([sy_prefix, id_str, 'test_eval.np'])) ''' fold_k_epoch_loss = np.hstack(list_epoch_loss) pickle_save((fold_k_epoch_loss, train_data.__len__()), file=self.dir_run + '_'.join([sy_prefix, id_str, 'epoch_loss.np'])) ''' if do_vali: fold_k_vali_eval = np.hstack(list_fold_k_vali_eval_track) pickle_save(fold_k_vali_eval, file=self.dir_run + '_'.join([sy_prefix, id_str, 'vali_eval.np']))
def fold_summary(self, fold_k, dir_run, train_data_length): sy_prefix = '_'.join(['Fold', str(fold_k)]) if self.do_validation: fold_k_vali_eval = np.hstack(self.list_fold_k_vali_track) pickle_save(fold_k_vali_eval, file=dir_run + '_'.join([sy_prefix, 'vali_eval.np'])) fold_k_train_eval = np.vstack(self.list_fold_k_train_track) fold_k_test_eval = np.vstack(self.list_fold_k_test_track) pickle_save(fold_k_train_eval, file=dir_run + '_'.join([sy_prefix, 'train_eval.np'])) pickle_save(fold_k_test_eval, file=dir_run + '_'.join([sy_prefix, 'test_eval.np'])) fold_k_epoch_loss = np.hstack(self.list_epoch_loss) pickle_save((fold_k_epoch_loss, train_data_length), file=dir_run + '_'.join([sy_prefix, 'epoch_loss.np']))
def kfold_cv_eval(self, data_dict=None, eval_dict=None, model_para_dict=None): """ Evaluation based on k-fold cross validation if multiple folds exist :param data_dict: :param eval_dict: :param model_para_dict: :return: """ self.display_information(data_dict=data_dict) self.setup_eval(data_dict=data_dict, eval_dict=eval_dict) model_id, data_id = self.model_parameter.model_id, data_dict['data_id'] fold_num = data_dict['fold_num'] # updated due to the debug mode cutoffs, do_validation = eval_dict['cutoffs'], eval_dict[ 'do_validation'] tree_ranker = globals()[model_id](model_para_dict) time_begin = datetime.datetime.now() # timing l2r_cv_avg_ndcg_scores = np.zeros(len(cutoffs)) # fold average l2r_cv_avg_nerr_scores = np.zeros(len(cutoffs)) # fold average l2r_cv_avg_ap_scores = np.zeros(len(cutoffs)) # fold average l2r_cv_avg_p_scores = np.zeros(len(cutoffs)) # fold average list_all_fold_ndcg_at_ks_per_q = [] list_all_fold_err_at_ks_per_q = [] list_all_fold_ap_at_ks_per_q = [] list_all_fold_p_at_ks_per_q = [] for fold_k in range(1, fold_num + 1): # determine the file paths file_train, file_vali, file_test = self.determine_files( data_dict=data_dict, fold_k=fold_k) self.update_save_model_dir(data_dict=data_dict, fold_k=fold_k) y_test, group_test, y_pred = tree_ranker.run( fold_k=fold_k, file_train=file_train, file_vali=file_vali, file_test=file_test, data_dict=data_dict, eval_dict=eval_dict, save_model_dir=self.save_model_dir) fold_avg_ndcg_at_ks, fold_avg_nerr_at_ks, fold_avg_ap_at_ks, fold_avg_p_at_ks,\ list_ndcg_at_ks_per_q, list_err_at_ks_per_q, list_ap_at_ks_per_q, list_p_at_ks_per_q = \ self.cal_metric_at_ks(model_id=model_id, all_std_labels=y_test, all_preds=y_pred, group=group_test, ks=cutoffs) performance_list = [ model_id ] if data_id in YAHOO_LTR or data_id in ISTELLA_LTR else [ model_id + ' Fold-' + str(fold_k) ] for i, co in enumerate(cutoffs): performance_list.append('\nnDCG@{}:{:.4f}'.format( co, fold_avg_ndcg_at_ks[i])) for i, co in enumerate(cutoffs): performance_list.append('\nnERR@{}:{:.4f}'.format( co, fold_avg_nerr_at_ks[i])) for i, co in enumerate(cutoffs): performance_list.append('\nMAP@{}:{:.4f}'.format( co, fold_avg_ap_at_ks[i])) for i, co in enumerate(cutoffs): performance_list.append('\nP@{}:{:.4f}'.format( co, fold_avg_p_at_ks[i])) performance_str = '\t'.join(performance_list) print('\n\t', performance_str) l2r_cv_avg_ndcg_scores = np.add( l2r_cv_avg_ndcg_scores, fold_avg_ndcg_at_ks) # sum for later cv-performance l2r_cv_avg_nerr_scores = np.add( l2r_cv_avg_nerr_scores, fold_avg_nerr_at_ks) # sum for later cv-performance l2r_cv_avg_ap_scores = np.add( l2r_cv_avg_ap_scores, fold_avg_ap_at_ks) # sum for later cv-performance l2r_cv_avg_p_scores = np.add( l2r_cv_avg_p_scores, fold_avg_p_at_ks) # sum for later cv-performance list_all_fold_ndcg_at_ks_per_q.extend(list_ndcg_at_ks_per_q) list_all_fold_err_at_ks_per_q.extend(list_err_at_ks_per_q) list_all_fold_ap_at_ks_per_q.extend(list_ap_at_ks_per_q) list_all_fold_p_at_ks_per_q.extend(list_p_at_ks_per_q) time_end = datetime.datetime.now() # overall timing elapsed_time_str = str(time_end - time_begin) print('Elapsed time:\t', elapsed_time_str + "\n") print() # begin to print either cv or average performance l2r_cv_avg_ndcg_scores = np.divide(l2r_cv_avg_ndcg_scores, fold_num) l2r_cv_avg_nerr_scores = np.divide(l2r_cv_avg_nerr_scores, fold_num) l2r_cv_avg_ap_scores = np.divide(l2r_cv_avg_ap_scores, fold_num) l2r_cv_avg_p_scores = np.divide(l2r_cv_avg_p_scores, fold_num) if do_validation: eval_prefix = str(fold_num) + '-fold cross validation scores:' else: eval_prefix = str(fold_num) + '-fold average scores:' print( model_id, eval_prefix, self.result_to_str(list_scores=l2r_cv_avg_ndcg_scores, list_cutoffs=cutoffs, metric_str='nDCG')) print( model_id, eval_prefix, self.result_to_str(list_scores=l2r_cv_avg_nerr_scores, list_cutoffs=cutoffs, metric_str='nERR')) print( model_id, eval_prefix, self.result_to_str(list_scores=l2r_cv_avg_ap_scores, list_cutoffs=cutoffs, metric_str='MAP')) print( model_id, eval_prefix, self.result_to_str(list_scores=l2r_cv_avg_p_scores, list_cutoffs=cutoffs, metric_str='P')) all_fold_ndcg_at_ks_per_q = np.vstack(list_all_fold_ndcg_at_ks_per_q) all_fold_err_at_ks_per_q = np.vstack(list_all_fold_err_at_ks_per_q) all_fold_ap_at_ks_per_q = np.vstack(list_all_fold_ap_at_ks_per_q) all_fold_p_at_ks_per_q = np.vstack(list_all_fold_p_at_ks_per_q) pickle_save( all_fold_ndcg_at_ks_per_q, file=self.output_root + '_'.join([data_id, model_id, 'all_fold_ndcg_at_ks_per_q.np'])) pickle_save( all_fold_err_at_ks_per_q, file=self.output_root + '_'.join([data_id, model_id, 'all_fold_err_at_ks_per_q.np'])) pickle_save( all_fold_ap_at_ks_per_q, file=self.output_root + '_'.join([data_id, model_id, 'all_fold_ap_at_ks_per_q.np'])) pickle_save(all_fold_p_at_ks_per_q, file=self.output_root + '_'.join([data_id, model_id, 'all_fold_p_at_ks_per_q.np'])) return l2r_cv_avg_ndcg_scores, l2r_cv_avg_nerr_scores, l2r_cv_avg_ap_scores, l2r_cv_avg_p_scores
def kfold_cv_eval(self, data_dict=None, eval_dict=None, sf_para_dict=None, model_para_dict=None): """ Evaluation learning-to-rank methods via k-fold cross validation if there are k folds, otherwise one fold. :param data_dict: settings w.r.t. data :param eval_dict: settings w.r.t. evaluation :param sf_para_dict: settings w.r.t. scoring function :param model_para_dict: settings w.r.t. the ltr_adhoc model :return: """ self.display_information(data_dict, model_para_dict) self.check_consistency(data_dict, eval_dict, sf_para_dict) self.setup_eval(data_dict, eval_dict, sf_para_dict, model_para_dict) model_id = model_para_dict['model_id'] fold_num = data_dict['fold_num'] # for quick access of common evaluation settings epochs, loss_guided = eval_dict['epochs'], eval_dict['loss_guided'] vali_k, log_step, cutoffs = eval_dict['vali_k'], eval_dict[ 'log_step'], eval_dict['cutoffs'] do_vali, do_summary = eval_dict['do_validation'], eval_dict[ 'do_summary'] ranker = self.load_ranker(model_para_dict=model_para_dict, sf_para_dict=sf_para_dict) time_begin = datetime.datetime.now() # timing l2r_cv_avg_scores = np.zeros(len(cutoffs)) # fold average for fold_k in range(1, fold_num + 1): # evaluation over k-fold data ranker.reset_parameters( ) # reset with the same random initialization train_data, test_data, vali_data = self.load_data( eval_dict, data_dict, fold_k) if do_vali: fold_optimal_ndcgk = 0.0 if do_summary: list_epoch_loss, list_fold_k_train_eval_track, list_fold_k_test_eval_track, list_fold_k_vali_eval_track = [], [], [], [] if not do_vali and loss_guided: first_round = True threshold_epoch_loss = torch.cuda.FloatTensor([ 10000000.0 ]) if self.gpu else torch.FloatTensor([10000000.0]) for epoch_k in range(1, epochs + 1): torch_fold_k_epoch_k_loss, stop_training = self.train_ranker( ranker=ranker, train_data=train_data, model_para_dict=model_para_dict, epoch_k=epoch_k) ranker.scheduler.step( ) # adaptive learning rate with step_size=40, gamma=0.5 if stop_training: print('training is failed !') break if (do_summary or do_vali) and (epoch_k % log_step == 0 or epoch_k == 1): # stepwise check if do_vali: # per-step validation score vali_eval_tmp = ndcg_at_k(ranker=ranker, test_data=vali_data, k=vali_k, gpu=self.gpu, device=self.device, label_type=self.data_setting. data_dict['label_type']) vali_eval_v = vali_eval_tmp.data.numpy() if epoch_k > 1: # further validation comparison curr_vali_ndcg = vali_eval_v if (curr_vali_ndcg > fold_optimal_ndcgk) or ( epoch_k == epochs and curr_vali_ndcg == fold_optimal_ndcgk ): # we need at least a reference, in case all zero print('\t', epoch_k, '- nDCG@{} - '.format(vali_k), curr_vali_ndcg) fold_optimal_ndcgk = curr_vali_ndcg fold_optimal_checkpoint = '-'.join( ['Fold', str(fold_k)]) fold_optimal_epoch_val = epoch_k ranker.save( dir=self.dir_run + fold_optimal_checkpoint + '/', name='_'.join( ['net_params_epoch', str(epoch_k)]) + '.pkl') # buffer currently optimal model else: print('\t\t', epoch_k, '- nDCG@{} - '.format(vali_k), curr_vali_ndcg) if do_summary: # summarize per-step performance w.r.t. train, test fold_k_epoch_k_train_ndcg_ks = ndcg_at_ks( ranker=ranker, test_data=train_data, ks=cutoffs, gpu=self.gpu, device=self.device, label_type=self.data_setting. data_dict['label_type']) np_fold_k_epoch_k_train_ndcg_ks = fold_k_epoch_k_train_ndcg_ks.cpu( ).numpy( ) if self.gpu else fold_k_epoch_k_train_ndcg_ks.data.numpy( ) list_fold_k_train_eval_track.append( np_fold_k_epoch_k_train_ndcg_ks) fold_k_epoch_k_test_ndcg_ks = ndcg_at_ks( ranker=ranker, test_data=test_data, ks=cutoffs, gpu=self.gpu, device=self.device, label_type=self.data_setting. data_dict['label_type']) np_fold_k_epoch_k_test_ndcg_ks = fold_k_epoch_k_test_ndcg_ks.cpu( ).numpy( ) if self.gpu else fold_k_epoch_k_test_ndcg_ks.data.numpy( ) list_fold_k_test_eval_track.append( np_fold_k_epoch_k_test_ndcg_ks) fold_k_epoch_k_loss = torch_fold_k_epoch_k_loss.cpu( ).numpy( ) if self.gpu else torch_fold_k_epoch_k_loss.data.numpy( ) list_epoch_loss.append(fold_k_epoch_k_loss) if do_vali: list_fold_k_vali_eval_track.append(vali_eval_v) elif loss_guided: # stopping check via epoch-loss if first_round and torch_fold_k_epoch_k_loss >= threshold_epoch_loss: print('Bad threshold: ', torch_fold_k_epoch_k_loss, threshold_epoch_loss) if torch_fold_k_epoch_k_loss < threshold_epoch_loss: first_round = False print('\tFold-', str(fold_k), ' Epoch-', str(epoch_k), 'Loss: ', torch_fold_k_epoch_k_loss) threshold_epoch_loss = torch_fold_k_epoch_k_loss else: print('\tStopped according epoch-loss!', torch_fold_k_epoch_k_loss, threshold_epoch_loss) break if do_summary: # track sy_prefix = '_'.join(['Fold', str(fold_k)]) fold_k_train_eval = np.vstack(list_fold_k_train_eval_track) fold_k_test_eval = np.vstack(list_fold_k_test_eval_track) pickle_save(fold_k_train_eval, file=self.dir_run + '_'.join([sy_prefix, 'train_eval.np'])) pickle_save(fold_k_test_eval, file=self.dir_run + '_'.join([sy_prefix, 'test_eval.np'])) fold_k_epoch_loss = np.hstack(list_epoch_loss) pickle_save( (fold_k_epoch_loss, train_data.__len__()), file=self.dir_run + '_'.join([sy_prefix, 'epoch_loss.np'])) if do_vali: fold_k_vali_eval = np.hstack(list_fold_k_vali_eval_track) pickle_save(fold_k_vali_eval, file=self.dir_run + '_'.join([sy_prefix, 'vali_eval.np'])) if do_vali: # using the fold-wise optimal model for later testing based on validation data buffered_model = '_'.join( ['net_params_epoch', str(fold_optimal_epoch_val)]) + '.pkl' ranker.load(self.dir_run + fold_optimal_checkpoint + '/' + buffered_model) fold_optimal_ranker = ranker else: # buffer the model after a fixed number of training-epoches if no validation is deployed fold_optimal_checkpoint = '-'.join(['Fold', str(fold_k)]) ranker.save(dir=self.dir_run + fold_optimal_checkpoint + '/', name='_'.join(['net_params_epoch', str(epoch_k)]) + '.pkl') fold_optimal_ranker = ranker torch_fold_ndcg_ks = ndcg_at_ks( ranker=fold_optimal_ranker, test_data=test_data, ks=cutoffs, gpu=self.gpu, device=self.device, label_type=self.data_setting.data_dict['label_type']) fold_ndcg_ks = torch_fold_ndcg_ks.data.numpy() performance_list = [model_id + ' Fold-' + str(fold_k) ] # fold-wise performance for i, co in enumerate(cutoffs): performance_list.append('nDCG@{}:{:.4f}'.format( co, fold_ndcg_ks[i])) performance_str = '\t'.join(performance_list) print('\t', performance_str) l2r_cv_avg_scores = np.add( l2r_cv_avg_scores, fold_ndcg_ks) # sum for later cv-performance time_end = datetime.datetime.now() # overall timing elapsed_time_str = str(time_end - time_begin) print('Elapsed time:\t', elapsed_time_str + "\n\n") l2r_cv_avg_scores = np.divide(l2r_cv_avg_scores, fold_num) eval_prefix = str( fold_num) + '-fold cross validation scores:' if do_vali else str( fold_num) + '-fold average scores:' print(model_id, eval_prefix, metric_results_to_string(list_scores=l2r_cv_avg_scores, list_cutoffs=cutoffs) ) # print either cv or average performance return l2r_cv_avg_scores
def get_cv_performance(self): time_end = datetime.datetime.now() # overall timing elapsed_time_str = str(time_end - self.time_begin) ndcg_cv_avg_scores = np.divide(self.ndcg_cv_avg_scores, self.fold_num) nerr_cv_avg_scores = np.divide(self.nerr_cv_avg_scores, self.fold_num) ap_cv_avg_scores = np.divide(self.ap_cv_avg_scores, self.fold_num) p_cv_avg_scores = np.divide(self.p_cv_avg_scores, self.fold_num) eval_prefix = str(self.fold_num) + '-fold cross validation scores:' if self.do_validation \ else str(self.fold_num) + '-fold average scores:' list_metric_strs = [] list_metric_strs.append( metric_results_to_string(list_scores=ndcg_cv_avg_scores, list_cutoffs=self.cutoffs, metric='nDCG')) list_metric_strs.append( metric_results_to_string(list_scores=nerr_cv_avg_scores, list_cutoffs=self.cutoffs, metric='nERR')) list_metric_strs.append( metric_results_to_string(list_scores=ap_cv_avg_scores, list_cutoffs=self.cutoffs, metric='AP')) list_metric_strs.append( metric_results_to_string(list_scores=p_cv_avg_scores, list_cutoffs=self.cutoffs, metric='P')) metric_string = '\n'.join(list_metric_strs) print("\n{} {}\n{}".format(self.model_id, eval_prefix, metric_string)) print('Elapsed time:\t', elapsed_time_str + "\n\n") if self.reproduce: torch_mat_per_q_p = torch.cat(self.list_per_q_p, dim=0) torch_mat_per_q_ap = torch.cat(self.list_per_q_ap, dim=0) torch_mat_per_q_nerr = torch.cat(self.list_per_q_nerr, dim=0) torch_mat_per_q_ndcg = torch.cat(self.list_per_q_ndcg, dim=0) #print('torch_mat_per_q_ndcg', torch_mat_per_q_ndcg.size()) mat_per_q_p = torch_mat_per_q_p.data.numpy() mat_per_q_ap = torch_mat_per_q_ap.data.numpy() mat_per_q_nerr = torch_mat_per_q_nerr.data.numpy() mat_per_q_ndcg = torch_mat_per_q_ndcg.data.numpy() pickle_save(target=mat_per_q_p, file=self.dir_run + '_'.join([self.model_id, 'all_fold_p_at_ks_per_q.np'])) pickle_save( target=mat_per_q_ap, file=self.dir_run + '_'.join([self.model_id, 'all_fold_ap_at_ks_per_q.np'])) pickle_save( target=mat_per_q_nerr, file=self.dir_run + '_'.join([self.model_id, 'all_fold_nerr_at_ks_per_q.np'])) pickle_save( target=mat_per_q_ndcg, file=self.dir_run + '_'.join([self.model_id, 'all_fold_ndcg_at_ks_per_q.np'])) return ndcg_cv_avg_scores
def __init__(self, split_type, list_as_file, data_id=None, data_dict=None, fold_dir=None, presort=True, alpha=0.5, dictQueryRepresentation=None, dictDocumentRepresentation=None, dictQueryPermutaion=None, dictQueryDocumentSubtopics=None, buffer=True, add_noise=False, std_delta=1.0): self.presort = presort self.add_noise = add_noise ''' split-specific settings ''' self.split_type = split_type self.data_id = data_dict['data_id'] assert presort is True # since it is time-consuming to generate the ideal diversified ranking dynamically. if data_dict['data_id'] in TREC_DIV: # supported datasets torch_buffer_file = fold_dir.replace('folder', 'Bufferedfolder') + split_type.name if self.presort: torch_buffer_file = '_'.join([torch_buffer_file, 'presort', '{:,g}'.format(alpha)]) if self.add_noise: torch_buffer_file = '_'.join([torch_buffer_file, 'gaussian', '{:,g}'.format(std_delta)]) torch_buffer_file += '.torch' if os.path.exists(torch_buffer_file): print('loading buffered file ...') self.list_torch_Qs = pickle_load(torch_buffer_file) else: self.list_torch_Qs = [] for qid in list_as_file: np_q_repr = dictQueryRepresentation[str(qid)] # [1, 100] alphaDCG = dictQueryPermutaion[str(qid)]['alphaDCG'] q_doc_subtopics = dictQueryDocumentSubtopics[str(qid)] perm_docs = dictQueryPermutaion[str(qid)]['permutation'] if self.presort: # print('json-alphaDCG', alphaDCG) # TODO the meaning of json-alphaDCG needs to be confirmed ''' the following comparison shows that the provided permutation of docs is the ideal ranking ''' #print('personal-computation for json', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5)) perm_docs = get_div_ideal_ranking(pool_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, alpha=alpha) #print('personal-computation for ideal', alpha_DCG_at_k(sorted_docs=perm_docs, q_doc_subtopics=q_doc_subtopics, k=4, alpha=0.5)) #print('===') list_doc_reprs = [] for doc in perm_docs: list_doc_reprs.append(dictDocumentRepresentation[doc]) # [1, 100] np_doc_reprs = np.vstack(list_doc_reprs) # [permutation_size, 100] q_repr = torch.from_numpy(np_q_repr).type(torch.FloatTensor) doc_reprs = torch.from_numpy(np_doc_reprs).type(torch.FloatTensor) if self.add_noise: # add gaussian noise q_noise = torch.normal(mean=torch.zeros_like(q_repr), std=std_delta) doc_noise = torch.normal(mean=torch.zeros_like(doc_reprs), std=std_delta) q_repr = torch.add(q_repr, q_noise) doc_reprs = torch.add(doc_reprs, doc_noise) np_rele_mat = to_matrix(perm_docs=perm_docs, q_doc_subtopics=q_doc_subtopics) q_doc_rele_mat = torch.from_numpy(np_rele_mat).type(torch.FloatTensor) self.list_torch_Qs.append((qid, q_repr, perm_docs, doc_reprs, alphaDCG, q_doc_subtopics, q_doc_rele_mat)) #print('Num of q:', len(self.list_torch_Qs)) if buffer: parent_dir = Path(torch_buffer_file).parent if not os.path.exists(parent_dir): os.makedirs(parent_dir) pickle_save(self.list_torch_Qs, torch_buffer_file) else: raise NotImplementedError
def __init__(self, train, file, data_id=None, data_dict=None, sample_rankings_per_q=1, shuffle=True, hot=False, eval_dict=None, buffer=True, given_scaler=None): assert data_id is not None or data_dict is not None if data_dict is None: data_dict = self.get_default_data_dict(data_id=data_id) self.train = train if data_dict['data_id'] in MSLETOR or data_dict['data_id'] in MSLRWEB \ or data_dict['data_id'] in YAHOO_LTR or data_dict['data_id'] in YAHOO_LTR_5Fold \ or data_dict['data_id'] in ISTELLA_LTR \ or data_dict['data_id'] == 'IRGAN_MQ2008_Semi': # supported datasets self.check_load_setting(data_dict, eval_dict) perquery_file = get_buffer_file_name(data_id=data_id, file=file, data_dict=data_dict) if sample_rankings_per_q > 1: if hot: torch_perquery_file = perquery_file.replace( '.np', '_'.join([ 'SP', str(sample_rankings_per_q), 'Hot', '.torch' ])) else: torch_perquery_file = perquery_file.replace( '.np', '_'.join(['SP', str(sample_rankings_per_q), '.torch'])) else: if hot: torch_perquery_file = perquery_file.replace( '.np', '_Hot.torch') else: torch_perquery_file = perquery_file.replace( '.np', '.torch') if eval_dict is not None: mask_label, mask_ratio, mask_type = eval_dict[ 'mask_label'], eval_dict['mask_ratio'], eval_dict[ 'mask_type'] print(eval_dict) if mask_label: mask_label_str = '_'.join( [mask_type, 'Ratio', '{:,g}'.format(mask_ratio)]) torch_perquery_file = torch_perquery_file.replace( '.torch', '_' + mask_label_str + '.torch') else: mask_label = False if os.path.exists(torch_perquery_file): print('loading buffered file ...') self.list_torch_Qs = pickle_load(torch_perquery_file) else: self.list_torch_Qs = [] scale_data = data_dict['scale_data'] scaler_id = data_dict[ 'scaler_id'] if 'scaler_id' in data_dict else None list_Qs = iter_queries(in_file=file, data_dict=data_dict, scale_data=scale_data, scaler_id=scaler_id, perquery_file=perquery_file, buffer=buffer) list_inds = list(range(len(list_Qs))) for ind in list_inds: qid, doc_reprs, doc_labels = list_Qs[ind] if sample_rankings_per_q > 1: assert mask_label is not True # not supported since it is rarely used. list_ranking = [] list_labels = [] for _ in range(self.sample_rankings_per_q): des_inds = np_arg_shuffle_ties( doc_labels, descending=True) # sampling by shuffling ties list_ranking.append(doc_reprs[des_inds]) list_labels.append(doc_labels[des_inds]) batch_rankings = np.stack(list_ranking, axis=0) batch_std_labels = np.stack(list_labels, axis=0) torch_batch_rankings = torch.from_numpy( batch_rankings).type(torch.FloatTensor) torch_batch_std_labels = torch.from_numpy( batch_std_labels).type(torch.FloatTensor) else: torch_batch_rankings = torch.from_numpy( doc_reprs).type(torch.FloatTensor) torch_batch_rankings = torch.unsqueeze( torch_batch_rankings, dim=0) # a consistent batch dimension of size 1 torch_batch_std_labels = torch.from_numpy( doc_labels).type(torch.FloatTensor) torch_batch_std_labels = torch.unsqueeze( torch_batch_std_labels, dim=0) if mask_label: # masking if mask_type == 'rand_mask_rele': torch_batch_rankings, torch_batch_std_labels = random_mask_rele_labels( batch_ranking=torch_batch_rankings, batch_label=torch_batch_std_labels, mask_ratio=mask_ratio, mask_value=0, presort=data_dict['presort']) elif mask_type == 'rand_mask_all': masked_res = random_mask_all_labels( batch_ranking=torch_batch_rankings, batch_label=torch_batch_std_labels, mask_ratio=mask_ratio, mask_value=0, presort=data_dict['presort']) if masked_res is not None: torch_batch_rankings, torch_batch_std_labels = masked_res else: continue else: raise NotImplementedError if hot: assert mask_label is not True # not supported since it is rarely used. max_rele_level = data_dict['max_rele_level'] assert max_rele_level is not None torch_batch_std_hot_labels = get_one_hot_reprs( torch_batch_std_labels) batch_cnts = batch_count( batch_std_labels=torch_batch_std_labels, max_rele_grade=max_rele_level, descending=True) self.list_torch_Qs.append( (qid, torch_batch_rankings, torch_batch_std_labels, torch_batch_std_hot_labels, batch_cnts)) else: self.list_torch_Qs.append((qid, torch_batch_rankings, torch_batch_std_labels)) #buffer #print('Num of q:', len(self.list_torch_Qs)) if buffer: parent_dir = Path(torch_perquery_file).parent if not os.path.exists(parent_dir): os.makedirs(parent_dir) pickle_save(self.list_torch_Qs, torch_perquery_file) else: raise NotImplementedError self.hot = hot self.shuffle = shuffle
def iter_queries(in_file, data_dict=None, scale_data=None, scaler_id=None, perquery_file=None, buffer=True): ''' Transforms an iterator of rows to an iterator of queries (i.e., a unit of all the documents and labels associated with the same query). Each query is represented by a (qid, feature_mat, std_label_vec) tuple. :param in_file: :param has_comment: :param query_level_scale: perform query-level scaling, say normalization :param scaler: MinMaxScaler | RobustScaler :param unknown_as_zero: if not labled, regard the relevance degree as zero :return: ''' if os.path.exists(perquery_file): return pickle_load(perquery_file) if scale_data: scaler = get_scaler(scaler_id=scaler_id) presort, min_docs, min_rele = data_dict['presort'], data_dict[ 'min_docs'], data_dict['min_rele'] unknown_as_zero, binary_rele, has_comment = data_dict[ 'unknown_as_zero'], data_dict['binary_rele'], data_dict['has_comment'] clip_query = False if min_rele is not None and min_rele > 0: clip_query = True if min_docs is not None and min_docs > 0: clip_query = True list_Qs = [] with open(in_file, encoding='iso-8859-1') as file_obj: dict_data = dict() if has_comment: all_features_mat, all_labels_vec, qids, docids = parse_letor( file_obj.readlines(), has_comment=True) for i in range(len(qids)): f_vec = all_features_mat[i, :] std_s = all_labels_vec[i] qid = qids[i] docid = docids[i] if qid in dict_data: dict_data[qid].append((std_s, docid, f_vec)) else: dict_data[qid] = [(std_s, docid, f_vec)] del all_features_mat # unique qids seen = set() seen_add = seen.add # sequential unique id qids_unique = [x for x in qids if not (x in seen or seen_add(x))] for qid in qids_unique: tmp = list(zip(*dict_data[qid])) list_labels_per_q = tmp[0] if data_dict['data_id'] in MSLETOR_LIST: ''' convert the original rank-position into grade-labels ''' ranking_size = len(list_labels_per_q) list_labels_per_q = [ ranking_size - r for r in list_labels_per_q ] #list_docids_per_q = tmp[1] list_features_per_q = tmp[2] feature_mat = np.vstack(list_features_per_q) if scale_data: if data_dict['data_id'] in ISTELLA_LTR: # due to the possible extremely large features, e.g., 1.79769313486e+308 feature_mat = scaler.fit_transform( np.clip(feature_mat, a_min=None, a_max=ISTELLA_MAX)) else: feature_mat = scaler.fit_transform(feature_mat) Q = clip_query_data(qid=qid, feature_mat=feature_mat, std_label_vec=np.array(list_labels_per_q), binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, clip_query=clip_query, min_docs=min_docs, min_rele=min_rele, presort=presort) if Q is not None: list_Qs.append(Q) else: all_features_mat, all_labels_vec, qids = parse_letor( file_obj.readlines(), has_comment=False) for i in range(len(qids)): f_vec = all_features_mat[i, :] std_s = all_labels_vec[i] qid = qids[i] if qid in dict_data: dict_data[qid].append((std_s, f_vec)) else: dict_data[qid] = [(std_s, f_vec)] del all_features_mat # unique qids seen = set() seen_add = seen.add # sequential unique id qids_unique = [x for x in qids if not (x in seen or seen_add(x))] for qid in qids_unique: tmp = list(zip(*dict_data[qid])) list_labels_per_q = tmp[0] if data_dict['data_id'] in MSLETOR_LIST: ''' convert the original rank-position into grade-labels ''' ranking_size = len(list_labels_per_q) list_labels_per_q = [ ranking_size - r for r in list_labels_per_q ] list_features_per_q = tmp[1] feature_mat = np.vstack(list_features_per_q) if data_dict['data_id'] in ISTELLA_LTR: # due to the possible extremely large features, e.g., 1.79769313486e+308 feature_mat = scaler.fit_transform( np.clip(feature_mat, a_min=None, a_max=ISTELLA_MAX)) else: feature_mat = scaler.fit_transform(feature_mat) Q = clip_query_data(qid=qid, feature_mat=feature_mat, std_label_vec=np.array(list_labels_per_q), binary_rele=binary_rele, unknown_as_zero=unknown_as_zero, clip_query=clip_query, min_docs=min_docs, min_rele=min_rele, presort=presort) if Q is not None: list_Qs.append(Q) if buffer: assert perquery_file is not None parent_dir = Path(perquery_file).parent if not os.path.exists(parent_dir): os.makedirs(parent_dir) pickle_save(list_Qs, file=perquery_file) return list_Qs