def fold_evaluation(self, ranker, test_data, max_label, fold_k, model_id): avg_ndcg_at_ks, avg_nerr_at_ks, avg_ap_at_ks, avg_p_at_ks = \ ranker.adhoc_performance_at_ks(test_data=test_data, ks=self.cutoffs, device='cpu', max_label=max_label) fold_ndcg_ks = avg_ndcg_at_ks.data.numpy() fold_nerr_ks = avg_nerr_at_ks.data.numpy() fold_ap_ks = avg_ap_at_ks.data.numpy() fold_p_ks = avg_p_at_ks.data.numpy() self.ndcg_cv_avg_scores = np.add(self.ndcg_cv_avg_scores, fold_ndcg_ks) self.nerr_cv_avg_scores = np.add(self.nerr_cv_avg_scores, fold_nerr_ks) self.ap_cv_avg_scores = np.add(self.ap_cv_avg_scores, fold_ap_ks) self.p_cv_avg_scores = np.add(self.p_cv_avg_scores, fold_p_ks) list_metric_strs = [] list_metric_strs.append( metric_results_to_string(list_scores=fold_ndcg_ks, list_cutoffs=self.cutoffs, metric='nDCG')) list_metric_strs.append( metric_results_to_string(list_scores=fold_nerr_ks, list_cutoffs=self.cutoffs, metric='nERR')) list_metric_strs.append( metric_results_to_string(list_scores=fold_ap_ks, list_cutoffs=self.cutoffs, metric='AP')) list_metric_strs.append( metric_results_to_string(list_scores=fold_p_ks, list_cutoffs=self.cutoffs, metric='P')) metric_string = '\n\t'.join(list_metric_strs) print("\n{} on Fold - {}\n\t{}".format(model_id, str(fold_k), metric_string))
def fold_evaluation_reproduce(self, ranker, test_data, dir_run, max_label, fold_k, model_id, device='cpu'): self.dir_run = dir_run subdir = '-'.join(['Fold', str(fold_k)]) run_fold_k_dir = os.path.join(dir_run, subdir) fold_k_buffered_model_names = os.listdir(run_fold_k_dir) fold_opt_model_name = get_opt_model(fold_k_buffered_model_names) fold_opt_model = os.path.join(run_fold_k_dir, fold_opt_model_name) ranker.load(file_model=fold_opt_model, device=device) avg_ndcg_at_ks, avg_nerr_at_ks, avg_ap_at_ks, avg_p_at_ks, list_per_q_ndcg, list_per_q_nerr, list_per_q_ap,\ list_per_q_p = ranker.adhoc_performance_at_ks(test_data=test_data, ks=self.cutoffs, device='cpu', max_label=max_label, need_per_q=True) fold_ndcg_ks = avg_ndcg_at_ks.data.numpy() fold_nerr_ks = avg_nerr_at_ks.data.numpy() fold_ap_ks = avg_ap_at_ks.data.numpy() fold_p_ks = avg_p_at_ks.data.numpy() self.list_per_q_p.extend(list_per_q_p) self.list_per_q_ap.extend(list_per_q_ap) self.list_per_q_nerr.extend(list_per_q_nerr) self.list_per_q_ndcg.extend(list_per_q_ndcg) self.ndcg_cv_avg_scores = np.add(self.ndcg_cv_avg_scores, fold_ndcg_ks) self.nerr_cv_avg_scores = np.add(self.nerr_cv_avg_scores, fold_nerr_ks) self.ap_cv_avg_scores = np.add(self.ap_cv_avg_scores, fold_ap_ks) self.p_cv_avg_scores = np.add(self.p_cv_avg_scores, fold_p_ks) list_metric_strs = [] list_metric_strs.append( metric_results_to_string(list_scores=fold_ndcg_ks, list_cutoffs=self.cutoffs, metric='nDCG')) list_metric_strs.append( metric_results_to_string(list_scores=fold_nerr_ks, list_cutoffs=self.cutoffs, metric='nERR')) list_metric_strs.append( metric_results_to_string(list_scores=fold_ap_ks, list_cutoffs=self.cutoffs, metric='AP')) list_metric_strs.append( metric_results_to_string(list_scores=fold_p_ks, list_cutoffs=self.cutoffs, metric='P')) metric_string = '\n\t'.join(list_metric_strs) print("\n{} on Fold - {}\n\t{}".format(model_id, str(fold_k), metric_string))
def log_max(self, data_dict=None, max_cv_avg_scores=None, sf_para_dict=None, eval_dict=None, log_para_str=None): ''' Log the best performance across grid search and the corresponding setting ''' dir_root, cutoffs = eval_dict['dir_root'], eval_dict['cutoffs'] data_id = data_dict['data_id'] sf_str = self.sf_parameter.to_para_string(log=True) data_eval_str = self.data_setting.to_data_setting_string( log=True) + '\n' + self.eval_setting.to_eval_setting_string( log=True) with open(file=dir_root + '/' + '_'.join([data_id, sf_para_dict['sf_id'], 'max.txt']), mode='w') as max_writer: max_writer.write('\n\n'.join([ data_eval_str, sf_str, log_para_str, metric_results_to_string(max_cv_avg_scores, cutoffs, metric='aNDCG') ]))
def kfold_cv_eval(self, data_dict=None, eval_dict=None, sf_para_dict=None, model_para_dict=None): """ Evaluation learning-to-rank methods via k-fold cross validation if there are k folds, otherwise one fold. :param data_dict: settings w.r.t. data :param eval_dict: settings w.r.t. evaluation :param sf_para_dict: settings w.r.t. scoring function :param model_para_dict: settings w.r.t. the ltr_adhoc model :return: """ self.display_information(data_dict, model_para_dict) self.check_consistency(data_dict, eval_dict, sf_para_dict) self.setup_eval(data_dict, eval_dict, sf_para_dict, model_para_dict) model_id = model_para_dict['model_id'] fold_num = data_dict['fold_num'] # for quick access of common evaluation settings epochs, loss_guided = eval_dict['epochs'], eval_dict['loss_guided'] vali_k, log_step, cutoffs = eval_dict['vali_k'], eval_dict[ 'log_step'], eval_dict['cutoffs'] do_vali, do_summary = eval_dict['do_validation'], eval_dict[ 'do_summary'] ranker = self.load_ranker(model_para_dict=model_para_dict, sf_para_dict=sf_para_dict) time_begin = datetime.datetime.now() # timing l2r_cv_avg_scores = np.zeros(len(cutoffs)) # fold average for fold_k in range(1, fold_num + 1): # evaluation over k-fold data ranker.reset_parameters( ) # reset with the same random initialization train_data, test_data, vali_data = self.load_data( eval_dict, data_dict, fold_k) if do_vali: fold_optimal_ndcgk = 0.0 if do_summary: list_epoch_loss, list_fold_k_train_eval_track, list_fold_k_test_eval_track, list_fold_k_vali_eval_track = [], [], [], [] if not do_vali and loss_guided: first_round = True threshold_epoch_loss = torch.cuda.FloatTensor([ 10000000.0 ]) if self.gpu else torch.FloatTensor([10000000.0]) for epoch_k in range(1, epochs + 1): torch_fold_k_epoch_k_loss, stop_training = self.train_ranker( ranker=ranker, train_data=train_data, model_para_dict=model_para_dict, epoch_k=epoch_k) ranker.scheduler.step( ) # adaptive learning rate with step_size=40, gamma=0.5 if stop_training: print('training is failed !') break if (do_summary or do_vali) and (epoch_k % log_step == 0 or epoch_k == 1): # stepwise check if do_vali: # per-step validation score vali_eval_tmp = ndcg_at_k(ranker=ranker, test_data=vali_data, k=vali_k, gpu=self.gpu, device=self.device, label_type=self.data_setting. data_dict['label_type']) vali_eval_v = vali_eval_tmp.data.numpy() if epoch_k > 1: # further validation comparison curr_vali_ndcg = vali_eval_v if (curr_vali_ndcg > fold_optimal_ndcgk) or ( epoch_k == epochs and curr_vali_ndcg == fold_optimal_ndcgk ): # we need at least a reference, in case all zero print('\t', epoch_k, '- nDCG@{} - '.format(vali_k), curr_vali_ndcg) fold_optimal_ndcgk = curr_vali_ndcg fold_optimal_checkpoint = '-'.join( ['Fold', str(fold_k)]) fold_optimal_epoch_val = epoch_k ranker.save( dir=self.dir_run + fold_optimal_checkpoint + '/', name='_'.join( ['net_params_epoch', str(epoch_k)]) + '.pkl') # buffer currently optimal model else: print('\t\t', epoch_k, '- nDCG@{} - '.format(vali_k), curr_vali_ndcg) if do_summary: # summarize per-step performance w.r.t. train, test fold_k_epoch_k_train_ndcg_ks = ndcg_at_ks( ranker=ranker, test_data=train_data, ks=cutoffs, gpu=self.gpu, device=self.device, label_type=self.data_setting. data_dict['label_type']) np_fold_k_epoch_k_train_ndcg_ks = fold_k_epoch_k_train_ndcg_ks.cpu( ).numpy( ) if self.gpu else fold_k_epoch_k_train_ndcg_ks.data.numpy( ) list_fold_k_train_eval_track.append( np_fold_k_epoch_k_train_ndcg_ks) fold_k_epoch_k_test_ndcg_ks = ndcg_at_ks( ranker=ranker, test_data=test_data, ks=cutoffs, gpu=self.gpu, device=self.device, label_type=self.data_setting. data_dict['label_type']) np_fold_k_epoch_k_test_ndcg_ks = fold_k_epoch_k_test_ndcg_ks.cpu( ).numpy( ) if self.gpu else fold_k_epoch_k_test_ndcg_ks.data.numpy( ) list_fold_k_test_eval_track.append( np_fold_k_epoch_k_test_ndcg_ks) fold_k_epoch_k_loss = torch_fold_k_epoch_k_loss.cpu( ).numpy( ) if self.gpu else torch_fold_k_epoch_k_loss.data.numpy( ) list_epoch_loss.append(fold_k_epoch_k_loss) if do_vali: list_fold_k_vali_eval_track.append(vali_eval_v) elif loss_guided: # stopping check via epoch-loss if first_round and torch_fold_k_epoch_k_loss >= threshold_epoch_loss: print('Bad threshold: ', torch_fold_k_epoch_k_loss, threshold_epoch_loss) if torch_fold_k_epoch_k_loss < threshold_epoch_loss: first_round = False print('\tFold-', str(fold_k), ' Epoch-', str(epoch_k), 'Loss: ', torch_fold_k_epoch_k_loss) threshold_epoch_loss = torch_fold_k_epoch_k_loss else: print('\tStopped according epoch-loss!', torch_fold_k_epoch_k_loss, threshold_epoch_loss) break if do_summary: # track sy_prefix = '_'.join(['Fold', str(fold_k)]) fold_k_train_eval = np.vstack(list_fold_k_train_eval_track) fold_k_test_eval = np.vstack(list_fold_k_test_eval_track) pickle_save(fold_k_train_eval, file=self.dir_run + '_'.join([sy_prefix, 'train_eval.np'])) pickle_save(fold_k_test_eval, file=self.dir_run + '_'.join([sy_prefix, 'test_eval.np'])) fold_k_epoch_loss = np.hstack(list_epoch_loss) pickle_save( (fold_k_epoch_loss, train_data.__len__()), file=self.dir_run + '_'.join([sy_prefix, 'epoch_loss.np'])) if do_vali: fold_k_vali_eval = np.hstack(list_fold_k_vali_eval_track) pickle_save(fold_k_vali_eval, file=self.dir_run + '_'.join([sy_prefix, 'vali_eval.np'])) if do_vali: # using the fold-wise optimal model for later testing based on validation data buffered_model = '_'.join( ['net_params_epoch', str(fold_optimal_epoch_val)]) + '.pkl' ranker.load(self.dir_run + fold_optimal_checkpoint + '/' + buffered_model) fold_optimal_ranker = ranker else: # buffer the model after a fixed number of training-epoches if no validation is deployed fold_optimal_checkpoint = '-'.join(['Fold', str(fold_k)]) ranker.save(dir=self.dir_run + fold_optimal_checkpoint + '/', name='_'.join(['net_params_epoch', str(epoch_k)]) + '.pkl') fold_optimal_ranker = ranker torch_fold_ndcg_ks = ndcg_at_ks( ranker=fold_optimal_ranker, test_data=test_data, ks=cutoffs, gpu=self.gpu, device=self.device, label_type=self.data_setting.data_dict['label_type']) fold_ndcg_ks = torch_fold_ndcg_ks.data.numpy() performance_list = [model_id + ' Fold-' + str(fold_k) ] # fold-wise performance for i, co in enumerate(cutoffs): performance_list.append('nDCG@{}:{:.4f}'.format( co, fold_ndcg_ks[i])) performance_str = '\t'.join(performance_list) print('\t', performance_str) l2r_cv_avg_scores = np.add( l2r_cv_avg_scores, fold_ndcg_ks) # sum for later cv-performance time_end = datetime.datetime.now() # overall timing elapsed_time_str = str(time_end - time_begin) print('Elapsed time:\t', elapsed_time_str + "\n\n") l2r_cv_avg_scores = np.divide(l2r_cv_avg_scores, fold_num) eval_prefix = str( fold_num) + '-fold cross validation scores:' if do_vali else str( fold_num) + '-fold average scores:' print(model_id, eval_prefix, metric_results_to_string(list_scores=l2r_cv_avg_scores, list_cutoffs=cutoffs) ) # print either cv or average performance return l2r_cv_avg_scores
def get_cv_performance(self): time_end = datetime.datetime.now() # overall timing elapsed_time_str = str(time_end - self.time_begin) ndcg_cv_avg_scores = np.divide(self.ndcg_cv_avg_scores, self.fold_num) nerr_cv_avg_scores = np.divide(self.nerr_cv_avg_scores, self.fold_num) ap_cv_avg_scores = np.divide(self.ap_cv_avg_scores, self.fold_num) p_cv_avg_scores = np.divide(self.p_cv_avg_scores, self.fold_num) eval_prefix = str(self.fold_num) + '-fold cross validation scores:' if self.do_validation \ else str(self.fold_num) + '-fold average scores:' list_metric_strs = [] list_metric_strs.append( metric_results_to_string(list_scores=ndcg_cv_avg_scores, list_cutoffs=self.cutoffs, metric='nDCG')) list_metric_strs.append( metric_results_to_string(list_scores=nerr_cv_avg_scores, list_cutoffs=self.cutoffs, metric='nERR')) list_metric_strs.append( metric_results_to_string(list_scores=ap_cv_avg_scores, list_cutoffs=self.cutoffs, metric='AP')) list_metric_strs.append( metric_results_to_string(list_scores=p_cv_avg_scores, list_cutoffs=self.cutoffs, metric='P')) metric_string = '\n'.join(list_metric_strs) print("\n{} {}\n{}".format(self.model_id, eval_prefix, metric_string)) print('Elapsed time:\t', elapsed_time_str + "\n\n") if self.reproduce: torch_mat_per_q_p = torch.cat(self.list_per_q_p, dim=0) torch_mat_per_q_ap = torch.cat(self.list_per_q_ap, dim=0) torch_mat_per_q_nerr = torch.cat(self.list_per_q_nerr, dim=0) torch_mat_per_q_ndcg = torch.cat(self.list_per_q_ndcg, dim=0) #print('torch_mat_per_q_ndcg', torch_mat_per_q_ndcg.size()) mat_per_q_p = torch_mat_per_q_p.data.numpy() mat_per_q_ap = torch_mat_per_q_ap.data.numpy() mat_per_q_nerr = torch_mat_per_q_nerr.data.numpy() mat_per_q_ndcg = torch_mat_per_q_ndcg.data.numpy() pickle_save(target=mat_per_q_p, file=self.dir_run + '_'.join([self.model_id, 'all_fold_p_at_ks_per_q.np'])) pickle_save( target=mat_per_q_ap, file=self.dir_run + '_'.join([self.model_id, 'all_fold_ap_at_ks_per_q.np'])) pickle_save( target=mat_per_q_nerr, file=self.dir_run + '_'.join([self.model_id, 'all_fold_nerr_at_ks_per_q.np'])) pickle_save( target=mat_per_q_ndcg, file=self.dir_run + '_'.join([self.model_id, 'all_fold_ndcg_at_ks_per_q.np'])) return ndcg_cv_avg_scores
def ad_cv_eval(self, data_dict=None, eval_dict=None, ad_para_dict=None, sf_para_dict=None): """ Adversarial training and evaluation :param data_dict: :param eval_dict: :param ad_para_dict: :param sf_para_dict: :return: """ self.check_consistency(data_dict, eval_dict) self.display_information(data_dict, model_para_dict=ad_para_dict) self.setup_eval(data_dict, eval_dict, sf_para_dict, model_para_dict=ad_para_dict) model_id = ad_para_dict['model_id'] fold_num = data_dict['fold_num'] # for quick access of common evaluation settings epochs, loss_guided = eval_dict['epochs'], eval_dict['loss_guided'] vali_k, log_step, cutoffs = eval_dict['vali_k'], eval_dict[ 'log_step'], eval_dict['cutoffs'] do_vali, do_summary = eval_dict['do_validation'], eval_dict[ 'do_summary'] if sf_para_dict['id'] == 'ffnns': sf_para_dict['ffnns'].update( dict(num_features=data_dict['num_features'])) else: raise NotImplementedError ad_machine = self.get_ad_machine(eval_dict=eval_dict, data_dict=data_dict, sf_para_dict=sf_para_dict, ad_para_dict=ad_para_dict) time_begin = datetime.datetime.now() # timing g_l2r_cv_avg_scores, d_l2r_cv_avg_scores = np.zeros( len(cutoffs)), np.zeros(len(cutoffs)) # fold average for fold_k in range(1, fold_num + 1): dict_buffer = dict() # for buffering frequently used objs ad_machine.reset_generator_discriminator() fold_optimal_checkpoint = '-'.join(['Fold', str(fold_k)]) train_data, test_data, vali_data = self.load_data( eval_dict, data_dict, fold_k) if do_vali: g_fold_optimal_ndcgk, d_fold_optimal_ndcgk = 0.0, 0.0 if do_summary: list_epoch_loss = [] # not used yet g_list_fold_k_train_eval_track, g_list_fold_k_test_eval_track, g_list_fold_k_vali_eval_track = [], [], [] d_list_fold_k_train_eval_track, d_list_fold_k_test_eval_track, d_list_fold_k_vali_eval_track = [], [], [] for _ in range(10): ad_machine.burn_in(train_data=train_data) for epoch_k in range(1, epochs + 1): if model_id == 'IR_GMAN_List': stop_training = ad_machine.mini_max_train( train_data=train_data, generator=ad_machine.generator, pool_discriminator=ad_machine.pool_discriminator, dict_buffer=dict_buffer) g_ranker = ad_machine.get_generator() d_ranker = ad_machine.pool_discriminator[0] else: stop_training = ad_machine.mini_max_train( train_data=train_data, generator=ad_machine.generator, discriminator=ad_machine.discriminator, dict_buffer=dict_buffer) g_ranker = ad_machine.get_generator() d_ranker = ad_machine.get_discriminator() if stop_training: print('training is failed !') break if (do_summary or do_vali) and (epoch_k % log_step == 0 or epoch_k == 1): # stepwise check if do_vali: g_vali_eval_tmp = ndcg_at_k( ranker=g_ranker, test_data=vali_data, k=vali_k, multi_level_rele=self.data_setting. data_dict['multi_level_rele'], batch_mode=True) d_vali_eval_tmp = ndcg_at_k( ranker=d_ranker, test_data=vali_data, k=vali_k, multi_level_rele=self.data_setting. data_dict['multi_level_rele'], batch_mode=True) g_vali_eval_v, d_vali_eval_v = g_vali_eval_tmp.data.numpy( ), d_vali_eval_tmp.data.numpy() if epoch_k > 1: g_buffer, g_tmp_metric_val, g_tmp_epoch = \ self.per_epoch_validation(ranker=g_ranker, curr_metric_val=g_vali_eval_v, fold_optimal_metric_val=g_fold_optimal_ndcgk, curr_epoch=epoch_k, id_str='G', fold_optimal_checkpoint=fold_optimal_checkpoint, epochs=epochs) # observe better performance if g_buffer: g_fold_optimal_ndcgk, g_fold_optimal_epoch_val = g_tmp_metric_val, g_tmp_epoch d_buffer, d_tmp_metric_val, d_tmp_epoch = \ self.per_epoch_validation(ranker=d_ranker, curr_metric_val=d_vali_eval_v, fold_optimal_metric_val=d_fold_optimal_ndcgk, curr_epoch=epoch_k, id_str='D', fold_optimal_checkpoint=fold_optimal_checkpoint, epochs=epochs) if d_buffer: d_fold_optimal_ndcgk, d_fold_optimal_epoch_val = d_tmp_metric_val, d_tmp_epoch if do_summary: # summarize per-step performance w.r.t. train, test self.per_epoch_summary_step1( ranker=g_ranker, train_data=train_data, test_data=test_data, list_fold_k_train_eval_track= g_list_fold_k_train_eval_track, list_fold_k_test_eval_track= g_list_fold_k_test_eval_track, vali_eval_v=g_vali_eval_v, list_fold_k_vali_eval_track= g_list_fold_k_vali_eval_track, cutoffs=cutoffs, do_vali=do_vali) self.per_epoch_summary_step1( ranker=d_ranker, train_data=train_data, test_data=test_data, list_fold_k_train_eval_track= d_list_fold_k_train_eval_track, list_fold_k_test_eval_track= d_list_fold_k_test_eval_track, vali_eval_v=d_vali_eval_v, list_fold_k_vali_eval_track= d_list_fold_k_vali_eval_track, cutoffs=cutoffs, do_vali=do_vali) if do_summary: self.per_epoch_summary_step2( id_str='G', fold_k=fold_k, list_fold_k_train_eval_track=g_list_fold_k_train_eval_track, list_fold_k_test_eval_track=g_list_fold_k_test_eval_track, do_vali=do_vali, list_fold_k_vali_eval_track=g_list_fold_k_vali_eval_track) self.per_epoch_summary_step2( id_str='D', fold_k=fold_k, list_fold_k_train_eval_track=d_list_fold_k_train_eval_track, list_fold_k_test_eval_track=d_list_fold_k_test_eval_track, do_vali=do_vali, list_fold_k_vali_eval_track=d_list_fold_k_vali_eval_track) if do_vali: # using the fold-wise optimal model for later testing based on validation data # g_buffered_model = '_'.join( ['net_params_epoch', str(g_fold_optimal_epoch_val), 'G']) + '.pkl' g_ranker.load(self.dir_run + fold_optimal_checkpoint + '/' + g_buffered_model) g_fold_optimal_ranker = g_ranker d_buffered_model = '_'.join( ['net_params_epoch', str(d_fold_optimal_epoch_val), 'D']) + '.pkl' d_ranker.load(self.dir_run + fold_optimal_checkpoint + '/' + d_buffered_model) d_fold_optimal_ranker = d_ranker else: # using default G # buffer the model after a fixed number of training-epoches if no validation is deployed g_ranker.save( dir=self.dir_run + fold_optimal_checkpoint + '/', name='_'.join(['net_params_epoch', str(epoch_k), 'G']) + '.pkl') g_fold_optimal_ranker = g_ranker d_ranker.save( dir=self.dir_run + fold_optimal_checkpoint + '/', name='_'.join(['net_params_epoch', str(epoch_k), 'D']) + '.pkl') d_fold_optimal_ranker = d_ranker g_torch_fold_ndcg_ks = ndcg_at_ks( ranker=g_fold_optimal_ranker, test_data=test_data, ks=cutoffs, multi_level_rele=self.data_setting. data_dict['multi_level_rele'], batch_mode=True) g_fold_ndcg_ks = g_torch_fold_ndcg_ks.data.numpy() d_torch_fold_ndcg_ks = ndcg_at_ks( ranker=d_fold_optimal_ranker, test_data=test_data, ks=cutoffs, multi_level_rele=self.data_setting. data_dict['multi_level_rele'], batch_mode=True) d_fold_ndcg_ks = d_torch_fold_ndcg_ks.data.numpy() performance_list = [' Fold-' + str(fold_k) ] # fold-wise performance performance_list.append('Generator') for i, co in enumerate(cutoffs): performance_list.append('nDCG@{}:{:.4f}'.format( co, g_fold_ndcg_ks[i])) performance_list.append('\nDiscriminator') for i, co in enumerate(cutoffs): performance_list.append('nDCG@{}:{:.4f}'.format( co, d_fold_ndcg_ks[i])) performance_str = '\t'.join(performance_list) print('\t', performance_str) g_l2r_cv_avg_scores = np.add( g_l2r_cv_avg_scores, g_fold_ndcg_ks) # sum for later cv-performance d_l2r_cv_avg_scores = np.add(d_l2r_cv_avg_scores, d_fold_ndcg_ks) time_end = datetime.datetime.now() # overall timing elapsed_time_str = str(time_end - time_begin) print('Elapsed time:\t', elapsed_time_str + "\n\n") # begin to print either cv or average performance g_l2r_cv_avg_scores = np.divide(g_l2r_cv_avg_scores, fold_num) d_l2r_cv_avg_scores = np.divide(d_l2r_cv_avg_scores, fold_num) if do_vali: eval_prefix = str(fold_num) + '-fold cross validation scores:' else: eval_prefix = str(fold_num) + '-fold average scores:' print( 'Generator', eval_prefix, metric_results_to_string(list_scores=g_l2r_cv_avg_scores, list_cutoffs=cutoffs)) print( 'Discriminator', eval_prefix, metric_results_to_string(list_scores=d_l2r_cv_avg_scores, list_cutoffs=cutoffs))
def fold_evaluation_reproduce(self, ranker, test_data, dir_run, max_label, fold_k, model_id): self.dir_run = dir_run subdir = '-'.join(['Fold', str(fold_k)]) run_fold_k_dir = os.path.join(dir_run, subdir) fold_k_buffered_model_names = os.listdir(run_fold_k_dir) fold_opt_model_name = get_opt_model(fold_k_buffered_model_names) fold_opt_model = os.path.join(run_fold_k_dir, fold_opt_model_name) ranker.load(file_model=fold_opt_model) avg_andcg_at_ks, avg_err_ia_at_ks, avg_nerr_ia_at_ks, list_per_q_andcg = \ ranker.srd_performance_at_ks(test_data=test_data, ks=self.cutoffs, device='cpu', max_label=max_label, generate_div_run=True, dir=run_fold_k_dir,fold_k=fold_k, need_per_q_andcg=True) fold_andcg_ks = avg_andcg_at_ks.data.numpy() fold_err_ia_ks = avg_err_ia_at_ks.data.numpy() fold_nerr_ia_ks = avg_nerr_ia_at_ks.data.numpy() self.list_per_q_andcg.extend(list_per_q_andcg) self.andcg_cv_avg_scores = np.add(self.andcg_cv_avg_scores, fold_andcg_ks) self.err_ia_cv_avg_scores = np.add(self.err_ia_cv_avg_scores, fold_err_ia_ks) self.nerr_ia_cv_avg_scores = np.add(self.nerr_ia_cv_avg_scores, fold_nerr_ia_ks) list_metric_strs = [] list_metric_strs.append( metric_results_to_string(list_scores=fold_andcg_ks, list_cutoffs=self.cutoffs, metric='aNDCG')) list_metric_strs.append( metric_results_to_string(list_scores=fold_err_ia_ks, list_cutoffs=self.cutoffs, metric='ERR-IA')) list_metric_strs.append( metric_results_to_string(list_scores=fold_nerr_ia_ks, list_cutoffs=self.cutoffs, metric='nERR-IA')) metric_string = '\n\t'.join(list_metric_strs) print("\n{} on Fold - {}\n\t{}".format(model_id, str(fold_k), metric_string)) p_ndeval = subprocess.Popen([ '../../ptranking/metric/srd/ndeval', '../../ptranking/metric/srd/WT_Div_0912_Implicit_qrels.txt', run_fold_k_dir + '/fold_run.txt' ], shell=False, stdout=subprocess.PIPE, bufsize=-1) output_eval_q = p_ndeval.communicate() #print(output_eval_q) output_eval_q = output_eval_q[-2].decode().split("\n")[-2] output_eval_q = output_eval_q.split(',') #print('output_eval_q\n', output_eval_q) err_ia_5, err_ia_10, err_ia_20 = float(output_eval_q[2]), float( output_eval_q[3]), float(output_eval_q[4]) nerr_ia_5, nerr_ia_10, nerr_ia_20 = float(output_eval_q[5]), float( output_eval_q[6]), float(output_eval_q[7]) andcg_5, andcg_10, andcg_20 = float(output_eval_q[11]), float( output_eval_q[12]), float(output_eval_q[13]) ndeval_err_ia_ks = np.asarray([err_ia_5, err_ia_10, err_ia_20]) ndeval_nerr_ia_ks = np.asarray([nerr_ia_5, nerr_ia_10, nerr_ia_20]) ndeval_andcg_ks = np.asarray([andcg_5, andcg_10, andcg_20]) self.ndeval_err_ia_cv_avg_scores = np.add( self.ndeval_err_ia_cv_avg_scores, ndeval_err_ia_ks) self.ndeval_nerr_ia_cv_avg_scores = np.add( self.ndeval_nerr_ia_cv_avg_scores, ndeval_nerr_ia_ks) self.ndeval_andcg_cv_avg_scores = np.add( self.ndeval_andcg_cv_avg_scores, ndeval_andcg_ks) list_metric_strs = [] list_metric_strs.append( metric_results_to_string(list_scores=ndeval_andcg_ks, list_cutoffs=self.ndeval_cutoffs, metric='aNDCG(ndeval)')) list_metric_strs.append( metric_results_to_string(list_scores=ndeval_err_ia_ks, list_cutoffs=self.ndeval_cutoffs, metric='ERR-IA(ndeval)')) list_metric_strs.append( metric_results_to_string(list_scores=ndeval_nerr_ia_ks, list_cutoffs=self.ndeval_cutoffs, metric='nERR-IA(ndeval)')) metric_string = '\n\t'.join(list_metric_strs) print("\n{} on Fold - {} (ndeval)\n\t{}".format( model_id, str(fold_k), metric_string))