def test_major_sent(synthese): # logger.info('START: model testing...') dataset_type = 'synthese' if synthese else 'lead' data_loader = pipe.DomDetDataLoader(dataset_type=dataset_type) n_iter, total_loss = 0, 0.0 n_samples, total_hamming = 0, 0.0 cf_mats, precision_list, recall_list = list(), list(), list() for batch_idx, batch in enumerate(data_loader): n_iter += 1 y_true = batch['labels'].cpu().numpy() d_batch = len(y_true) des_sent_info = batch['des_sent_info'].cpu().numpy() n_samples += np.sum(des_sent_info[:, -1]) # logger.info('batch_size: {0}'.format(y_true.shape[0])) if synthese: hyp_scores = np.tile(y_pred_vec, (d_batch, 1)) fids = batch['fids'].cpu().numpy() eval_args = { 'hyp_scores': hyp_scores, 'fids': fids, 'is_hiernet': True } eval_res = metrics.metric_eval_for_syn_doc(**eval_args) else: hyp_scores = np.tile(y_pred_vec, (d_batch, max_n_sents, 1)) eval_args = { 'y_true': y_true, 'hyp_scores': hyp_scores, 'des_sent_info': des_sent_info, } eval_res = metrics.metric_eval(**eval_args) cf_mats.append(eval_res['cf_mat_list']) precision_list.extend(eval_res['precision_list']) recall_list.extend(eval_res['recall_list']) total_hamming += eval_res['hamming'] cls_f1, avg_f1 = metrics.compute_f1_with_confusion_mats(cf_mats) example_based_f1 = metrics.compute_example_based_f1( precision_list=precision_list, recall_list=recall_list) hamming = total_hamming / n_samples eval_log_info = { 'example_based_f1': example_based_f1, 'avg_f1': avg_f1, 'cls_f1': cls_f1, 'hamming': hamming, } res_str = 'example_based_f1: {example_based_f1:.6f},' \ 'avg_f1: {avg_f1:.6f}, cls_f1: {cls_f1}, hamming: {hamming:.6f}' logger.info(res_str.format(**eval_log_info))
def test_model_sent_mturk(): logger.info('START: testing Baseline [MAJOR] on [MTURK SENTS]') data_loader = pipe.DomDetDataLoader(dataset_type='mturk') n_iter, total_loss = 0, 0.0 n_samples, total_hamming = 0, 0.0 cf_mats, precision_list, recall_list = list(), list(), list() for batch_idx, batch in enumerate(data_loader): n_iter += 1 y_true = batch['sent_labels'].cpu().numpy( ) # d_batch * max_n_sents * n_doms d_batch = len(y_true) hyp_scores = np.tile(y_pred_vec, (d_batch, 1)) # hyp_scores = np.tile(y_pred_vec, (d_batch, max_n_sents, 1)) n_sents = batch['n_sents'].cpu().numpy() n_samples += np.sum(n_sents) logger.info('batch_size: {0}'.format(y_true.shape[0])) eval_args = { 'y_true': y_true, 'hyp_scores': hyp_scores, 'n_sents': n_sents, 'is_hiernet': True, } eval_res = metrics.metric_eval_for_mturk(**eval_args) cf_mats.append(eval_res['cf_mat_list']) precision_list.extend(eval_res['precision_list']) recall_list.extend(eval_res['recall_list']) total_hamming += eval_res['hamming'] cls_f1, avg_f1 = metrics.compute_f1_with_confusion_mats(cf_mats) example_based_f1 = metrics.compute_example_based_f1( precision_list=precision_list, recall_list=recall_list) hamming = total_hamming / n_samples eval_log_info = { 'example_based_f1': example_based_f1, 'avg_f1': avg_f1, 'cls_f1': cls_f1, 'hamming': hamming, } res_str = 'example_based_f1: {example_based_f1:.6f},' \ 'avg_f1: {avg_f1:.6f}, cls_f1: {cls_f1}, hamming: {hamming:.6f}' logger.info(res_str.format(**eval_log_info))
def test_model_word_mturk(matching_mode=None, corpus='wiki'): logger.info('START: model testing on [MTURK WORDS]') grain = 'word' dataset_type = '-'.join(('mturk', corpus, grain)) data_loader = pipe.DomDetDataLoader(dataset_type=dataset_type) n_samples = 0 p_list = list() r_list = list() for batch_idx, batch in enumerate(data_loader): # turn vars to numpy arrays y_true_sents = batch['sent_labels'].cpu().numpy( ) # d_batch * max_n_sents * n_doms y_true_words = batch['word_labels'].cpu().numpy( ) # d_batch * max_n_sents * max_n_words n_sents = batch['n_sents'].cpu().numpy() n_words = batch['n_words'].cpu().numpy() n_samples += np.sum(n_sents) d_batch = len(y_true_sents) hyp_scores = np.tile(y_pred_vec, (d_batch, 1)) logger.info('batch_size: {0}'.format(y_true_words.shape[0])) eval_args = { 'hyp_scores': hyp_scores, 'y_true_sents': y_true_sents, 'y_true_words': y_true_words, 'n_sents': n_sents, 'n_words': n_words, 'pred_grain': 'doc', 'max_alter': True, 'matching_mode': matching_mode, } eval_res = metrics_word_eval_binary.metric_eval_for_mturk_words_with_ir( **eval_args) p_list.extend(eval_res['p_list']) r_list.extend(eval_res['r_list']) exam_f1 = metrics.compute_example_based_f1(p_list, r_list) logger.info('word-eval. exam_f1: {0:6f}'.format(exam_f1))
def test_major_doc(): data_loader = pipe.DomDetDataLoader(dataset_type='test') n_iter, total_loss = 0, 0.0 n_samples, total_hamming = 0, 0.0 cf_mats, precision_list, recall_list = list(), list(), list() for batch_idx, batch in enumerate(data_loader): n_iter += 1 y_true = batch['labels'].cpu().numpy() # turn vars to numpy arrays d_batch = len(y_true) y_pred = np.tile(y_pred_vec, (d_batch, 1)) eval_args = { 'y_true': y_true, 'hyp_scores': y_pred, } n_samples += d_batch # logger.info('batch_size: {0}'.format(d_batch)) eval_res = metrics.metric_eval(**eval_args) cf_mats.append(eval_res['cf_mat_list']) precision_list.extend(eval_res['precision_list']) recall_list.extend(eval_res['recall_list']) total_hamming += eval_res['hamming'] cls_f1, avg_f1 = metrics.compute_f1_with_confusion_mats(cf_mats) example_based_f1 = metrics.compute_example_based_f1( precision_list=precision_list, recall_list=recall_list) hamming = total_hamming / n_samples eval_log_info = { 'example_based_f1': example_based_f1, 'avg_f1': avg_f1, 'cls_f1': cls_f1, 'hamming': hamming, } res_str = 'example_based_f1: {example_based_f1:.6f},' \ 'avg_f1: {avg_f1:.6f}, cls_f1: {cls_f1}, hamming: {hamming:.6f}' logger.info(res_str.format(**eval_log_info))
def eval_model(model, phase, save_pred=False, save_gold=False): assert phase in ('dev', 'test') data_loader = pipe.DomDetDataLoader(dataset_type=phase) model.eval() n_iter, total_loss = 0, 0.0 n_samples, total_hamming = 0, 0.0 cf_mats, precision_list, recall_list = list(), list(), list() for batch_idx, batch in enumerate(data_loader): n_iter += 1 c = copy.deepcopy feed_dict = c(batch) for (k, v) in feed_dict.items(): feed_dict[k] = Variable(v, requires_grad=False, volatile=True) # fix ids and masks loss, doc_scores = model(**feed_dict)[:2] total_loss += loss.data[0] y_true = batch['labels'].cpu().numpy() # turn vars to numpy arrays hyp_scores = doc_scores.data.cpu().numpy() eval_args = { 'y_true': y_true, 'hyp_scores': hyp_scores, } if save_pred: eval_args['save_pred_to'] = join(path_parser.pred_doc, config_loader.meta_model_name) if save_gold: eval_args['save_true_to'] = join(path_parser.pred_doc, 'gold') # del model_res n_samples += y_true.shape[0] # logger.info('batch_size: {0}'.format(y_true.shape[0])) eval_res = metrics.metric_eval(**eval_args) cf_mats.append(eval_res['cf_mat_list']) precision_list.extend(eval_res['precision_list']) recall_list.extend(eval_res['recall_list']) total_hamming += eval_res['hamming'] avg_loss = total_loss / n_iter cls_f1, avg_f1 = metrics.compute_f1_with_confusion_mats(cf_mats) example_based_f1 = metrics.compute_example_based_f1( precision_list=precision_list, recall_list=recall_list) hamming = total_hamming / n_samples eval_log_info = { 'ph': phase, 'loss': avg_loss, 'example_based_f1': example_based_f1, 'avg_f1': avg_f1, 'cls_f1': cls_f1, 'hamming': hamming, } return eval_log_info
def test_model_word_mturk_with_checkpoints(model, matching_mode=None, corpus='wiki', save_pred=False, save_gold=False, n_iter=None, restore=False): if corpus == 'wiki': save_dir = path_parser.pred_mturk_wiki elif corpus == 'nyt': if lang != 'en': raise ValueError('Set lang to en when NYT corpus is used') save_dir = path_parser.pred_mturk_nyt else: raise ValueError('Invalid corpus: {}'.format(corpus)) if config_loader.placement == 'auto': model = nn.DataParallel(model, device_ids=config_loader.device) if config_loader.placement in ('auto', 'single'): model.cuda() logger.info('START: model testing on [MTURK WORDS]') checkpoint = join(path_parser.model_save, config_loader.model_name) if restore: checkpoint = join(checkpoint, 'resume') filter_keys = None if config_loader.reset_size_for_test and not config_loader.set_sep_des_size: logger.info('Filter DES pretrained paras...') filter_keys = [ 'module.word_det.des_ids', 'module.word_det.des_sent_mask', 'module.word_det.des_word_mask' ] load_checkpoint(checkpoint=checkpoint, model=model, n_iter=n_iter, filter_keys=filter_keys) grain = 'word' dataset_type = '-'.join(('mturk', corpus, grain)) data_loader = pipe.DomDetDataLoader(dataset_type=dataset_type) model.eval() c = copy.deepcopy pred_grain = get_model_pred_grain() p_list = list() r_list = list() y_true_sents_list = list() n_sents_list = list() for batch_idx, batch in enumerate(data_loader): feed_dict = c(batch) del feed_dict['sent_labels'] del feed_dict['word_labels'] del feed_dict['n_sents'] del feed_dict['n_words'] for (k, v) in feed_dict.items(): feed_dict[k] = Variable(v, requires_grad=False, volatile=True) # fix ids and masks if pred_grain == 'doc': _, doc_scores = model(**feed_dict) hyp_scores = doc_scores.data.cpu().numpy() elif pred_grain == 'sent': _, _, sent_scores = model(**feed_dict) hyp_scores = sent_scores.data.cpu().numpy() elif pred_grain == 'word': feed_dict['return_sent_attn'] = True feed_dict['return_word_attn'] = True _, _, _, word_scores, _, word_attn = model(**feed_dict) hyp_scores = word_scores.data.cpu().numpy( ) # n_batch * n_sents * n_words * n_doms else: raise ValueError('Invalid prediction grain: {}'.format(pred_grain)) # turn vars to numpy arrays y_true_sents = batch['sent_labels'].cpu().numpy( ) # d_batch * max_n_sents * n_doms y_true_words = batch['word_labels'].cpu().numpy( ) # d_batch * max_n_sents * max_n_words n_sents = batch['n_sents'].cpu().numpy() n_words = batch['n_words'].cpu().numpy() logger.info('batch_size: {0}'.format(y_true_words.shape[0])) eval_args = { 'hyp_scores': hyp_scores, 'y_true_sents': y_true_sents, 'y_true_words': y_true_words, 'n_sents': n_sents, 'n_words': n_words, 'pred_grain': pred_grain, 'max_alter': True, 'matching_mode': matching_mode, } if save_pred: fn = '_'.join((grain, config_loader.meta_model_name)) pred_save_fp = join(save_dir, fn) eval_args['save_pred_to'] = pred_save_fp if save_gold: fn = '_'.join((grain, 'gold')) true_save_fp = join(save_dir, fn) eval_args['save_true_to'] = true_save_fp eval_res = metrics_word_eval_binary.metric_eval_for_mturk_words_with_ir( **eval_args) p_list.extend(eval_res['p_list']) r_list.extend(eval_res['r_list']) y_true_sents_list.append(y_true_sents) n_sents_list.append(n_sents) exam_f1 = metrics.compute_example_based_f1(p_list, r_list) logger.info('word-eval. exam_f1: {0:6f}'.format(exam_f1)) report_dom_specific_f1(p_list, r_list, y_true_sents_list[0], n_sents_list[0])