def eval(epoch, opt, vis=None, vis_window=None): model.eval() data_iter_val = iter(dataloader_val) start = time.time() num_show = 0 predictions = defaultdict(list) count = 0 timestamp_file = json.load(open(opt.grd_reference)) min_value = -1e8 if opt.eval_obj_grounding: grd_output = defaultdict(dict) lemma_det_dict = { opt.wtol[key]: idx for key, idx in opt.wtod.items() if key in opt.wtol } print('{} classes have the associated lemma word!'.format( len(lemma_det_dict))) if opt.eval_obj_grounding or opt.language_eval: for step in range(len(dataloader_val)): data = data_iter_val.next() if opt.vis_attn: seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, seg_show, seg_dim_info, region_feat, frm_mask, sample_idx, ppl_mask = data else: seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, region_feat, frm_mask, sample_idx, ppl_mask = data proposals = proposals[:, :max(int(max(num[:, 1])), 1), :] ppl_mask = ppl_mask[:, :max(int(max(num[:, 1])), 1)] region_feat = region_feat[:, :max(int(max(num[:, 1])), 1), :] segs_feat.resize_(seg_feat.size()).data.copy_(seg_feat) input_num.resize_(num.size()).data.copy_(num) input_ppls.resize_(proposals.size()).data.copy_(proposals) mask_ppls.resize_(ppl_mask.size()).data.copy_(ppl_mask) pnt_mask = torch.cat( (mask_ppls.new(mask_ppls.size(0), 1).fill_(0), mask_ppls), dim=1) # pad 1 column from a legacy reason ppls_feat.resize_(region_feat.size()).data.copy_(region_feat) sample_idx = Variable(sample_idx.type(input_num.type())) eval_opt = { 'sample_max': 1, 'beam_size': opt.beam_size, 'inference_mode': True } dummy = input_ppls.new(input_ppls.size(0)).byte().fill_(0) batch_size = input_ppls.size(0) seq, att2_weights, sim_mat = model(segs_feat, dummy, dummy, input_num, \ input_ppls, dummy, dummy, ppls_feat, dummy, sample_idx, pnt_mask, 'sample', eval_opt) # save localization results on generated sentences if opt.eval_obj_grounding: assert opt.beam_size == 1, 'only support beam_size is 1' att2_ind = torch.max(att2_weights.view(batch_size, att2_weights.size(1), \ opt.num_sampled_frm, opt.num_prop_per_frm), dim=-1)[1] obj_bbox_att2 = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \ .permute(0, 2, 1, 3).contiguous(), 1, att2_ind.unsqueeze(-1).expand((batch_size, \ att2_ind.size(1), opt.num_sampled_frm, input_ppls.size(-1)))) # Bx20x10x7 for i in range(seq.size(0)): vid_id, seg_idx = seg_id[i].split('_segment_') seg_idx = str(int(seg_idx)) tmp_result = { 'clss': [], 'idx_in_sent': [], 'bbox_for_all_frames': [] } for j in range(seq.size(1)): if seq[i, j].item() != 0: lemma = opt.wtol[opt.itow[str(seq[i, j].item())]] if lemma in lemma_det_dict: tmp_result['bbox_for_all_frames'].append( obj_bbox_att2[i, j, :, :4].tolist()) tmp_result['clss'].append( opt.itod[lemma_det_dict[lemma]]) tmp_result['idx_in_sent'].append( j ) # redundant, for the sake of output format else: break grd_output[vid_id][seg_idx] = tmp_result sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, \ dataset.wtod, seq.data, opt.vocab_size, opt) for k, sent in enumerate(sents): vid_idx, seg_idx = seg_id[k].split('_segment_') seg_idx = str(int(seg_idx)) predictions[vid_idx].append( {'sentence':sent, 'timestamp':[round(timestamp, 2) for timestamp in timestamp_file[ \ 'annotations'][vid_idx]['segments'][seg_idx]['timestamps']]}) if num_show < 20: print('segment %s: %s' % (seg_id[k], sent)) num_show += 1 # visualization if opt.vis_attn: assert (opt.beam_size == 1) # only support beam_size=1 att2_weights = F.softmax(att2_weights, dim=2) # visualize some selected examples if torch.sum(proposals[k]) != 0: vis_infer(seg_show[k], seg_id[k], sent, \ att2_weights[k].cpu().data, proposals[k], num[k].long(), \ bboxs[k], sim_mat[k].cpu().data, seg_dim_info[k]) if count % 2 == 0: print(count) count += 1 lang_stats = defaultdict(float) if opt.language_eval: print('Total videos to be evaluated %d' % (len(predictions))) submission = './experiments/results/' + 'densecap-' + opt.val_split + '-' + opt.id + '.json' dense_cap_all = { 'version': 'VERSION 1.0', 'results': predictions, 'external_data': { 'used': 'true', 'details': 'Visual Genome for Faster R-CNN pre-training' } } with open(submission, 'w') as f: json.dump(dense_cap_all, f) references = opt.densecap_references verbose = opt.densecap_verbose tious_lst = [0.3, 0.5, 0.7, 0.9] evaluator = ANETcaptions(ground_truth_filenames=references, prediction_filename=submission, tious=tious_lst, max_proposals=1000, verbose=verbose) evaluator.evaluate() for m, v in evaluator.scores.items(): lang_stats[m] = np.mean(v) print('\nResults Summary (lang eval):') print('Printing language evaluation metrics...') for m, s in lang_stats.items(): print('{}: {:.3f}'.format(m, s * 100)) print('\n') if opt.eval_obj_grounding: # write attention results to file attn_file = './experiments/results/attn-gen-sent-results-' + opt.val_split + '-' + opt.id + '.json' with open(attn_file, 'w') as f: json.dump( { 'results': grd_output, 'eval_mode': 'gen', 'external_data': { 'used': True, 'details': 'Object detector pre-trained on Visual Genome on object detection task.' } }, f) if not opt.test_mode: # offline eval evaluator = ANetGrdEval(reference_file=opt.grd_reference, submission_file=attn_file, split_file=opt.split_file, val_split=[opt.val_split], iou_thresh=0.5) print('\nResults Summary (generated sent):') print( 'Printing attention accuracy on generated sentences, per class and per sentence, respectively...' ) prec_all, recall_all, f1_all, prec_all_per_sent, rec_all_per_sent, f1_all_per_sent = evaluator.grd_eval( mode='all') prec_loc, recall_loc, f1_loc, prec_loc_per_sent, rec_loc_per_sent, f1_loc_per_sent = evaluator.grd_eval( mode='loc') else: print('*' * 62) print('* [WARNING] Grounding eval unavailable for the test set!\ *\n* Please submit your result files under directory *\ \n* results/ to the eval server! *') print('*' * 62) if opt.att_model == 'topdown' and opt.eval_obj_grounding_gt: with torch.no_grad(): box_accu_att, box_accu_grd, cls_accu = eval_grounding( opt) # eval grounding else: box_accu_att, box_accu_grd, cls_accu = 0, 0, 0 if opt.enable_visdom: assert (opt.language_eval) if vis_window['score'] is None: vis_window['score'] = vis.line( X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T, Y=np.column_stack( (np.asarray(box_accu_att), np.asarray(box_accu_grd), np.asarray(cls_accu), np.asarray(lang_stats['Bleu_4']), np.asarray(lang_stats['METEOR']), np.asarray(lang_stats['CIDEr']), np.asarray(lang_stats['SPICE']))), opts=dict(title='Validation Score', xlabel='Validation Epoch', ylabel='Score', legend=[ 'BA (alpha)', 'BA (beta)', 'CLS Accu', 'Bleu_4', 'METEOR', 'CIDEr', 'SPICE' ])) else: vis.line(X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T, Y=np.column_stack( (np.asarray(box_accu_att), np.asarray(box_accu_grd), np.asarray(cls_accu), np.asarray(lang_stats['Bleu_4']), np.asarray(lang_stats['METEOR']), np.asarray(lang_stats['CIDEr']), np.asarray(lang_stats['SPICE']))), opts=dict(title='Validation Score', xlabel='Validation Epoch', ylabel='Score', legend=[ 'BA (alpha)', 'BA (beta)', 'CLS Accu', 'Bleu_4', 'METEOR', 'CIDEr', 'SPICE' ]), win=vis_window['score'], update='append') print('Saving the predictions') # Write validation result into summary val_result_history[iteration] = { 'lang_stats': lang_stats, 'predictions': predictions } return lang_stats
def eval_grounding(opt, vis=None): model.eval() data_iter = iter(dataloader_val) cls_pred_lst = [] cls_accu_score = defaultdict(list) att2_output = defaultdict(dict) grd_output = defaultdict(dict) vocab_in_split = set() for step in range(len(dataloader_val)): data = data_iter.next() seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, region_feat, frm_mask, sample_idx, ppl_mask = data proposals = proposals[:, :max(int(max(num[:, 1])), 1), :] ppl_mask = ppl_mask[:, :max(int(max(num[:, 1])), 1)] assert (max(int(max(num[:, 1])), 1) == opt.num_sampled_frm * opt.num_prop_per_frm) bboxs = bboxs[:, :max(int(max(num[:, 2])), 1), :] frm_mask = frm_mask[:, :max(int(max(num[:, 1])), 1 ), :max(int(max(num[:, 2])), 1)] region_feat = region_feat[:, :max(int(max(num[:, 1])), 1), :] segs_feat.resize_(seg_feat.size()).data.copy_(seg_feat) input_seqs.resize_(iseq.size()).data.copy_(iseq) gt_seqs.resize_(gts_seq.size()).data.copy_(gts_seq) input_num.resize_(num.size()).data.copy_(num) input_ppls.resize_(proposals.size()).data.copy_(proposals) mask_ppls.resize_(ppl_mask.size()).data.copy_(ppl_mask) pnt_mask = torch.cat( (mask_ppls.new(mask_ppls.size(0), 1).fill_(0), mask_ppls), dim=1) gt_bboxs.resize_(bboxs.size()).data.copy_( bboxs) # for region cls eval only mask_frms.resize_(frm_mask.size()).data.copy_( frm_mask) # for region cls eval only ppls_feat.resize_(region_feat.size()).data.copy_(region_feat) sample_idx = Variable(sample_idx.type(input_seqs.type())) dummy = input_ppls.new(input_ppls.size(0)).byte().fill_(0) from maskrcnn_benchmark.modeling.roi_heads.relation_head.relation_head import build_roi_relation_head from maskrcnn_benchmark.structures.bounding_box import BoxList # cls_pred_hm_lst contains a list of tuples (clss_ind, hit/1 or miss/0) cls_pred_hm_lst, att2_ind, grd_ind = model(segs_feat, input_seqs, gt_seqs, input_num, input_ppls, gt_bboxs, dummy, ppls_feat, mask_frms, sample_idx, pnt_mask, 'GRD') # save attention/grounding results on GT sentences obj_mask = (input_seqs[:, 0, 1:, 0] > opt.vocab_size) # Bx20 obj_bbox_att2 = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \ .permute(0, 2, 1, 3).contiguous(), 1, att2_ind.unsqueeze(-1).expand((att2_ind.size(0), \ att2_ind.size(1), opt.num_sampled_frm, 7))) # Bx20x10x7 obj_bbox_grd = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \ .permute(0, 2, 1, 3).contiguous(), 1, grd_ind.unsqueeze(-1).expand((grd_ind.size(0), \ grd_ind.size(1), opt.num_sampled_frm, 7))) # Bx20x10x7 for i in range(obj_mask.size(0)): vid_id, seg_idx = seg_id[i].split('_segment_') seg_idx = str(int(seg_idx)) tmp_result_grd = { 'clss': [], 'idx_in_sent': [], 'bbox_for_all_frames': [] } tmp_result_att2 = { 'clss': [], 'idx_in_sent': [], 'bbox_for_all_frames': [] } for j in range(obj_mask.size(1)): if obj_mask[i, j]: cls_name = opt.itod[input_seqs[i, 0, j + 1, 0].item() - opt.vocab_size] vocab_in_split.update([cls_name]) tmp_result_att2['clss'].append(cls_name) tmp_result_att2['idx_in_sent'].append(j) tmp_result_att2['bbox_for_all_frames'].append( obj_bbox_att2[i, j, :, :4].tolist()) tmp_result_grd['clss'].append(cls_name) tmp_result_grd['idx_in_sent'].append(j) tmp_result_grd['bbox_for_all_frames'].append( obj_bbox_grd[i, j, :, :4].tolist()) att2_output[vid_id][seg_idx] = tmp_result_att2 grd_output[vid_id][seg_idx] = tmp_result_grd cls_pred_lst.append(cls_pred_hm_lst) # write results to file attn_file = './experiments/results/attn-gt-sent-results-' + opt.val_split + '-' + opt.id + '.json' with open(attn_file, 'w') as f: json.dump( { './experiments/results': att2_output, 'eval_mode': 'GT', 'external_data': { 'used': True, 'details': 'Object detector pre-trained on Visual Genome on object detection task.' } }, f) grd_file = './experiments/results/grd-gt-sent-results-' + opt.val_split + '-' + opt.id + '.json' with open(grd_file, 'w') as f: json.dump( { 'results': grd_output, 'eval_mode': 'GT', 'external_data': { 'used': True, 'details': 'Object detector pre-trained on Visual Genome on object detection task.' } }, f) if not opt.test_mode: cls_pred_lst = torch.cat(cls_pred_lst, dim=0).cpu() cls_accu_lst = torch.cat( (cls_pred_lst[:, 0:1], (cls_pred_lst[:, 0:1] == cls_pred_lst[:, 1:2]).long()), dim=1) for i in range(cls_accu_lst.size(0)): cls_accu_score[cls_accu_lst[i, 0].long().item()].append( cls_accu_lst[i, 1].item()) print( 'Total number of object classes in the split: {}. {} have classification results.' .format(len(vocab_in_split), len(cls_accu_score))) cls_accu = np.sum( [sum(hm) * 1. / len(hm) for i, hm in cls_accu_score.items()]) * 1. / len(vocab_in_split) # offline eval evaluator = ANetGrdEval(reference_file=opt.grd_reference, submission_file=attn_file, split_file=opt.split_file, val_split=[opt.val_split], iou_thresh=0.5) attn_accu = evaluator.gt_grd_eval() evaluator.import_sub(grd_file) grd_accu = evaluator.gt_grd_eval() print('\nResults Summary (GT sent):') print( 'The averaged attention / grounding box accuracy across all classes is: {:.4f} / {:.4f}' .format(attn_accu, grd_accu)) print( 'The averaged classification accuracy across all classes is: {:.4f}\n' .format(cls_accu)) return attn_accu, grd_accu, cls_accu else: print('*' * 62) print('* [WARNING] Grounding eval unavailable for the test set!\ *\n* Please submit your result files under directory *\ \n* results/ to the eval server! *') print('*' * 62) return 0, 0, 0