def eval(epoch, opt, vis=None, vis_window=None): model.eval() data_iter_val = iter(dataloader_val) start = time.time() num_show = 0 predictions = defaultdict(list) count = 0 timestamp_file = json.load(open(opt.grd_reference)) min_value = -1e8 if opt.eval_obj_grounding: grd_output = defaultdict(dict) lemma_det_dict = { opt.wtol[key]: idx for key, idx in opt.wtod.items() if key in opt.wtol } print('{} classes have the associated lemma word!'.format( len(lemma_det_dict))) if opt.eval_obj_grounding or opt.language_eval: for step in range(len(dataloader_val)): data = data_iter_val.next() if opt.vis_attn: seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, seg_show, seg_dim_info, region_feat, frm_mask, sample_idx, ppl_mask = data else: seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, region_feat, frm_mask, sample_idx, ppl_mask = data proposals = proposals[:, :max(int(max(num[:, 1])), 1), :] ppl_mask = ppl_mask[:, :max(int(max(num[:, 1])), 1)] region_feat = region_feat[:, :max(int(max(num[:, 1])), 1), :] segs_feat.resize_(seg_feat.size()).data.copy_(seg_feat) input_num.resize_(num.size()).data.copy_(num) input_ppls.resize_(proposals.size()).data.copy_(proposals) mask_ppls.resize_(ppl_mask.size()).data.copy_(ppl_mask) pnt_mask = torch.cat( (mask_ppls.new(mask_ppls.size(0), 1).fill_(0), mask_ppls), dim=1) # pad 1 column from a legacy reason ppls_feat.resize_(region_feat.size()).data.copy_(region_feat) sample_idx = Variable(sample_idx.type(input_num.type())) eval_opt = { 'sample_max': 1, 'beam_size': opt.beam_size, 'inference_mode': True } dummy = input_ppls.new(input_ppls.size(0)).byte().fill_(0) batch_size = input_ppls.size(0) seq, att2_weights, sim_mat = model(segs_feat, dummy, dummy, input_num, \ input_ppls, dummy, dummy, ppls_feat, dummy, sample_idx, pnt_mask, 'sample', eval_opt) # save localization results on generated sentences if opt.eval_obj_grounding: assert opt.beam_size == 1, 'only support beam_size is 1' att2_ind = torch.max(att2_weights.view(batch_size, att2_weights.size(1), \ opt.num_sampled_frm, opt.num_prop_per_frm), dim=-1)[1] obj_bbox_att2 = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \ .permute(0, 2, 1, 3).contiguous(), 1, att2_ind.unsqueeze(-1).expand((batch_size, \ att2_ind.size(1), opt.num_sampled_frm, input_ppls.size(-1)))) # Bx20x10x7 for i in range(seq.size(0)): vid_id, seg_idx = seg_id[i].split('_segment_') seg_idx = str(int(seg_idx)) tmp_result = { 'clss': [], 'idx_in_sent': [], 'bbox_for_all_frames': [] } for j in range(seq.size(1)): if seq[i, j].item() != 0: lemma = opt.wtol[opt.itow[str(seq[i, j].item())]] if lemma in lemma_det_dict: tmp_result['bbox_for_all_frames'].append( obj_bbox_att2[i, j, :, :4].tolist()) tmp_result['clss'].append( opt.itod[lemma_det_dict[lemma]]) tmp_result['idx_in_sent'].append( j ) # redundant, for the sake of output format else: break grd_output[vid_id][seg_idx] = tmp_result sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, \ dataset.wtod, seq.data, opt.vocab_size, opt) for k, sent in enumerate(sents): vid_idx, seg_idx = seg_id[k].split('_segment_') seg_idx = str(int(seg_idx)) predictions[vid_idx].append( {'sentence':sent, 'timestamp':[round(timestamp, 2) for timestamp in timestamp_file[ \ 'annotations'][vid_idx]['segments'][seg_idx]['timestamps']]}) if num_show < 20: print('segment %s: %s' % (seg_id[k], sent)) num_show += 1 # visualization if opt.vis_attn: assert (opt.beam_size == 1) # only support beam_size=1 att2_weights = F.softmax(att2_weights, dim=2) # visualize some selected examples if torch.sum(proposals[k]) != 0: vis_infer(seg_show[k], seg_id[k], sent, \ att2_weights[k].cpu().data, proposals[k], num[k].long(), \ bboxs[k], sim_mat[k].cpu().data, seg_dim_info[k]) if count % 2 == 0: print(count) count += 1 lang_stats = defaultdict(float) if opt.language_eval: print('Total videos to be evaluated %d' % (len(predictions))) submission = './experiments/results/' + 'densecap-' + opt.val_split + '-' + opt.id + '.json' dense_cap_all = { 'version': 'VERSION 1.0', 'results': predictions, 'external_data': { 'used': 'true', 'details': 'Visual Genome for Faster R-CNN pre-training' } } with open(submission, 'w') as f: json.dump(dense_cap_all, f) references = opt.densecap_references verbose = opt.densecap_verbose tious_lst = [0.3, 0.5, 0.7, 0.9] evaluator = ANETcaptions(ground_truth_filenames=references, prediction_filename=submission, tious=tious_lst, max_proposals=1000, verbose=verbose) evaluator.evaluate() for m, v in evaluator.scores.items(): lang_stats[m] = np.mean(v) print('\nResults Summary (lang eval):') print('Printing language evaluation metrics...') for m, s in lang_stats.items(): print('{}: {:.3f}'.format(m, s * 100)) print('\n') if opt.eval_obj_grounding: # write attention results to file attn_file = './experiments/results/attn-gen-sent-results-' + opt.val_split + '-' + opt.id + '.json' with open(attn_file, 'w') as f: json.dump( { 'results': grd_output, 'eval_mode': 'gen', 'external_data': { 'used': True, 'details': 'Object detector pre-trained on Visual Genome on object detection task.' } }, f) if not opt.test_mode: # offline eval evaluator = ANetGrdEval(reference_file=opt.grd_reference, submission_file=attn_file, split_file=opt.split_file, val_split=[opt.val_split], iou_thresh=0.5) print('\nResults Summary (generated sent):') print( 'Printing attention accuracy on generated sentences, per class and per sentence, respectively...' ) prec_all, recall_all, f1_all, prec_all_per_sent, rec_all_per_sent, f1_all_per_sent = evaluator.grd_eval( mode='all') prec_loc, recall_loc, f1_loc, prec_loc_per_sent, rec_loc_per_sent, f1_loc_per_sent = evaluator.grd_eval( mode='loc') else: print('*' * 62) print('* [WARNING] Grounding eval unavailable for the test set!\ *\n* Please submit your result files under directory *\ \n* results/ to the eval server! *') print('*' * 62) if opt.att_model == 'topdown' and opt.eval_obj_grounding_gt: with torch.no_grad(): box_accu_att, box_accu_grd, cls_accu = eval_grounding( opt) # eval grounding else: box_accu_att, box_accu_grd, cls_accu = 0, 0, 0 if opt.enable_visdom: assert (opt.language_eval) if vis_window['score'] is None: vis_window['score'] = vis.line( X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T, Y=np.column_stack( (np.asarray(box_accu_att), np.asarray(box_accu_grd), np.asarray(cls_accu), np.asarray(lang_stats['Bleu_4']), np.asarray(lang_stats['METEOR']), np.asarray(lang_stats['CIDEr']), np.asarray(lang_stats['SPICE']))), opts=dict(title='Validation Score', xlabel='Validation Epoch', ylabel='Score', legend=[ 'BA (alpha)', 'BA (beta)', 'CLS Accu', 'Bleu_4', 'METEOR', 'CIDEr', 'SPICE' ])) else: vis.line(X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T, Y=np.column_stack( (np.asarray(box_accu_att), np.asarray(box_accu_grd), np.asarray(cls_accu), np.asarray(lang_stats['Bleu_4']), np.asarray(lang_stats['METEOR']), np.asarray(lang_stats['CIDEr']), np.asarray(lang_stats['SPICE']))), opts=dict(title='Validation Score', xlabel='Validation Epoch', ylabel='Score', legend=[ 'BA (alpha)', 'BA (beta)', 'CLS Accu', 'Bleu_4', 'METEOR', 'CIDEr', 'SPICE' ]), win=vis_window['score'], update='append') print('Saving the predictions') # Write validation result into summary val_result_history[iteration] = { 'lang_stats': lang_stats, 'predictions': predictions } return lang_stats
name_id = 'small_sample.json' evaluation_type = 'fast' #generate score for the captioning if (evaluation_type == 'slow'): evaluator = old_ANETcaptions(ground_truth_filenames=['./data/val_1.json', './data/val_2.json'], prediction_filename='./'+name_id, tious=[0.3, 0.5, 0.7, 0.9], max_proposals=1000, verbose=True) elif(evaluation_type == 'fast'): evaluator = ANETcaptions(ground_truth_filenames=['./data/val_1.json', './data/val_2.json'], prediction_filename='./'+name_id, tious=[0.3, 0.5, 0.7, 0.9], max_proposals=1000, verbose=True) # verbose=args.verbose -> verbose = True evaluator.evaluate() # Output the results for i, tiou in enumerate([0.3, 0.5, 0.7, 0.9]): print '-' * 80 print "tIoU: " , tiou print '-' * 80 for metric in evaluator.scores: score = evaluator.scores[metric][i] print '| %s: %2.4f'%(metric, 100*score)
def evaluate_gt_proposal(val_loader, model_att, model_tep, model_sg, idx_to_word, epoch=0): # Evaluate mode model_att.eval() model_tep.eval() model_sg.eval() out = {} out['version'] = 'VERSION 1.0' out['results'] = {} out['external_data'] = {} out['external_data']['used'] = 'false' out['external_data']['details'] = 'for evaluation' end = time.time() with torch.no_grad(): for batch_idx, (data, boxes, duration, v_name, timestamp) in enumerate(val_loader): if data.shape[2] < 5: print("Pass this data (idx:%d) because very short video." % batch_idx) continue if args.use_gpu: data = Variable(data.cuda()) boxes = Variable(boxes.cuda()) else: data = Variable(data) boxes = Variable(boxes) # Predict proposals proposals = model_tep(data) # Obtain proposal features with ground-truth proposal # using weighted attention(descriptiveness) score pos_feats = get_gt_proposal(data, proposals[1], proposals[3], boxes, scale_ratios=args.scale_ratios, use_gpu=args.use_gpu) if args.use_gpu: pos_feats = Variable(pos_feats.cuda()) else: pos_feats = Variable(pos_feats) att = model_att(pos_feats) if args.use_gpu: att = Variable(att.cuda()) else: att = Variable(att) att = Variable(att) # Generate sentences gen_result, _ = model_sg.sample(pos_feats, att, greedy=True) gen_sents = idx_to_sent(gen_result, idx_to_word) start_times = timestamp[0, :, 0].data.cpu().numpy() end_times = timestamp[0, :, 1].data.cpu().numpy() out['results'][v_name[0]] = [] for i in range(len(gen_sents)): temp = {} temp['sentence'] = gen_sents[i][0] temp['timestamp'] = [ float(start_times[i]), float(end_times[i]) ] out['results'][v_name[0]].append(temp) # Print if (batch_idx + 1) % args.print_freq == 0: print("\tValidation: [{}/{}]\t" "Time: {:.3f}".format((batch_idx + 1), len(val_loader), time.time() - end)) # Write to JSON if not os.path.isdir('./output'): os.makedirs('./output') json_name = 'output/result_{}_{}.json'.format(args.file_name, str(epoch)) json.dump(out, open(json_name, 'w')) # Evaluate scores scores = {} evaluator = ANETcaptions(ground_truth_filenames=args.references, prediction_filename=json_name, tious=args.tious, max_proposals=args.max_proposals_per_video, verbose=True) evaluator.evaluate() print("Validation Scores") for metric in evaluator.scores: score = evaluator.scores[metric] scores[metric] = 100 * sum(score) / float(len(score)) print('| %s: %2.4f' % (metric, scores[metric])) return scores
def evaluate_gt(val_loader, model_att, model_sg, criterion, idx_to_word, epoch=0): # Evaluate mode model_att.eval() model_sg.eval() count = 0 losses = 0.0 out = {} out['version'] = 'VERSION 1.0' out['results'] = {} out['external_data'] = {} out['external_data']['used'] = 'false' out['external_data']['details'] = 'for evaluation' with torch.no_grad(): for batch_idx, (data, target, v_name, timestamp) in enumerate(val_loader): if args.use_gpu: data = data.cuda() target = target.cuda() data = Variable(data) target = Variable(target) # Generate sentences att = model_att(data) att = Variable(att) gen_result, output = model_sg(data, att) loss = criterion(output.view(-1, output.shape[2]), target[:, 1:].contiguous().view(-1)) losses += loss.item() # Into dict. structure gen_sents = idx_to_sent(gen_result, idx_to_word) start_times = timestamp[:, 0].data.cpu().numpy() end_times = timestamp[:, 1].data.cpu().numpy() for i in range(len(gen_sents)): if not v_name[i] in out['results']: out['results'][v_name[i]] = [] temp = {} temp['sentence'] = gen_sents[i][0] temp['timestamp'] = [ float(start_times[i]), float(end_times[i]) ] out['results'][v_name[i]].append(temp) count += 1 print("Check Validation data : {} / {}".format(count, len(val_loader.dataset))) avg_loss = losses / len(val_loader.dataset) print("Validation average loss : {:.4f}".format(avg_loss)) # Write to JSON if not os.path.isdir('./output'): os.makedirs('./output') json_name = 'output/result_{}_{}.json'.format(args.file_name, str(epoch)) json.dump(out, open(json_name, 'w')) # Evaluate scores scores = {} evaluator = ANETcaptions(ground_truth_filenames=args.references, prediction_filename=json_name, tious=args.tious, max_proposals=args.max_proposals_per_video, verbose=True) evaluator.evaluate() print("Validation Scores") for metric in evaluator.scores: score = evaluator.scores[metric] scores[metric] = 100 * sum(score) / float(len(score)) print('| %s: %2.4f' % (metric, scores[metric])) return avg_loss, scores