Exemplo n.º 1
0
def eval(epoch, opt, vis=None, vis_window=None):
    model.eval()

    data_iter_val = iter(dataloader_val)
    start = time.time()

    num_show = 0
    predictions = defaultdict(list)
    count = 0
    timestamp_file = json.load(open(opt.grd_reference))
    min_value = -1e8

    if opt.eval_obj_grounding:
        grd_output = defaultdict(dict)

        lemma_det_dict = {
            opt.wtol[key]: idx
            for key, idx in opt.wtod.items() if key in opt.wtol
        }
        print('{} classes have the associated lemma word!'.format(
            len(lemma_det_dict)))

    if opt.eval_obj_grounding or opt.language_eval:
        for step in range(len(dataloader_val)):
            data = data_iter_val.next()
            if opt.vis_attn:
                seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, seg_show, seg_dim_info, region_feat, frm_mask, sample_idx, ppl_mask = data
            else:
                seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, region_feat, frm_mask, sample_idx, ppl_mask = data

            proposals = proposals[:, :max(int(max(num[:, 1])), 1), :]
            ppl_mask = ppl_mask[:, :max(int(max(num[:, 1])), 1)]
            region_feat = region_feat[:, :max(int(max(num[:, 1])), 1), :]

            segs_feat.resize_(seg_feat.size()).data.copy_(seg_feat)
            input_num.resize_(num.size()).data.copy_(num)
            input_ppls.resize_(proposals.size()).data.copy_(proposals)
            mask_ppls.resize_(ppl_mask.size()).data.copy_(ppl_mask)
            pnt_mask = torch.cat(
                (mask_ppls.new(mask_ppls.size(0), 1).fill_(0), mask_ppls),
                dim=1)  # pad 1 column from a legacy reason
            ppls_feat.resize_(region_feat.size()).data.copy_(region_feat)
            sample_idx = Variable(sample_idx.type(input_num.type()))

            eval_opt = {
                'sample_max': 1,
                'beam_size': opt.beam_size,
                'inference_mode': True
            }
            dummy = input_ppls.new(input_ppls.size(0)).byte().fill_(0)

            batch_size = input_ppls.size(0)

            seq, att2_weights, sim_mat = model(segs_feat, dummy, dummy, input_num, \
                                               input_ppls, dummy, dummy, ppls_feat, dummy, sample_idx, pnt_mask, 'sample', eval_opt)

            # save localization results on generated sentences
            if opt.eval_obj_grounding:
                assert opt.beam_size == 1, 'only support beam_size is 1'

                att2_ind = torch.max(att2_weights.view(batch_size, att2_weights.size(1), \
                    opt.num_sampled_frm, opt.num_prop_per_frm), dim=-1)[1]
                obj_bbox_att2 = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \
                    .permute(0, 2, 1, 3).contiguous(), 1, att2_ind.unsqueeze(-1).expand((batch_size, \
                    att2_ind.size(1), opt.num_sampled_frm, input_ppls.size(-1)))) # Bx20x10x7

                for i in range(seq.size(0)):
                    vid_id, seg_idx = seg_id[i].split('_segment_')
                    seg_idx = str(int(seg_idx))
                    tmp_result = {
                        'clss': [],
                        'idx_in_sent': [],
                        'bbox_for_all_frames': []
                    }

                    for j in range(seq.size(1)):
                        if seq[i, j].item() != 0:
                            lemma = opt.wtol[opt.itow[str(seq[i, j].item())]]
                            if lemma in lemma_det_dict:
                                tmp_result['bbox_for_all_frames'].append(
                                    obj_bbox_att2[i, j, :, :4].tolist())
                                tmp_result['clss'].append(
                                    opt.itod[lemma_det_dict[lemma]])
                                tmp_result['idx_in_sent'].append(
                                    j
                                )  # redundant, for the sake of output format
                        else:
                            break
                    grd_output[vid_id][seg_idx] = tmp_result

            sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, \
                                          dataset.wtod, seq.data, opt.vocab_size, opt)

            for k, sent in enumerate(sents):
                vid_idx, seg_idx = seg_id[k].split('_segment_')
                seg_idx = str(int(seg_idx))

                predictions[vid_idx].append(
                    {'sentence':sent,
                    'timestamp':[round(timestamp, 2) for timestamp in timestamp_file[ \
                        'annotations'][vid_idx]['segments'][seg_idx]['timestamps']]})

                if num_show < 20:
                    print('segment %s: %s' % (seg_id[k], sent))
                    num_show += 1

                # visualization
                if opt.vis_attn:
                    assert (opt.beam_size == 1)  # only support beam_size=1
                    att2_weights = F.softmax(att2_weights, dim=2)
                    # visualize some selected examples
                    if torch.sum(proposals[k]) != 0:
                        vis_infer(seg_show[k], seg_id[k], sent, \
                            att2_weights[k].cpu().data, proposals[k], num[k].long(), \
                            bboxs[k], sim_mat[k].cpu().data, seg_dim_info[k])

            if count % 2 == 0:
                print(count)
            count += 1

    lang_stats = defaultdict(float)
    if opt.language_eval:
        print('Total videos to be evaluated %d' % (len(predictions)))

        submission = './experiments/results/' + 'densecap-' + opt.val_split + '-' + opt.id + '.json'
        dense_cap_all = {
            'version': 'VERSION 1.0',
            'results': predictions,
            'external_data': {
                'used': 'true',
                'details': 'Visual Genome for Faster R-CNN pre-training'
            }
        }
        with open(submission, 'w') as f:
            json.dump(dense_cap_all, f)

        references = opt.densecap_references
        verbose = opt.densecap_verbose
        tious_lst = [0.3, 0.5, 0.7, 0.9]
        evaluator = ANETcaptions(ground_truth_filenames=references,
                                 prediction_filename=submission,
                                 tious=tious_lst,
                                 max_proposals=1000,
                                 verbose=verbose)
        evaluator.evaluate()

        for m, v in evaluator.scores.items():
            lang_stats[m] = np.mean(v)

        print('\nResults Summary (lang eval):')
        print('Printing language evaluation metrics...')
        for m, s in lang_stats.items():
            print('{}: {:.3f}'.format(m, s * 100))
        print('\n')

    if opt.eval_obj_grounding:
        # write attention results to file
        attn_file = './experiments/results/attn-gen-sent-results-' + opt.val_split + '-' + opt.id + '.json'
        with open(attn_file, 'w') as f:
            json.dump(
                {
                    'results': grd_output,
                    'eval_mode': 'gen',
                    'external_data': {
                        'used':
                        True,
                        'details':
                        'Object detector pre-trained on Visual Genome on object detection task.'
                    }
                }, f)

        if not opt.test_mode:
            # offline eval
            evaluator = ANetGrdEval(reference_file=opt.grd_reference,
                                    submission_file=attn_file,
                                    split_file=opt.split_file,
                                    val_split=[opt.val_split],
                                    iou_thresh=0.5)

            print('\nResults Summary (generated sent):')
            print(
                'Printing attention accuracy on generated sentences, per class and per sentence, respectively...'
            )
            prec_all, recall_all, f1_all, prec_all_per_sent, rec_all_per_sent, f1_all_per_sent = evaluator.grd_eval(
                mode='all')
            prec_loc, recall_loc, f1_loc, prec_loc_per_sent, rec_loc_per_sent, f1_loc_per_sent = evaluator.grd_eval(
                mode='loc')
        else:
            print('*' * 62)
            print('*  [WARNING] Grounding eval unavailable for the test set!\
    *\n*            Please submit your result files under directory *\
     \n*            results/ to the eval server!                    *')
            print('*' * 62)

    if opt.att_model == 'topdown' and opt.eval_obj_grounding_gt:
        with torch.no_grad():
            box_accu_att, box_accu_grd, cls_accu = eval_grounding(
                opt)  # eval grounding
    else:
        box_accu_att, box_accu_grd, cls_accu = 0, 0, 0

    if opt.enable_visdom:
        assert (opt.language_eval)
        if vis_window['score'] is None:
            vis_window['score'] = vis.line(
                X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T,
                Y=np.column_stack(
                    (np.asarray(box_accu_att), np.asarray(box_accu_grd),
                     np.asarray(cls_accu), np.asarray(lang_stats['Bleu_4']),
                     np.asarray(lang_stats['METEOR']),
                     np.asarray(lang_stats['CIDEr']),
                     np.asarray(lang_stats['SPICE']))),
                opts=dict(title='Validation Score',
                          xlabel='Validation Epoch',
                          ylabel='Score',
                          legend=[
                              'BA (alpha)', 'BA (beta)', 'CLS Accu', 'Bleu_4',
                              'METEOR', 'CIDEr', 'SPICE'
                          ]))
        else:
            vis.line(X=np.tile(np.arange(epoch, epoch + 1), (7, 1)).T,
                     Y=np.column_stack(
                         (np.asarray(box_accu_att), np.asarray(box_accu_grd),
                          np.asarray(cls_accu),
                          np.asarray(lang_stats['Bleu_4']),
                          np.asarray(lang_stats['METEOR']),
                          np.asarray(lang_stats['CIDEr']),
                          np.asarray(lang_stats['SPICE']))),
                     opts=dict(title='Validation Score',
                               xlabel='Validation Epoch',
                               ylabel='Score',
                               legend=[
                                   'BA (alpha)', 'BA (beta)', 'CLS Accu',
                                   'Bleu_4', 'METEOR', 'CIDEr', 'SPICE'
                               ]),
                     win=vis_window['score'],
                     update='append')

    print('Saving the predictions')

    # Write validation result into summary
    val_result_history[iteration] = {
        'lang_stats': lang_stats,
        'predictions': predictions
    }

    return lang_stats
Exemplo n.º 2
0
def eval_grounding(opt, vis=None):
    model.eval()

    data_iter = iter(dataloader_val)
    cls_pred_lst = []
    cls_accu_score = defaultdict(list)
    att2_output = defaultdict(dict)
    grd_output = defaultdict(dict)
    vocab_in_split = set()

    for step in range(len(dataloader_val)):
        data = data_iter.next()
        seg_feat, iseq, gts_seq, num, proposals, bboxs, box_mask, seg_id, region_feat, frm_mask, sample_idx, ppl_mask = data

        proposals = proposals[:, :max(int(max(num[:, 1])), 1), :]
        ppl_mask = ppl_mask[:, :max(int(max(num[:, 1])), 1)]
        assert (max(int(max(num[:, 1])),
                    1) == opt.num_sampled_frm * opt.num_prop_per_frm)
        bboxs = bboxs[:, :max(int(max(num[:, 2])), 1), :]
        frm_mask = frm_mask[:, :max(int(max(num[:, 1])), 1
                                    ), :max(int(max(num[:, 2])), 1)]
        region_feat = region_feat[:, :max(int(max(num[:, 1])), 1), :]

        segs_feat.resize_(seg_feat.size()).data.copy_(seg_feat)
        input_seqs.resize_(iseq.size()).data.copy_(iseq)
        gt_seqs.resize_(gts_seq.size()).data.copy_(gts_seq)
        input_num.resize_(num.size()).data.copy_(num)
        input_ppls.resize_(proposals.size()).data.copy_(proposals)
        mask_ppls.resize_(ppl_mask.size()).data.copy_(ppl_mask)
        pnt_mask = torch.cat(
            (mask_ppls.new(mask_ppls.size(0), 1).fill_(0), mask_ppls), dim=1)
        gt_bboxs.resize_(bboxs.size()).data.copy_(
            bboxs)  # for region cls eval only
        mask_frms.resize_(frm_mask.size()).data.copy_(
            frm_mask)  # for region cls eval only
        ppls_feat.resize_(region_feat.size()).data.copy_(region_feat)
        sample_idx = Variable(sample_idx.type(input_seqs.type()))

        dummy = input_ppls.new(input_ppls.size(0)).byte().fill_(0)

        from maskrcnn_benchmark.modeling.roi_heads.relation_head.relation_head import build_roi_relation_head
        from maskrcnn_benchmark.structures.bounding_box import BoxList

        # cls_pred_hm_lst contains a list of tuples (clss_ind, hit/1 or miss/0)
        cls_pred_hm_lst, att2_ind, grd_ind = model(segs_feat, input_seqs,
                                                   gt_seqs, input_num,
                                                   input_ppls, gt_bboxs, dummy,
                                                   ppls_feat, mask_frms,
                                                   sample_idx, pnt_mask, 'GRD')

        # save attention/grounding results on GT sentences
        obj_mask = (input_seqs[:, 0, 1:, 0] > opt.vocab_size)  # Bx20
        obj_bbox_att2 = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \
            .permute(0, 2, 1, 3).contiguous(), 1, att2_ind.unsqueeze(-1).expand((att2_ind.size(0), \
            att2_ind.size(1), opt.num_sampled_frm, 7))) # Bx20x10x7
        obj_bbox_grd = torch.gather(input_ppls.view(-1, opt.num_sampled_frm, opt.num_prop_per_frm, 7) \
            .permute(0, 2, 1, 3).contiguous(), 1, grd_ind.unsqueeze(-1).expand((grd_ind.size(0), \
            grd_ind.size(1), opt.num_sampled_frm, 7))) # Bx20x10x7

        for i in range(obj_mask.size(0)):
            vid_id, seg_idx = seg_id[i].split('_segment_')
            seg_idx = str(int(seg_idx))
            tmp_result_grd = {
                'clss': [],
                'idx_in_sent': [],
                'bbox_for_all_frames': []
            }
            tmp_result_att2 = {
                'clss': [],
                'idx_in_sent': [],
                'bbox_for_all_frames': []
            }
            for j in range(obj_mask.size(1)):
                if obj_mask[i, j]:
                    cls_name = opt.itod[input_seqs[i, 0, j + 1, 0].item() -
                                        opt.vocab_size]
                    vocab_in_split.update([cls_name])
                    tmp_result_att2['clss'].append(cls_name)
                    tmp_result_att2['idx_in_sent'].append(j)
                    tmp_result_att2['bbox_for_all_frames'].append(
                        obj_bbox_att2[i, j, :, :4].tolist())
                    tmp_result_grd['clss'].append(cls_name)
                    tmp_result_grd['idx_in_sent'].append(j)
                    tmp_result_grd['bbox_for_all_frames'].append(
                        obj_bbox_grd[i, j, :, :4].tolist())
            att2_output[vid_id][seg_idx] = tmp_result_att2
            grd_output[vid_id][seg_idx] = tmp_result_grd

        cls_pred_lst.append(cls_pred_hm_lst)

    # write results to file
    attn_file = './experiments/results/attn-gt-sent-results-' + opt.val_split + '-' + opt.id + '.json'
    with open(attn_file, 'w') as f:
        json.dump(
            {
                './experiments/results': att2_output,
                'eval_mode': 'GT',
                'external_data': {
                    'used':
                    True,
                    'details':
                    'Object detector pre-trained on Visual Genome on object detection task.'
                }
            }, f)
    grd_file = './experiments/results/grd-gt-sent-results-' + opt.val_split + '-' + opt.id + '.json'
    with open(grd_file, 'w') as f:
        json.dump(
            {
                'results': grd_output,
                'eval_mode': 'GT',
                'external_data': {
                    'used':
                    True,
                    'details':
                    'Object detector pre-trained on Visual Genome on object detection task.'
                }
            }, f)

    if not opt.test_mode:
        cls_pred_lst = torch.cat(cls_pred_lst, dim=0).cpu()
        cls_accu_lst = torch.cat(
            (cls_pred_lst[:, 0:1],
             (cls_pred_lst[:, 0:1] == cls_pred_lst[:, 1:2]).long()),
            dim=1)
        for i in range(cls_accu_lst.size(0)):
            cls_accu_score[cls_accu_lst[i, 0].long().item()].append(
                cls_accu_lst[i, 1].item())
        print(
            'Total number of object classes in the split: {}. {} have classification results.'
            .format(len(vocab_in_split), len(cls_accu_score)))
        cls_accu = np.sum(
            [sum(hm) * 1. / len(hm)
             for i, hm in cls_accu_score.items()]) * 1. / len(vocab_in_split)

        # offline eval
        evaluator = ANetGrdEval(reference_file=opt.grd_reference,
                                submission_file=attn_file,
                                split_file=opt.split_file,
                                val_split=[opt.val_split],
                                iou_thresh=0.5)

        attn_accu = evaluator.gt_grd_eval()
        evaluator.import_sub(grd_file)
        grd_accu = evaluator.gt_grd_eval()

        print('\nResults Summary (GT sent):')
        print(
            'The averaged attention / grounding box accuracy across all classes is: {:.4f} / {:.4f}'
            .format(attn_accu, grd_accu))
        print(
            'The averaged classification accuracy across all classes is: {:.4f}\n'
            .format(cls_accu))

        return attn_accu, grd_accu, cls_accu
    else:
        print('*' * 62)
        print('*  [WARNING] Grounding eval unavailable for the test set!\
    *\n*            Please submit your result files under directory *\
     \n*            results/ to the eval server!                    *')
        print('*' * 62)

        return 0, 0, 0