def forward(self, att_feats, target_seq, masks):

        batch_size, len_q = target_seq.size()

        # target_pos: batch_size * len_q
        target_pos = np.array([
            [j + 1 for j in range(len_q)]
            for _ in range(batch_size)])
        target_pos = torch.LongTensor(target_pos).cuda()
        target_pos = Variable(target_pos, requires_grad=False)
        target_pos = target_pos * masks.long()

        output_enc = self.image_embed(att_feats)

        # target_seq: batch_size * len_q
        # target_pos: batch_size * len_q
        # input_seq:  batch_size * len_q
        # output_enc: batch_size * len_q * model_size
        # output_dec: batch_size * len_q * model_size
        # masks: batch_size * len_q
        # seq_logsofts: n_layers * batch_size * len_q * output_size
        seq_logsofts = self.decoder(target_seq, target_pos, output_enc, masks)

        if self.is_show_result:
            sampleLogprobs, seq = torch.max(self.get_logprob_data(seq_logsofts), 2)
            sents = utils.decode_sequence(self.vocab, seq)
            target_sents = utils.decode_sequence(self.vocab, target_seq.data[:, 1:])
            print("===============output=================")
            for k, sent in enumerate(sents):
                print(sent," | ",target_sents[k])
            print("===============output=================")

        # seq_logsofts: n_layers * batch_size * len_q * output_size
        return seq_logsofts
예제 #2
0
파일: _loss.py 프로젝트: xpertasks/attn2d
 def get_scores(self, preds, target):
     if self.bleu_scorer == 'coco':
         bleu_scorer = BleuScorer(n=self.bleu_order)
         coco = True
     else:
         coco = False
         scores = []
     # Go to sentence space to compute scores:
     hypo = decode_sequence(self.vocab, preds)  # candidate
     refs = decode_sequence(self.vocab, target.data)  # references
     num_img = target.size(0) // self.seq_per_img
     for e, h in enumerate(hypo):
         ix_start = e // self.seq_per_img * self.seq_per_img
         ix_end = ix_start + 5  # self.seq_per_img
         if coco:
             bleu_scorer += (h, refs[ix_start:ix_end])
         else:
             scores.append(
                 sentence_bleu(h,
                               ' '.join(refs[ix_start:ix_end]),
                               order=self.bleu_order))
     if coco:
         (score, scores) = bleu_scorer.compute_score()
         scores = scores[-1]
     self.logger.debug("Bleu scores: %s" % str(scores))
     return scores
예제 #3
0
    def forward(self, att_feats, target_seq, masks):

        # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:])
        # print("================target_seq================")
        # for k, sent in enumerate(sents):
        #     print(sent)
        # print("================target_seq================")

        batch_size, att_size, att_feat_size = att_feats.size()
        batch_size, len_q = target_seq.size()

        # input_pos: batch_size * len_q
        # input_pos = np.array([
        #     [pos_i + 1 for pos_i in range(att_size)]
        #     for i in range(batch_size)])
        # input_pos = torch.LongTensor(input_pos).cuda()
        # input_pos = Variable(input_pos, requires_grad=False)

        # target_pos: batch_size * len_q
        target_pos = np.array([[j + 1 for j in range(len_q)]
                               for _ in range(batch_size)])
        target_pos = torch.LongTensor(target_pos).cuda()
        target_pos = Variable(target_pos, requires_grad=False)
        target_pos = target_pos * masks.long()

        output_enc = self.image_embed(att_feats)

        # target_seq: batch_size * len_q
        # target_pos: batch_size * len_q
        # input_seq:  batch_size * len_q
        # output_enc: batch_size * len_q * model_size
        # output_dec: batch_size * len_q * model_size
        # masks: batch_size * len_q
        output_dec = self.decoder(target_seq, target_pos, output_enc, masks)

        if self.drop_prob_lm > 0:
            output_dec = F.dropout(output_dec, self.drop_prob_lm)

        # output_dec:   batch_size * len_q * model_size
        # seq_logsofts: batch_size * len_q * output_size
        seq_logsofts = F.log_softmax(self.proj(output_dec), -1)

        # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:])
        # print("================target_seq================")
        # for k, sent in enumerate(sents):
        #     print(sent)
        # print("================target_seq================")

        if self.is_show_result:
            sampleLogprobs, seq = torch.max(seq_logsofts.data, 2)
            sents = utils.decode_sequence(self.vocab, seq)
            target_sents = utils.decode_sequence(self.vocab,
                                                 target_seq.data[:, 1:])
            print("===============output=================")
            for k, sent in enumerate(sents):
                print(sent, " | ", target_sents[k])
            print("===============output=================")

        # seq_logsofts: batch_size * len_q * output_size
        return seq_logsofts
예제 #4
0
def eval_split_n(model, n_predictions, loader, input_data, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    beam_size = eval_kwargs.get('beam_size', 1)
    sample_n = eval_kwargs.get('sample_n', 1)
    sample_n_method = eval_kwargs.get('sample_n_method', 'sample')

    fc_feats, att_feats, att_masks, data = input_data

    tmp_eval_kwargs = eval_kwargs.copy()
    if sample_n_method == 'bs':
        # case 1 sample_n == beam size
        tmp_eval_kwargs.update({'sample_n': 1, 'beam_size': sample_n, 'group_size': 1})  # randomness from softmax
        with torch.no_grad():
            model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample')
        for k in range(loader.batch_size):
            _sents = utils.decode_sequence(loader.get_vocab(),
                                           torch.stack([model.done_beams[k][_]['seq'] for _ in range(sample_n)]))
            for sent in _sents:
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
                n_predictions.append(entry)
    # case 2 sample / gumbel / topk sampling/ nucleus sampling
    elif sample_n_method == 'sample' or \
            sample_n_method == 'gumbel' or \
            sample_n_method.startswith('top'):
        tmp_eval_kwargs.update(
            {'sample_n': sample_n, 'sample_method': sample_n_method, 'beam_size': 1})  # randomness from sample
        with torch.no_grad():
            _seq, _sampleLogprobs = model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample')
        _sents = utils.decode_sequence(loader.get_vocab(), _seq)
        _perplexity = - _sampleLogprobs.gather(2, _seq.unsqueeze(2)).squeeze(2).sum(1) / ((_seq > 0).float().sum(1) + 1)
        for k, sent in enumerate(_sents):
            entry = {'image_id': data['infos'][k // sample_n]['id'], 'caption': sent,
                     'perplexity': _perplexity[k].item()}
            n_predictions.append(entry)
    elif sample_n_method == 'dbs':
        # Use diverse beam search
        tmp_eval_kwargs.update({'beam_size': sample_n * beam_size, 'group_size': sample_n})  # randomness from softmax
        with torch.no_grad():
            model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample')
        for k in range(loader.batch_size):
            _sents = utils.decode_sequence(loader.get_vocab(), torch.stack(
                [model.done_beams[k][_]['seq'] for _ in range(0, sample_n * beam_size, beam_size)]))
            for sent in _sents:
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
                n_predictions.append(entry)
    else:
        tmp_eval_kwargs.update(
            {'sample_method': sample_n_method[1:], 'group_size': sample_n, 'beam_size': 1})  # randomness from softmax
        with torch.no_grad():
            _seq, _sampleLogprobs = model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample')
        _sents = utils.decode_sequence(loader.get_vocab(), _seq)
        for k, sent in enumerate(_sents):
            entry = {'image_id': data['infos'][k // sample_n]['id'], 'caption': sent}
            n_predictions.append(entry)
    if verbose:
        for entry in sorted(n_predictions[-loader.batch_size * sample_n:], key=lambda x: x['image_id']):
            print('image %s: %s' % (entry['image_id'], entry['caption']))
예제 #5
0
    def forward(self, fc_feats, att_feats, target_seq, masks):

        # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:])
        # print("================target_seq================")
        # for k, sent in enumerate(sents):
        #     print(sent)
        # print("================target_seq================")

        # with BOS
        batch_size, len_q = target_seq.size()

        # target_pos: batch_size * len_1_q
        target_pos = np.array([[j + 1 for j in range(len_q)]
                               for _ in range(batch_size)])
        target_pos = torch.LongTensor(target_pos).cuda()
        target_pos = Variable(target_pos, requires_grad=False)
        target_pos = target_pos * masks.long()

        # fc_feats: batch_size * model_size
        # att_feats: batch_size * att_size * model_sze
        fc_feats, att_feats = self.embed_feats(fc_feats, att_feats)

        # target_seq: batch_size * len_q
        # target_pos: batch_size * len_q
        # fc_feats: batch_size * model_size
        # att_feats: batch_size * att_size * model_size
        # masks:  batch_size * len_q
        # output_dec: batch_size * len_q * model_size
        # proj_wg: batch_size * len_q * (vocab_size + 1)
        output_dec, proj_wg = self.decoder(target_seq, target_pos, fc_feats,
                                           att_feats, masks)

        if self.drop_prob_lm > 0:
            output_dec = F.dropout(output_dec, self.drop_prob_lm)

        # output_dec:   batch_size * len_q * model_size
        # seq_logsofts: batch_size * len_q * output_size
        seq_logsofts = F.log_softmax(self.proj(output_dec) * proj_wg, -1)

        # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:])
        # print("================target_seq================")
        # for k, sent in enumerate(sents):
        #     print(sent)
        # print("================target_seq================")

        if self.is_show_result:
            sampleLogprobs, seq = torch.max(seq_logsofts.data, 2)
            sents = utils.decode_sequence(self.vocab, seq)
            target_sents = utils.decode_sequence(self.vocab,
                                                 target_seq.data[:, 1:])
            print("===============output=================")
            for k, sent in enumerate(sents):
                print(sent, " | ", target_sents[k])
            print("===============output=================")

        # seq_logsofts: batch_size * len_q * output_size
        return seq_logsofts, proj_wg
예제 #6
0
파일: rewards.py 프로젝트: cxqj/ECHR
def get_self_critical_reward2(greedy_res, vid_info, gen_result, vocab, opt):
    vid, sentences_batch = vid_info

    start = time.time()
    batch_size, sent_len = gen_result.size(
    )  # batch_size = sample_size * seq_per_img

    # get greedy decoding baseline
    gen_result = utils.decode_sequence(vocab, gen_result)
    greedy_res = utils.decode_sequence(vocab, greedy_res)

    # pdb.set_trace()
    res = OrderedDict()

    #gen_result = gen_result.data.cpu().numpy()
    #greedy_res = greedy_res.data.cpu().numpy()
    for i in range(batch_size):
        #res[i] = [array_to_str(gen_result[i])]
        res[i] = [remove_nonascii(gen_result[i])]
    for i in range(batch_size):
        #res[batch_size + i] = [array_to_str(greedy_res[i])]
        res[batch_size + i] = [remove_nonascii(greedy_res[i])]

    gts = OrderedDict()
    for i in range(len(sentences_batch)):
        gts[i] = [remove_nonascii(sentences_batch[i])]

    #res_ = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)]

    res_ = [{
        'image_id': i,
        'caption': res[i]
    } for i in range(2 * batch_size)]  # for cider
    res__ = {i: res[i] for i in range(2 * batch_size)}
    gts = {i: gts[i % batch_size] for i in range(2 * batch_size)}

    #pdb.set_trace()
    if opt.meteor_reward_weight > 0:
        #print('vid:', vid)
        _, meteor_score = Meteor_scorer.compute_score(gts, res__)
    else:
        meteor_score = 0
    if opt.meteor_reward_weight < 1:
        _, cider_score = Cider_scorer.compute_score(gts, res_)
        #print('Meteor score:', _)
    else:
        cider_score = 0

    scores = opt.meteor_reward_weight * np.array(meteor_score) + (
        1 - opt.meteor_reward_weight) * np.array(cider_score)
    scores = scores[:batch_size] - scores[batch_size:]
    rewards = np.repeat(scores[:, np.newaxis], sent_len, 1)
    #print('time consuming:',time.time()-start)

    return rewards
예제 #7
0
def eval_print(loader, data, seq, predictions, n, eval_kwargs={}):
    beam_size = eval_kwargs.get('beam_size', 1)
    verbose_beam = eval_kwargs.get('verbose_beam', 1)
    num_images = eval_kwargs.get('num_images',
                                 eval_kwargs.get('val_images_use', -1))
    verbose = eval_kwargs.get('verbose', True)

    # print beam search
    if beam_size > 1 and verbose_beam:
        for i in range(loader.batch_size):
            print('\n'.join(
                utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(
                    0))[0] for _ in model.done_beams[i]))
            print('---' * 10)
    sents = utils.decode_sequence(loader.get_vocab(), seq)

    for k, sent in enumerate(sents):
        entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
        predictions.append(entry)
        if eval_kwargs.get('dump_images', 0) == 1:
            # dump the raw image to vis/ folder
            cmd = 'cp "' + os.path.join(
                eval_kwargs['image_root'],
                data['infos'][k]['file_path']) + '" vis/imgs/img' + str(
                    len(predictions)) + '.jpg'  # bit gross
            print(cmd)
            os.system(cmd)

        if verbose:
            print('image %s: %s' % (entry['image_id'], entry['caption']))

    # if we wrapped around the split or used up val imgs budget then bail
    ix0 = data['bounds']['it_pos_now']
    ix1 = data['bounds']['it_max']
    if num_images != -1:
        ix1 = min(ix1, num_images)
    for i in range(n - ix1):
        predictions.pop()

    if verbose:
        if ix0 % 200 == 0:
            print('evaluating validation preformance... %d/%d' %(ix0, ix1)) if ix0 else \
                print('evaluating validation preformance... %d/%d' %(ix1, ix1))

    if data['bounds']['wrapped']:
        return True
    if num_images >= 0 and n >= num_images:
        return True

    return False
예제 #8
0
파일: _loss.py 프로젝트: xpertasks/attn2d
 def get_scores(self, preds, target):
     # The reward loss:
     cider_scorer = CiderScorer(n=4, sigma=6)
     # Go to sentence space to compute scores:
     hypo = decode_sequence(self.vocab, preds)  # candidate
     refs = decode_sequence(self.vocab, target.data)  # references
     num_img = target.size(0) // self.seq_per_img
     for e, h in enumerate(hypo):
         ix_start = e // self.seq_per_img * self.seq_per_img
         ix_end = ix_start + 5  # self.seq_per_img
         cider_scorer += (h, refs[ix_start:ix_end])
     (score, scores) = cider_scorer.compute_score()
     self.logger.debug("CIDEr score: %s" % str(scores))
     return scores
예제 #9
0
파일: _loss.py 프로젝트: xpertasks/attn2d
 def get_scores(self, preds, target):
     hypo = decode_sequence(self.vocab, preds)  # candidate
     refs = decode_sequence(self.vocab, target.data)  # references
     num_img = target.size(0) // self.seq_per_img
     scores = []
     lr = len(refs)
     codes = self.infersent.encode(refs + hypo)
     refs = codes[:lr]
     hypo = codes[lr:]
     for e, h in enumerate(hypo):
         ix_start = e // self.seq_per_img * self.seq_per_img
         ix_end = ix_start + 5  # self.seq_per_img
         scores.append(group_similarity(h, refs[ix_start:ix_end]))
     self.logger.debug("infersent similairities: %s" % str(scores))
     return scores
예제 #10
0
    def forward(self, input, input1, seq, seq1, target, vocab):
        # truncate to the same size
        # input (batch_size * (seq_length + 2) * (vocab_size + 1))
        # target (batch_size * (seq_length))
        batch_size, L, Mp1 = input.size(0), input.size(1), input.size(2)
        seq_length = target.size(1)

        loss = Variable(torch.FloatTensor(1).zero_(),
                        requires_grad=True).cuda()
        n = 0

        label = utils.decode_sequence(vocab, target.data)
        seq = utils.decode_sequence(vocab, seq)
        seq1 = utils.decode_sequence(vocab, seq1)

        # train
        reward = utils.get_reward(seq, label, "CIDEr")
        # test
        reward1 = utils.get_reward(seq1, label, "CIDEr")

        reward_diff = reward - reward1

        if reward_diff < 1:
            reward_diff = 1

        for b in range(batch_size):
            first_time = True

            for t in range(1, L):

                if t - 1 >= seq_length:
                    target_index = 0
                else:
                    target_index = target.data[b, t - 1]

                if target_index == 0 and first_time:
                    first_time = False
                elif target_index == 0 and not first_time:
                    break

                logsoft = input[b, t, target_index]
                loss.sub_(logsoft)
                n += 1

        loss.div_(n)
        loss.mul_(reward_diff)

        return loss, reward, reward1
예제 #11
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    for video_path in opt['videos']:
        print(video_path)
        with torch.no_grad():
            frames = skvideo.io.vread(video_path)
            # bp ---
            batches = create_batches(frames, load_img_fn, tf_img_fn)
            seq_prob, seq_preds = full_decoder(batches, mode='inference')
            sents = utils.decode_sequence(vocab, seq_preds)

            for sent in sents:
                print(sent)
예제 #12
0
def demo_simple(obj_det_model, model, img_name):
    model.eval()
    file_path = img_name
    max_proposal = 200

    num_proposal = 6
    num_nms = 6

    # load the image.
    input_imgs = torch.FloatTensor(1)
    img = Image.open(file_path).convert('RGB')
    # resize the image.
    img = transforms.Resize((opt.image_crop_size, opt.image_crop_size))(img)
    
    ppls = get_caption(obj_det_model, img_name)
    pad_proposals = torch.cat([i.unsqueeze(dim=0) for i in ppls]).unsqueeze(dim=0)
#     pad_proposals = torch.from_numpy(pad_proposals).float().unsqueeze(dim=0)
    num = torch.FloatTensor([1, len(ppls), len(ppls)]).unsqueeze(dim=0)

    img = transforms.ToTensor()(img)
    img = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(img).unsqueeze(dim=0)
    
    img = img.cuda()
    pad_proposals = pad_proposals.cuda()
    num = num.cuda()

    eval_opt = {'sample_max':1, 'beam_size': opt.beam_size, 'inference_mode' : True, 'tag_size' : opt.cbs_tag_size}
    seq, bn_seq, fg_seq, _, _, _ = model._sample(img, pad_proposals, num, eval_opt)
    sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, dataset.wtod, \
                                    seq.data, bn_seq.data, fg_seq.data, opt.vocab_size, opt)
    return sents
예제 #13
0
파일: eval.py 프로젝트: htt98/AI_View
def get_caption(model, crit, dataset, vocab, opt):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    samples = {}
    for data in loader:
        # forward the model to get loss
        fc_feats = Variable(data['fc_feats'], volatile=True).cuda()
        video_ids = data['video_ids']

        # forward the model to also get generated samples for each image
        seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt)
        #print(seq_preds)

        sents = utils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])

    with open(
            os.path.join(opt["results_path"],
                         opt["model"].split("/")[-1].split('.')[0] + ".json"),
            'w') as prediction_results:
        json.dump({"predictions": samples}, prediction_results)
예제 #14
0
def eval_split(model, vocab, eval_kwargs):
    loader = get_eval_loader(kwargs=eval_kwargs)
    print(
        "assigned {} images for model evaluation in Karpathy {} split".format(
            len(loader), eval_kwargs['eval_split']))

    predictions = []
    eval_mode = eval_kwargs.get('eval_mode', 0)

    start = time.time()
    for i, batch in enumerate(loader):
        temp = [_.cuda() for _ in batch]
        cocoid, fc_feat, att_feat = temp
        word_idx, father_idx, mask = model._greedy_search(
            fc_feat, att_feat, 40)
        sents = utils.decode_sequence(vocab, word_idx, father_idx, mask)

        for j in range(len(sents)):
            entry = {'image_id': cocoid[j].item(), 'caption': sents[j]}
            print('{}: {}({})'.format(cocoid[j].item(), sents[j], i))
            predictions.append(entry)
        if i > eval_kwargs['eval_images'] >= 0:
            break
    print("inference took {} seconds.".format(time.time() - start))
    lang_stat = language_eval(predictions, eval_kwargs['id'],
                              eval_kwargs['eval_split'])

    if not eval_kwargs['eval_time']:
        model.train()
        model.set_gpu(eval_kwargs['use_cuda'] == 1)
    return lang_stat
def eval_split(model_cnn, model, filepaths, ix_to_word, eval_kwargs={}):

    verbose_eval = eval_kwargs.get('verbose_eval', True)
    beam_size = eval_kwargs.get('beam_size', 1)
    caption_model = eval_kwargs.get('caption_model', '')
    batch_size = eval_kwargs.get('batch_size', 1)

    predictions = []

    data = get_batch(filepaths, batch_size)

    images = torch.from_numpy(data['images']).cuda()
    images = utils.prepro_norm(images, False)
    images = Variable(images, requires_grad=False)

    if models.is_only_fc_feat(caption_model):
        fc_feats = model_cnn(images)
    else:
        fc_feats, att_feats = model_cnn(images)

    if models.is_only_fc_feat(caption_model):
        seq, _ = model.sample(fc_feats, {'beam_size': beam_size})
    else:
        seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size})

    # sents
    sents = utils.decode_sequence(ix_to_word, seq)

    for k, sent in enumerate(sents):
        print(sent)
        sent = ''.join(sent.split())
        predictions.append(sent)

    return predictions
예제 #16
0
def test(model, crit, dataset, vocab, opt):
    loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=True)
    scorer = COCOScorer()
    gt_dataframe = json_normalize(json.load(open(opt.input_json))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in loader:
        # forward the model to get loss
        fc_feats = Variable(data['fc_feats']).cuda()
        labels = Variable(data['labels']).long().cuda()
        with torch.no_grad():
            # forward the model to also get generated samples for each image
            seq_probs, seq_preds = model(fc_feats, labels, teacher_forcing_ratio=0)
            print(seq_preds)

        sents = utils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = 'video' + str(data['ix'][k])
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt.results_path):
        os.makedirs(opt.results_path)

    with open(os.path.join(opt.results_path, "scores.txt"), 'a') as scores_table:
            scores_table.write(json.dumps(results[0]) + "\n")
    with open(os.path.join(opt.results_path, opt.model.split("/")[-1].split('.')[0] + ".json"), 'w') as prediction_results:
        json.dump({"predictions": samples, "scores": valid_score}, prediction_results)
예제 #17
0
def test(model, crit, dataset, vocab, opt):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=False)
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    #results = []
    samples = {}
    for index, data in enumerate(loader):
        print 'batch: ' + str((index + 1) * opt["batch_size"])
        # forward the model to get loss
        fc_feats = Variable(data['fc_feats'], volatile=True).cuda()
        labels = Variable(data['labels'], volatile=True).long().cuda()
        masks = Variable(data['masks'], volatile=True).cuda()
        video_ids = data['video_ids']

        # forward the model to also get generated samples for each image
        seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt)
        # print(seq_preds)

        sents = utils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]
        # break
    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    #results.append(valid_score)
    #print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])
    result = OrderedDict()
    result['checkpoint'] = opt["saved_model"][opt["saved_model"].rfind('/') +
                                              1:]
    score_sum = 0
    for key, value in valid_score.items():
        score_sum += float(value)
    result['sum'] = str(score_sum)
    #result = OrderedDict(result, **valid_score)
    result = OrderedDict(result.items() + valid_score.items())
    print result
    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])
    with open(os.path.join(opt["results_path"], "scores.txt"),
              'a') as scores_table:
        scores_table.write(json.dumps(result) + "\n")
    with open(
            os.path.join(opt["results_path"],
                         opt["model"].split("/")[-1].split('.')[0] + ".json"),
            'w') as prediction_results:
        json.dump({
            "predictions": samples,
            "scores": valid_score
        }, prediction_results)
예제 #18
0
def eval_split(model_cnn, model, loader, eval_kwargs={}):

    verbose_eval = eval_kwargs.get('verbose_eval', True)
    beam_size = eval_kwargs.get('beam_size', 1)
    caption_model = eval_kwargs.get('caption_model', '')
    batch_size = eval_kwargs.get('batch_size', 1)

    split = ''
    loader.reset_iterator(split)
    n = 0
    predictions = []
    vocab = loader.get_vocab()

    while True:
        data = loader.get_batch(split, batch_size)
        n = n + batch_size

        images = torch.from_numpy(data['images']).cuda()
        images = utils.prepro_norm(images, False)
        images = Variable(images, requires_grad=False)

        if models.is_only_fc_feat(caption_model):
            fc_feats = model_cnn(images)
        else:
            fc_feats, att_feats = model_cnn(images)

        if models.is_only_fc_feat(caption_model):
            seq, _ = model.sample(fc_feats, {'beam_size': beam_size})
        else:
            seq, _ = model.sample(fc_feats, att_feats,
                                  {'beam_size': beam_size})

        # sents
        sents = utils.decode_sequence(vocab, seq)

        for k, sent in enumerate(sents):
            image_id = data['infos'][k]['id']
            image_id = int(image_id.split('_')[2])
            entry = {'image_id': image_id, 'caption': sent}
            predictions.append(entry)
            if verbose_eval:
                print('image %s: %s' % (entry['image_id'], entry['caption']))

        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']

        for i in range(n - ix1):
            predictions.pop()
        if verbose_eval:
            print('evaluating validation preformance... %d/%d' %
                  (ix0 - 1, ix1))

        if data['bounds']['wrapped']:
            break

    return predictions
예제 #19
0
def predict(model, crit, loader, eval_kwargs={}):
    print('loader.batch_size', loader.batch_size)
    verbose = eval_kwargs.get('verbose', True)
    use_cpu = eval_kwargs.get('use_cpu', False)

    # Make sure in the evaluation mode
    model.eval()

    loader.reset_iterator('dummy')

    n = 0
    predictions = []
    while True:
        data = loader.get_batch('dummy')
        # print('batch data type and size', type(data), len(data))
        # print('data', data)
        n = n + loader.batch_size

        # forward the model to also get generated samples for each image
        # Only leave one feature for each image, in case duplicate sample
        tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 
            data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
            data['att_masks'][np.arange(loader.batch_size) * loader.seq_per_img]]
        tmp = [Variable(torch.from_numpy(_), volatile=True) for _ in tmp]
        if not use_cpu:
            tmp = [_.cuda() for _ in tmp]
        fc_feats, att_feats, att_masks = tmp
        # forward the model to also get generated samples for each image
        seq = model(fc_feats, att_feats, att_masks, opt=eval_kwargs, mode='sample')[0].data
        
        sents = utils.decode_sequence(loader.get_vocab(), seq)

        for k, sent in enumerate(sents):
            if verbose:
                print('image %s: ' %(data['infos'][k]['id']), sent.encode('utf8', 'replace'))
            entry = {'image_id': data['infos'][k]['id'], 
                'caption': sent,
                'file_path': data['infos'][k]['file_path']}
            if eval_kwargs.get('dump_path', 0) == 1:
                entry['file_name'] = data['infos'][k]['file_path']
            predictions.append(entry)

        # if we wrapped around the split or used up val imgs budget then bail
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        for i in range(n - ix1):
            predictions.pop()

        if data['bounds']['wrapped']:
            break

    # print('predictions', predictions)

    # Switch back to training mode
    model.train()
    return predictions
예제 #20
0
파일: eval.py 프로젝트: stillarrow/S2VT_ACT
def test(model, crit, dataset, vocab, device, opt):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    for data in loader:
        # forward the model to get loss
        fc_feats = data['fc_feats'].to(device)
        labels = data['labels'].to(device)
        masks = data['masks'].to(device)
        video_ids = data['video_ids']
        if opt["model"] == "S2VTACTModel":
            action = data['action'].to(device)
        # forward the model to also get generated samples for each image
        with torch.no_grad():
            if opt["model"] == "S2VTModel":
                seq_probs, seq_preds = model(fc_feats,
                                             mode='inference',
                                             opt=opt)
            else:
                seq_probs, seq_preds = model(fc_feats,
                                             action=action,
                                             device=device,
                                             mode='inference',
                                             opt=opt)

        sents = utils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])

    with open(os.path.join(opt["results_path"], "scores.txt"),
              'a') as scores_table:
        scores_table.write(json.dumps(results[0]) + "\n")
    with open(
            os.path.join(opt["results_path"],
                         opt["model"].split("/")[-1].split('.')[0] + ".json"),
            'w') as prediction_results:
        json.dump({
            "predictions": samples,
            "scores": valid_score
        }, prediction_results)
예제 #21
0
def test(model, crit, dataset, vocab, opt):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    scorer = COCOScorer()
    gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])
    gts = convert_data_to_coco_scorer_format(gt_dataframe)
    results = []
    samples = {}
    seq_probs_list = []
    seq_preds_list = []
    masks_list = []
    labels_list = []

    for data in loader:
        # forward the model to get loss
        fc_feats = data['fc_feats'].cuda()
        if(opt["with_mean"] == 0):
                feats_3d = data['feats_3d'].cuda()
        labels = data['labels'].cuda()
        masks = data['masks'].cuda()
        video_ids = data['video_ids']
      
        # forward the model to also get generated samples for each image
        with torch.no_grad():
            if(opt["with_mean"] == 1):
                seq_probs, seq_preds = model(
                    fc_feats, mode='inference', opt=opt)
            else:
                seq_probs, seq_preds = model(
                    fc_feats, feats_3d, mode='inference', opt=opt)

        sents = utils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]
        
        seq_preds_list.append(seq_preds)
        seq_probs_list.append(seq_probs)
        masks_list.append(masks)
        labels_list.append(labels)

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    seq_probs_list = torch.cat(seq_probs_list, 0)
    seq_preds_list = torch.cat(seq_preds_list, 0)
    labels_list = torch.cat(labels_list, 0)
    masks_list = torch.cat(masks_list, 0)

    return valid_score, samples, seq_probs_list, seq_preds_list, labels_list, masks_list
예제 #22
0
def eval_print_caption(loader, info, seq, predictions):
    # TODO: beam search > 1
    sent = utils.decode_sequence(loader.get_vocab(), seq)[0]
    seq_ = seq.squeeze().data.cpu().numpy()
    entry = {
        'image_id': info['id'],
        'caption': sent,
        'caption_ix': seq_.tolist()
    }
    predictions.append(entry)
    print('image %s: %s' % (entry['image_id'], entry['caption']))
예제 #23
0
def test(model, crit, dataset, vocab, opt):
    model.eval()
    loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    scorer = COCOScorer()

    gt_dataframe = json_normalize(
        json.load(open('data_subset/vatex_subsample_v1.0.json')))

    gts = convert_data_to_coco_scorer_format(gt_dataframe, 'chinese')

    results = []
    samples = {}
    for data in loader:
        # forward the model to get loss
        i3d_feats = data['i3d_feats'].squeeze(1)  #.cuda()
        labels = data['labels']  #.cuda()
        masks = data['masks']  #.cuda()
        video_ids = data['video_ids']

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(i3d_feats, mode='inference', opt=opt)
        sents = utils.decode_sequence(vocab, seq_preds)

        for k, sent in enumerate(sents):
            video_id = video_ids[k]
            samples[video_id] = [{'image_id': video_id, 'caption': sent}]

    with suppress_stdout_stderr():
        valid_score = scorer.score(gts, samples, samples.keys())
    results.append(valid_score)
    print(valid_score)

    if not os.path.exists(opt["results_path"]):
        os.makedirs(opt["results_path"])

    with open(
            os.path.join(opt["results_path"],
                         "chinese_LSTM_OPT_epoch601_scores.txt"),
            'a') as scores_table:
        scores_table.write(json.dumps(results[0]) + "\n")
    with open(
            os.path.join(
                opt["results_path"],
                opt["model"].split("/")[-1].split('.')[0] +
                "_chinese_LSTM_OPT_epoch601.json"), 'w') as prediction_results:
        json.dump({
            "predictions": samples,
            "scores": valid_score
        },
                  prediction_results,
                  indent=2)
예제 #24
0
def image_captioning_body(img):

    #img = skimage.io.imread(opt.image)
    #img = skimage.io.imread('silicon_test_images/cellphone.jpg')
    if (img.shape[0]==2):
        img = img[0]
    print (img.shape)


    fc_batch = np.ndarray((batch_size, 2048), dtype = 'float32')
    att_batch = np.ndarray((batch_size, 14, 14, 2048), dtype = 'float32')
    if len(img.shape) == 2:
        img = img[:,:,np.newaxis]
        img = np.concatenate((img, img, img), axis=2)

    img = img.astype('float32')/255.0
    img = torch.from_numpy(img.transpose([2, 0, 1]))
    '''
    if use_cuda==False:
        img = torch.from_numpy(img.transpose([2, 0, 1]))
    else:
        img = torch.from_numpy(img.transpose([2, 0, 1])).cuda()
    '''
    with torch.no_grad():
        img = Variable(preprocess(img))
        if use_cuda==True:
            img = img.cuda()
        tmp_fc, tmp_att = my_resnet(img)

    fc_batch[0] = tmp_fc.data.cpu().float().numpy()
    att_batch[0] = tmp_att.data.cpu().float().numpy()
    data['fc_feats'] = fc_batch
    data['att_feats'] = att_batch

    tmp = [data['fc_feats'][np.arange(batch_size)], 
        data['att_feats'][np.arange(batch_size)]]

    with torch.no_grad():
        if use_cuda:
            tmp = [Variable(torch.from_numpy(_)).cuda() for _ in tmp]
        else:
            tmp = [Variable(torch.from_numpy(_)) for _ in tmp]
        fc_feats, att_feats = tmp
        # forward the model to also get generated samples for each image
        seq, _ = model.sample(fc_feats, att_feats, vars(opt))

    seq = seq.cpu().numpy()
    sents = utils.decode_sequence(vocab, seq)
    print (sents)
    return sents[0]
예제 #25
0
def make_sents_mask(gen_result, vocab):
	length = gen_result.shape[1]
	sents_mask = []
	sents = utils.decode_sequence(vocab, gen_result)
	for sent in sents:
		sent_mask = [0]*length
		tokens = sent.lower().split()
		tag = nltk.pos_tag(tokens)
		for i in range(len(tag)):
			if tag[i][1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS']:
				sent_mask[i] = 1
		sents_mask.append(sent_mask)

	return torch.Tensor(sents_mask).cuda()
예제 #26
0
    def get_bert_ids(self, seqs):
        input_ids = []
        txts = utils.decode_sequence(self.vocab, seqs[:,1:], add_punct=True)

        txts = [txt.replace('<blank>', self.tokenizer.mask_token).replace('<UNK>', self.tokenizer.unk_token) for txt in txts]
        ids = [self.tokenizer.convert_tokens_to_ids(["[CLS]"] + self.tokenizer.tokenize(txt) + ["[SEP]"]) for txt in txts]
        max_seq_length = max([len(input_id) for input_id in ids])
        pad_token = self.tokenizer.pad_token
        for input_id in ids:
            padding = [self.tokenizer._convert_token_to_id(pad_token)] * (max_seq_length - len(input_id))
            input_id += padding
            input_ids.append(input_id)

        input_ids = torch.tensor(input_ids, dtype=torch.long).cuda().contiguous()
        return input_ids
def demov(model, crit, dataset, vocab, opt):
    loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=False)
    for i, data in enumerate(loader):
        # forward the model to get loss
        fc_feats = data['fc_feats'].cuda()

        # forward the model to also get generated samples for each image
        with torch.no_grad():
            seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt)

        sents = utils.decode_sequence(vocab, seq_preds)

        print(sents)
        if i == 0:
            break
예제 #28
0
    def gen_caption(self, im_target, im_reference=None):
        if self.is_relative and im_reference == None:
            return ''

        if not self.is_relative and not im_reference == None:
            return ''

        fc_feat, att_feat = self.get_feat(im_target, im_reference)
        tmp = (fc_feat, att_feat)
        tmp = [torch.from_numpy(_).to(DEVICE) for _ in tmp]
        fc_feat, att_feat = tmp

        if not self.opt['use_att']:
            att_feat = torch.zeros(1, 1, 1, 1)

        seq, _ = self.model.sample(fc_feat, att_feat, self.opt)
        sents = utils.decode_sequence(self.vocab, seq)

        return seq, sents
예제 #29
0
def find_type(greedy_res, vocab):
    #0: TE
    #1: TIE
    #2: TDE
    effect_type = []
    sents = utils.decode_sequence(vocab, greedy_res)[0]
    tokens = sents.lower().split()
    tag = nltk.pos_tag(tokens)
    for i in range(len(tag)):
        # if tag[i][1] in ['NN', 'NNS','NNP','NNPS'] and tag[i][0] !='unk' and tag[i][0] !='group'and tag[i][0] !='people':
        # if tag[i][1] in ['NN', 'NNS','NNP','NNPS'] and tag[i][0] !='unk'  and wtol[tag[i][0]] in wtod:
        # if tag[i][1] in ['NN', 'NNS','NNP','NNPS'] and tag[i][0] !='unk':
        if tag[i][1] in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']:
            # if t[1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP'] and t[0] !='unk':
            effect_type.append(1)
        elif tag[i][1] in ['CC', 'IN', 'RP']:
            effect_type.append(2)
        else:
            effect_type.append(0)
    return effect_type
예제 #30
0
def find_type(seq,vocab,force=False):
	#0: TE
	#1: TIE
	#2: TDE
	if force:
		effect_type = [2]*seq.shape[1]
	else:
		effect_type = [0]*seq.shape[1]
		sents = utils.decode_sequence(vocab, seq)[0]
		tokens = sents.lower().split()
		tag = nltk.pos_tag(tokens)
		for i in range(len(tag)):
			if tag[i][1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS']:
			# if t[1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP'] and t[0] !='unk':
				effect_type[i] = 1
			# elif tag[i][1] in ['CC','IN','RP']:
			#     effect_type[i] = 2
			# else:
			#     effect_type[i] = 0
	return effect_type
def eval_split(model, crit, loader, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1))
    split = eval_kwargs.get('split', 'test')
    lang_eval = eval_kwargs.get('language_eval', 0)
    dataset = eval_kwargs.get('dataset', 'coco')
    print_all_beam = eval_kwargs.get('print_all_beam', False)

    print('> print_all_beam', print_all_beam)

    # Make sure in the evaluation mode
    model.eval()

    loader.reset_iterator(split)

    n = 0
    loss = 0
    loss_sum = 0
    loss_evals = 1e-8
    predictions = []
    while True:
        data = loader.get_batch(split)
        n = n + loader.batch_size

        if data.get('labels', None) is not None:
            # forward the model to get loss
            tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['attributes']]
            tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, attributes = tmp

            loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:], attributes).data[0]
            loss_sum = loss_sum + loss
            loss_evals = loss_evals + 1

        # forward the model to also get generated samples for each image
        # Only leave one feature for each image, in case duplicate sample
        tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 
            data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
        tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
        fc_feats, att_feats = tmp
        # forward the model to also get generated samples for each image
        seq, prob, attr = model.sample(fc_feats, att_feats, eval_kwargs)

        if print_all_beam is True:
            for p in xrange(seq.shape[0]):
                seq_this = seq[p, :, :]
                prob_this = prob[p, :, :]
                sents = utils.decode_sequence(loader.get_vocab(), seq_this)

                print('---------------------------------------------------------')
                print('> video id %s:' % data['infos'][p]['id'])

                for k, sent in enumerate(sents):
                    entry = {'image_id': data['infos'][p]['id'], 'caption': sent}
                    if eval_kwargs.get('dump_path', 0) == 1:
                        entry['file_name'] = data['infos'][p]['file_path']
                    predictions.append(entry)
                    if eval_kwargs.get('dump_images', 0) == 1:
                        # dump the raw image to vis/ folder
                        cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross
                        print(cmd)
                        os.system(cmd)

                    if verbose:
                        print('    %s (%.5f)' %(entry['caption'], math.exp(sum(prob_this[k, :]))))

                print('---------------------------------------------------------')
                if split == 'show':
                    p = raw_input()
            # seq [image_idx, beam_idx, sentence]
        else:
            
            #set_trace()
            sents = utils.decode_sequence(loader.get_vocab(), seq)

            for k, sent in enumerate(sents):
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
                if eval_kwargs.get('dump_path', 0) == 1:
                    entry['file_name'] = data['infos'][k]['file_path']
                predictions.append(entry)
                if eval_kwargs.get('dump_images', 0) == 1:
                    # dump the raw image to vis/ folder
                    cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross
                    print(cmd)
                    os.system(cmd)

                this_attr = attr[k, :].data.cpu().numpy()
                assert this_attr.shape == (1000,)

                this_gt_attr = attributes[k * loader.seq_per_img, :].data.cpu().numpy()
                gt_attr_indices = this_gt_attr.argsort()[-5:][::-1]

                attr_indices = this_attr.argsort()[-5:][::-1]

                gt_label = labels[k * loader.seq_per_img, 1:].data.cpu().numpy()

                if verbose:
                    print('video %s: %s' % (entry['image_id'], entry['caption']))
                    print('   gt: %s' % ' '.join(([loader.ix_to_word[str(p)] for p in gt_label if p > 0])))
                    print('   attr: %s' % (' '.join([loader.attr_idx2word[id] for id in attr_indices])))
                    print('   gt labels: %s' % (' '.join([loader.attr_idx2word[id] for id in gt_attr_indices])))

        # if we wrapped around the split or used up val imgs budget then bail
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if num_images != -1:
            ix1 = min(ix1, num_images)
        for i in range(n - ix1):
            predictions.pop()

        if verbose and split is not 'show':
            print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss))

        if data['bounds']['wrapped']:
            break
        if num_images >= 0 and n >= num_images:
            break

    lang_stats = None
    if lang_eval == 1:
        lang_stats = language_eval(dataset, predictions, eval_kwargs['id'], split)

    # Switch back to training mode
    model.train()
    return loss_sum/loss_evals, predictions, lang_stats
예제 #32
0
def eval_split(model, crit, loader, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1))
    split = eval_kwargs.get('split', 'val')
    lang_eval = eval_kwargs.get('language_eval', 1)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)

    # Make sure in the evaluation mode
    model.eval()

    loader.reset_iterator(split)

    n = 0
    loss = 0
    loss_sum = 0
    loss_evals = 1e-8
    predictions = []
    while True:
        data = loader.get_batch(split)
        n = n + loader.batch_size

        if data.get('labels', None) is not None:
            # forward the model to get loss
            tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
            tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks = tmp

            loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:]).data[0]
            loss_sum = loss_sum + loss
            loss_evals = loss_evals + 1

        # forward the model to also get generated samples for each image
        # Only leave one feature for each image, in case duplicate sample
        tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 
            data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
        tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
        fc_feats, att_feats = tmp
        # forward the model to also get generated samples for each image
        seq, _ = model.sample(fc_feats, att_feats, eval_kwargs)
        
        #set_trace()
        sents = utils.decode_sequence(loader.get_vocab(), seq)

        for k, sent in enumerate(sents):
            entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
            if eval_kwargs.get('dump_path', 0) == 1:
                entry['file_name'] = data['infos'][k]['file_path']
            predictions.append(entry)
            if eval_kwargs.get('dump_images', 0) == 1:
                # dump the raw image to vis/ folder
                cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross
                print(cmd)
                os.system(cmd)

            if verbose:
                print('image %s: %s' %(entry['image_id'], entry['caption']))

        # if we wrapped around the split or used up val imgs budget then bail
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if num_images != -1:
            ix1 = min(ix1, num_images)
        for i in range(n - ix1):
            predictions.pop()

        if verbose:
            print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss))

        if data['bounds']['wrapped']:
            break
        if num_images >= 0 and n >= num_images:
            break

    lang_stats = {}
    if lang_eval == 1:
        lang_stats = language_eval(dataset, predictions, eval_kwargs['id'], split)

    # Switch back to training mode
    model.train()
    return loss_sum/loss_evals, predictions, lang_stats
예제 #33
0
def eval_split(sess, model, loader, eval_kwargs):
    verbose = eval_kwargs.get('verbose', True)
    num_images = eval_kwargs.get('num_images', -1)
    split = eval_kwargs.get('split', 'test')
    language_eval = eval_kwargs.get('language_eval', 0)
    dataset = eval_kwargs.get('dataset', 'coco')

    # Make sure in the evaluation mode
    sess.run(tf.assign(model.training, False))
    sess.run(tf.assign(model.cnn_training, False))

    loader.reset_iterator(split)

    n = 0
    loss_sum = 0
    loss_evals = 1e-8
    predictions = []

    while True:
        # fetch a batch of data
        if opt.beam_size > 1:
            data = loader.get_batch(split, 1)
            n = n + 1
        else:
            data = loader.get_batch(split, opt.batch_size)
            n = n + opt.batch_size

        #evaluate loss if we have the labels
        loss = 0
        if data.get('labels', None) is not None:
            # forward the model to get loss
            feed = {model.images: data['images'], model.labels: data['labels'], model.masks: data['masks']}
            loss = sess.run(model.cost, feed)
            loss_sum = loss_sum + loss
            loss_evals = loss_evals + 1

        # forward the model to also get generated samples for each image
        if opt.beam_size == 1:
            # forward the model to also get generated samples for each image
            feed = {model.images: data['images']}
            #g_o,g_l,g_p, seq = sess.run([model.g_output, model.g_logits, model.g_probs, model.generator], feed)
            seq = sess.run(model.generator, feed)

            #set_trace()
            sents = utils.decode_sequence(vocab, seq)

            for k, sent in enumerate(sents):
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
                predictions.append(entry)
                if verbose:
                    print('image %s: %s' %(entry['image_id'], entry['caption']))
        else:
            seq = model.decode(data['images'], opt.beam_size, sess)
            sents = [' '.join([vocab.get(str(ix), '') for ix in sent]).strip() for sent in seq]
            sents = [sents[0]]
            entry = {'image_id': data['infos'][0]['id'], 'caption': sents[0]}
            predictions.append(entry)
            if verbose:
                for sent in sents:
                    print('image %s: %s' %(entry['image_id'], sent))

        for k, sent in enumerate(sents):
            entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
            if opt.dump_path == 1:
                entry['file_name'] = data['infos'][k]['file_path']
                table.insert(predictions, entry)
            if opt.dump_images == 1:
                # dump the raw image to vis/ folder
                cmd = 'cp "' + os.path.join(opt.image_root, data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross
                print(cmd)
                os.system(cmd)

            if verbose:
                print('image %s: %s' %(entry['image_id'], entry['caption']))

        # if we wrapped around the split or used up val imgs budget then bail
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if num_images != -1:
            ix1 = min(ix1, num_images)
        for i in range(n - ix1):
            predictions.pop()

        if verbose:
            print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss))

        if data['bounds']['wrapped']:
            break
        if num_images >= 0 and n >= num_images:
            break

    lang_stats = None
    if language_eval == 1:
        lang_stats = eval_utils.language_eval(dataset, predictions)

    # Switch back to training mode
    sess.run(tf.assign(model.training, True))
    sess.run(tf.assign(model.cnn_training, True))
    return loss_sum/loss_evals, predictions, lang_stats
예제 #34
0
def eval_split(sess, model, loader, eval_kwargs):
    verbose = eval_kwargs.get('verbose', True)
    val_images_use = eval_kwargs.get('val_images_use', -1)
    split = eval_kwargs.get('split', 'val')
    language_eval = eval_kwargs.get('language_eval', 1)
    dataset = eval_kwargs.get('dataset', 'coco')

    # Make sure in the evaluation mode
    sess.run(tf.assign(model.training, False))
    sess.run(tf.assign(model.cnn_training, False))

    loader.reset_iterator(split)

    n = 0
    loss_sum = 0
    loss_evals = 0
    predictions = []
    while True:
        if opt.beam_size > 1:
            data = loader.get_batch(split, 1)
            n = n + 1
        else:
            data = loader.get_batch(split)
            n = n + loader.batch_size

        # forward the model to get loss
        feed = {model.images: data['images'], model.labels: data['labels'], model.masks: data['masks']}
        loss = sess.run(model.cost, feed)

        loss_sum = loss_sum + loss
        loss_evals = loss_evals + 1

        if opt.beam_size == 1:
            # forward the model to also get generated samples for each image
            feed = {model.images: data['images']}
            #g_o,g_l,g_p, seq = sess.run([model.g_output, model.g_logits, model.g_probs, model.generator], feed)
            seq = sess.run(model.generator, feed)

            #set_trace()
            sents = utils.decode_sequence(loader.get_vocab(), seq)

            for k, sent in enumerate(sents):
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
                predictions.append(entry)
                if verbose:
                    print('image %s: %s' %(entry['image_id'], entry['caption']))
        else:
            seq = model.decode(data['images'], opt.beam_size, sess)
            sents = [' '.join([loader.ix_to_word.get(str(ix), '') for ix in sent]).strip() for sent in seq]
            entry = {'image_id': data['infos'][0]['id'], 'caption': sents[0]}
            predictions.append(entry)
            if verbose:
                for sent in sents:
                    print('image %s: %s' %(entry['image_id'], sent))
        
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if val_images_use != -1:
            ix1 = min(ix1, val_images_use)
        for i in range(n - ix1):
            predictions.pop()
        if verbose:
            print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss))

        if data['bounds']['wrapped']:
            break
        if n>= val_images_use:
            break

    if language_eval == 1:
        lang_stats = eval_utils.language_eval(dataset, predictions)

    # Switch back to training mode
    sess.run(tf.assign(model.training, True))
    sess.run(tf.assign(model.cnn_training, True))
    return loss_sum/loss_evals, predictions, lang_stats