예제 #1
0
def language_eval_excoco(predictions, predictions_bleu, sents_label_eval,
                         loader):

    Scorer = CiderD()
    Bleu_scorer = Bleu(4)
    METEOR_scorer = Meteor()
    ROUGE_scorer = Rouge()

    c_score, _ = Scorer.compute_score(sents_label_eval, predictions)
    b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu)
    m_score, _ = METEOR_scorer.compute_score(sents_label_eval,
                                             predictions_bleu)
    r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu)

    print('Evaluating {} samples'.format(len(predictions)))

    print('Bleu_1 : ' + str(b_score[0]))
    print('Bleu_2 : ' + str(b_score[1]))
    print('Bleu_3 : ' + str(b_score[2]))
    print('Bleu_4 : ' + str(b_score[3]))
    print('METEOR : ' + str(m_score))
    print('ROUGE_L : ' + str(r_score))
    print('CIDEr : ' + str(c_score))

    lang_stat = {}
    lang_stat['BLEU_1'] = b_score[0]
    lang_stat['BLEU_2'] = b_score[1]
    lang_stat['BLEU_3'] = b_score[2]
    lang_stat['BLEU_4'] = b_score[3]
    lang_stat['METEOR'] = m_score
    lang_stat['ROUGE_L'] = r_score
    lang_stat['CIDEr'] = c_score

    return lang_stat
예제 #2
0
 def __init__(self, opt):
     super(get_self_critical_reward, self).__init__()
     self.vocab_size = opt.vocab_size
     self.st2towidx = opt.st2towidx
     self.opt = opt
     # self.st2towidx.requires_grad=False
     self.CiderD_scorer = CiderD(df=opt.cached_tokens)
예제 #3
0
def get_reward_cirder(gen_result, gts_data, opt):
    global CiderD_scorer
    if CiderD_scorer is None:
        # type = 0
        # if type == 0:
        #     path_cider = "/media/amds/data/code/cider"
        #     path_idxs = "/media/amds/data/dataset/mscoco"
        # else:
        #     path_cider = "/home/scw4750/caption/cider"
        #     path_idxs = "/home/scw4750/caption/dataset/mscoco"

        path_cider = opt.path_cider
        path_idxs = opt.path_idxs

        # /home/scw4750/caption/cider
        # /media/amds/data/code/cider
        sys.path.append(path_cider)
        from pyciderevalcap.ciderD.ciderD import CiderD

        # /home/scw4750/caption/dataset/mscoco
        # /media/amds/data/dataset/mscoco
        CiderD_scorer = CiderD(df='coco-train-idxs', path=path_idxs)

    batch_size = gen_result.size(0)  # batch_size = sample_size * seq_per_img
    seq_per_img = batch_size // len(gts_data)

    res = OrderedDict()
    gen_result = gen_result.cpu().numpy()

    # sample result
    for i in range(batch_size):
        res[i] = [array_to_str(gen_result[i])]

    gts = OrderedDict()
    for i in range(len(gts_data)):
        gts[i] = [
            array_to_str(gts_data[i][j]) for j in range(len(gts_data[i]))
        ]

    res = [{'image_id': i, 'caption': res[i]} for i in range(batch_size)]

    gts = {i: gts[i % batch_size // seq_per_img] for i in range(batch_size)}

    _, scores = CiderD_scorer.compute_score(gts, res)
    sample_mean = np.mean(scores)
    print('Cider scores: {:.3f} sample:{:.3f}'.format(_, sample_mean))

    # diff_result = sample_result - greedy_result
    # batch_size
    # scores = scores[:batch_size] - scores[batch_size:]

    # batch_size * seq_length
    rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1)

    return rewards, sample_mean
예제 #4
0
def init_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
    global Bleu_scorer
    Bleu_scorer = Bleu_scorer or Bleu(
        4
    )  # I changed this o BLEU 1 - Originally bleu 4 - MISTAKEN: It's just showing the number of avail. metrics
예제 #5
0
def init_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
    global Cider_scorer
    Cider_scorer = Cider_scorer or Cider(df=cached_tokens)
    global Bleu_scorer
    Bleu_scorer = Bleu_scorer or Bleu(4)
예제 #6
0
def get_scorers(cider_idx_path):
    return {
        'cider': CiderD(df=cider_idx_path),
        'bleu': Bleu(),
        'rouge': Rouge(),
        'meteor': Meteor()
    }
예제 #7
0
파일: criterion.py 프로젝트: zmskye/AREL
    def __init__(self, opt, dataset):
        super(ReinforceCriterion, self).__init__()
        self.dataset = dataset
        self.reward_type = opt.reward_type
        self.bleu = None

        if self.reward_type == 'METEOR':
            from vist_eval.meteor.meteor import Meteor
            self.reward_scorer = Meteor()
        elif self.reward_type == 'CIDEr':
            sys.path.append("cider")
            from pyciderevalcap.ciderD.ciderD import CiderD
            self.reward_scorer = CiderD(df=opt.cached_tokens)
        elif self.reward_type == 'Bleu_4' or self.reward_type == 'Bleu_3':
            from vist_eval.bleu.bleu import Bleu
            self.reward_scorer = Bleu(4)
            self.bleu = int(self.reward_type[-1]) - 1
        elif self.reward_type == 'ROUGE_L':
            from vist_eval.rouge.rouge import Rouge
            self.reward_scorer = Rouge()
        else:
            err_msg = "{} scorer hasn't been implemented".format(
                self.reward_type)
            logging.error(err_msg)
            raise Exception(err_msg)
예제 #8
0
class Cider:
    def __init__(self, args):
        self.cider = CiderD(df='coco-train')
        with open('data/train_references.pkl') as fid:
            self.references = pickle.load(fid)

    def get_scores(self, seqs, images):
        captions = self._get_captions(seqs)
        res = [{
            'image_id': i,
            'caption': [caption]
        } for i, caption in enumerate(captions)]
        gts = {
            i: self.references[image_id]
            for i, image_id in enumerate(images)
        }
        _, scores = self.cider.compute_score(gts, res)
        return scores

    def _get_captions(self, seqs):
        captions = [self._get_caption(seq) for seq in seqs]
        return captions

    def _get_caption(self, seq):
        words = []
        for word in seq:
            words.append(str(word))
            if word == 0:
                break
        caption = ' '.join(words)
        return caption
예제 #9
0
def init_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
    global Bleu_scorer
    Bleu_scorer = Bleu_scorer or Bleu(4)
    global Bert_scorer
    Bert_scorer = Bert_scorer or BertScorer(
        verbose=False, all_layers=False, lang='en')
예제 #10
0
def init_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
    global Bleu_scorer
    Bleu_scorer = Bleu_scorer or Bleu(4)
    global Meteor_scorer
    Meteor_scorer = Meteor()
    global Rouge_scorer
    Rouge_scorer = Rouge()
예제 #11
0
def init_scorer(cache_tokens):
    global CiderD_scorer
    if CiderD_scorer is None:
        CiderD_scorer = CiderD(df=cache_tokens)
    else:
        CiderD_scorer = CiderD_scorer
    # CiderD_scorer = CiderD_scorer or CiderD(df=cache_tokens)
    global Bleu_scorer
    if Bleu_scorer is None:
        Bleu_scorer = Bleu(4)
    else:
        Bleu_scorer = Bleu_scorer
from __future__ import division
from __future__ import print_function

import numpy as np
import time
import misc.utils as utils
from collections import OrderedDict
import torch
from torch.autograd import Variable

import sys
sys.path.append("cider")
from pyciderevalcap.ciderD.ciderD import CiderD
#from pyciderevalcap.cider.cider import Cider

CiderD_scorer = CiderD(df='coco-train-idxs')
#CiderD_scorer = CiderD(df='corpus')

def array_to_str(arr):
    out = ''
    for i in range(len(arr)):
        out += str(arr[i]) + ' '
        if arr[i] == 0:
            break
    return out.strip()

def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result):
    batch_size = gen_result.size(0)# batch_size = sample_size * seq_per_img
    seq_per_img = batch_size // len(data['gts'])
    
    # get greedy decoding baseline
예제 #13
0
def init_cider_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
예제 #14
0
def train(model,
          criterion,
          optimizer,
          train_loader,
          val_loader,
          opt,
          rl_criterion=None):
    infos = {
        'iter': 0,
        'epoch': 0,
        'start_epoch': 0,
        'best_score': float('-inf'),
        'best_iter': 0,
        'best_epoch': opt.max_epochs
    }

    checkpoint_checked = False
    rl_training = False
    seq_per_img = train_loader.get_seq_per_img()
    infos_history = {}

    if os.path.exists(opt.start_from):
        # loading the same model file at a different experiment dir
        start_from_file = os.path.join(
            opt.start_from, os.path.basename(opt.model_file)) if os.path.isdir(
                opt.start_from) else opt.start_from
        logger.info('Loading state from: %s', start_from_file)
        checkpoint = torch.load(start_from_file)
        model.load_state_dict(checkpoint['model'])
        infos = checkpoint['infos']
        infos['start_epoch'] = infos['epoch']
        checkpoint_checked = True  # this epoch is already checked
    else:
        logger.info('No checkpoint found! Training from the scratch')

    if opt.use_rl == 1 and opt.use_rl_after == 0:
        opt.use_rl_after = infos['epoch']
        opt.use_cst_after = infos['epoch']
        train_loader.set_current_epoch(infos['epoch'])

    while True:
        t_start = time.time()
        model.train()
        data = train_loader.get_batch()
        feats = [Variable(feat, volatile=False) for feat in data['feats']]
        labels = Variable(data['labels'], volatile=False)
        masks = Variable(data['masks'], volatile=False)

        if torch.cuda.is_available():
            feats = [feat.cuda() for feat in feats]
            labels = labels.cuda()
            masks = masks.cuda()

        # implement scheduled sampling
        opt.ss_prob = 0
        if opt.use_ss == 1 and infos['epoch'] >= opt.use_ss_after:
            annealing_prob = opt.ss_k / (opt.ss_k + np.exp(
                (infos['epoch'] - opt.use_ss_after) / opt.ss_k))
            opt.ss_prob = min(1 - annealing_prob, opt.ss_max_prob)
            model.set_ss_prob(opt.ss_prob)

        if opt.use_rl == 1 and infos[
                'epoch'] >= opt.use_rl_after and not rl_training:
            logger.info('Using RL objective...')
            rl_training = True
            bcmr_scorer = {
                'Bleu_4': Bleu(),
                'CIDEr': CiderD(df=opt.train_cached_tokens),
                'METEOR': Meteor(),
                'ROUGE_L': Rouge()
            }[opt.eval_metric]

            # logger.info('loading gt refs: %s', train_loader.cocofmt_file)
            # gt_refs = utils.load_gt_refs(train_loader.cocofmt_file)

        mixer_from = opt.mixer_from
        if opt.use_mixer == 1 and rl_training:
            # -1 for annealing
            if opt.mixer_from == -1:
                annealing_mixer = opt.seq_length - int(
                    np.ceil((infos['epoch'] - opt.use_rl_after + 1) /
                            float(opt.mixer_descrease_every)))
                mixer_from = max(1, annealing_mixer)

            model.set_mixer_from(mixer_from)

        scb_captions = opt.scb_captions
        if opt.use_cst == 1 and rl_training:
            if opt.scb_captions == -1:
                annealing_robust = int(
                    np.ceil((infos['epoch'] - opt.use_cst_after + 1) /
                            float(opt.cst_increase_every)))
                scb_captions = min(annealing_robust, seq_per_img - 1)

        optimizer.zero_grad()
        model.set_seq_per_img(seq_per_img)

        if rl_training:
            # using mixer
            pred, model_res, logprobs = model(feats, labels)

            if opt.use_cst == 0:
                # greedy decoding baseline in SCST paper
                greedy_baseline, _ = model.sample(
                    [Variable(f.data, volatile=True) for f in feats], {
                        'sample_max': 1,
                        'expand_feat': opt.expand_feat
                    })

            if opt.use_cst == 1:
                bcmrscores = data['bcmrscores']
                reward, m_score, g_score = utils.get_cst_reward(
                    model_res,
                    data['gts'],
                    bcmr_scorer,
                    bcmrscores=bcmrscores,
                    expand_feat=opt.expand_feat,
                    seq_per_img=train_loader.get_seq_per_img(),
                    scb_captions=scb_captions,
                    scb_baseline=opt.scb_baseline,
                    use_eos=opt.use_eos,
                    use_mixer=opt.use_mixer)
            else:
                # use greedy baseline by default, compute self-critical reward
                reward, m_score, g_score = utils.get_self_critical_reward(
                    model_res,
                    greedy_baseline,
                    data['gts'],
                    bcmr_scorer,
                    expand_feat=opt.expand_feat,
                    seq_per_img=train_loader.get_seq_per_img(),
                    use_eos=opt.use_eos)

            loss = rl_criterion(
                model_res, logprobs,
                Variable(torch.from_numpy(reward).float().cuda(),
                         requires_grad=False))

        else:
            pred = model(feats, labels)[0]
            loss = criterion(pred, labels[:, 1:], masks[:, 1:])

        loss.backward()
        clip_grad_norm(model.parameters(), opt.grad_clip)
        optimizer.step()
        infos['TrainLoss'] = loss.data[0]
        infos['mixer_from'] = mixer_from
        infos['scb_captions'] = scb_captions

        if infos['iter'] % opt.print_log_interval == 0:
            elapsed_time = time.time() - t_start
            log_info = [('Epoch', infos['epoch']), ('Iter', infos['iter']),
                        ('Loss', infos['TrainLoss'])]
            if rl_training:
                log_info += [('Reward', np.mean(reward[:, 0])),
                             ('{} (m)'.format(opt.eval_metric), m_score),
                             ('{} (b)'.format(opt.eval_metric), g_score)]
            if opt.use_ss == 1:
                log_info += [('ss_prob', opt.ss_prob)]
            if opt.use_mixer == 1:
                log_info += [('mixer_from', mixer_from)]
            if opt.use_cst == 1:
                log_info += [('scb_captions', scb_captions)]
            log_info += [('Time', elapsed_time)]
            logger.info(
                '%s',
                '\t'.join(['{}: {}'.format(k, v) for (k, v) in log_info]))

        infos['iter'] += 1

        if infos['epoch'] < train_loader.get_current_epoch():
            infos['epoch'] = train_loader.get_current_epoch()
            checkpoint_checked = False
            learning_rate = utils.adjust_learning_rate(
                opt, optimizer, infos['epoch'] - infos['start_epoch'])
            logger.info('===> Learning rate: %f: ', learning_rate)

        if (infos['epoch'] >= opt.save_checkpoint_from
                and infos['epoch'] % opt.save_checkpoint_every == 0
                and not checkpoint_checked):
            # evaluate the validation performance
            results = validate(model, criterion, val_loader, opt)
            logger.info(
                'Validation output: %s',
                json.dumps(results['scores'], indent=4, sort_keys=True))
            infos.update(results['scores'])

            check_model(model, opt, infos, infos_history)
            checkpoint_checked = True

        if (infos['epoch'] >= opt.max_epochs
                or infos['epoch'] - infos['best_epoch'] > opt.max_patience):
            logger.info('>>> Terminating...')
            break

    return infos
예제 #15
0
import cPickle as pickle
import os
import sys
sys.path.append('/home/llj/caffe/examples/caption-eval/coco-caption')
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pyciderevalcap.ciderD.ciderD import CiderD
from collections import defaultdict

CiderD_scorer = CiderD(df='msvd')  ### need to change for msvd


def score_all(ref, hypo):
    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref, hypo)
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score

    return final_scores


def score(ref, hypo):
예제 #16
0
def get_sample_reward_aic(sample_res, gts_data, gamma, vocab, opt):

    batch_size = sample_res.size(0)
    seq_length = sample_res.size(1)

    global CiderD_scorer
    global Bleu_scorer
    global Rouge_scorer
    if CiderD_scorer is None:
        # type = 0
        # if type == 0:
        #     path_cider = "/media/amds/data/code/cider"
        #     path_idxs = "/media/amds/data/dataset/mscoco"
        # else:
        #     path_cider = "/home/scw4750/caption/cider"
        #     path_idxs = "/home/scw4750/caption/dataset/mscoco"

        path_cider = opt.path_cider
        path_idxs = opt.path_idxs

        # /home/scw4750/caption/cider
        # /media/amds/data/code/cider
        sys.path.append(path_cider)
        from pyciderevalcap.ciderD.ciderD import CiderD
        from pyciderevalcap.bleu.bleu import Bleu
        from pyciderevalcap.rouge.rouge import Rouge
        from pyciderevalcap.meteor.meteor import Meteor

        # /home/scw4750/caption/dataset/mscoco
        # /media/amds/data/dataset/mscoco
        CiderD_scorer = CiderD(df=opt.cider_idxs, path=path_idxs)
        Bleu_scorer = Bleu()
        Rouge_scorer = Rouge()
        Meteor_scorer = Meteor()

    batch_size = sample_res.size(0)  # batch_size = sample_size * seq_per_img
    seq_per_img = batch_size // len(gts_data)

    res = OrderedDict()
    sample_res = sample_res.cpu().numpy()

    # sample result
    for i in range(batch_size):
        res[i] = [array_to_str_aic(sample_res[i], vocab)]

    gts = OrderedDict()
    for i in range(len(gts_data)):
        gts[i] = [
            array_to_str_aic(gts_data[i][j], vocab)
            for j in range(len(gts_data[i]))
        ]

    res = [{'image_id': i, 'caption': res[i]} for i in range(batch_size)]
    gts = {i: gts[i // seq_per_img] for i in range(batch_size)}

    if opt.rl_metric == 'CIDEr':
        _, scores = CiderD_scorer.compute_score(gts, res)
    elif opt.rl_metric == 'ROUGE_L':
        _, scores = Rouge_scorer.compute_score(gts, res)
    elif opt.rl_metric == 'Bleu_4':
        _, scores = Bleu_scorer.compute_score(gts, res)
        _ = _[-1]
        scores = np.array(scores[-1])
    elif opt.rl_metric == 'AVG':
        d_, d_scores = CiderD_scorer.compute_score(gts, res)
        b_, b_scores = Bleu_scorer.compute_score(gts, res)
        r_, r_scores = Rouge_scorer.compute_score(gts, res)

        b_ = b_[-1]
        b_scores = np.array(b_scores[-1])

        _ = (d_ + b_ + r_) / 3
        scores = (d_scores + b_scores + r_scores) / 3
    elif opt.rl_metric == 'Meteor':
        _, scores = Meteor_scorer.compute_score(gts, res)

    # sample batch
    sample_mean = np.mean(scores)
    print('scores: {:.3f} sample:{:.3f}'.format(_, sample_mean))

    # batch_size
    sample_reward = scores

    # seq_length
    list_gamma = np.logspace(seq_length - 1, 0, seq_length, base=gamma)
    # batch_size * seq_length
    batch_gamma = np.repeat(list_gamma[np.newaxis, :], batch_size, 0)
    # batch_size * seq_length
    batch_sample_reward = np.repeat(sample_reward[:, np.newaxis], seq_length,
                                    1)

    # batch_size * (seq_length+1)
    full_sample_reward = batch_gamma * batch_sample_reward

    # sample_reward : batch_size
    # sample_mean : 1
    # full_sample_reward : batch_size * (seq_length+1)
    return full_sample_reward, sample_mean
예제 #17
0
class get_self_critical_reward(nn.Module):
    def __init__(self, opt):
        super(get_self_critical_reward, self).__init__()
        self.vocab_size = opt.vocab_size
        self.st2towidx = opt.st2towidx
        self.opt = opt
        # self.st2towidx.requires_grad=False
        self.CiderD_scorer = CiderD(df=opt.cached_tokens)

    def forward(self, gen_input, greedy_input, gt_gts, ncap):

        gen_txt_seq, gen_bn_seq, gen_vis_seq = gen_input
        greedy_txt_seq, greedy_bn_seq, greedy_vis_seq = greedy_input

        self.st2towidx = self.st2towidx.type_as(gen_txt_seq)
        batch_size = gen_txt_seq.size(0)
        seq_per_img = batch_size // gt_gts.size(0)

        gen_result = gen_txt_seq.new(gen_txt_seq.size()).zero_()
        greedy_result = greedy_txt_seq.new(greedy_txt_seq.size()).zero_()

        gen_mask = gen_txt_seq < self.vocab_size
        gen_vis_seq = gen_vis_seq.view(batch_size,-1)
        gen_bn_seq = gen_bn_seq.view(batch_size, -1)

        # compose the seq
        gen_result[gen_mask] = gen_txt_seq[gen_mask]
        gen_vis_idx = gen_vis_seq[gen_mask==0]*2 + gen_bn_seq[gen_mask==0] - 1

        gen_result[gen_mask==0] = self.st2towidx[gen_vis_idx]

        greedy_mask = greedy_txt_seq < self.vocab_size
        greedy_vis_seq = greedy_vis_seq.view(batch_size,-1)
        greedy_bn_seq = greedy_bn_seq.view(batch_size, -1)

        # compose the seq
        greedy_result[greedy_mask] = greedy_txt_seq[greedy_txt_seq < self.vocab_size]
        greedy_vis_idx = greedy_vis_seq[greedy_mask==0]*2 + greedy_bn_seq[greedy_mask==0] - 1
        greedy_result[greedy_mask==0] = self.st2towidx[greedy_vis_idx]

        res = OrderedDict()
        gen_result = gen_result.cpu().numpy()
        greedy_result = greedy_result.cpu().numpy()

        for i in range(batch_size):
            res[i] = [array_to_str(gen_result[i])]
        for i in range(batch_size):
            res[batch_size + i] = [array_to_str(greedy_result[i])]

        gts = OrderedDict()
        for i in range(batch_size):
            gts_np = gt_gts[i][:ncap.data[i]].data.cpu().numpy()
            gts[i] = [array_to_str(gts_np[j]) for j in range(len(gts_np))]

        # caption = utils.decode_normal(self.opt.itow, torch.from_numpy(gen_result))
        # pdb.set_trace()
        # print(caption[0])

        # utils.decode_normal(self.opt.itow, gt_gts.data.view(-1,20))
        #_, scores = Bleu(4).compute_score(gts, res)
        #scores = np.array(scores[3])
        res = [{'image_id':i, 'caption': res[i]} for i in range(2 * batch_size)]
        gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)}
        _, scores = self.CiderD_scorer.compute_score(gts, res)
        # print(_)

        scores = scores[:batch_size] - scores[batch_size:]
        rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1)

        return rewards, _
예제 #18
0
 def __init__(self, args):
     self.cider = CiderD(df='coco-train')
     with open('data/train_references.pkl') as fid:
         self.references = pickle.load(fid)
def train(opt):
    # setup dataloader
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    #set the checkpoint path
    opt.checkpoint_path = os.path.join(opt.checkpoint_path, opt.id)
    isExists = os.path.exists(opt.checkpoint_path)
    if not isExists:
        os.makedirs(opt.checkpoint_path)
        os.makedirs(opt.checkpoint_path + '/logs')
        print(opt.checkpoint_path + ' creating !')
    else:
        print(opt.checkpoint_path + ' already exists!')

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(
                os.path.join(
                    opt.checkpoint_path, 'infos_' + opt.id +
                    format(int(opt.start_from), '04') + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "att_feat_size", "rnn_size",
                "input_encoding_size"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(
                    opt.checkpoint_path, 'histories_' + opt.id +
                    format(int(opt.start_from), '04') + '.pkl')):
            with open(
                    os.path.join(
                        opt.checkpoint_path, 'histories_' + opt.id +
                        format(int(opt.start_from), '04') + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    word_loss_history = histories.get('word_loss_history', {})
    MAD_loss_history = histories.get('MAD_loss_history', {})
    SAP_loss_history = histories.get('SAP_loss_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    lr_history = histories.get('lr_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    #set up model, assure in training mode
    threshold = opt.threshold
    sc_flag = False
    num_gpu = opt.num_gpu

    model = models.setup(opt).cuda(device=0)
    model.train()
    update_lr_flag = True
    dp_model = torch.nn.parallel.DataParallel(model)

    optimizer = optim.Adam(model.parameters(),
                           opt.learning_rate,
                           (opt.optim_alpha, opt.optim_beta),
                           opt.optim_epsilon,
                           weight_decay=opt.weight_decay)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(
                opt.checkpoint_path, 'optimizer' + opt.id +
                format(int(opt.start_from), '04') + '.pth')):
        optimizer.load_state_dict(
            torch.load(
                os.path.join(
                    opt.checkpoint_path, 'optimizer' + opt.id +
                    format(int(opt.start_from), '04') + '.pth')))

    if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
        frac = (epoch - opt.scheduled_sampling_start
                ) // opt.scheduled_sampling_increase_every
        opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                          opt.scheduled_sampling_max_prob)
        model.ss_prob = opt.ss_prob

    optimizer.zero_grad()
    accumulate_iter = 0
    train_loss = 0

    subsequent_mat = np.load('data/markov_mat.npy')
    subsequent_mat = torch.from_numpy(subsequent_mat).cuda(device=0).float()
    subsequent_mat_all = subsequent_mat.clone()
    # for multi-GPU training
    for i in range(opt.num_gpu - 1):
        subsequent_mat_all = torch.cat([subsequent_mat_all, subsequent_mat],
                                       dim=0)

    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
                opt.current_lr = opt.learning_rate
            for group in optimizer.param_groups:
                group['lr'] = opt.current_lr

            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if sc_flag == False and opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                print('initializing CIDEr scorer...')
                s = time.time()
                global CiderD_scorer
                if (CiderD_scorer is None):
                    CiderD_scorer = CiderD(df=opt.cached_tokens)
                    #takes about 30s
                    print('initlizing CIDEr scorers in {:3f}s'.format(
                        time.time() - s))
                sc_flag = True
                opt.learning_rate_decay_every = opt.learning_rate_decay_every * 2  #default 5 for xe, 10 for scst

            update_lr_flag = False

        print('current_lr is {}'.format(opt.current_lr))
        start = time.time()
        data = loader.get_batch('train', opt.batch_size)

        torch.cuda.synchronize()

        fc_feats = None
        att_feats = None

        tmp = [
            data['fc_feats'], data['labels'], data['masks'], data['att_feats'],
            data['attr_labels'], data['subsequent_labels']
        ]
        tmp = [
            _ if _ is None else torch.from_numpy(_).cuda(device=0) for _ in tmp
        ]
        fc_feats, labels, masks, att_feats, attr_labels, subsequent_labels = tmp

        #convert 1-1000 to 0-999 (perhaps done in preprocessing)
        subsequent_labels = subsequent_labels - 1
        subsequent_mask = (subsequent_labels[:, 1:] >= 0).float()
        subsequent_labels = torch.where(
            subsequent_labels > 0, subsequent_labels,
            torch.zeros_like(subsequent_labels).int().cuda(device=0))

        print('Read and process data:', time.time() - start)

        if not sc_flag:
            SAP_loss, word_loss, MAD_loss = dp_model(
                fc_feats, att_feats, labels, masks, attr_labels,
                subsequent_labels, subsequent_mask, subsequent_mat_all)
            SAP_loss = SAP_loss.mean()
            word_loss = word_loss.mean()
            MAD_loss = MAD_loss.mean()
            accumulate_iter = accumulate_iter + 1
            loss = (word_loss + 0.2 * SAP_loss +
                    0.2 * MAD_loss) / opt.accumulate_number
            loss.backward()
        else:
            st = time.time()
            sm = torch.zeros([num_gpu, 1]).cuda(
                device=0)  #indexs for sampling by probabilities
            gen_result, sample_logprobs, _ = dp_model(fc_feats,
                                                      att_feats,
                                                      attr_labels,
                                                      subsequent_mat_all,
                                                      sm,
                                                      mode='sample')
            dp_model.eval()
            with torch.no_grad():
                greedy_res, _, _ = dp_model(fc_feats,
                                            att_feats,
                                            attr_labels,
                                            subsequent_mat_all,
                                            mode='sample')
            dp_model.train()
            ed = time.time()
            print('GPU time is : {}s'.format(ed - st))
            reward = get_self_critical_reward(gen_result, greedy_res,
                                              data['gts'])
            word_loss = dp_model(sample_logprobs,
                                 gen_result.data,
                                 torch.from_numpy(reward).float().cuda(),
                                 mode='scst_forward')
            word_loss = word_loss.mean()

            loss = word_loss

            #forward to minimize SAP loss and MAD loss
            SAP_loss, _, MAD_loss = dp_model(fc_feats, att_feats, labels,
                                             masks, attr_labels,
                                             subsequent_labels,
                                             subsequent_mask,
                                             subsequent_mat_all)
            SAP_loss = SAP_loss.mean()
            MAD_loss = MAD_loss.mean()
            loss = loss + 0.2 * SAP_loss + 0.2 * MAD_loss
            loss.backward()
            accumulate_iter = accumulate_iter + 1

        if accumulate_iter % opt.accumulate_number == 0:
            utils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()
            optimizer.zero_grad()

            iteration += 1
            accumulate_iter = 0
            train_loss = loss.item() * opt.accumulate_number
            end = time.time()
            #you can record the training log if you need
            #text_file = open(opt.checkpoint_path+'/logs/train_log_'+opt.id+'.txt', "aw")
            if not sc_flag:
                print("iter {} (epoch {}), SAP_loss = {:.3f}, word_loss = {:.3f}, MAD_loss = {:.3f} time/batch = {:.3f}" \
                      .format(iteration, epoch,SAP_loss, word_loss,MAD_loss, end - start))
                #text_file.write("iter {} (epoch {}),SAP_loss = {:.3f}, word_loss {:.3f}, MAD_loss {:.3f},time/batch = {:.3f}\n" \
                #      .format(iteration, epoch,SAP_loss, word_loss, MAD_loss, end - start))

            else:
                print("iter {} (epoch {}),SAP_loss = {:.3f}, avg_reward = {:.3f},MAD_loss = {:.3f} time/batch = {:.3f}" \
                      .format(iteration, epoch,SAP_loss,np.mean(reward[:, 0]),MAD_loss, end - start))
                #text_file.write("iter {} (epoch {}), avg_reward = {:.3f} MAD_loss ={:.3f}, time/batch = {:.3f}\n" \
                #      .format(iteration, epoch, np.mean(reward[:, 0]), MAD_loss, end - start))
            #text_file.close()
        torch.cuda.synchronize()

        # Update the iteration and epoch

        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every
                == 0) and (accumulate_iter % opt.accumulate_number == 0):
            add_summary_value(tb_summary_writer, 'word_loss', word_loss.item(),
                              iteration)
            add_summary_value(tb_summary_writer, 'MAD_loss', MAD_loss.item(),
                              iteration)
            add_summary_value(tb_summary_writer, 'SAP_loss', SAP_loss.item(),
                              iteration)
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                              model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward[:, 0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            word_loss_history[iteration] = word_loss.item()
            SAP_loss_history[iteration] = SAP_loss.item()
            MAD_loss_history[iteration] = MAD_loss.item()

            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every
                == 0) and (accumulate_iter % opt.accumulate_number == 0):
            # eval model
            eval_kwargs = {
                'split': 'val',
                'dataset': opt.input_json,
                'num_images': -1,
                'index_eval': 1,
                'id': opt.id,
                'beam': opt.beam,
                'verbose_loss': 1,
                'checkpoint_path': opt.checkpoint_path
            }
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats, precision, recall = eval_utils.eval_split(
                dp_model, loader, subsequent_mat_all, eval_kwargs)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            if lang_stats is not None:
                for k, v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            #save lang stats
            f_lang = open(
                opt.checkpoint_path + '/logs/lang_' + opt.id + '.txt', 'aw')
            f_lang.write(
                str(iteration) + ' ' +
                str(iteration / opt.save_checkpoint_every) + '\n')
            f_lang.write('val loss ' + str(val_loss) + '\n')
            for key_lang in lang_stats:
                f_lang.write(key_lang + ' ' + str(lang_stats[key_lang]) + '\n')
            f_lang.write('precision ' + str(precision) + ' recall ' +
                         str(recall) + '\n')
            f_lang.close()

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            save_id = iteration / opt.save_checkpoint_every

            if best_val_score is None or current_score > best_val_score or current_score > threshold:
                best_val_score = current_score
                best_flag = True

                ##only save the improved models or when the CIDEr-D is larger than a given threshold
                checkpoint_path = os.path.join(
                    opt.checkpoint_path,
                    'model' + opt.id + format(int(save_id), '04') + '.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(
                    opt.checkpoint_path,
                    'optimizer' + opt.id + format(int(save_id), '04') + '.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                #record the lang stats for saved mdoel
                f_lang = open(
                    opt.checkpoint_path + '/logs/Best_lang_' + opt.id + '.txt',
                    'aw')
                f_lang.write(
                    str(iteration) + ' ' +
                    str(iteration / opt.save_checkpoint_every) + '\n')
                f_lang.write('val loss ' + str(val_loss) + '\n')
                for key_lang in lang_stats:
                    f_lang.write(key_lang + ' ' + str(lang_stats[key_lang]) +
                                 '\n')
                f_lang.write('precision ' + str(precision) + ' recall ' +
                             str(recall) + '\n')
                f_lang.close()

            # Dump miscalleous informations
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['iterators'] = loader.iterators
            infos['split_ix'] = loader.split_ix
            infos['best_val_score'] = best_val_score
            infos['opt'] = opt
            infos['vocab'] = loader.get_vocab()

            histories['val_result_history'] = val_result_history
            histories['loss_history'] = loss_history
            histories['word_loss_history'] = loss_history
            histories['MAD_loss_history'] = MAD_loss_history
            histories['SAP_loss_history'] = SAP_loss_history

            histories['lr_history'] = lr_history
            histories['ss_prob_history'] = ss_prob_history

            with open(
                    os.path.join(
                        opt.checkpoint_path, 'infos_' + opt.id +
                        format(int(save_id), '04') + '.pkl'), 'wb') as f:
                cPickle.dump(infos, f)
            with open(
                    os.path.join(
                        opt.checkpoint_path, 'histories_' + opt.id +
                        format(int(save_id), '04') + '.pkl'), 'wb') as f:
                cPickle.dump(histories, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
예제 #20
0
def eval_split(model, crit, loader, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    verbose_beam = eval_kwargs.get('verbose_beam', 1)
    verbose_loss = eval_kwargs.get('verbose_loss', 1)
    num_images = eval_kwargs.get('num_images',
                                 eval_kwargs.get('val_images_use', -1))
    split = eval_kwargs.get('split', None)
    if split == 'onlinetest':
        split = None
    lang_eval = eval_kwargs.get('language_eval', 0)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)
    ciderd = eval_kwargs.get('ciderd', False)
    annFile = eval_kwargs.get('annfile', None)

    # Make sure in the evaluation mode
    model.eval()

    loader.reset_iterator(split)

    n = 0
    loss = 0
    loss_sum = 0
    loss_evals = 1e-8
    predictions = []

    # produce CiderD score for each generated caption
    if ciderd:
        CiderD_scorer = CiderD(df='coco-train-idxs')

    while True:
        data = loader.get_batch(split)
        n = n + loader.batch_size

        if data.get('labels', None) is not None and verbose_loss:
            # forward the model to get loss
            tmp = [
                data['fc_feats'], data['att_feats'], data['labels'],
                data['masks'], data['att_masks']
            ]
            tmp = [
                torch.from_numpy(_).cuda() if _ is not None else _ for _ in tmp
            ]
            fc_feats, att_feats, labels, masks, att_masks = tmp

            with torch.no_grad():
                loss = crit(
                    model(fc_feats, att_feats, labels, att_masks)[0],
                    labels[:, 1:], masks[:, 1:]).item()
            loss_sum = loss_sum + loss
            loss_evals = loss_evals + 1

        # forward the model to also get generated samples for each image
        # Only leave one feature for each image, in case duplicate sample
        tmp = [
            data['fc_feats'][np.arange(loader.batch_size) *
                             loader.seq_per_img],
            data['att_feats'][np.arange(loader.batch_size) *
                              loader.seq_per_img],
            data['att_masks'][np.arange(loader.batch_size) *
                              loader.seq_per_img]
            if data['att_masks'] is not None else None
        ]
        tmp = [torch.from_numpy(_).cuda() if _ is not None else _ for _ in tmp]
        fc_feats, att_feats, att_masks = tmp
        # forward the model to also get generated samples for each image
        with torch.no_grad():
            outputs = model(fc_feats,
                            att_feats,
                            att_masks,
                            opt=eval_kwargs,
                            mode='sample')
            seq = outputs[0].data

        # fproducing the ciderd score, first producing the groudtruth file gts and the evaluated file res
        if ciderd:
            gts = {}
            for i in range(len(data['gts'])):
                gts[data['infos'][i]['id']] = [
                    array_to_str(data['gts'][i][j])
                    for j in range(len(data['gts'][i]))
                ]
            gen_result = seq.data.cpu().numpy()
            res = {}
            for i in range(len(gen_result)):
                res[data['infos'][i]['id']] = [array_to_str(gen_result[i])]
            res_ = [{'image_id': k, 'caption': v} for k, v in res.items()]
            _, cider_scores = CiderD_scorer.compute_score(gts, res_)

        # Print beam search
        if beam_size > 1 and verbose_beam:
            for i in range(loader.batch_size):
                print('\n'.join([
                    utils.decode_sequence(loader.get_vocab(),
                                          _['seq'].unsqueeze(0))[0]
                    for _ in model.done_beams[i]
                ]))
                print('--' * 10)
        sents = utils.decode_sequence(loader.get_vocab(), seq)

        for k, sent in enumerate(sents):
            if ciderd:
                entry = {
                    'image_id': data['infos'][k]['id'],
                    'caption': sent,
                    'cider': cider_scores[k]
                }
            else:
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}

            if eval_kwargs.get('dump_path', 0) == 1:
                entry['file_name'] = data['infos'][k]['file_path']
            predictions.append(entry)
            if eval_kwargs.get('dump_images', 0) == 1:
                # dump the raw image to vis/ folder
                cmd = 'cp "' + os.path.join(
                    eval_kwargs['image_root'],
                    data['infos'][k]['file_path']) + '" vis/imgs/img' + str(
                        len(predictions)) + '.jpg'  # bit gross
                print(cmd)
                os.system(cmd)

            if verbose:
                if ciderd:
                    print(
                        'image %s: %s; ciderd: %.3f' %
                        (entry['image_id'], entry['caption'], entry['cider']))
                else:
                    print('image %s: %s' %
                          (entry['image_id'], entry['caption']))

        # if we wrapped around the split or used up val imgs budget then bail
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if num_images != -1:
            ix1 = min(ix1, num_images)
        for i in range(n - ix1):
            predictions.pop()

        if verbose:
            print('evaluating validation preformance... %d/%d (%f)' %
                  (ix0 - 1, ix1, loss))

        if data['bounds']['wrapped']:
            break
        if num_images >= 0 and n >= num_images:
            break

    lang_stats = None
    if lang_eval == 1:
        lang_stats = language_eval(dataset, predictions, eval_kwargs['id'],
                                   split, annFile)

    # Switch back to training mode
    model.train()
    return loss_sum / loss_evals, predictions, lang_stats
예제 #21
0
 def _init_cider_scorer(self):
     cached_tokens, _ = os.path.splitext(os.path.basename(self.ngram_file))
     self.CiderD_scorer = self.CiderD_scorer or CiderD(
         df=cached_tokens, ngram_file=self.ngram_file)
예제 #22
0
import time
import misc.utils as utils
from collections import OrderedDict
import torch

import sys
#sys.path.append("cider")
sys.path.append("cider-master")
from pyciderevalcap.ciderD.ciderD import CiderD
sys.path.append("coco-caption")
#from pycocoevalcap.cider.cider import CiderD
from pycocoevalcap.bleu.bleu import Bleu

CiderD_scorer = None
Bleu_scorer = None
CiderD_scorer = CiderD(df='corpus')


def init_scorer(cached_tokens):
    global CiderD_scorer
    CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
    global Bleu_scorer
    Bleu_scorer = Bleu_scorer or Bleu(4)


def array_to_str(arr):
    out = ''
    for i in range(len(arr)):
        out += str(arr[i]) + ' '
        if arr[i] == 0:
            break
예제 #23
0
 def __init__(self, path_to_cached_tokens, metric_weights):
     self._scorer = dict(
             ciderD = CiderD(df=path_to_cached_tokens),
             cider = Cider(df=path_to_cached_tokens),
             bleu = BleuSilent(4))
     self.weights = metric_weights
예제 #24
0
    tf.logging.info('restoring from checkpoint')
else:
    sess.run(init)
    
start = sess.run(models[0].step)

best_res = 0.0
history_cider = []
batch_size = FLAGS.batch_size*FLAGS.ngpu
# the number of batches(iterations) per epoch
nBatches_epoch = opts.nImgs//batch_size
vocab = vocabulary(opts)

# The cider_d scorer
# TODO create the df
CiderD_Scorer = CiderD(df = FLAGS.ached_tokens)

def score_seq(gts,seqs):
    
    # check if it is valid
    assert gts.shape[0]%opts.nSeqs_per_img==0
    assert len(seqs) == 2
    assert seqs[0].shape[0] == seqs[1].shape[0] ==gts.shape[0]
    assert seqs[0].shape[0]%opts.nSeqs_per_img == 0
    
    batch_size = gts.shape[0]
    nImage = batch_size//opts.nSeqs_per_img
    gts = np.reshape(gts,[nImage,opts.nSeqs_per_img,-1])
    gts = {i: list(chain.from_iterable(convert_to_str(gts[i]))) for i in xrange(nImage)}
    
    baseline_seqs, random_seqs = seqs
예제 #25
0
def get_self_critical_reward_aic(greedy_res, gen_result, gts_data, alpha,
                                 vocab, opt):
    global CiderD_scorer
    global Bleu_scorer
    global Rouge_scorer
    if CiderD_scorer is None:
        # type = 0
        # if type == 0:
        #     path_cider = "/media/amds/data/code/cider"
        #     path_idxs = "/media/amds/data/dataset/mscoco"
        # else:
        #     path_cider = "/home/scw4750/caption/cider"
        #     path_idxs = "/home/scw4750/caption/dataset/mscoco"

        path_cider = opt.path_cider
        path_idxs = opt.path_idxs

        # /home/scw4750/caption/cider
        # /media/amds/data/code/cider
        sys.path.append(path_cider)
        from pyciderevalcap.ciderD.ciderD import CiderD
        from pyciderevalcap.bleu.bleu import Bleu
        from pyciderevalcap.rouge.rouge import Rouge
        from pyciderevalcap.meteor.meteor import Meteor

        # /home/scw4750/caption/dataset/mscoco
        # /media/amds/data/dataset/mscoco
        CiderD_scorer = CiderD(df=opt.cider_idxs, path=path_idxs)
        Bleu_scorer = Bleu()
        Rouge_scorer = Rouge()
        Meteor_scorer = Meteor()

    batch_size = gen_result.size(0)  # batch_size = sample_size * seq_per_img
    seq_per_img = batch_size // len(gts_data)

    res = OrderedDict()
    gen_result = gen_result.cpu().numpy()
    greedy_res = greedy_res.cpu().numpy()

    # sample result
    for i in range(batch_size):
        res[i] = [array_to_str_aic(gen_result[i], vocab)]

    # greedy result
    for i in range(batch_size):
        res[batch_size + i] = [array_to_str_aic(greedy_res[i], vocab)]

    gts = OrderedDict()
    for i in range(len(gts_data)):
        gts[i] = [
            array_to_str_aic(gts_data[i][j], vocab)
            for j in range(len(gts_data[i]))
        ]

    res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)]
    gts = {
        i: gts[i % batch_size // seq_per_img]
        for i in range(2 * batch_size)
    }

    if opt.rl_metric == 'CIDEr':
        _, scores = CiderD_scorer.compute_score(gts, res)
    elif opt.rl_metric == 'ROUGE_L':
        _, scores = Rouge_scorer.compute_score(gts, res)
    elif opt.rl_metric == 'Bleu_4':
        _, scores = Bleu_scorer.compute_score(gts, res)
        _ = _[-1]
        scores = np.array(scores[-1])
    elif opt.rl_metric == 'AVG':
        d_, d_scores = CiderD_scorer.compute_score(gts, res)
        b_, b_scores = Bleu_scorer.compute_score(gts, res)
        r_, r_scores = Rouge_scorer.compute_score(gts, res)

        b_ = b_[-1]
        b_scores = np.array(b_scores[-1])

        _ = (d_ + b_ + r_) / 3
        scores = (d_scores + b_scores + r_scores) / 3
    elif opt.rl_metric == 'Meteor':
        _, scores = Meteor_scorer.compute_score(gts, res)

    sample_mean = np.mean(scores[:batch_size])
    greedy_mean = np.mean(scores[batch_size:])
    print('scores: {:.3f} sample:{:.3f} greedy:{:.3f}'.format(
        _, sample_mean, greedy_mean))

    # diff_result = sample_result - greedy_result
    # batch_size
    scores = scores[:batch_size] - scores[batch_size:] * alpha

    # batch_size * seq_length
    rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1)

    return rewards, sample_mean, greedy_mean