Пример #1
0
    def test_story(self, model, dataset, loader, opt):
        logging.info("Evaluating...")
        start = time.time()
        model.eval()
        dataset.test()

        predictions = {}
        prediction_txt = open(self.prediction_file,
                              'wb')  # open the file to store the predictions

        for iter, batch in enumerate(loader):
            iter_start = time.time()

            semantic = batch['semantic'].cuda()
            feature_fc = Variable(batch['feature_fc'], volatile=True).cuda()
            feature_conv = Variable(
                batch['feature_conv'],
                volatile=True).cuda() if 'feature_conv' in batch else None
            if feature_conv is not None:
                results, _ = model.predict(feature_fc,
                                           feature_conv,
                                           beam_size=opt.beam_size)
            else:
                results, _ = model.predict(feature_fc,
                                           semantic,
                                           beam_size=opt.beam_size)

            sents = utils.decode_story(dataset.get_vocab(), results)

            indexes = batch['index'].numpy()
            for j, story in enumerate(sents):
                vid, _ = dataset.get_id(indexes[j])
                if vid not in predictions:  # only predict one story for an album
                    # write into txt file for evaluate metrics like Cider
                    prediction_txt.write('{}\t {}\n'.format(vid, story))
                    # save into predictions
                    predictions[vid] = story

            print("Evaluate iter {}/{}  {:04.2f}%. Time used: {}".format(
                iter, len(loader), iter * 100.0 / len(loader),
                time.time() - iter_start))

        prediction_txt.close()

        json_prediction_file = '{}.json'.format(self.prediction_file)
        for vid in predictions.keys():
            predictions[vid] = [predictions[vid]]
        self.eval.evaluate(self.reference, predictions)
        with open(json_prediction_file, 'w') as f:
            json.dump(predictions, f)
        metrics = self.eval.eval_overall

        json.dump(metrics,
                  open(os.path.join(self.save_dir, 'test_scores.json'), 'w'))
        # Switch back to training mode
        print("Test finished. Time used: {}".format(time.time() - start))
        return predictions, metrics
Пример #2
0
    def forward(self, seq, seq_log_probs, baseline, index, rewards=None):
        '''
        :param seq: (batch_size, 5, seq_length)
        :param seq_log_probs: (batch_size, 5, seq_length)
        :param baseline: (batch_size, 5, seq_length)
        :param indexes: (batch_size,)
        :param rewards: (batch_size, 5, seq_length)
        :return:
        '''
        if rewards is None:
            # compute the reward
            sents = utils.decode_story(self.dataset.get_vocab(), seq)

            rewards = []
            batch_size = seq.size(0)
            for i, story in enumerate(sents):
                vid, _ = self.dataset.get_id(index[i])
                GT_story = self.dataset.get_GT(index[i])
                result = {vid: [story]}
                gt = {vid: [GT_story]}
                score, _ = self.reward_scorer.compute_score(gt, result)
                if self.bleu is not None:
                    rewards.append(score[self.bleu])
                else:
                    rewards.append(score)
            rewards = torch.FloatTensor(rewards)  # (batch_size,)
            avg_reward = rewards.mean()
            rewards = Variable(rewards.view(batch_size, 1,
                                            1).expand_as(seq)).cuda()
        else:
            avg_reward = rewards.mean()
            rewards = rewards.view(-1, 5, 1)

        # get the mask
        mask = (seq > 0).float(
        )  # its size is supposed to be (batch_size, 5, seq_length)
        if mask.size(2) > 1:
            mask = torch.cat([
                mask.new(mask.size(0), mask.size(1), 1).fill_(1),
                mask[:, :, :-1]
            ], 2).contiguous()
        else:
            mask.fill_(1)
        mask = Variable(mask)

        # compute the loss
        advantage = Variable(rewards.data - baseline.data)
        value_loss = self._cal_value_loss(rewards, baseline, mask)
        action_loss = self._cal_action_loss(seq_log_probs, advantage, mask)

        return action_loss + value_loss, avg_reward
Пример #3
0
def train(opt):
    logger = Logger(opt)
    flag = Flag(D_iters=opt.D_iter, G_iters=opt.G_iter, always=opt.always)
    ################### set up dataset and dataloader ########################
    dataset = VISTDataset(opt)
    opt.vocab_size = dataset.get_vocab_size()
    opt.seq_length = dataset.get_story_length()

    dataset.set_option(data_type={
        'whole_story': False,
        'split_story': True,
        'caption': False
    })

    dataset.train()
    train_loader = DataLoader(dataset,
                              batch_size=opt.batch_size,
                              shuffle=opt.shuffle,
                              num_workers=opt.workers)
    dataset.val()
    val_loader = DataLoader(dataset,
                            batch_size=opt.batch_size,
                            shuffle=False,
                            num_workers=opt.workers)

    ##################### set up model, criterion and optimizer ######
    bad_valid = 0

    # set up evaluator
    evaluator = Evaluator(opt, 'val')

    # set up criterion
    crit = criterion.LanguageModelCriterion()
    rl_crit = criterion.ReinforceCriterion(opt, dataset)

    # set up model
    model = models.setup(opt)
    model.cuda()
    disc_opt = copy.copy(opt)
    disc_opt.model = 'RewardModel'
    disc = models.setup(disc_opt)
    if os.path.exists(os.path.join(logger.log_dir, 'disc-model.pth')):
        logging.info("loading pretrained RewardModel")
        disc.load_state_dict(
            torch.load(os.path.join(logger.log_dir, 'disc-model.pth')))
    disc.cuda()

    # set up optimizer
    optimizer = setup_optimizer(opt, model)
    disc_optimizer = setup_optimizer(opt, disc)

    dataset.train()
    model.train()
    disc.train()
    ############################## training ##################################
    for epoch in range(logger.epoch_start, opt.max_epochs):
        # Assign the scheduled sampling prob

        start = time.time()
        for iter, batch in enumerate(train_loader):
            logger.iteration += 1
            torch.cuda.synchronize()

            feature_fc = Variable(batch['feature_fc']).cuda()
            target = Variable(batch['split_story']).cuda()
            index = batch['index']

            optimizer.zero_grad()
            disc_optimizer.zero_grad()

            if flag.flag == "Disc":
                model.eval()
                disc.train()
                if opt.decoding_method_DISC == 'sample':
                    seq, seq_log_probs, baseline = model.sample(
                        feature_fc,
                        sample_max=False,
                        rl_training=True,
                        pad=True)
                elif opt.decoding_method_DISC == 'greedy':
                    seq, seq_log_probs, baseline = model.sample(
                        feature_fc,
                        sample_max=True,
                        rl_training=True,
                        pad=True)
            else:
                model.train()
                disc.eval()
                seq, seq_log_probs, baseline = model.sample(feature_fc,
                                                            sample_max=False,
                                                            rl_training=True,
                                                            pad=True)

            seq = Variable(seq).cuda()
            mask = (seq > 0).float()
            mask = to_contiguous(
                torch.cat([
                    Variable(
                        mask.data.new(mask.size(0), mask.size(1), 1).fill_(1)),
                    mask[:, :, :-1]
                ], 2))
            normed_seq_log_probs = (seq_log_probs *
                                    mask).sum(-1) / mask.sum(-1)

            gen_score = disc(seq.view(-1, seq.size(2)),
                             feature_fc.view(-1, feature_fc.size(2)))

            if flag.flag == "Disc":
                gt_score = disc(target.view(-1, target.size(2)),
                                feature_fc.view(-1, feature_fc.size(2)))
                loss = -torch.sum(gt_score) + torch.sum(gen_score)

                avg_pos_score = torch.mean(gt_score)
                avg_neg_score = torch.mean(gen_score)

                if logger.iteration % 5 == 0:
                    logging.info("pos reward {} neg reward {}".format(
                        avg_pos_score.data[0], avg_neg_score.data[0]))
                    print(
                        "PREDICTION: ",
                        utils.decode_story(dataset.get_vocab(),
                                           seq[:1].data)[0])
                    print(
                        "GROUND TRUTH: ",
                        utils.decode_story(dataset.get_vocab(),
                                           target[:1].data)[0])
            else:
                rewards = Variable(gen_score.data -
                                   0.001 * normed_seq_log_probs.data)
                #with open("/tmp/reward.txt", "a") as f:
                #    print(" ".join(map(str, rewards.data.cpu().numpy())), file=f)
                loss, avg_score = rl_crit(seq.data, seq_log_probs, baseline,
                                          index, rewards)
                # if logger.iteration % opt.losses_log_every == 0:
                avg_pos_score = torch.mean(gen_score)
                logging.info("average reward: {} average IRL score: {}".format(
                    avg_score.data[0], avg_pos_score.data[0]))

            if flag.flag == "Disc":
                loss.backward()
                nn.utils.clip_grad_norm(disc.parameters(),
                                        opt.grad_clip,
                                        norm_type=2)
                disc_optimizer.step()
            else:
                tf_loss = crit(model(feature_fc, target), target)
                print("rl_loss / tf_loss = ", loss.data[0] / tf_loss.data[0])
                loss = opt.rl_weight * loss + (1 - opt.rl_weight) * tf_loss
                loss.backward()
                nn.utils.clip_grad_norm(model.parameters(),
                                        opt.grad_clip,
                                        norm_type=2)
                optimizer.step()

            train_loss = loss.data[0]
            torch.cuda.synchronize()

            # Write the training loss summary
            if logger.iteration % opt.losses_log_every == 0:
                logger.log_training(epoch, iter, train_loss, opt.learning_rate,
                                    model.ss_prob)
                logging.info(
                    "Epoch {} Train {} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s"
                    .format(epoch, flag.flag, iter, len(train_loader),
                            train_loss,
                            time.time() - start))
                start = time.time()

            if logger.iteration % opt.save_checkpoint_every == 0:
                if opt.always is None:
                    # Evaluate on validation dataset and save model for every epoch
                    val_loss, predictions, metrics = evaluator.eval_story(
                        model, crit, dataset, val_loader, opt)
                    if opt.metric == 'XE':
                        score = -val_loss
                    else:
                        score = metrics[opt.metric]
                    logger.log_checkpoint(epoch, val_loss, metrics,
                                          predictions, opt, model, dataset,
                                          optimizer)
                    # halve the learning rate if not improving for a long time
                    if logger.best_val_score > score:
                        bad_valid += 1
                        if bad_valid >= 10:
                            opt.learning_rate = opt.learning_rate / 2.0
                            logging.info("halve learning rate to {}".format(
                                opt.learning_rate))
                            checkpoint_path = os.path.join(
                                logger.log_dir, 'model-best.pth')
                            model.load_state_dict(torch.load(checkpoint_path))
                            utils.set_lr(
                                optimizer,
                                opt.learning_rate)  # set the decayed rate
                            bad_valid = 0
                            logging.info("bad valid : {}".format(bad_valid))
                    else:
                        logging.info("achieving best {} score: {}".format(
                            opt.metric, score))
                        bad_valid = 0
                else:
                    torch.save(disc.state_dict(),
                               os.path.join(logger.log_dir, 'disc-model.pth'))
            flag.inc()
Пример #4
0
    def test_challange(self, model, dataset, loader, opt, side_model=None):
        # Make sure in the evaluation mode
        logging.info("Evaluating...")
        start = time.time()
        model.eval()
        dataset.test()

        predictions = {
            "team_name": "",
            "evaluation_info": {
                "additional_description": ""
            },
            "output_stories": []
        }

        prediction_txt = open(self.prediction_file,
                              'w')  # open the file to store the predictions

        count = 0
        finished_flickr_ids = []
        for iter, batch in enumerate(loader):
            iter_start = time.time()

            semantic = batch['semantic'].cuda()
            feature_fc = Variable(batch['feature_fc'], volatile=True).cuda()
            conv_feature = Variable(
                batch['feature_conv'],
                volatile=True).cuda() if 'feature_conv' in batch else None
            count += feature_fc.size(0)
            if conv_feature is not None:
                results, _ = model.predict(feature_fc,
                                           conv_feature,
                                           beam_size=opt.beam_size)
            else:
                results, _ = model.predict(feature_fc,
                                           semantic,
                                           beam_size=opt.beam_size)
            stories = utils.decode_story(dataset.get_vocab(), results)

            indexes = batch['index'].numpy()
            for j, story in enumerate(stories):
                album_id, flickr_id = dataset.get_all_id(indexes[j])
                story_id = dataset.get_story_id(indexes[j])
                concat_flickr_id = "-".join(flickr_id)
                if concat_flickr_id not in finished_flickr_ids:
                    # if vid not in predictions:  # only predict one story for an album
                    # write into txt file for evaluate metrics like Cider
                    prediction_txt.write('{}\t {}\n'.format(album_id, story))
                    # save into predictions
                    predictions['output_stories'].append({
                        'story_id':
                        story_id,
                        'album_id':
                        album_id,
                        'photo_sequence':
                        flickr_id,
                        'story_text_normalized':
                        story
                    })
                    finished_flickr_ids.append(concat_flickr_id)

            logging.info(
                "Evaluate iter {}/{}  {:04.2f}%. Time used: {}".format(
                    iter, len(loader), iter * 100.0 / len(loader),
                    time.time() - iter_start))

        prediction_txt.close()
        json_prediction_file = os.path.join(self.save_dir, 'challenge.json')
        with open(json_prediction_file, 'w') as f:
            json.dump(predictions, f)

        logging.info(
            "Evaluation finished. Evaluated {} samples. Time used: {}".format(
                count,
                time.time() - start))
        return predictions
Пример #5
0
    def eval_story(self, model, crit, dataset, loader, opt, side_model=None):
        # Make sure in the evaluation mode
        logging.info("Evaluating...")
        start = time.time()
        model.eval()
        dataset.val()

        loss_sum = 0
        loss_evals = 0
        predictions = {}

        prediction_txt = open(self.prediction_file,
                              'wb')  # open the file to store the predictions

        count = 0
        for iter, batch in enumerate(loader):
            iter_start = time.time()

            semantic = batch['semantic'].cuda()
            feature_fc = Variable(batch['feature_fc'], volatile=True).cuda()
            target = Variable(batch['split_story'], volatile=True).cuda()
            conv_feature = Variable(
                batch['feature_conv'],
                volatile=True).cuda() if 'feature_conv' in batch else None

            count += feature_fc.size(0)

            if side_model is not None:
                story, _ = side_model.predict(
                    feature_fc.view(-1, feature_fc.shape[2]), 1)
                story = Variable(story).cuda()
                if conv_feature is not None:
                    output = model(feature_fc, target, story, conv_feature)
                else:
                    output = model(feature_fc, target, story)
            else:
                if conv_feature is not None:
                    output = model(feature_fc, target, conv_feature)
                else:
                    output = model(feature_fc, target, semantic)

            loss = crit(output, target).data[0]
            loss_sum += loss
            loss_evals += 1

            # forward the model to also get generated samples for each video
            if side_model is not None:
                if conv_feature is not None:
                    results, _ = model.predict(feature_fc,
                                               story,
                                               conv_feature,
                                               beam_size=opt.beam_size)
                else:
                    results, _ = model.predict(feature_fc,
                                               conv_feature,
                                               beam_size=opt.beam_size)
            else:
                if conv_feature is not None:
                    results, _ = model.predict(feature_fc,
                                               conv_feature,
                                               beam_size=opt.beam_size)
                else:
                    results, _ = model.predict(feature_fc,
                                               semantic,
                                               beam_size=opt.beam_size)
            stories = utils.decode_story(dataset.get_vocab(), results)

            indexes = batch['index'].numpy()
            for j, story in enumerate(stories):
                vid, _ = dataset.get_id(indexes[j])
                if vid not in predictions:  # only predict one story for an album
                    # write into txt file for evaluate metrics like Cider
                    prediction_txt.write('{}\t {}\n'.format(vid, story))
                    # save into predictions
                    predictions[vid] = story

            logging.info(
                "Evaluate iter {}/{}  {:04.2f}%. Time used: {}".format(
                    iter, len(loader), iter * 100.0 / len(loader),
                    time.time() - iter_start))

        prediction_txt.close()
        metrics = self.measure()  # compute all the language metrics

        # Switch back to training mode
        model.train()
        dataset.train()
        logging.info(
            "Evaluation finished. Evaluated {} samples. Time used: {}".format(
                count,
                time.time() - start))
        return loss_sum / loss_evals, predictions, metrics
Пример #6
0
    def test_challange(self, model, dataset, loader, opt, side_model=None):
        # Make sure in the evaluation mode
        logging.info("Evaluating...")
        start = time.time()
        model.eval()
        dataset.test()

        predictions = {
            "team_name": "",
            "evaluation_info": {
                "additional_description": ""
            },
            "output_stories": []
        }

        prediction_txt = open(self.prediction_file,
                              'w')  # open the file to store the predictions

        count = 0
        finished_flickr_ids = []
        with torch.no_grad():
            for iter, batch in enumerate(loader):
                iter_start = time.time()

                feature_fc = batch['feature_fc'].cuda()
                feature_obj = batch['feature_obj'].cuda()
                if opt.use_spatial:
                    feature_obj_spatial = batch['feature_obj_spatial'].cuda()
                else:
                    feature_obj_spatial = None
                if opt.use_classes:
                    feature_obj_classes = batch['feature_obj_classes'].cuda()
                else:
                    feature_obj_classes = None
                if opt.use_attrs:
                    feature_obj_attrs = batch['feature_obj_attrs'].cuda()
                else:
                    feature_obj_attrs = None
                count += feature_fc.size(0)
                results, _ = model.predict(
                    feature_fc,
                    feature_obj,
                    beam_size=opt.beam_size,
                    spatial=feature_obj_spatial,
                    clss=feature_obj_classes,
                    attrs=feature_obj_attrs,
                    penalty=opt.penalty,
                    frequencies=dataset.frequency,
                    function_words=dataset.get_function_words())
                stories, _ = utils.decode_story(dataset.get_vocab(), results)

                indexes = batch['index'].numpy()
                for j, story in enumerate(stories):
                    album_id, flickr_id = dataset.get_all_id(indexes[j])
                    concat_flickr_id = "-".join(flickr_id)
                    if concat_flickr_id not in finished_flickr_ids:
                        # if vid not in predictions:  # only predict one story for an album
                        # write into txt file for evaluate metrics like Cider
                        prediction_txt.write('{}\t {}\n'.format(
                            album_id, story))
                        # save into predictions
                        predictions['output_stories'].append({
                            'album_id':
                            album_id,
                            'photo_sequence':
                            flickr_id,
                            'story_text_normalized':
                            story
                        })
                        finished_flickr_ids.append(concat_flickr_id)

                logging.info(
                    "Evaluate iter {}/{}  {:04.2f}%. Time used: {}".format(
                        iter, len(loader), iter * 100.0 / len(loader),
                        time.time() - iter_start))

            prediction_txt.close()
            json_prediction_file = os.path.join(self.save_dir,
                                                'challenge.json')
            with open(json_prediction_file, 'w') as f:
                json.dump(predictions, f)

            logging.info(
                "Evaluation finished. Evaluated {} samples. Time used: {}".
                format(count,
                       time.time() - start))
            subprocess.call([
                "java", "-jar", opt.challenge_dir, "-testFile",
                os.path.join(self.save_dir, 'challenge.json'), "-gsFile",
                opt.sis_path
            ])
        return predictions
Пример #7
0
    def test_story(self, model, dataset, loader, opt):
        logging.info("Evaluating...")
        start = time.time()
        model.eval()
        dataset.test()

        predictions = {}
        prediction_txt = open(self.prediction_file,
                              'w')  # open the file to store the predictions

        with torch.no_grad():
            for iter, batch in enumerate(loader):
                iter_start = time.time()

                feature_fc = batch['feature_fc'].cuda()
                feature_conv = batch['feature_conv'].cuda(
                ) if 'feature_conv' in batch else None
                feature_obj = batch['feature_obj'].cuda()
                if opt.use_spatial:
                    feature_obj_spatial = batch['feature_obj_spatial'].cuda()
                else:
                    feature_obj_spatial = None
                if opt.use_classes:
                    feature_obj_classes = batch['feature_obj_classes'].cuda()
                else:
                    feature_obj_classes = None
                if opt.use_attrs:
                    feature_obj_attrs = batch['feature_obj_attrs'].cuda()
                else:
                    feature_obj_attrs = None

                results, _ = model.predict(
                    feature_fc,
                    feature_obj,
                    beam_size=opt.beam_size,
                    spatial=feature_obj_spatial,
                    clss=feature_obj_classes,
                    attrs=feature_obj_attrs,
                    penalty=opt.penalty,
                    frequencies=dataset.frequency,
                    function_words=dataset.get_function_words())

                sents, _ = utils.decode_story(dataset.get_vocab(), results)

                indexes = batch['index'].numpy()
                for j, story in enumerate(sents):
                    vid, _ = dataset.get_id(indexes[j])
                    if vid not in predictions:  # only predict one story for an album
                        # write into txt file for evaluate metrics like Cider
                        prediction_txt.write('{}\t {}\n'.format(vid, story))
                        # save into predictions
                        predictions[vid] = story

                print("Evaluate iter {}/{}  {:04.2f}%. Time used: {}".format(
                    iter, len(loader), iter * 100.0 / len(loader),
                    time.time() - iter_start))

            prediction_txt.close()
            metrics = self.measure()  # compute all the language metrics

            json.dump(
                metrics,
                open(self.prediction_file.replace('prediction', 'scores'),
                     'w'))
            # Switch back to training mode
            print("Test finished. Time used: {}".format(time.time() - start))
        return predictions, metrics
Пример #8
0
    def eval_story(self, model, crit, dataset, loader, opt, side_model=None):
        # Make sure in the evaluation mode
        logging.info("Evaluating...")
        start = time.time()
        model.eval()
        dataset.val()

        loss_sum = 0
        loss_evals = 0
        predictions = {}

        prediction_txt = open(self.prediction_file,
                              'w')  # open the file to store the predictions

        count = 0
        with torch.no_grad():
            for iter, batch in enumerate(loader):
                iter_start = time.time()

                feature_fc = batch['feature_fc'].cuda()
                feature_obj = batch['feature_obj'].cuda()
                if opt.use_spatial:
                    feature_obj_spatial = batch['feature_obj_spatial'].cuda()
                else:
                    feature_obj_spatial = None
                if opt.use_classes:
                    feature_obj_classes = batch['feature_obj_classes'].cuda()
                else:
                    feature_obj_classes = None
                if opt.use_attrs:
                    feature_obj_attrs = batch['feature_obj_attrs'].cuda()
                else:
                    feature_obj_attrs = None
                target = batch['split_story'].cuda()
                prefix = batch['prefix_story'].cuda()
                history_count = batch['history_counter'].cuda()
                conv_feature = batch['feature_conv'].cuda(
                ) if 'feature_conv' in batch else None

                count += feature_fc.size(0)

                output = model(feature_fc,
                               feature_obj,
                               target,
                               history_count,
                               spatial=feature_obj_spatial,
                               clss=feature_obj_classes,
                               attrs=feature_obj_attrs)

                loss = crit(output, target).item()
                loss_sum += loss
                loss_evals += 1

                # forward the model to also get generated samples for each video
                results, _ = model.predict(
                    feature_fc,
                    feature_obj,
                    beam_size=opt.beam_size,
                    penalty=opt.penalty,
                    spatial=feature_obj_spatial,
                    clss=feature_obj_classes,
                    attrs=feature_obj_attrs,
                    frequencies=dataset.frequency,
                    function_words=dataset.get_function_words())
                stories, _ = utils.decode_story(dataset.get_vocab(), results)

                indexes = batch['index'].numpy()
                for j, story in enumerate(stories):
                    vid, _ = dataset.get_id(indexes[j])
                    if vid not in predictions:  # only predict one story for an album
                        # write into txt file for evaluate metrics like Cider
                        prediction_txt.write('{}\t {}\n'.format(vid, story))
                        # save into predictions
                        predictions[vid] = story

                logging.info(
                    "Evaluate iter {}/{}  {:04.2f}%. Time used: {}".format(
                        iter, len(loader), iter * 100.0 / len(loader),
                        time.time() - iter_start))

            prediction_txt.close()
            metrics = self.measure()  # compute all the language metrics

            # Switch back to training mode
            model.train()
            dataset.train()
            logging.info(
                "Evaluation finished. Evaluated {} samples. Time used: {}".
                format(count,
                       time.time() - start))
        return loss_sum / loss_evals, predictions, metrics