def test_story(self, model, dataset, loader, opt): logging.info("Evaluating...") start = time.time() model.eval() dataset.test() predictions = {} prediction_txt = open(self.prediction_file, 'wb') # open the file to store the predictions for iter, batch in enumerate(loader): iter_start = time.time() semantic = batch['semantic'].cuda() feature_fc = Variable(batch['feature_fc'], volatile=True).cuda() feature_conv = Variable( batch['feature_conv'], volatile=True).cuda() if 'feature_conv' in batch else None if feature_conv is not None: results, _ = model.predict(feature_fc, feature_conv, beam_size=opt.beam_size) else: results, _ = model.predict(feature_fc, semantic, beam_size=opt.beam_size) sents = utils.decode_story(dataset.get_vocab(), results) indexes = batch['index'].numpy() for j, story in enumerate(sents): vid, _ = dataset.get_id(indexes[j]) if vid not in predictions: # only predict one story for an album # write into txt file for evaluate metrics like Cider prediction_txt.write('{}\t {}\n'.format(vid, story)) # save into predictions predictions[vid] = story print("Evaluate iter {}/{} {:04.2f}%. Time used: {}".format( iter, len(loader), iter * 100.0 / len(loader), time.time() - iter_start)) prediction_txt.close() json_prediction_file = '{}.json'.format(self.prediction_file) for vid in predictions.keys(): predictions[vid] = [predictions[vid]] self.eval.evaluate(self.reference, predictions) with open(json_prediction_file, 'w') as f: json.dump(predictions, f) metrics = self.eval.eval_overall json.dump(metrics, open(os.path.join(self.save_dir, 'test_scores.json'), 'w')) # Switch back to training mode print("Test finished. Time used: {}".format(time.time() - start)) return predictions, metrics
def forward(self, seq, seq_log_probs, baseline, index, rewards=None): ''' :param seq: (batch_size, 5, seq_length) :param seq_log_probs: (batch_size, 5, seq_length) :param baseline: (batch_size, 5, seq_length) :param indexes: (batch_size,) :param rewards: (batch_size, 5, seq_length) :return: ''' if rewards is None: # compute the reward sents = utils.decode_story(self.dataset.get_vocab(), seq) rewards = [] batch_size = seq.size(0) for i, story in enumerate(sents): vid, _ = self.dataset.get_id(index[i]) GT_story = self.dataset.get_GT(index[i]) result = {vid: [story]} gt = {vid: [GT_story]} score, _ = self.reward_scorer.compute_score(gt, result) if self.bleu is not None: rewards.append(score[self.bleu]) else: rewards.append(score) rewards = torch.FloatTensor(rewards) # (batch_size,) avg_reward = rewards.mean() rewards = Variable(rewards.view(batch_size, 1, 1).expand_as(seq)).cuda() else: avg_reward = rewards.mean() rewards = rewards.view(-1, 5, 1) # get the mask mask = (seq > 0).float( ) # its size is supposed to be (batch_size, 5, seq_length) if mask.size(2) > 1: mask = torch.cat([ mask.new(mask.size(0), mask.size(1), 1).fill_(1), mask[:, :, :-1] ], 2).contiguous() else: mask.fill_(1) mask = Variable(mask) # compute the loss advantage = Variable(rewards.data - baseline.data) value_loss = self._cal_value_loss(rewards, baseline, mask) action_loss = self._cal_action_loss(seq_log_probs, advantage, mask) return action_loss + value_loss, avg_reward
def train(opt): logger = Logger(opt) flag = Flag(D_iters=opt.D_iter, G_iters=opt.G_iter, always=opt.always) ################### set up dataset and dataloader ######################## dataset = VISTDataset(opt) opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.get_story_length() dataset.set_option(data_type={ 'whole_story': False, 'split_story': True, 'caption': False }) dataset.train() train_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=opt.shuffle, num_workers=opt.workers) dataset.val() val_loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers) ##################### set up model, criterion and optimizer ###### bad_valid = 0 # set up evaluator evaluator = Evaluator(opt, 'val') # set up criterion crit = criterion.LanguageModelCriterion() rl_crit = criterion.ReinforceCriterion(opt, dataset) # set up model model = models.setup(opt) model.cuda() disc_opt = copy.copy(opt) disc_opt.model = 'RewardModel' disc = models.setup(disc_opt) if os.path.exists(os.path.join(logger.log_dir, 'disc-model.pth')): logging.info("loading pretrained RewardModel") disc.load_state_dict( torch.load(os.path.join(logger.log_dir, 'disc-model.pth'))) disc.cuda() # set up optimizer optimizer = setup_optimizer(opt, model) disc_optimizer = setup_optimizer(opt, disc) dataset.train() model.train() disc.train() ############################## training ################################## for epoch in range(logger.epoch_start, opt.max_epochs): # Assign the scheduled sampling prob start = time.time() for iter, batch in enumerate(train_loader): logger.iteration += 1 torch.cuda.synchronize() feature_fc = Variable(batch['feature_fc']).cuda() target = Variable(batch['split_story']).cuda() index = batch['index'] optimizer.zero_grad() disc_optimizer.zero_grad() if flag.flag == "Disc": model.eval() disc.train() if opt.decoding_method_DISC == 'sample': seq, seq_log_probs, baseline = model.sample( feature_fc, sample_max=False, rl_training=True, pad=True) elif opt.decoding_method_DISC == 'greedy': seq, seq_log_probs, baseline = model.sample( feature_fc, sample_max=True, rl_training=True, pad=True) else: model.train() disc.eval() seq, seq_log_probs, baseline = model.sample(feature_fc, sample_max=False, rl_training=True, pad=True) seq = Variable(seq).cuda() mask = (seq > 0).float() mask = to_contiguous( torch.cat([ Variable( mask.data.new(mask.size(0), mask.size(1), 1).fill_(1)), mask[:, :, :-1] ], 2)) normed_seq_log_probs = (seq_log_probs * mask).sum(-1) / mask.sum(-1) gen_score = disc(seq.view(-1, seq.size(2)), feature_fc.view(-1, feature_fc.size(2))) if flag.flag == "Disc": gt_score = disc(target.view(-1, target.size(2)), feature_fc.view(-1, feature_fc.size(2))) loss = -torch.sum(gt_score) + torch.sum(gen_score) avg_pos_score = torch.mean(gt_score) avg_neg_score = torch.mean(gen_score) if logger.iteration % 5 == 0: logging.info("pos reward {} neg reward {}".format( avg_pos_score.data[0], avg_neg_score.data[0])) print( "PREDICTION: ", utils.decode_story(dataset.get_vocab(), seq[:1].data)[0]) print( "GROUND TRUTH: ", utils.decode_story(dataset.get_vocab(), target[:1].data)[0]) else: rewards = Variable(gen_score.data - 0.001 * normed_seq_log_probs.data) #with open("/tmp/reward.txt", "a") as f: # print(" ".join(map(str, rewards.data.cpu().numpy())), file=f) loss, avg_score = rl_crit(seq.data, seq_log_probs, baseline, index, rewards) # if logger.iteration % opt.losses_log_every == 0: avg_pos_score = torch.mean(gen_score) logging.info("average reward: {} average IRL score: {}".format( avg_score.data[0], avg_pos_score.data[0])) if flag.flag == "Disc": loss.backward() nn.utils.clip_grad_norm(disc.parameters(), opt.grad_clip, norm_type=2) disc_optimizer.step() else: tf_loss = crit(model(feature_fc, target), target) print("rl_loss / tf_loss = ", loss.data[0] / tf_loss.data[0]) loss = opt.rl_weight * loss + (1 - opt.rl_weight) * tf_loss loss.backward() nn.utils.clip_grad_norm(model.parameters(), opt.grad_clip, norm_type=2) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() # Write the training loss summary if logger.iteration % opt.losses_log_every == 0: logger.log_training(epoch, iter, train_loss, opt.learning_rate, model.ss_prob) logging.info( "Epoch {} Train {} - Iter {} / {}, loss = {:.5f}, time used = {:.3f}s" .format(epoch, flag.flag, iter, len(train_loader), train_loss, time.time() - start)) start = time.time() if logger.iteration % opt.save_checkpoint_every == 0: if opt.always is None: # Evaluate on validation dataset and save model for every epoch val_loss, predictions, metrics = evaluator.eval_story( model, crit, dataset, val_loader, opt) if opt.metric == 'XE': score = -val_loss else: score = metrics[opt.metric] logger.log_checkpoint(epoch, val_loss, metrics, predictions, opt, model, dataset, optimizer) # halve the learning rate if not improving for a long time if logger.best_val_score > score: bad_valid += 1 if bad_valid >= 10: opt.learning_rate = opt.learning_rate / 2.0 logging.info("halve learning rate to {}".format( opt.learning_rate)) checkpoint_path = os.path.join( logger.log_dir, 'model-best.pth') model.load_state_dict(torch.load(checkpoint_path)) utils.set_lr( optimizer, opt.learning_rate) # set the decayed rate bad_valid = 0 logging.info("bad valid : {}".format(bad_valid)) else: logging.info("achieving best {} score: {}".format( opt.metric, score)) bad_valid = 0 else: torch.save(disc.state_dict(), os.path.join(logger.log_dir, 'disc-model.pth')) flag.inc()
def test_challange(self, model, dataset, loader, opt, side_model=None): # Make sure in the evaluation mode logging.info("Evaluating...") start = time.time() model.eval() dataset.test() predictions = { "team_name": "", "evaluation_info": { "additional_description": "" }, "output_stories": [] } prediction_txt = open(self.prediction_file, 'w') # open the file to store the predictions count = 0 finished_flickr_ids = [] for iter, batch in enumerate(loader): iter_start = time.time() semantic = batch['semantic'].cuda() feature_fc = Variable(batch['feature_fc'], volatile=True).cuda() conv_feature = Variable( batch['feature_conv'], volatile=True).cuda() if 'feature_conv' in batch else None count += feature_fc.size(0) if conv_feature is not None: results, _ = model.predict(feature_fc, conv_feature, beam_size=opt.beam_size) else: results, _ = model.predict(feature_fc, semantic, beam_size=opt.beam_size) stories = utils.decode_story(dataset.get_vocab(), results) indexes = batch['index'].numpy() for j, story in enumerate(stories): album_id, flickr_id = dataset.get_all_id(indexes[j]) story_id = dataset.get_story_id(indexes[j]) concat_flickr_id = "-".join(flickr_id) if concat_flickr_id not in finished_flickr_ids: # if vid not in predictions: # only predict one story for an album # write into txt file for evaluate metrics like Cider prediction_txt.write('{}\t {}\n'.format(album_id, story)) # save into predictions predictions['output_stories'].append({ 'story_id': story_id, 'album_id': album_id, 'photo_sequence': flickr_id, 'story_text_normalized': story }) finished_flickr_ids.append(concat_flickr_id) logging.info( "Evaluate iter {}/{} {:04.2f}%. Time used: {}".format( iter, len(loader), iter * 100.0 / len(loader), time.time() - iter_start)) prediction_txt.close() json_prediction_file = os.path.join(self.save_dir, 'challenge.json') with open(json_prediction_file, 'w') as f: json.dump(predictions, f) logging.info( "Evaluation finished. Evaluated {} samples. Time used: {}".format( count, time.time() - start)) return predictions
def eval_story(self, model, crit, dataset, loader, opt, side_model=None): # Make sure in the evaluation mode logging.info("Evaluating...") start = time.time() model.eval() dataset.val() loss_sum = 0 loss_evals = 0 predictions = {} prediction_txt = open(self.prediction_file, 'wb') # open the file to store the predictions count = 0 for iter, batch in enumerate(loader): iter_start = time.time() semantic = batch['semantic'].cuda() feature_fc = Variable(batch['feature_fc'], volatile=True).cuda() target = Variable(batch['split_story'], volatile=True).cuda() conv_feature = Variable( batch['feature_conv'], volatile=True).cuda() if 'feature_conv' in batch else None count += feature_fc.size(0) if side_model is not None: story, _ = side_model.predict( feature_fc.view(-1, feature_fc.shape[2]), 1) story = Variable(story).cuda() if conv_feature is not None: output = model(feature_fc, target, story, conv_feature) else: output = model(feature_fc, target, story) else: if conv_feature is not None: output = model(feature_fc, target, conv_feature) else: output = model(feature_fc, target, semantic) loss = crit(output, target).data[0] loss_sum += loss loss_evals += 1 # forward the model to also get generated samples for each video if side_model is not None: if conv_feature is not None: results, _ = model.predict(feature_fc, story, conv_feature, beam_size=opt.beam_size) else: results, _ = model.predict(feature_fc, conv_feature, beam_size=opt.beam_size) else: if conv_feature is not None: results, _ = model.predict(feature_fc, conv_feature, beam_size=opt.beam_size) else: results, _ = model.predict(feature_fc, semantic, beam_size=opt.beam_size) stories = utils.decode_story(dataset.get_vocab(), results) indexes = batch['index'].numpy() for j, story in enumerate(stories): vid, _ = dataset.get_id(indexes[j]) if vid not in predictions: # only predict one story for an album # write into txt file for evaluate metrics like Cider prediction_txt.write('{}\t {}\n'.format(vid, story)) # save into predictions predictions[vid] = story logging.info( "Evaluate iter {}/{} {:04.2f}%. Time used: {}".format( iter, len(loader), iter * 100.0 / len(loader), time.time() - iter_start)) prediction_txt.close() metrics = self.measure() # compute all the language metrics # Switch back to training mode model.train() dataset.train() logging.info( "Evaluation finished. Evaluated {} samples. Time used: {}".format( count, time.time() - start)) return loss_sum / loss_evals, predictions, metrics
def test_challange(self, model, dataset, loader, opt, side_model=None): # Make sure in the evaluation mode logging.info("Evaluating...") start = time.time() model.eval() dataset.test() predictions = { "team_name": "", "evaluation_info": { "additional_description": "" }, "output_stories": [] } prediction_txt = open(self.prediction_file, 'w') # open the file to store the predictions count = 0 finished_flickr_ids = [] with torch.no_grad(): for iter, batch in enumerate(loader): iter_start = time.time() feature_fc = batch['feature_fc'].cuda() feature_obj = batch['feature_obj'].cuda() if opt.use_spatial: feature_obj_spatial = batch['feature_obj_spatial'].cuda() else: feature_obj_spatial = None if opt.use_classes: feature_obj_classes = batch['feature_obj_classes'].cuda() else: feature_obj_classes = None if opt.use_attrs: feature_obj_attrs = batch['feature_obj_attrs'].cuda() else: feature_obj_attrs = None count += feature_fc.size(0) results, _ = model.predict( feature_fc, feature_obj, beam_size=opt.beam_size, spatial=feature_obj_spatial, clss=feature_obj_classes, attrs=feature_obj_attrs, penalty=opt.penalty, frequencies=dataset.frequency, function_words=dataset.get_function_words()) stories, _ = utils.decode_story(dataset.get_vocab(), results) indexes = batch['index'].numpy() for j, story in enumerate(stories): album_id, flickr_id = dataset.get_all_id(indexes[j]) concat_flickr_id = "-".join(flickr_id) if concat_flickr_id not in finished_flickr_ids: # if vid not in predictions: # only predict one story for an album # write into txt file for evaluate metrics like Cider prediction_txt.write('{}\t {}\n'.format( album_id, story)) # save into predictions predictions['output_stories'].append({ 'album_id': album_id, 'photo_sequence': flickr_id, 'story_text_normalized': story }) finished_flickr_ids.append(concat_flickr_id) logging.info( "Evaluate iter {}/{} {:04.2f}%. Time used: {}".format( iter, len(loader), iter * 100.0 / len(loader), time.time() - iter_start)) prediction_txt.close() json_prediction_file = os.path.join(self.save_dir, 'challenge.json') with open(json_prediction_file, 'w') as f: json.dump(predictions, f) logging.info( "Evaluation finished. Evaluated {} samples. Time used: {}". format(count, time.time() - start)) subprocess.call([ "java", "-jar", opt.challenge_dir, "-testFile", os.path.join(self.save_dir, 'challenge.json'), "-gsFile", opt.sis_path ]) return predictions
def test_story(self, model, dataset, loader, opt): logging.info("Evaluating...") start = time.time() model.eval() dataset.test() predictions = {} prediction_txt = open(self.prediction_file, 'w') # open the file to store the predictions with torch.no_grad(): for iter, batch in enumerate(loader): iter_start = time.time() feature_fc = batch['feature_fc'].cuda() feature_conv = batch['feature_conv'].cuda( ) if 'feature_conv' in batch else None feature_obj = batch['feature_obj'].cuda() if opt.use_spatial: feature_obj_spatial = batch['feature_obj_spatial'].cuda() else: feature_obj_spatial = None if opt.use_classes: feature_obj_classes = batch['feature_obj_classes'].cuda() else: feature_obj_classes = None if opt.use_attrs: feature_obj_attrs = batch['feature_obj_attrs'].cuda() else: feature_obj_attrs = None results, _ = model.predict( feature_fc, feature_obj, beam_size=opt.beam_size, spatial=feature_obj_spatial, clss=feature_obj_classes, attrs=feature_obj_attrs, penalty=opt.penalty, frequencies=dataset.frequency, function_words=dataset.get_function_words()) sents, _ = utils.decode_story(dataset.get_vocab(), results) indexes = batch['index'].numpy() for j, story in enumerate(sents): vid, _ = dataset.get_id(indexes[j]) if vid not in predictions: # only predict one story for an album # write into txt file for evaluate metrics like Cider prediction_txt.write('{}\t {}\n'.format(vid, story)) # save into predictions predictions[vid] = story print("Evaluate iter {}/{} {:04.2f}%. Time used: {}".format( iter, len(loader), iter * 100.0 / len(loader), time.time() - iter_start)) prediction_txt.close() metrics = self.measure() # compute all the language metrics json.dump( metrics, open(self.prediction_file.replace('prediction', 'scores'), 'w')) # Switch back to training mode print("Test finished. Time used: {}".format(time.time() - start)) return predictions, metrics
def eval_story(self, model, crit, dataset, loader, opt, side_model=None): # Make sure in the evaluation mode logging.info("Evaluating...") start = time.time() model.eval() dataset.val() loss_sum = 0 loss_evals = 0 predictions = {} prediction_txt = open(self.prediction_file, 'w') # open the file to store the predictions count = 0 with torch.no_grad(): for iter, batch in enumerate(loader): iter_start = time.time() feature_fc = batch['feature_fc'].cuda() feature_obj = batch['feature_obj'].cuda() if opt.use_spatial: feature_obj_spatial = batch['feature_obj_spatial'].cuda() else: feature_obj_spatial = None if opt.use_classes: feature_obj_classes = batch['feature_obj_classes'].cuda() else: feature_obj_classes = None if opt.use_attrs: feature_obj_attrs = batch['feature_obj_attrs'].cuda() else: feature_obj_attrs = None target = batch['split_story'].cuda() prefix = batch['prefix_story'].cuda() history_count = batch['history_counter'].cuda() conv_feature = batch['feature_conv'].cuda( ) if 'feature_conv' in batch else None count += feature_fc.size(0) output = model(feature_fc, feature_obj, target, history_count, spatial=feature_obj_spatial, clss=feature_obj_classes, attrs=feature_obj_attrs) loss = crit(output, target).item() loss_sum += loss loss_evals += 1 # forward the model to also get generated samples for each video results, _ = model.predict( feature_fc, feature_obj, beam_size=opt.beam_size, penalty=opt.penalty, spatial=feature_obj_spatial, clss=feature_obj_classes, attrs=feature_obj_attrs, frequencies=dataset.frequency, function_words=dataset.get_function_words()) stories, _ = utils.decode_story(dataset.get_vocab(), results) indexes = batch['index'].numpy() for j, story in enumerate(stories): vid, _ = dataset.get_id(indexes[j]) if vid not in predictions: # only predict one story for an album # write into txt file for evaluate metrics like Cider prediction_txt.write('{}\t {}\n'.format(vid, story)) # save into predictions predictions[vid] = story logging.info( "Evaluate iter {}/{} {:04.2f}%. Time used: {}".format( iter, len(loader), iter * 100.0 / len(loader), time.time() - iter_start)) prediction_txt.close() metrics = self.measure() # compute all the language metrics # Switch back to training mode model.train() dataset.train() logging.info( "Evaluation finished. Evaluated {} samples. Time used: {}". format(count, time.time() - start)) return loss_sum / loss_evals, predictions, metrics