def eval_split(model_cnn, model, filepaths, ix_to_word, eval_kwargs={}): verbose_eval = eval_kwargs.get('verbose_eval', True) beam_size = eval_kwargs.get('beam_size', 1) caption_model = eval_kwargs.get('caption_model', '') batch_size = eval_kwargs.get('batch_size', 1) predictions = [] data = get_batch(filepaths, batch_size) images = torch.from_numpy(data['images']).cuda() images = utils.prepro_norm(images, False) images = Variable(images, requires_grad=False) if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) if models.is_only_fc_feat(caption_model): seq, _ = model.sample(fc_feats, {'beam_size': beam_size}) else: seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) # sents sents = utils.decode_sequence(ix_to_word, seq) for k, sent in enumerate(sents): print(sent) sent = ''.join(sent.split()) predictions.append(sent) return predictions
def eval_split(model_cnn, model, loader, eval_kwargs={}): verbose_eval = eval_kwargs.get('verbose_eval', True) beam_size = eval_kwargs.get('beam_size', 1) caption_model = eval_kwargs.get('caption_model', '') batch_size = eval_kwargs.get('batch_size', 1) split = '' loader.reset_iterator(split) n = 0 predictions = [] vocab = loader.get_vocab() while True: data = loader.get_batch(split, batch_size) n = n + batch_size images = torch.from_numpy(data['images']).cuda() images = utils.prepro_norm(images, False) images = Variable(images, requires_grad=False) if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) if models.is_only_fc_feat(caption_model): seq, _ = model.sample(fc_feats, {'beam_size': beam_size}) else: seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) # sents sents = utils.decode_sequence(vocab, seq) for k, sent in enumerate(sents): image_id = data['infos'][k]['id'] image_id = int(image_id.split('_')[2]) entry = {'image_id': image_id, 'caption': sent} predictions.append(entry) if verbose_eval: print('image %s: %s' % (entry['image_id'], entry['caption'])) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] for i in range(n - ix1): predictions.pop() if verbose_eval: print('evaluating validation preformance... %d/%d' % (ix0 - 1, ix1)) if data['bounds']['wrapped']: break return predictions
def train_cnn(model_cnn, images, bus, fc_expander, att_expander, bu_expander, use_reinforce): fc_feats = None att_feats = None bu_feats = None # train cnn if models.is_only_fc_feat(opt.caption_model): fc_feats = model_cnn(images) if opt.seq_per_img > 1 and not use_reinforce: fc_feats = fc_expander(fc_feats) elif models.is_only_att_feat(opt.caption_model): att_feats = model_cnn(images) if opt.seq_per_img > 1 and not use_reinforce: att_feats = att_expander(att_feats) elif models.has_sub_region_bu(opt.caption_model): fc_feats, att_feats, bu_feats = model_cnn(images) if opt.seq_per_img > 1 and not use_reinforce: fc_feats = fc_expander(fc_feats) att_feats = att_expander(att_feats) bu_feats = bu_expander(bu_feats) else: fc_feats, att_feats = model_cnn(images) if opt.seq_per_img > 1 and not use_reinforce: fc_feats = fc_expander(fc_feats) att_feats = att_expander(att_feats) if models.has_bu(opt.caption_model): bus_feats = bus if opt.seq_per_img > 1 and not use_reinforce: bu_feats = bu_expander(bus_feats) return fc_feats, att_feats, bu_feats
def compute_output(caption_model, beam_size, model, fc_feats, att_feats, bu_feats): if models.is_only_fc_feat(caption_model): output = model.sample(fc_feats, {'beam_size': beam_size}) elif models.is_only_att_feat(caption_model): output = model.sample(att_feats, {'beam_size': beam_size}) elif models.has_bu(caption_model) or models.has_sub_region_bu( caption_model) or models.is_prob_weight_mul_out(caption_model): output = model.sample(fc_feats, att_feats, bu_feats, {'beam_size': beam_size}) else: output = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) return output
def train_normal(params, opt): model = params['model'] fc_feats = params['fc_feats'] att_feats = params['att_feats'] labels = params['labels'] targets = params['targets'] masks = params['masks'] vocab = params['vocab'] crit = params['crit'] # forward start = time.time() if models.is_transformer(opt.caption_model): output = model(att_feats, targets, masks) elif models.is_ctransformer(opt.caption_model): output = model(fc_feats, att_feats, targets, masks) elif models.is_only_fc_feat(opt.caption_model): output = model(fc_feats, labels) elif models.is_only_att_feat(opt.caption_model): output = model(att_feats, labels) elif models.has_bu(opt.caption_model): bu_feats = params['bu_feats'] output = model(fc_feats, att_feats, bu_feats, labels) else: output = model(fc_feats, att_feats, labels) if opt.verbose: print('model {:.3f}'.format(time.time() - start)) # compute the loss start = time.time() if models.is_prob_weight(opt.caption_model): output = output[0] loss = crit(output, labels, masks) if opt.verbose: print('crit {:.3f}'.format(time.time() - start)) # backward start = time.time() loss.backward() if opt.verbose: print('loss {:.3f}'.format(time.time() - start)) # show information train_loss = loss.data[0] reward_mean = 0 return train_loss, reward_mean
def train_mix(params, iteration, opt): model = params['model'] fc_feats = params['fc_feats'] att_feats = params['att_feats'] labels = params['labels'] masks = params['masks'] vocab = params['vocab'] gts = params['gts'] crit_pg = params['crit_pg'] crit = params['crit'] output = None if iteration % 2 == 1: use_reinforce = True train_loss, reward_mean = crit_pg.forward_backward( output, labels, masks, vocab) else: use_reinforce = False # forward start = time.time() if models.is_only_fc_feat(opt.caption_model): output = model(fc_feats, labels) else: output = model(fc_feats, att_feats, labels) if opt.verbose: print('model {:.3f}'.format(time.time() - start)) # compute the loss start = time.time() loss = crit(output, labels, masks) if opt.verbose: print('crit {:.3f}'.format(time.time() - start)) # backward start = time.time() loss.backward() if opt.verbose: print('loss {:.3f}'.format(time.time() - start)) # show information train_loss = loss.data[0] reward_mean = 0 return train_loss, reward_mean
def compute_cnn_feats(caption_model, model_cnn, images): fc_feats = None att_feats = None bu_feats = None if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) elif models.is_only_att_feat(caption_model): att_feats = model_cnn(images) elif caption_model == "SCST": fc_feats, att_feats = model_cnn(images) elif models.is_prob_weight(caption_model): if models.has_sub_region_bu(caption_model): fc_feats, att_feats, bu_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) elif models.is_prob_weight_mul_out(caption_model): fc_feats, att_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) return fc_feats, att_feats, bu_feats
def train_actor_critic(params, opt, type, retain_graph=False): model = params['model'] fc_feats = params['fc_feats'] att_feats = params['att_feats'] labels = params['labels'] masks = params['masks'] vocab = params['vocab'] gts = params['gts'] if type == 0: crit_c = params['crit_c'] elif type == 1: crit_ac = params['crit_ac'] if models.has_bu(opt.caption_model) or models.has_sub_region_bu( opt.caption_model): bu_feats = params['bu_feats'] # forward start = time.time() if models.is_only_fc_feat(opt.caption_model): sample_seq, sample_seqLogprobs, sample_value = model.sample( fc_feats, {'sample_max': 0}) elif models.has_bu(opt.caption_model) or models.has_sub_region_bu( opt.caption_model): sample_seq, sample_seqLogprobs, sample_value = model.sample( fc_feats, att_feats, bu_feats, {'sample_max': 0}) else: # sample_seq, sample_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 0}) # greedy_seq, greedy_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 1}) sample_output = model.sample(fc_feats, att_feats, {'sample_max': 0}) sample_seq = sample_output[0] sample_seqLogprobs = sample_output[1] sample_value = sample_output[2] if opt.verbose: print('model {:.3f}'.format(time.time() - start)) # compute the loss start = time.time() # 0. critic # 1. critic, actor if type == 0: # seq, seqLogprobs, seq1, target, vocab loss, reward_mean, sample_mean = crit_c(sample_seq, sample_value, gts) elif type == 1: # seq, seqLogprobs, seq1, target, vocab loss, reward_mean, sample_mean = crit_ac(sample_seq, sample_seqLogprobs, sample_value, gts) # loss, reward_mean = crit_rl(sample_seq, sample_seqLogprobs, gts) if opt.verbose: print('crit {:.3f}'.format(time.time() - start)) # backward start = time.time() loss.backward(retain_graph=retain_graph) if opt.verbose: print('loss {:.3f}'.format(time.time() - start)) # show information train_loss = loss.data[0] return train_loss, reward_mean, sample_mean
def train_reinforce(params, opt): model = params['model'] fc_feats = params['fc_feats'] att_feats = params['att_feats'] labels = params['labels'] masks = params['masks'] vocab = params['vocab'] crit_pg = params['crit_pg'] crit_rl = params['crit_rl'] targets = params['targets'] gts = params['gts'] if models.has_bu(opt.caption_model) or models.has_sub_region_bu( opt.caption_model): bu_feats = params['bu_feats'] # compute policy gradient if opt.reinforce_type == 0: raise Exception('reinforce_type error, 0 is deprecated') # forward start = time.time() if models.is_only_fc_feat(opt.caption_model): output = model(fc_feats, labels) else: output = model(fc_feats, att_feats, labels) if opt.verbose: print('model {:.3f}'.format(time.time() - start)) train_loss, reward_mean = crit_pg.forward_backward( output, labels, masks, vocab) # self-critical elif opt.reinforce_type == 1: # forward start = time.time() if models.is_only_fc_feat(opt.caption_model): sample_seq, sample_seqLogprobs = model.sample( fc_feats, {'sample_max': 0}) greedy_seq, greedy_seqLogprobs = model.sample( fc_feats, {'sample_max': 1}) elif models.is_only_att_feat(opt.caption_model): sample_seq, sample_seqLogprobs = model.sample( att_feats, {'sample_max': 0}) greedy_seq, greedy_seqLogprobs = model.sample( att_feats, {'sample_max': 1}) elif models.has_bu(opt.caption_model) or models.has_sub_region_bu( opt.caption_model): sample_seq, sample_seqLogprobs = model.sample( fc_feats, att_feats, bu_feats, {'sample_max': 0}) greedy_seq, greedy_seqLogprobs = model.sample( fc_feats, att_feats, bu_feats, {'sample_max': 1}) else: # sample_seq, sample_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 0}) # greedy_seq, greedy_seqLogprobs = model.sample_forward(fc_feats, att_feats, labels, {'sample_max': 1}) sample_output = model.sample(fc_feats, att_feats, {'sample_max': 0}) greedy_output = model.sample(fc_feats, att_feats, {'sample_max': 1}) sample_seq = sample_output[0] sample_seqLogprobs = sample_output[1] greedy_seq = greedy_output[0] greedy_seqLogprobs = greedy_output[1] if opt.verbose: print('model {:.3f}'.format(time.time() - start)) # compute the loss start = time.time() # seq, seqLogprobs, seq1, target, vocab loss, reward_mean, sample_mean, greedy_mean = crit_rl( sample_seq, sample_seqLogprobs, greedy_seq, gts, masks) # loss, reward_mean = crit_rl(sample_seq, sample_seqLogprobs, gts) if opt.verbose: print('crit {:.3f}'.format(time.time() - start)) # backward start = time.time() loss.backward() if opt.verbose: print('loss {:.3f}'.format(time.time() - start)) # show information train_loss = loss.data[0] return train_loss, reward_mean, sample_mean, greedy_mean
def eval_split_only(model_cnn, model, crit, loader, eval_kwargs={}): verbose_eval = eval_kwargs.get('verbose_eval', True) val_images_use = eval_kwargs.get('val_images_use', -1) split = eval_kwargs.get('split', 'val') lang_eval = eval_kwargs.get('language_eval', 1) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) coco_caption_path = eval_kwargs.get('coco_caption_path', 'coco-caption') caption_model = eval_kwargs.get('caption_model', '') batch_size = eval_kwargs.get('batch_size', 2) seq_per_img = eval_kwargs.get('seq_per_img', 5) # Make sure in the evaluation mode model_cnn.eval() model.eval() loader.reset_iterator(split) n = 0 loss_sum = 0 loss_evals = 0 predictions = [] vocab = loader.get_vocab() vocab_size = loader.get_vocab_size() while True: data = loader.get_batch(split, batch_size) n = n + batch_size images = data['images'] if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) elif models.is_only_att_feat(caption_model): att_feats = model_cnn(images) elif caption_model == "SCST": fc_feats, att_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) if models.is_only_fc_feat(caption_model): seq, _ = model.sample(fc_feats, {'beam_size': beam_size}) elif models.is_only_att_feat(caption_model): seq, _ = model.sample(att_feats, {'beam_size': beam_size}) else: seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) # sents = utils.decode_sequence(vocab, seq) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} predictions.append(entry) if verbose_eval: print('image %s: %s' % (entry['image_id'], entry['caption'])) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if val_images_use != -1: ix1 = min(ix1, val_images_use) for i in range(n - ix1): predictions.pop() if verbose_eval: print('evaluating validation preformance... %d/%d' % (ix0 - 1, ix1)) if data['bounds']['wrapped']: break if n >= val_images_use: break if lang_eval == 1: lang_stats, str_stats = language_eval(dataset, predictions, coco_caption_path) # Switch back to training mode model_cnn.train() model.train() return 0, predictions, lang_stats, str_stats
def compute_loss(crit, model, caption_model, seq_per_img, fc_expander, att_expander, bu_expander, fc_feats, att_feats, bu_feats, labels, masks, tokens): if models.is_only_fc_feat(caption_model): if seq_per_img > 1: fc_feats_ext = fc_expander(fc_feats) else: fc_feats_ext = fc_feats batch_outputs = model(fc_feats_ext, labels) elif models.is_only_att_feat(caption_model): if seq_per_img > 1: att_feats_ext = att_expander(att_feats) else: att_feats_ext = att_feats batch_outputs = model(att_feats_ext, labels) elif caption_model == "SCST": if seq_per_img > 1: fc_feats_ext = fc_expander(fc_feats) att_feats_ext = att_expander(att_feats) else: fc_feats_ext = fc_feats att_feats_ext = att_feats batch_outputs, _ = model(fc_feats_ext, att_feats_ext, labels, "train") elif models.is_prob_weight(caption_model): if models.has_sub_region_bu(caption_model): if seq_per_img > 1: fc_feats_ext = fc_expander(fc_feats) att_feats_ext = att_expander(att_feats) bu_feats_ext = bu_expander(bu_feats) else: fc_feats_ext = fc_feats att_feats_ext = att_feats bu_feats_ext = bu_feats batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext, bu_feats_ext, labels) else: if seq_per_img > 1: fc_feats_ext = fc_expander(fc_feats) att_feats_ext = att_expander(att_feats) else: fc_feats_ext = fc_feats att_feats_ext = att_feats if models.has_bu(caption_model): if seq_per_img > 1: bu_feats_ext = bu_expander(bu_feats) else: bu_feats_ext = bu_feats batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext, bu_feats_ext, labels) else: batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext, labels) elif models.is_prob_weight_mul_out(caption_model): if seq_per_img > 1: fc_feats_ext = fc_expander(fc_feats) att_feats_ext = att_expander(att_feats) else: fc_feats_ext = fc_feats att_feats_ext = att_feats if models.has_bu(caption_model): if seq_per_img > 1: bu_feats_ext = bu_expander(bu_feats) else: bu_feats_ext = bu_feats batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext, bu_feats_ext, labels) else: batch_outputs, prob_w = model(fc_feats_ext, att_feats_ext, labels) else: if seq_per_img > 1: fc_feats_ext = fc_expander(fc_feats) att_feats_ext = att_expander(att_feats) else: fc_feats_ext = fc_feats att_feats_ext = att_feats if models.has_bu(caption_model): if seq_per_img > 1: bu_feats_ext = bu_expander(bu_feats) else: bu_feats_ext = bu_feats batch_outputs = model(fc_feats_ext, att_feats_ext, bu_feats_ext, labels) else: batch_outputs = model(fc_feats_ext, att_feats_ext, labels) if models.is_prob_weight(caption_model) or models.is_prob_weight_mul_out( caption_model): loss = crit(batch_outputs, labels, masks, prob_w, tokens) else: loss = crit(batch_outputs, labels, masks) loss.backward() return loss.data[0]
def eval_split(model_cnn, model, loader, eval_kwargs={}): verbose_eval = eval_kwargs.get('verbose_eval', True) beam_size = eval_kwargs.get('beam_size', 1) caption_model = eval_kwargs.get('caption_model', '') batch_size = eval_kwargs.get('batch_size', 1) split = '' loader.reset_iterator(split) n = 0 predictions = [] vocab = loader.get_vocab() while True: start = time.time() data = loader.get_batch(split, batch_size) n = n + batch_size images = torch.from_numpy(data['images']).cuda() images = utils.prepro_norm(images, False) images = Variable(images, requires_grad=False) if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) if models.is_only_fc_feat(caption_model): seq, _ = model.sample(fc_feats, {'beam_size': beam_size}) else: seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) # sents sents = utils.decode_sequence_aic(vocab, seq) for k, sent in enumerate(sents): image_id = data['infos'][k]['id'] # print(image_id, sent) # image_id = int(image_id.split('_')[2]) entry = {'image_id': image_id, 'caption': sent} predictions.append(entry) if verbose_eval: print('image %s: %s' % (entry['image_id'], entry['caption'])) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] for i in range(n - ix1): predictions.pop() if verbose_eval: span_time = time.time()-start left_time = (ix1-ix0)*span_time/batch_size if left_time > 3600: left_h = left_time//3600 left_m = (left_time - left_h*3600)//60 left_s = left_time - left_h*3600 - left_m*60 s_left_time = '%dh:%dm:%.3fs' % (left_h, left_m, left_s) elif left_time > 60: left_m = left_time // 60 left_s = left_time - left_m * 60 s_left_time = '%dm:%.3fs' % (left_m, left_s) else: s_left_time = '%.3fs' % (left_time) print('evaluating validation preformance... %d/%d %.3fs left:%s' % (ix0, ix1, span_time, s_left_time)) if data['bounds']['wrapped']: break return predictions
def eval_split(model_cnn, model, loader, eval_kwargs={}): verbose_eval = eval_kwargs.get('verbose_eval', True) beam_size = eval_kwargs.get('beam_size', 1) caption_model = eval_kwargs.get('caption_model', '') batch_size = eval_kwargs.get('batch_size', 1) output_dir = eval_kwargs.get('output_dir', '') split = '' loader.reset_iterator(split) n = 0 predictions = [] vocab = loader.get_vocab() dir_fc = os.path.join(output_dir, 'fc') dir_att = os.path.join(output_dir, 'att') print(dir_fc) print(dir_att) if not os.path.isdir(dir_fc): os.mkdir(dir_fc) if not os.path.isdir(dir_att): os.mkdir(dir_att) while True: data = loader.get_batch(split, batch_size) n = n + batch_size images = torch.from_numpy(data['images']).cuda() images = utils.prepro_norm(images, False) images = Variable(images, requires_grad=False) if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) if models.is_only_fc_feat(caption_model): seq, _ = model.sample(fc_feats, {'beam_size': beam_size}) else: seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) # sents sents = utils.decode_sequence(vocab, seq) for k, sent in enumerate(sents): # att_batch = y[k].data.cpu().float().numpy() # np.savez_compressed(os.path.join(dir_att, data['infos'][k]['id']), x = att_batch) # fc_batch = fc_feats[k].data.cpu().float().numpy() # att_batch = att_feats[k].data.cpu().float().numpy() # # np.savez_compressed(os.path.join(dir_fc, data['infos'][k]['id']), x = fc_batch) # np.savez_compressed(os.path.join(dir_att, data['infos'][k]['id']), x = att_batch) entry = {'image_id': data['infos'][k]['id'], 'caption': sent} predictions.append(entry) if verbose_eval: print('image %s: %s' % (entry['image_id'], entry['caption'])) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] for i in range(n - ix1): predictions.pop() if verbose_eval: print('evaluating validation preformance... %d/%d' % (ix0 - 1, ix1)) if data['bounds']['wrapped']: break return predictions