def forward(self, att_feats, target_seq, masks): batch_size, len_q = target_seq.size() # target_pos: batch_size * len_q target_pos = np.array([ [j + 1 for j in range(len_q)] for _ in range(batch_size)]) target_pos = torch.LongTensor(target_pos).cuda() target_pos = Variable(target_pos, requires_grad=False) target_pos = target_pos * masks.long() output_enc = self.image_embed(att_feats) # target_seq: batch_size * len_q # target_pos: batch_size * len_q # input_seq: batch_size * len_q # output_enc: batch_size * len_q * model_size # output_dec: batch_size * len_q * model_size # masks: batch_size * len_q # seq_logsofts: n_layers * batch_size * len_q * output_size seq_logsofts = self.decoder(target_seq, target_pos, output_enc, masks) if self.is_show_result: sampleLogprobs, seq = torch.max(self.get_logprob_data(seq_logsofts), 2) sents = utils.decode_sequence(self.vocab, seq) target_sents = utils.decode_sequence(self.vocab, target_seq.data[:, 1:]) print("===============output=================") for k, sent in enumerate(sents): print(sent," | ",target_sents[k]) print("===============output=================") # seq_logsofts: n_layers * batch_size * len_q * output_size return seq_logsofts
def get_scores(self, preds, target): if self.bleu_scorer == 'coco': bleu_scorer = BleuScorer(n=self.bleu_order) coco = True else: coco = False scores = [] # Go to sentence space to compute scores: hypo = decode_sequence(self.vocab, preds) # candidate refs = decode_sequence(self.vocab, target.data) # references num_img = target.size(0) // self.seq_per_img for e, h in enumerate(hypo): ix_start = e // self.seq_per_img * self.seq_per_img ix_end = ix_start + 5 # self.seq_per_img if coco: bleu_scorer += (h, refs[ix_start:ix_end]) else: scores.append( sentence_bleu(h, ' '.join(refs[ix_start:ix_end]), order=self.bleu_order)) if coco: (score, scores) = bleu_scorer.compute_score() scores = scores[-1] self.logger.debug("Bleu scores: %s" % str(scores)) return scores
def forward(self, att_feats, target_seq, masks): # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:]) # print("================target_seq================") # for k, sent in enumerate(sents): # print(sent) # print("================target_seq================") batch_size, att_size, att_feat_size = att_feats.size() batch_size, len_q = target_seq.size() # input_pos: batch_size * len_q # input_pos = np.array([ # [pos_i + 1 for pos_i in range(att_size)] # for i in range(batch_size)]) # input_pos = torch.LongTensor(input_pos).cuda() # input_pos = Variable(input_pos, requires_grad=False) # target_pos: batch_size * len_q target_pos = np.array([[j + 1 for j in range(len_q)] for _ in range(batch_size)]) target_pos = torch.LongTensor(target_pos).cuda() target_pos = Variable(target_pos, requires_grad=False) target_pos = target_pos * masks.long() output_enc = self.image_embed(att_feats) # target_seq: batch_size * len_q # target_pos: batch_size * len_q # input_seq: batch_size * len_q # output_enc: batch_size * len_q * model_size # output_dec: batch_size * len_q * model_size # masks: batch_size * len_q output_dec = self.decoder(target_seq, target_pos, output_enc, masks) if self.drop_prob_lm > 0: output_dec = F.dropout(output_dec, self.drop_prob_lm) # output_dec: batch_size * len_q * model_size # seq_logsofts: batch_size * len_q * output_size seq_logsofts = F.log_softmax(self.proj(output_dec), -1) # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:]) # print("================target_seq================") # for k, sent in enumerate(sents): # print(sent) # print("================target_seq================") if self.is_show_result: sampleLogprobs, seq = torch.max(seq_logsofts.data, 2) sents = utils.decode_sequence(self.vocab, seq) target_sents = utils.decode_sequence(self.vocab, target_seq.data[:, 1:]) print("===============output=================") for k, sent in enumerate(sents): print(sent, " | ", target_sents[k]) print("===============output=================") # seq_logsofts: batch_size * len_q * output_size return seq_logsofts
def eval_split_n(model, n_predictions, loader, input_data, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) beam_size = eval_kwargs.get('beam_size', 1) sample_n = eval_kwargs.get('sample_n', 1) sample_n_method = eval_kwargs.get('sample_n_method', 'sample') fc_feats, att_feats, att_masks, data = input_data tmp_eval_kwargs = eval_kwargs.copy() if sample_n_method == 'bs': # case 1 sample_n == beam size tmp_eval_kwargs.update({'sample_n': 1, 'beam_size': sample_n, 'group_size': 1}) # randomness from softmax with torch.no_grad(): model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample') for k in range(loader.batch_size): _sents = utils.decode_sequence(loader.get_vocab(), torch.stack([model.done_beams[k][_]['seq'] for _ in range(sample_n)])) for sent in _sents: entry = {'image_id': data['infos'][k]['id'], 'caption': sent} n_predictions.append(entry) # case 2 sample / gumbel / topk sampling/ nucleus sampling elif sample_n_method == 'sample' or \ sample_n_method == 'gumbel' or \ sample_n_method.startswith('top'): tmp_eval_kwargs.update( {'sample_n': sample_n, 'sample_method': sample_n_method, 'beam_size': 1}) # randomness from sample with torch.no_grad(): _seq, _sampleLogprobs = model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample') _sents = utils.decode_sequence(loader.get_vocab(), _seq) _perplexity = - _sampleLogprobs.gather(2, _seq.unsqueeze(2)).squeeze(2).sum(1) / ((_seq > 0).float().sum(1) + 1) for k, sent in enumerate(_sents): entry = {'image_id': data['infos'][k // sample_n]['id'], 'caption': sent, 'perplexity': _perplexity[k].item()} n_predictions.append(entry) elif sample_n_method == 'dbs': # Use diverse beam search tmp_eval_kwargs.update({'beam_size': sample_n * beam_size, 'group_size': sample_n}) # randomness from softmax with torch.no_grad(): model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample') for k in range(loader.batch_size): _sents = utils.decode_sequence(loader.get_vocab(), torch.stack( [model.done_beams[k][_]['seq'] for _ in range(0, sample_n * beam_size, beam_size)])) for sent in _sents: entry = {'image_id': data['infos'][k]['id'], 'caption': sent} n_predictions.append(entry) else: tmp_eval_kwargs.update( {'sample_method': sample_n_method[1:], 'group_size': sample_n, 'beam_size': 1}) # randomness from softmax with torch.no_grad(): _seq, _sampleLogprobs = model(fc_feats, att_feats, att_masks, opt=tmp_eval_kwargs, mode='sample') _sents = utils.decode_sequence(loader.get_vocab(), _seq) for k, sent in enumerate(_sents): entry = {'image_id': data['infos'][k // sample_n]['id'], 'caption': sent} n_predictions.append(entry) if verbose: for entry in sorted(n_predictions[-loader.batch_size * sample_n:], key=lambda x: x['image_id']): print('image %s: %s' % (entry['image_id'], entry['caption']))
def forward(self, fc_feats, att_feats, target_seq, masks): # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:]) # print("================target_seq================") # for k, sent in enumerate(sents): # print(sent) # print("================target_seq================") # with BOS batch_size, len_q = target_seq.size() # target_pos: batch_size * len_1_q target_pos = np.array([[j + 1 for j in range(len_q)] for _ in range(batch_size)]) target_pos = torch.LongTensor(target_pos).cuda() target_pos = Variable(target_pos, requires_grad=False) target_pos = target_pos * masks.long() # fc_feats: batch_size * model_size # att_feats: batch_size * att_size * model_sze fc_feats, att_feats = self.embed_feats(fc_feats, att_feats) # target_seq: batch_size * len_q # target_pos: batch_size * len_q # fc_feats: batch_size * model_size # att_feats: batch_size * att_size * model_size # masks: batch_size * len_q # output_dec: batch_size * len_q * model_size # proj_wg: batch_size * len_q * (vocab_size + 1) output_dec, proj_wg = self.decoder(target_seq, target_pos, fc_feats, att_feats, masks) if self.drop_prob_lm > 0: output_dec = F.dropout(output_dec, self.drop_prob_lm) # output_dec: batch_size * len_q * model_size # seq_logsofts: batch_size * len_q * output_size seq_logsofts = F.log_softmax(self.proj(output_dec) * proj_wg, -1) # sents = utils.decode_sequence(self.vocab, target_seq.data[:,1:]) # print("================target_seq================") # for k, sent in enumerate(sents): # print(sent) # print("================target_seq================") if self.is_show_result: sampleLogprobs, seq = torch.max(seq_logsofts.data, 2) sents = utils.decode_sequence(self.vocab, seq) target_sents = utils.decode_sequence(self.vocab, target_seq.data[:, 1:]) print("===============output=================") for k, sent in enumerate(sents): print(sent, " | ", target_sents[k]) print("===============output=================") # seq_logsofts: batch_size * len_q * output_size return seq_logsofts, proj_wg
def get_self_critical_reward2(greedy_res, vid_info, gen_result, vocab, opt): vid, sentences_batch = vid_info start = time.time() batch_size, sent_len = gen_result.size( ) # batch_size = sample_size * seq_per_img # get greedy decoding baseline gen_result = utils.decode_sequence(vocab, gen_result) greedy_res = utils.decode_sequence(vocab, greedy_res) # pdb.set_trace() res = OrderedDict() #gen_result = gen_result.data.cpu().numpy() #greedy_res = greedy_res.data.cpu().numpy() for i in range(batch_size): #res[i] = [array_to_str(gen_result[i])] res[i] = [remove_nonascii(gen_result[i])] for i in range(batch_size): #res[batch_size + i] = [array_to_str(greedy_res[i])] res[batch_size + i] = [remove_nonascii(greedy_res[i])] gts = OrderedDict() for i in range(len(sentences_batch)): gts[i] = [remove_nonascii(sentences_batch[i])] #res_ = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)] res_ = [{ 'image_id': i, 'caption': res[i] } for i in range(2 * batch_size)] # for cider res__ = {i: res[i] for i in range(2 * batch_size)} gts = {i: gts[i % batch_size] for i in range(2 * batch_size)} #pdb.set_trace() if opt.meteor_reward_weight > 0: #print('vid:', vid) _, meteor_score = Meteor_scorer.compute_score(gts, res__) else: meteor_score = 0 if opt.meteor_reward_weight < 1: _, cider_score = Cider_scorer.compute_score(gts, res_) #print('Meteor score:', _) else: cider_score = 0 scores = opt.meteor_reward_weight * np.array(meteor_score) + ( 1 - opt.meteor_reward_weight) * np.array(cider_score) scores = scores[:batch_size] - scores[batch_size:] rewards = np.repeat(scores[:, np.newaxis], sent_len, 1) #print('time consuming:',time.time()-start) return rewards
def eval_print(loader, data, seq, predictions, n, eval_kwargs={}): beam_size = eval_kwargs.get('beam_size', 1) verbose_beam = eval_kwargs.get('verbose_beam', 1) num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1)) verbose = eval_kwargs.get('verbose', True) # print beam search if beam_size > 1 and verbose_beam: for i in range(loader.batch_size): print('\n'.join( utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze( 0))[0] for _ in model.done_beams[i])) print('---' * 10) sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} predictions.append(entry) if eval_kwargs.get('dump_images', 0) == 1: # dump the raw image to vis/ folder cmd = 'cp "' + os.path.join( eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str( len(predictions)) + '.jpg' # bit gross print(cmd) os.system(cmd) if verbose: print('image %s: %s' % (entry['image_id'], entry['caption'])) # if we wrapped around the split or used up val imgs budget then bail ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if verbose: if ix0 % 200 == 0: print('evaluating validation preformance... %d/%d' %(ix0, ix1)) if ix0 else \ print('evaluating validation preformance... %d/%d' %(ix1, ix1)) if data['bounds']['wrapped']: return True if num_images >= 0 and n >= num_images: return True return False
def get_scores(self, preds, target): # The reward loss: cider_scorer = CiderScorer(n=4, sigma=6) # Go to sentence space to compute scores: hypo = decode_sequence(self.vocab, preds) # candidate refs = decode_sequence(self.vocab, target.data) # references num_img = target.size(0) // self.seq_per_img for e, h in enumerate(hypo): ix_start = e // self.seq_per_img * self.seq_per_img ix_end = ix_start + 5 # self.seq_per_img cider_scorer += (h, refs[ix_start:ix_end]) (score, scores) = cider_scorer.compute_score() self.logger.debug("CIDEr score: %s" % str(scores)) return scores
def get_scores(self, preds, target): hypo = decode_sequence(self.vocab, preds) # candidate refs = decode_sequence(self.vocab, target.data) # references num_img = target.size(0) // self.seq_per_img scores = [] lr = len(refs) codes = self.infersent.encode(refs + hypo) refs = codes[:lr] hypo = codes[lr:] for e, h in enumerate(hypo): ix_start = e // self.seq_per_img * self.seq_per_img ix_end = ix_start + 5 # self.seq_per_img scores.append(group_similarity(h, refs[ix_start:ix_end])) self.logger.debug("infersent similairities: %s" % str(scores)) return scores
def forward(self, input, input1, seq, seq1, target, vocab): # truncate to the same size # input (batch_size * (seq_length + 2) * (vocab_size + 1)) # target (batch_size * (seq_length)) batch_size, L, Mp1 = input.size(0), input.size(1), input.size(2) seq_length = target.size(1) loss = Variable(torch.FloatTensor(1).zero_(), requires_grad=True).cuda() n = 0 label = utils.decode_sequence(vocab, target.data) seq = utils.decode_sequence(vocab, seq) seq1 = utils.decode_sequence(vocab, seq1) # train reward = utils.get_reward(seq, label, "CIDEr") # test reward1 = utils.get_reward(seq1, label, "CIDEr") reward_diff = reward - reward1 if reward_diff < 1: reward_diff = 1 for b in range(batch_size): first_time = True for t in range(1, L): if t - 1 >= seq_length: target_index = 0 else: target_index = target.data[b, t - 1] if target_index == 0 and first_time: first_time = False elif target_index == 0 and not first_time: break logsoft = input[b, t, target_index] loss.sub_(logsoft) n += 1 loss.div_(n) loss.mul_(reward_diff) return loss, reward, reward1
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' vocab = dataset.get_vocab() full_decoder = ConvS2VT(convnet, model, opt) tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray for video_path in opt['videos']: print(video_path) with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) for sent in sents: print(sent)
def demo_simple(obj_det_model, model, img_name): model.eval() file_path = img_name max_proposal = 200 num_proposal = 6 num_nms = 6 # load the image. input_imgs = torch.FloatTensor(1) img = Image.open(file_path).convert('RGB') # resize the image. img = transforms.Resize((opt.image_crop_size, opt.image_crop_size))(img) ppls = get_caption(obj_det_model, img_name) pad_proposals = torch.cat([i.unsqueeze(dim=0) for i in ppls]).unsqueeze(dim=0) # pad_proposals = torch.from_numpy(pad_proposals).float().unsqueeze(dim=0) num = torch.FloatTensor([1, len(ppls), len(ppls)]).unsqueeze(dim=0) img = transforms.ToTensor()(img) img = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(img).unsqueeze(dim=0) img = img.cuda() pad_proposals = pad_proposals.cuda() num = num.cuda() eval_opt = {'sample_max':1, 'beam_size': opt.beam_size, 'inference_mode' : True, 'tag_size' : opt.cbs_tag_size} seq, bn_seq, fg_seq, _, _, _ = model._sample(img, pad_proposals, num, eval_opt) sents = utils.decode_sequence(dataset.itow, dataset.itod, dataset.ltow, dataset.itoc, dataset.wtod, \ seq.data, bn_seq.data, fg_seq.data, opt.vocab_size, opt) return sents
def get_caption(model, crit, dataset, vocab, opt): model.eval() loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) samples = {} for data in loader: # forward the model to get loss fc_feats = Variable(data['fc_feats'], volatile=True).cuda() video_ids = data['video_ids'] # forward the model to also get generated samples for each image seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt) #print(seq_preds) sents = utils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) with open( os.path.join(opt["results_path"], opt["model"].split("/")[-1].split('.')[0] + ".json"), 'w') as prediction_results: json.dump({"predictions": samples}, prediction_results)
def eval_split(model, vocab, eval_kwargs): loader = get_eval_loader(kwargs=eval_kwargs) print( "assigned {} images for model evaluation in Karpathy {} split".format( len(loader), eval_kwargs['eval_split'])) predictions = [] eval_mode = eval_kwargs.get('eval_mode', 0) start = time.time() for i, batch in enumerate(loader): temp = [_.cuda() for _ in batch] cocoid, fc_feat, att_feat = temp word_idx, father_idx, mask = model._greedy_search( fc_feat, att_feat, 40) sents = utils.decode_sequence(vocab, word_idx, father_idx, mask) for j in range(len(sents)): entry = {'image_id': cocoid[j].item(), 'caption': sents[j]} print('{}: {}({})'.format(cocoid[j].item(), sents[j], i)) predictions.append(entry) if i > eval_kwargs['eval_images'] >= 0: break print("inference took {} seconds.".format(time.time() - start)) lang_stat = language_eval(predictions, eval_kwargs['id'], eval_kwargs['eval_split']) if not eval_kwargs['eval_time']: model.train() model.set_gpu(eval_kwargs['use_cuda'] == 1) return lang_stat
def eval_split(model_cnn, model, filepaths, ix_to_word, eval_kwargs={}): verbose_eval = eval_kwargs.get('verbose_eval', True) beam_size = eval_kwargs.get('beam_size', 1) caption_model = eval_kwargs.get('caption_model', '') batch_size = eval_kwargs.get('batch_size', 1) predictions = [] data = get_batch(filepaths, batch_size) images = torch.from_numpy(data['images']).cuda() images = utils.prepro_norm(images, False) images = Variable(images, requires_grad=False) if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) if models.is_only_fc_feat(caption_model): seq, _ = model.sample(fc_feats, {'beam_size': beam_size}) else: seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) # sents sents = utils.decode_sequence(ix_to_word, seq) for k, sent in enumerate(sents): print(sent) sent = ''.join(sent.split()) predictions.append(sent) return predictions
def test(model, crit, dataset, vocab, opt): loader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=True) scorer = COCOScorer() gt_dataframe = json_normalize(json.load(open(opt.input_json))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in loader: # forward the model to get loss fc_feats = Variable(data['fc_feats']).cuda() labels = Variable(data['labels']).long().cuda() with torch.no_grad(): # forward the model to also get generated samples for each image seq_probs, seq_preds = model(fc_feats, labels, teacher_forcing_ratio=0) print(seq_preds) sents = utils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = 'video' + str(data['ix'][k]) samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt.results_path): os.makedirs(opt.results_path) with open(os.path.join(opt.results_path, "scores.txt"), 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open(os.path.join(opt.results_path, opt.model.split("/")[-1].split('.')[0] + ".json"), 'w') as prediction_results: json.dump({"predictions": samples, "scores": valid_score}, prediction_results)
def test(model, crit, dataset, vocab, opt): model.eval() loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=False) scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) #results = [] samples = {} for index, data in enumerate(loader): print 'batch: ' + str((index + 1) * opt["batch_size"]) # forward the model to get loss fc_feats = Variable(data['fc_feats'], volatile=True).cuda() labels = Variable(data['labels'], volatile=True).long().cuda() masks = Variable(data['masks'], volatile=True).cuda() video_ids = data['video_ids'] # forward the model to also get generated samples for each image seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt) # print(seq_preds) sents = utils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] # break with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) #results.append(valid_score) #print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) result = OrderedDict() result['checkpoint'] = opt["saved_model"][opt["saved_model"].rfind('/') + 1:] score_sum = 0 for key, value in valid_score.items(): score_sum += float(value) result['sum'] = str(score_sum) #result = OrderedDict(result, **valid_score) result = OrderedDict(result.items() + valid_score.items()) print result if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table: scores_table.write(json.dumps(result) + "\n") with open( os.path.join(opt["results_path"], opt["model"].split("/")[-1].split('.')[0] + ".json"), 'w') as prediction_results: json.dump({ "predictions": samples, "scores": valid_score }, prediction_results)
def eval_split(model_cnn, model, loader, eval_kwargs={}): verbose_eval = eval_kwargs.get('verbose_eval', True) beam_size = eval_kwargs.get('beam_size', 1) caption_model = eval_kwargs.get('caption_model', '') batch_size = eval_kwargs.get('batch_size', 1) split = '' loader.reset_iterator(split) n = 0 predictions = [] vocab = loader.get_vocab() while True: data = loader.get_batch(split, batch_size) n = n + batch_size images = torch.from_numpy(data['images']).cuda() images = utils.prepro_norm(images, False) images = Variable(images, requires_grad=False) if models.is_only_fc_feat(caption_model): fc_feats = model_cnn(images) else: fc_feats, att_feats = model_cnn(images) if models.is_only_fc_feat(caption_model): seq, _ = model.sample(fc_feats, {'beam_size': beam_size}) else: seq, _ = model.sample(fc_feats, att_feats, {'beam_size': beam_size}) # sents sents = utils.decode_sequence(vocab, seq) for k, sent in enumerate(sents): image_id = data['infos'][k]['id'] image_id = int(image_id.split('_')[2]) entry = {'image_id': image_id, 'caption': sent} predictions.append(entry) if verbose_eval: print('image %s: %s' % (entry['image_id'], entry['caption'])) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] for i in range(n - ix1): predictions.pop() if verbose_eval: print('evaluating validation preformance... %d/%d' % (ix0 - 1, ix1)) if data['bounds']['wrapped']: break return predictions
def predict(model, crit, loader, eval_kwargs={}): print('loader.batch_size', loader.batch_size) verbose = eval_kwargs.get('verbose', True) use_cpu = eval_kwargs.get('use_cpu', False) # Make sure in the evaluation mode model.eval() loader.reset_iterator('dummy') n = 0 predictions = [] while True: data = loader.get_batch('dummy') # print('batch data type and size', type(data), len(data)) # print('data', data) n = n + loader.batch_size # forward the model to also get generated samples for each image # Only leave one feature for each image, in case duplicate sample tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img], data['att_masks'][np.arange(loader.batch_size) * loader.seq_per_img]] tmp = [Variable(torch.from_numpy(_), volatile=True) for _ in tmp] if not use_cpu: tmp = [_.cuda() for _ in tmp] fc_feats, att_feats, att_masks = tmp # forward the model to also get generated samples for each image seq = model(fc_feats, att_feats, att_masks, opt=eval_kwargs, mode='sample')[0].data sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): if verbose: print('image %s: ' %(data['infos'][k]['id']), sent.encode('utf8', 'replace')) entry = {'image_id': data['infos'][k]['id'], 'caption': sent, 'file_path': data['infos'][k]['file_path']} if eval_kwargs.get('dump_path', 0) == 1: entry['file_name'] = data['infos'][k]['file_path'] predictions.append(entry) # if we wrapped around the split or used up val imgs budget then bail ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] for i in range(n - ix1): predictions.pop() if data['bounds']['wrapped']: break # print('predictions', predictions) # Switch back to training mode model.train() return predictions
def test(model, crit, dataset, vocab, device, opt): model.eval() loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} for data in loader: # forward the model to get loss fc_feats = data['fc_feats'].to(device) labels = data['labels'].to(device) masks = data['masks'].to(device) video_ids = data['video_ids'] if opt["model"] == "S2VTACTModel": action = data['action'].to(device) # forward the model to also get generated samples for each image with torch.no_grad(): if opt["model"] == "S2VTModel": seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt) else: seq_probs, seq_preds = model(fc_feats, action=action, device=device, mode='inference', opt=opt) sents = utils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) with open(os.path.join(opt["results_path"], "scores.txt"), 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open( os.path.join(opt["results_path"], opt["model"].split("/")[-1].split('.')[0] + ".json"), 'w') as prediction_results: json.dump({ "predictions": samples, "scores": valid_score }, prediction_results)
def test(model, crit, dataset, vocab, opt): model.eval() loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open(opt["input_json"]))['sentences']) gts = convert_data_to_coco_scorer_format(gt_dataframe) results = [] samples = {} seq_probs_list = [] seq_preds_list = [] masks_list = [] labels_list = [] for data in loader: # forward the model to get loss fc_feats = data['fc_feats'].cuda() if(opt["with_mean"] == 0): feats_3d = data['feats_3d'].cuda() labels = data['labels'].cuda() masks = data['masks'].cuda() video_ids = data['video_ids'] # forward the model to also get generated samples for each image with torch.no_grad(): if(opt["with_mean"] == 1): seq_probs, seq_preds = model( fc_feats, mode='inference', opt=opt) else: seq_probs, seq_preds = model( fc_feats, feats_3d, mode='inference', opt=opt) sents = utils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] seq_preds_list.append(seq_preds) seq_probs_list.append(seq_probs) masks_list.append(masks) labels_list.append(labels) with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) seq_probs_list = torch.cat(seq_probs_list, 0) seq_preds_list = torch.cat(seq_preds_list, 0) labels_list = torch.cat(labels_list, 0) masks_list = torch.cat(masks_list, 0) return valid_score, samples, seq_probs_list, seq_preds_list, labels_list, masks_list
def eval_print_caption(loader, info, seq, predictions): # TODO: beam search > 1 sent = utils.decode_sequence(loader.get_vocab(), seq)[0] seq_ = seq.squeeze().data.cpu().numpy() entry = { 'image_id': info['id'], 'caption': sent, 'caption_ix': seq_.tolist() } predictions.append(entry) print('image %s: %s' % (entry['image_id'], entry['caption']))
def test(model, crit, dataset, vocab, opt): model.eval() loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) scorer = COCOScorer() gt_dataframe = json_normalize( json.load(open('data_subset/vatex_subsample_v1.0.json'))) gts = convert_data_to_coco_scorer_format(gt_dataframe, 'chinese') results = [] samples = {} for data in loader: # forward the model to get loss i3d_feats = data['i3d_feats'].squeeze(1) #.cuda() labels = data['labels'] #.cuda() masks = data['masks'] #.cuda() video_ids = data['video_ids'] # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(i3d_feats, mode='inference', opt=opt) sents = utils.decode_sequence(vocab, seq_preds) for k, sent in enumerate(sents): video_id = video_ids[k] samples[video_id] = [{'image_id': video_id, 'caption': sent}] with suppress_stdout_stderr(): valid_score = scorer.score(gts, samples, samples.keys()) results.append(valid_score) print(valid_score) if not os.path.exists(opt["results_path"]): os.makedirs(opt["results_path"]) with open( os.path.join(opt["results_path"], "chinese_LSTM_OPT_epoch601_scores.txt"), 'a') as scores_table: scores_table.write(json.dumps(results[0]) + "\n") with open( os.path.join( opt["results_path"], opt["model"].split("/")[-1].split('.')[0] + "_chinese_LSTM_OPT_epoch601.json"), 'w') as prediction_results: json.dump({ "predictions": samples, "scores": valid_score }, prediction_results, indent=2)
def image_captioning_body(img): #img = skimage.io.imread(opt.image) #img = skimage.io.imread('silicon_test_images/cellphone.jpg') if (img.shape[0]==2): img = img[0] print (img.shape) fc_batch = np.ndarray((batch_size, 2048), dtype = 'float32') att_batch = np.ndarray((batch_size, 14, 14, 2048), dtype = 'float32') if len(img.shape) == 2: img = img[:,:,np.newaxis] img = np.concatenate((img, img, img), axis=2) img = img.astype('float32')/255.0 img = torch.from_numpy(img.transpose([2, 0, 1])) ''' if use_cuda==False: img = torch.from_numpy(img.transpose([2, 0, 1])) else: img = torch.from_numpy(img.transpose([2, 0, 1])).cuda() ''' with torch.no_grad(): img = Variable(preprocess(img)) if use_cuda==True: img = img.cuda() tmp_fc, tmp_att = my_resnet(img) fc_batch[0] = tmp_fc.data.cpu().float().numpy() att_batch[0] = tmp_att.data.cpu().float().numpy() data['fc_feats'] = fc_batch data['att_feats'] = att_batch tmp = [data['fc_feats'][np.arange(batch_size)], data['att_feats'][np.arange(batch_size)]] with torch.no_grad(): if use_cuda: tmp = [Variable(torch.from_numpy(_)).cuda() for _ in tmp] else: tmp = [Variable(torch.from_numpy(_)) for _ in tmp] fc_feats, att_feats = tmp # forward the model to also get generated samples for each image seq, _ = model.sample(fc_feats, att_feats, vars(opt)) seq = seq.cpu().numpy() sents = utils.decode_sequence(vocab, seq) print (sents) return sents[0]
def make_sents_mask(gen_result, vocab): length = gen_result.shape[1] sents_mask = [] sents = utils.decode_sequence(vocab, gen_result) for sent in sents: sent_mask = [0]*length tokens = sent.lower().split() tag = nltk.pos_tag(tokens) for i in range(len(tag)): if tag[i][1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS']: sent_mask[i] = 1 sents_mask.append(sent_mask) return torch.Tensor(sents_mask).cuda()
def get_bert_ids(self, seqs): input_ids = [] txts = utils.decode_sequence(self.vocab, seqs[:,1:], add_punct=True) txts = [txt.replace('<blank>', self.tokenizer.mask_token).replace('<UNK>', self.tokenizer.unk_token) for txt in txts] ids = [self.tokenizer.convert_tokens_to_ids(["[CLS]"] + self.tokenizer.tokenize(txt) + ["[SEP]"]) for txt in txts] max_seq_length = max([len(input_id) for input_id in ids]) pad_token = self.tokenizer.pad_token for input_id in ids: padding = [self.tokenizer._convert_token_to_id(pad_token)] * (max_seq_length - len(input_id)) input_id += padding input_ids.append(input_id) input_ids = torch.tensor(input_ids, dtype=torch.long).cuda().contiguous() return input_ids
def demov(model, crit, dataset, vocab, opt): loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=False) for i, data in enumerate(loader): # forward the model to get loss fc_feats = data['fc_feats'].cuda() # forward the model to also get generated samples for each image with torch.no_grad(): seq_probs, seq_preds = model(fc_feats, mode='inference', opt=opt) sents = utils.decode_sequence(vocab, seq_preds) print(sents) if i == 0: break
def gen_caption(self, im_target, im_reference=None): if self.is_relative and im_reference == None: return '' if not self.is_relative and not im_reference == None: return '' fc_feat, att_feat = self.get_feat(im_target, im_reference) tmp = (fc_feat, att_feat) tmp = [torch.from_numpy(_).to(DEVICE) for _ in tmp] fc_feat, att_feat = tmp if not self.opt['use_att']: att_feat = torch.zeros(1, 1, 1, 1) seq, _ = self.model.sample(fc_feat, att_feat, self.opt) sents = utils.decode_sequence(self.vocab, seq) return seq, sents
def find_type(greedy_res, vocab): #0: TE #1: TIE #2: TDE effect_type = [] sents = utils.decode_sequence(vocab, greedy_res)[0] tokens = sents.lower().split() tag = nltk.pos_tag(tokens) for i in range(len(tag)): # if tag[i][1] in ['NN', 'NNS','NNP','NNPS'] and tag[i][0] !='unk' and tag[i][0] !='group'and tag[i][0] !='people': # if tag[i][1] in ['NN', 'NNS','NNP','NNPS'] and tag[i][0] !='unk' and wtol[tag[i][0]] in wtod: # if tag[i][1] in ['NN', 'NNS','NNP','NNPS'] and tag[i][0] !='unk': if tag[i][1] in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']: # if t[1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP'] and t[0] !='unk': effect_type.append(1) elif tag[i][1] in ['CC', 'IN', 'RP']: effect_type.append(2) else: effect_type.append(0) return effect_type
def find_type(seq,vocab,force=False): #0: TE #1: TIE #2: TDE if force: effect_type = [2]*seq.shape[1] else: effect_type = [0]*seq.shape[1] sents = utils.decode_sequence(vocab, seq)[0] tokens = sents.lower().split() tag = nltk.pos_tag(tokens) for i in range(len(tag)): if tag[i][1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS']: # if t[1] in ['NN', 'NNS','NNP','NNPS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP'] and t[0] !='unk': effect_type[i] = 1 # elif tag[i][1] in ['CC','IN','RP']: # effect_type[i] = 2 # else: # effect_type[i] = 0 return effect_type
def eval_split(model, crit, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1)) split = eval_kwargs.get('split', 'test') lang_eval = eval_kwargs.get('language_eval', 0) dataset = eval_kwargs.get('dataset', 'coco') print_all_beam = eval_kwargs.get('print_all_beam', False) print('> print_all_beam', print_all_beam) # Make sure in the evaluation mode model.eval() loader.reset_iterator(split) n = 0 loss = 0 loss_sum = 0 loss_evals = 1e-8 predictions = [] while True: data = loader.get_batch(split) n = n + loader.batch_size if data.get('labels', None) is not None: # forward the model to get loss tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['attributes']] tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp] fc_feats, att_feats, labels, masks, attributes = tmp loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:], attributes).data[0] loss_sum = loss_sum + loss loss_evals = loss_evals + 1 # forward the model to also get generated samples for each image # Only leave one feature for each image, in case duplicate sample tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp] fc_feats, att_feats = tmp # forward the model to also get generated samples for each image seq, prob, attr = model.sample(fc_feats, att_feats, eval_kwargs) if print_all_beam is True: for p in xrange(seq.shape[0]): seq_this = seq[p, :, :] prob_this = prob[p, :, :] sents = utils.decode_sequence(loader.get_vocab(), seq_this) print('---------------------------------------------------------') print('> video id %s:' % data['infos'][p]['id']) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][p]['id'], 'caption': sent} if eval_kwargs.get('dump_path', 0) == 1: entry['file_name'] = data['infos'][p]['file_path'] predictions.append(entry) if eval_kwargs.get('dump_images', 0) == 1: # dump the raw image to vis/ folder cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross print(cmd) os.system(cmd) if verbose: print(' %s (%.5f)' %(entry['caption'], math.exp(sum(prob_this[k, :])))) print('---------------------------------------------------------') if split == 'show': p = raw_input() # seq [image_idx, beam_idx, sentence] else: #set_trace() sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} if eval_kwargs.get('dump_path', 0) == 1: entry['file_name'] = data['infos'][k]['file_path'] predictions.append(entry) if eval_kwargs.get('dump_images', 0) == 1: # dump the raw image to vis/ folder cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross print(cmd) os.system(cmd) this_attr = attr[k, :].data.cpu().numpy() assert this_attr.shape == (1000,) this_gt_attr = attributes[k * loader.seq_per_img, :].data.cpu().numpy() gt_attr_indices = this_gt_attr.argsort()[-5:][::-1] attr_indices = this_attr.argsort()[-5:][::-1] gt_label = labels[k * loader.seq_per_img, 1:].data.cpu().numpy() if verbose: print('video %s: %s' % (entry['image_id'], entry['caption'])) print(' gt: %s' % ' '.join(([loader.ix_to_word[str(p)] for p in gt_label if p > 0]))) print(' attr: %s' % (' '.join([loader.attr_idx2word[id] for id in attr_indices]))) print(' gt labels: %s' % (' '.join([loader.attr_idx2word[id] for id in gt_attr_indices]))) # if we wrapped around the split or used up val imgs budget then bail ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if verbose and split is not 'show': print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss)) if data['bounds']['wrapped']: break if num_images >= 0 and n >= num_images: break lang_stats = None if lang_eval == 1: lang_stats = language_eval(dataset, predictions, eval_kwargs['id'], split) # Switch back to training mode model.train() return loss_sum/loss_evals, predictions, lang_stats
def eval_split(model, crit, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1)) split = eval_kwargs.get('split', 'val') lang_eval = eval_kwargs.get('language_eval', 1) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) # Make sure in the evaluation mode model.eval() loader.reset_iterator(split) n = 0 loss = 0 loss_sum = 0 loss_evals = 1e-8 predictions = [] while True: data = loader.get_batch(split) n = n + loader.batch_size if data.get('labels', None) is not None: # forward the model to get loss tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']] tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp] fc_feats, att_feats, labels, masks = tmp loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:]).data[0] loss_sum = loss_sum + loss loss_evals = loss_evals + 1 # forward the model to also get generated samples for each image # Only leave one feature for each image, in case duplicate sample tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp] fc_feats, att_feats = tmp # forward the model to also get generated samples for each image seq, _ = model.sample(fc_feats, att_feats, eval_kwargs) #set_trace() sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} if eval_kwargs.get('dump_path', 0) == 1: entry['file_name'] = data['infos'][k]['file_path'] predictions.append(entry) if eval_kwargs.get('dump_images', 0) == 1: # dump the raw image to vis/ folder cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross print(cmd) os.system(cmd) if verbose: print('image %s: %s' %(entry['image_id'], entry['caption'])) # if we wrapped around the split or used up val imgs budget then bail ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if verbose: print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss)) if data['bounds']['wrapped']: break if num_images >= 0 and n >= num_images: break lang_stats = {} if lang_eval == 1: lang_stats = language_eval(dataset, predictions, eval_kwargs['id'], split) # Switch back to training mode model.train() return loss_sum/loss_evals, predictions, lang_stats
def eval_split(sess, model, loader, eval_kwargs): verbose = eval_kwargs.get('verbose', True) num_images = eval_kwargs.get('num_images', -1) split = eval_kwargs.get('split', 'test') language_eval = eval_kwargs.get('language_eval', 0) dataset = eval_kwargs.get('dataset', 'coco') # Make sure in the evaluation mode sess.run(tf.assign(model.training, False)) sess.run(tf.assign(model.cnn_training, False)) loader.reset_iterator(split) n = 0 loss_sum = 0 loss_evals = 1e-8 predictions = [] while True: # fetch a batch of data if opt.beam_size > 1: data = loader.get_batch(split, 1) n = n + 1 else: data = loader.get_batch(split, opt.batch_size) n = n + opt.batch_size #evaluate loss if we have the labels loss = 0 if data.get('labels', None) is not None: # forward the model to get loss feed = {model.images: data['images'], model.labels: data['labels'], model.masks: data['masks']} loss = sess.run(model.cost, feed) loss_sum = loss_sum + loss loss_evals = loss_evals + 1 # forward the model to also get generated samples for each image if opt.beam_size == 1: # forward the model to also get generated samples for each image feed = {model.images: data['images']} #g_o,g_l,g_p, seq = sess.run([model.g_output, model.g_logits, model.g_probs, model.generator], feed) seq = sess.run(model.generator, feed) #set_trace() sents = utils.decode_sequence(vocab, seq) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} predictions.append(entry) if verbose: print('image %s: %s' %(entry['image_id'], entry['caption'])) else: seq = model.decode(data['images'], opt.beam_size, sess) sents = [' '.join([vocab.get(str(ix), '') for ix in sent]).strip() for sent in seq] sents = [sents[0]] entry = {'image_id': data['infos'][0]['id'], 'caption': sents[0]} predictions.append(entry) if verbose: for sent in sents: print('image %s: %s' %(entry['image_id'], sent)) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} if opt.dump_path == 1: entry['file_name'] = data['infos'][k]['file_path'] table.insert(predictions, entry) if opt.dump_images == 1: # dump the raw image to vis/ folder cmd = 'cp "' + os.path.join(opt.image_root, data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross print(cmd) os.system(cmd) if verbose: print('image %s: %s' %(entry['image_id'], entry['caption'])) # if we wrapped around the split or used up val imgs budget then bail ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if verbose: print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss)) if data['bounds']['wrapped']: break if num_images >= 0 and n >= num_images: break lang_stats = None if language_eval == 1: lang_stats = eval_utils.language_eval(dataset, predictions) # Switch back to training mode sess.run(tf.assign(model.training, True)) sess.run(tf.assign(model.cnn_training, True)) return loss_sum/loss_evals, predictions, lang_stats
def eval_split(sess, model, loader, eval_kwargs): verbose = eval_kwargs.get('verbose', True) val_images_use = eval_kwargs.get('val_images_use', -1) split = eval_kwargs.get('split', 'val') language_eval = eval_kwargs.get('language_eval', 1) dataset = eval_kwargs.get('dataset', 'coco') # Make sure in the evaluation mode sess.run(tf.assign(model.training, False)) sess.run(tf.assign(model.cnn_training, False)) loader.reset_iterator(split) n = 0 loss_sum = 0 loss_evals = 0 predictions = [] while True: if opt.beam_size > 1: data = loader.get_batch(split, 1) n = n + 1 else: data = loader.get_batch(split) n = n + loader.batch_size # forward the model to get loss feed = {model.images: data['images'], model.labels: data['labels'], model.masks: data['masks']} loss = sess.run(model.cost, feed) loss_sum = loss_sum + loss loss_evals = loss_evals + 1 if opt.beam_size == 1: # forward the model to also get generated samples for each image feed = {model.images: data['images']} #g_o,g_l,g_p, seq = sess.run([model.g_output, model.g_logits, model.g_probs, model.generator], feed) seq = sess.run(model.generator, feed) #set_trace() sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} predictions.append(entry) if verbose: print('image %s: %s' %(entry['image_id'], entry['caption'])) else: seq = model.decode(data['images'], opt.beam_size, sess) sents = [' '.join([loader.ix_to_word.get(str(ix), '') for ix in sent]).strip() for sent in seq] entry = {'image_id': data['infos'][0]['id'], 'caption': sents[0]} predictions.append(entry) if verbose: for sent in sents: print('image %s: %s' %(entry['image_id'], sent)) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if val_images_use != -1: ix1 = min(ix1, val_images_use) for i in range(n - ix1): predictions.pop() if verbose: print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss)) if data['bounds']['wrapped']: break if n>= val_images_use: break if language_eval == 1: lang_stats = eval_utils.language_eval(dataset, predictions) # Switch back to training mode sess.run(tf.assign(model.training, True)) sess.run(tf.assign(model.cnn_training, True)) return loss_sum/loss_evals, predictions, lang_stats