def validate(opt, val_loader, model, audio): # compute the encoding for all the validation images and captions img_embs, cap_embs, aud_embs = encode_data(model, val_loader, audio, opt.log_step, logging.info) # image to caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure=opt.measure) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # caption to image retrieval (r1i, r5i, r10i, medri, meanri) = t2i(img_embs, cap_embs, measure=opt.measure) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri)) # image to audio retrieval (r1ia, r5ia, r10ia, medria, meanria) = i2t(img_embs, aud_embs, measure=opt.measure) logging.info("image to audio: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1ia, r5ia, r10ia, medria, meanria)) # audio to image retrieval (r1ai, r5ai, r10ai, medrai, meanrai) = t2i(img_embs, aud_embs, measure=opt.measure) logging.info("audio to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1ai, r5ai, r10ai, medrai, meanrai)) # caption to audio retrieval (r1ca, r5ca, r10ca, medrca, meanrca) = i2t(cap_embs, aud_embs, measure=opt.measure, npts=5000) logging.info("text to audio: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1ca, r5ca, r10ca, medrca, meanrca)) # audio to caption retrieval (r1ac, r5ac, r10ac, medrac, meanrac) = i2t(aud_embs, cap_embs, measure=opt.measure, npts=5000) logging.info("audio to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1ac, r5ac, r10ac, medrac, meanrac)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i + r1ia + r5ia + r10ia + r1ai + r5ai + r10ai + r1ca + r5ca + r10ca + r1ac + r5ac + r10ac # pdb.set_trace() # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def validate(opt, val_loader, model): # compute the encoding for all the validation images and captions img_embs, cap_embs = encode_data( model, val_loader, opt.log_step, logging.info) print(img_embs.shape[0] // 5, "Images", cap_embs.shape[0], "texts for validate") # caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure=opt.measure) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr) = t2i( img_embs, cap_embs, measure=opt.measure) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def validate(opt, val_loader, model, tb_writer): # compute the encoding for all the validation images and captions # with torch.no_grad(): img_embs, cap_embs = encode_data(model, val_loader, opt.log_step, logging.info) # caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure=opt.test_measure) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, measure=opt.test_measure) print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_writer.add_scalar('data/r1', r1, model.Eiters) tb_writer.add_scalar('data/r5', r5, model.Eiters) tb_writer.add_scalar('data/r10', r10, model.Eiters) tb_writer.add_scalar('data/medr', medr, model.Eiters) tb_writer.add_scalar('data/meanr', meanr, model.Eiters) tb_writer.add_scalar('data/r1i', r1i, model.Eiters) tb_writer.add_scalar('data/r5i', r5i, model.Eiters) tb_writer.add_scalar('data/r10i', r10i, model.Eiters) tb_writer.add_scalar('data/medri', medri, model.Eiters) tb_writer.add_scalar('data/meanr', meanr, model.Eiters) tb_writer.add_scalar('data/rsum', currscore, model.Eiters) return currscore
def eval_model(): print ('evaluating model...') weights = model.get_weights() emb_w = weights[0] im_w = weights[1] im_b = weights[2] gru_weights = weights[3:12] test_model_im = Model(input=image_input, output=emb_image) test_model_im.set_weights([im_w, im_b]) test_model_im.compile(optimizer='adam', loss=contrastive_loss) test_model_cap = Model(input=cap_input, output=emb_cap) test_model_cap.set_weights([emb_w]+ gru_weights) test_model_cap.compile(optimizer='adam', loss=contrastive_loss) test_cap, test_im = test_iter.all() all_caps = numpy.zeros(shape=(len(test_cap),model_config['max_cap_length'])) all_images = numpy.zeros(shape=(len(test_cap), model_config['dim_cnn'])) pred_cap = test_model_cap.predict(test_cap) pred_im = test_model_im.predict(test_im) test_errs = compute_errors(pred_cap, pred_im) r10_c, rmean_c = t2i(test_errs) r10_i, rmean_i = i2t(test_errs) print ("Image to text: %.1f %.1f" % (r10_i, rmean_i)) print ("Text to image: %.1f %.1f" % (r10_c, rmean_c))
def validate(opt, val_loader, model): # compute the encoding for all the validation images and captions vid_seq_embs, para_seq_embs, clip_embs, cap_embs, _, _, num_clips, cur_vid_total = encode_data( opt, model, val_loader, opt.log_step, logging.info, contextual_model=True) # caption retrieval # vid_clip_rep, _, _ = i2t(clip_embs, cap_embs, measure=opt.measure) # image retrieval # cap_clip_rep, _, _ = t2i(clip_embs, cap_embs, measure=opt.measure) # caption retrieval vid_seq_rep, top1_v2p, rank_vid_v2p = i2t(vid_seq_embs, para_seq_embs, measure=opt.measure) # image retrieval para_seq_rep, top1_p2v, rank_para_p2v = t2i(vid_seq_embs, para_seq_embs, measure=opt.measure) currscore = vid_seq_rep['sum'] + para_seq_rep['sum'] # logging.info("Clip to Sent: %.1f, %.1f, %.1f, %.1f, %.1f" % # (vid_clip_rep['r1'], vid_clip_rep['r5'], vid_clip_rep['r10'], vid_clip_rep['medr'], vid_clip_rep['meanr'])) # logging.info("Sent to Clip: %.1f, %.1f, %.1f, %.1f, %.1f" % # (cap_clip_rep['r1'], cap_clip_rep['r5'], cap_clip_rep['r10'], cap_clip_rep['medr'], cap_clip_rep['meanr'])) logging.info("Video to Paragraph: %.1f, %.1f, %.1f, %.1f, %.1f" % (vid_seq_rep['r1'], vid_seq_rep['r5'], vid_seq_rep['r10'], vid_seq_rep['medr'], vid_seq_rep['meanr'])) logging.info("Paragraph to Video: %.1f, %.1f, %.1f, %.1f, %.1f" % (para_seq_rep['r1'], para_seq_rep['r5'], para_seq_rep['r10'], para_seq_rep['medr'], para_seq_rep['meanr'])) logging.info("Currscore: %.1f" % (currscore)) # record metrics in tensorboard # LogReporter(tb_logger, vid_clip_rep, model.Eiters, 'clip') # LogReporter(tb_logger, cap_clip_rep, model.Eiters, 'clipi') LogReporter(tb_logger, vid_seq_rep, model.Eiters, 'seq') LogReporter(tb_logger, para_seq_rep, model.Eiters, 'seqi') tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def validate(opt, val_loader, model): # compute the encoding for all the validation images and captions img_embs, cap_embs = encode_data(model, val_loader, opt.log_step, logging.info) # caption retrieval (r1, r5, r10, medr, meanr), d_i2t = i2t(img_embs, cap_embs, measure=opt.measure) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr), d_t2i = t2i(img_embs, cap_embs, measure=opt.measure) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_logger.log_value('recall@1_text', r1, step=model.Eiters) tb_logger.log_value('recall@5_text', r5, step=model.Eiters) tb_logger.log_value('recall@10_text', r10, step=model.Eiters) tb_logger.log_value('med-r_text', medr, step=model.Eiters) tb_logger.log_value('mean-r_text', meanr, step=model.Eiters) tb_logger.log_value('recall@1_im', r1i, step=model.Eiters) tb_logger.log_value('recall@5_im', r5i, step=model.Eiters) tb_logger.log_value('recall@10_im', r10i, step=model.Eiters) tb_logger.log_value('med-r_im', medri, step=model.Eiters) tb_logger.log_value('mean-r_im', meanr, step=model.Eiters) tb_logger.log_value('recall_sum', currscore, step=model.Eiters) return currscore, d_i2t
def eval_model(): print ('evaluating model...') weights = model.get_weights() for j in range(len(weights)): print(weights[j].shape) emb_w = weights[0] im_w = weights[4] im_b = weights[5] gru_weights = weights[1:4] test_model_im = Model(inputs=image_input, outputs=emb_image) test_model_im.set_weights([im_w, im_b]) test_model_im.compile(optimizer='adam', loss=contrastive_loss) test_model_cap = Model(inputs=cap_input, outputs=emb_cap) test_model_cap.set_weights([emb_w]+ gru_weights) test_model_cap.compile(optimizer='adam', loss=contrastive_loss) test_cap, test_im = test_iter.all() all_caps = numpy.zeros(shape=(len(test_cap),model_config['max_cap_length'])) all_images = numpy.zeros(shape=(len(test_cap), model_config['dim_cnn'])) pred_cap = test_model_cap.predict(test_cap) pred_im = test_model_im.predict(test_im) test_errs = compute_errors(pred_cap, pred_im) r10_c, rmean_c = t2i(test_errs) r10_i, rmean_i = i2t(test_errs) print ("Image to text: %.1f %.1f" % (r10_i, rmean_i)) print ("Text to image: %.1f %.1f" % (r10_c, rmean_c))
def validate(opt, val_loader, model): # compute the encoding for all the validation images and captions with torch.no_grad(): img_embs, cap_embs, cap_lens, freqs = encode_data( model, val_loader, opt.log_step, logging.info) img_embs = numpy.array( [img_embs[i] for i in range(0, len(img_embs), 1)]) start = time.time() # find the similarity between every caption and image in the validation set? if opt.cross_attn == 't2i': sims, _ = shard_xattn_t2i(img_embs, cap_embs, cap_lens, freqs, opt, shard_size=opt.shard_size) elif opt.cross_attn == 'i2t': sims, _ = shard_xattn_i2t(img_embs, cap_embs, cap_lens, freqs, opt, shard_size=opt.shard_size) else: raise NotImplementedError end = time.time() print("calculate similarity time:", end - start) # caption retrieval (find the right text with every image) (r1, r5, r10, r20, r50, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f %.1f %.1f" % (r1, r5, r10, r20, r50, medr, meanr)) # image retrieval (find the right image for every text) (r1i, r5i, r10i, r20i, r50i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f %.1f %.1f" % (r1i, r5i, r10i, r20i, r50i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def evaluate(img_ids, img_embs, t_embs, measure='cosine', n_caption=2, val_metric='map', direction='t2i'): count = {} for iid in img_ids: if int(iid) not in count: count[int(iid)] = (1, 0) else: count[int(iid)] = (count[int(iid)][0] + 1, 0) img_mask, text_mask = [False for _ in img_ids], [True for _ in img_ids] for idx, iid in enumerate(img_ids): c, u = count[int(iid)] if c >= n_caption and u == 0: img_mask[idx] = True count[int(iid)] = (c, 1) elif c >= n_caption and u == 1: count[int(iid)] = (c, 2) else: text_mask[idx] = False img_ids = [x for idx, x in enumerate(img_ids) if img_mask[idx]] img_embs = img_embs[img_mask] t_embs = t_embs[text_mask] c2i_all_errors = evaluation.cal_error(img_embs, t_embs, measure) if val_metric == "recall": # meme retrieval (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=n_caption) # caption retrieval (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=n_caption) elif val_metric == "map": # meme retrieval t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=n_caption) # caption retrieval i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=n_caption) currscore = 0 if val_metric == "recall": if direction == 'i2t' or direction == 'all': rsum = r1 + r5 + r10 currscore += rsum if direction == 't2i' or direction == 'all': rsumi = r1i + r5i + r10i currscore += rsumi elif val_metric == "map": if direction == 'i2t' or direction == 'all': currscore += i2t_map_score if direction == 't2i' or direction == 'all': currscore += t2i_map_score return currscore
def valid_retrieval(self): '''The task is performed 5 times on 1000-image subsets of the test set and the results are averaged.''' '''Our best results are obtained with a different strategy: Images are resized to 400x400 irrespective of their size and aspect ratio''' mean_loss = 0 embed_txts = [] embed_imgs = [] if self.init_from or (self.mode == 'val'): self.txt_enc, self.img_enc = self.restore_model( self.init_from, self.main_dir, self.model_name) self.txt_enc.eval() self.img_enc.eval() data_iter = iter(self.validloader) iters = len(data_iter) mean_loss = 0 for j in range(iters): img, tokens, _ = next(data_iter) tokens = tokens.to(self.device) img = img.to(self.device) embed_txt = self.txt_enc(tokens) embed_img = self.img_enc(img) #.type(torch.cuda.DoubleTensor) ## Compute loss. loss = self.hard_negative_loss( embed_img, embed_txt) + self.hard_negative_loss( embed_txt, embed_img) mean_loss += loss embed_txts.extend(embed_txt) embed_imgs.extend(embed_img) mean_loss /= iters r = np.zeros(4) ri = np.zeros(4) for i in range(5): r += i2t(embed_imgs[1000 * i:1000 (i + 1)], embed_txts[1000 * i:1000 (i + 1)]) ri += t2i(embed_imgs[1000 * i:1000 (i + 1)], embed_txts[1000 * i:1000 (i + 1)]) r /= 5 ri /= 5 print("Image to text: %.1f, %.1f, %.1f, %.1f" % (r[0], r[1], r[2], r[3])) print("Text to image: %.1f, %.1f, %.1f, %.1f" % (ri[0], ri[1], ri[2], ri[3])) return r, ri, mean_loss
def validate(opt, val_loader, model): # compute the encoding for all the validation images and captions img_embs, cap_embs, cap_lens = encode_data(model, val_loader, opt.log_step, logging.info) img_embs = numpy.array([img_embs[i] for i in range(0, len(img_embs), 5)]) print("Img shape in validate:", img_embs.shape) start = time.time() if opt.cross_attn == 't2i': sims = shard_xattn_t2i(img_embs, cap_embs, cap_lens, opt, shard_size=128) elif opt.cross_attn == 'i2t': sims = shard_xattn_i2t(img_embs, cap_embs, cap_lens, opt, shard_size=128) else: raise NotImplementedError end = time.time() print("calculate similarity time:", end - start) # caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def validate(opt, val_loader, model, vocab): # compute the encoding for all the validation images and captions img_embs, cap_embs = encode_data(model, val_loader, opt.log_step, logger.info, vocab) # caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure='cosine') logger.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, measure='cosine') logger.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i return currscore
def validate(opt, val_loader, model): # compute the encoding for all the validation images and captions img_embs, cap_embs, cap_lens = encode_data(model, val_loader, opt.log_step, logging.info) # clear duplicate 5*images and keep 1*images img_embs = numpy.array([img_embs[i] for i in range(0, len(img_embs), 5)]) # record computation time of validation start = time.time() sims = shard_attn_scores(model, img_embs, cap_embs, cap_lens, opt, shard_size=100) end = time.time() print("calculate similarity time:", end - start) # caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping r_sum = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r_sum', r_sum, step=model.Eiters) return r_sum
def validate(opt, val_loader, model, num_offsets=10): # compute the encoding for all the validation images and captions img_seq_embs, cap_seq_embs = encode_eval_data( model, val_loader, opt.log_step, logging.info, num_offsets=num_offsets) for _offset in xrange(num_offsets): logging.info("Offset: %.1f" % _offset ) # caption retrieval (seq_r1, seq_r5, seq_r10, seq_medr, seq_meanr) = i2t( img_seq_embs[_offset], cap_seq_embs[_offset], measure=opt.measure) logging.info("seq_Image to seq_text: %.1f, %.1f, %.1f, %.1f, %.1f" % (seq_r1, seq_r5, seq_r10, seq_medr, seq_meanr)) # image retrieval (seq_r1i, seq_r5i, seq_r10i, seq_medri, seq_meanr) = t2i( img_seq_embs[_offset], cap_seq_embs[_offset], measure=opt.measure) logging.info("seq_Text to seq_image: %.1f, %.1f, %.1f, %.1f, %.1f" % (seq_r1i, seq_r5i, seq_r10i, seq_medri, seq_meanr))
def validate(opt, val_loader, model, lang, n=5): # compute the encoding for all the validation images and captions img_embs, cap_embs, val_loss = encode_data(model, val_loader, opt.log_step, logging.info) if lang in ['en', 'de']: n = 5 else: n = 1 # caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, measure=opt.measure, n=n) logging.info( "%s Image to text: R@1 %.1f | R@5 %.1f | R@10 %.1f | Medr %.1f | Meanr %.1f" % (lang, r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, measure=opt.measure, n=n) logging.info( "%s Text to image: R@1 %.1f | R@5 %.1f | R@10 %.1f | Medr %.1f | Meanr %.1f" % (lang, r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('rsum', currscore, step=model.Eiters) tb_logger.log_value('valid', val_loss, step=model.Eiters) return currscore
def validate(opt, val_loader, model, tb_logger): # compute the encoding for all the validation images and captions print("start validate") model.val_start() img_embs, cap_embs, cap_masks = encode_data(model, val_loader, opt.log_step, logging.info) # caption retrieval (i2t_r1, i2t_r5, i2t_r10, i2t_medr, i2t_meanr), (t2i_r1, t2i_r5, t2i_r10, t2i_medr, t2i_meanr) = i2t(img_embs, cap_embs, cap_masks, measure=opt.measure, model=model) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (i2t_r1, i2t_r5, i2t_r10, i2t_medr, i2t_meanr)) # image retrieval #(r1i, r5i, r10i, medri, meanr) = t2i( # img_embs, cap_embs, measure=opt.measure, model=model) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (t2i_r1, t2i_r5, t2i_r10, t2i_medr, t2i_meanr)) # sum of recalls to be used for early stopping currscore = i2t_r1 + i2t_r5 + i2t_r10 + t2i_r1 + t2i_r5 + t2i_r10 # record metrics in tensorboard tb_logger.log_value('i2t_r1', i2t_r1, step=model.Eiters) tb_logger.log_value('i2t_r5', i2t_r5, step=model.Eiters) tb_logger.log_value('i2t_r10', i2t_r10, step=model.Eiters) tb_logger.log_value('i2t_medr', i2t_medr, step=model.Eiters) tb_logger.log_value('i2t_meanr', i2t_meanr, step=model.Eiters) tb_logger.log_value('t2i_r1', t2i_r1, step=model.Eiters) tb_logger.log_value('t2i_r5', t2i_r5, step=model.Eiters) tb_logger.log_value('t2i_r10', t2i_r10, step=model.Eiters) tb_logger.log_value('t2i_medr', t2i_medr, step=model.Eiters) tb_logger.log_value('t2i_meanr', t2i_meanr, step=model.Eiters) tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def validate(opt, val_loader, model): # compute the encoding for all the validation images and captions start = time.time() img_embs, cap_embs, cap_lens = encode_data(model, val_loader, opt, opt.log_step, logging.info) end = time.time() print("calculate backbone time:", end - start) img_embs = numpy.array([img_embs[i] for i in range(0, len(img_embs), 5)]) start = time.time() sims = 1 - cdist(img_embs, cap_embs, metric='cosine') end = time.time() print("calculate similarity time:", end - start) # caption retrieval (r1, r5, r10, medr, meanr) = i2t(img_embs, cap_embs, cap_lens, sims) logging.info("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) # image retrieval (r1i, r5i, r10i, medri, meanr) = t2i(img_embs, cap_embs, cap_lens, sims) logging.info("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanr)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def trainer( data='coco', #f8k, f30k, coco margin=0.2, dim=1024, dim_image=4096, dim_word=300, encoder='gru', # gru OR bow max_epochs=15, dispFreq=10, decay_c=0., grad_clip=2., maxlen_w=100, optimizer='adam', batch_size=128, saveto='/ais/gobi3/u/rkiros/uvsmodels/coco.npz', validFreq=100, lrate=0.0002, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['encoder'] = encoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ print(model_options) # reload options if reload_ and os.path.exists(saveto): print('reloading...' + saveto) with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) # Load training and development sets print('Loading dataset') train, dev = load_dataset(data)[:2] # Create and save dictionary print('Creating dictionary') worddict = build_dictionary(train[0] + dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print('Dictionary size: ' + str(n_words)) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print('Building model') params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) # before any regularizer print('Building f_log_probs...', ) f_log_probs = theano.function(inps, cost, profile=False) print('Done') # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print('Building f_cost...', ) f_cost = theano.function(inps, cost, profile=False) print('Done') print('Building sentence encoder') trng, inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print('Building image encoder') trng, inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print('Building f_grad...', ) grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k, t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print('Building optimizers...', ) # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print('Optimization') # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 curr = 0. n_samples = 0 for eidx in xrange(max_epochs): print('Epoch ', eidx) for x, im in train_iter: n_samples += len(x) uidx += 1 x, mask, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print('Minibatch with zero sample under length ', maxlen_w) uidx -= 1 continue # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud) if numpy.mod(uidx, validFreq) == 0: print('Computing results...') curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc ls = encode_sentences(curr_model, dev[0]) lim = encode_images(curr_model, dev[1]) (r1, r5, r10, medr) = i2t(lim, ls) print("Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)) (r1i, r5i, r10i, medri) = t2i(lim, ls) print("Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri)) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print('Saving...', ) params = unzip(tparams) numpy.savez(saveto, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print('Done') print('Seen %d samples' % n_samples)
def test(test_loader, model, tb_logger, measure='cosine', log_step=10, ndcg_scorer=None): # compute the encoding for all the validation images and captions img_embs, cap_embs = encode_data(model, test_loader, log_step, logging.info) if measure == 'cosine': sim_fn = cosine_sim elif measure == 'dot': sim_fn = dot_sim results = [] for i in range(5): r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], None, None, return_ranks=True, ndcg_scorer=ndcg_scorer, fold_index=i) print( "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % r) ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], None, None, return_ranks=True, ndcg_scorer=ndcg_scorer, fold_index=i) if i == 0: rt, rti = rt0, rti0 print( "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f, ndcg_spice=%.4f" % ri) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[16] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[14]) print( "Image to text: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % mean_metrics[:7]) print("Average t2i Recall: %.1f" % mean_metrics[15]) print( "Text to image: %.1f %.1f %.1f %.1f %.1f ndcg_rouge=%.4f ndcg_spice=%.4f" % mean_metrics[7:14]) # record metrics in tensorboard tb_logger.add_scalar('test/r1', mean_metrics[0], model.Eiters) tb_logger.add_scalar('test/r5', mean_metrics[1], model.Eiters) tb_logger.add_scalar('test/r10', mean_metrics[2], model.Eiters) tb_logger.add_scalars('test/mean_ndcg', { 'rougeL': mean_metrics[5], 'spice': mean_metrics[6] }, model.Eiters) tb_logger.add_scalar('test/r1i', mean_metrics[7], model.Eiters) tb_logger.add_scalar('test/r5i', mean_metrics[8], model.Eiters) tb_logger.add_scalar('test/r10i', mean_metrics[9], model.Eiters) tb_logger.add_scalars('test/mean_ndcg_i', { 'rougeL': mean_metrics[12], 'spice': mean_metrics[13] }, model.Eiters)
def trainer(load_from=None, save_dir='snapshots', name='anon', **kwargs): """ :param load_from: location to load parameters + options from :param name: name of model, used as location to save parameters + options """ curr_model = dict() # load old model, including parameters, but overwrite with new options if load_from: print 'reloading...' + load_from with open('%s.pkl' % load_from, 'rb') as f: curr_model = pkl.load(f) else: curr_model['options'] = {} for k, v in kwargs.iteritems(): curr_model['options'][k] = v model_options = curr_model['options'] # initialize logger import datetime timestampedName = datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '_' + name from logger import Log log = Log(name=timestampedName, hyperparams=model_options, saveDir='vis/training', xLabel='Examples Seen', saveFrequency=1) print curr_model['options'] # Load training and development sets print 'Loading dataset' dataset = load_dataset(model_options['data'], cnn=model_options['cnn'], load_train=True) train = dataset['train'] dev = dataset['dev'] # Create dictionary print 'Creating dictionary' worddict = build_dictionary(train['caps'] + dev['caps']) print 'Dictionary size: ' + str(len(worddict)) curr_model['worddict'] = worddict curr_model['options']['n_words'] = len(worddict) + 2 # save model pkl.dump(curr_model, open('%s/%s.pkl' % (save_dir, name), 'wb')) print 'Loading data' train_iter = datasource.Datasource(train, batch_size=model_options['batch_size'], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print 'Building model' params = init_params(model_options) # reload parameters if load_from is not None and os.path.exists(load_from): params = load_params(load_from, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print 'Building sentence encoder' inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Building errors..' inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc curr_model['f_err'] = f_err if model_options['grad_clip'] > 0.: grads = [maxnorm(g, model_options['grad_clip']) for g in grads] lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' uidx = 0 curr = 0 n_samples = 0 for eidx in xrange(model_options['max_epochs']): print 'Epoch ', eidx for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options['lrate']) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, model_options['dispFreq']) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud log.update({'Error': float(cost)}, n_samples) if numpy.mod(uidx, model_options['validFreq']) == 0: print 'Computing results...' # encode sentences efficiently dev_s = encode_sentences( curr_model, dev_caps, batch_size=model_options['batch_size']) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True) (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs) print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ( r1, r5, r10, medr, meanr) log.update( { 'R@1': r1, 'R@5': r5, 'R@10': r10, 'median_rank': medr, 'mean_rank': meanr }, n_samples) print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % ( r1i, r5i, r10i, medri, meanri) log.update( { 'Image2Caption_R@1': r1i, 'Image2Caption_R@5': r5i, 'Image2CaptionR@10': r10i, 'Image2Caption_median_rank': medri, 'Image2Caption_mean_rank': meanri }, n_samples) tot = r1 + r5 + r10 if tot > curr: curr = tot # Save parameters print 'Saving...', numpy.savez('%s/%s' % (save_dir, name), **unzip(tparams)) print 'Done' vis_details['hyperparams'] = model_options # Save visualization details with open( 'vis/roc/%s/%s.json' % (model_options['data'], timestampedName), 'w') as f: json.dump(vis_details, f) # Add the new model to the index try: index = json.load(open('vis/roc/index.json', 'r')) except IOError: index = {model_options['data']: []} models = index[model_options['data']] if timestampedName not in models: models.append(timestampedName) with open('vis/roc/index.json', 'w') as f: json.dump(index, f) print 'Seen %d samples' % n_samples
def trainer(load_from=None, save_dir="snapshots", name="anon", **kwargs): """ :param load_from: location to load parameters + options from :param name: name of model, used as location to save parameters + options """ curr_model = dict() # load old model, including parameters, but overwrite with new options if load_from: print "reloading..." + load_from with open("%s.pkl" % load_from, "rb") as f: curr_model = pkl.load(f) else: curr_model["options"] = {} for k, v in kwargs.iteritems(): curr_model["options"][k] = v model_options = curr_model["options"] # initialize logger import datetime timestampedName = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + "_" + name from logger import Log log = Log( name=timestampedName, hyperparams=model_options, saveDir="vis/training", xLabel="Examples Seen", saveFrequency=1 ) print curr_model["options"] # Load training and development sets print "Loading dataset" dataset = load_dataset(model_options["data"], cnn=model_options["cnn"], load_train=True) train = dataset["train"] dev = dataset["dev"] # Create dictionary print "Creating dictionary" worddict = build_dictionary(train["caps"] + dev["caps"]) print "Dictionary size: " + str(len(worddict)) curr_model["worddict"] = worddict curr_model["options"]["n_words"] = len(worddict) + 2 # save model pkl.dump(curr_model, open("%s/%s.pkl" % (save_dir, name), "wb")) print "Loading data" train_iter = datasource.Datasource(train, batch_size=model_options["batch_size"], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print "Building model" params = init_params(model_options) # reload parameters if load_from is not None and os.path.exists(load_from): params = load_params(load_from, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print "Building sentence encoder" inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print "Building image encoder" inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print "Building f_grad...", grads = tensor.grad(cost, wrt=itemlist(tparams)) print "Building errors.." inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model["f_senc"] = f_senc curr_model["f_ienc"] = f_ienc curr_model["f_err"] = f_err if model_options["grad_clip"] > 0.0: grads = [maxnorm(g, model_options["grad_clip"]) for g in grads] lr = tensor.scalar(name="lr") print "Building optimizers...", # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options["optimizer"])(lr, tparams, grads, inps, cost) print "Optimization" uidx = 0 curr = 0 n_samples = 0 for eidx in xrange(model_options["max_epochs"]): print "Epoch ", eidx for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options["lrate"]) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print "NaN detected" return 1.0, 1.0, 1.0 if numpy.mod(uidx, model_options["dispFreq"]) == 0: print "Epoch ", eidx, "Update ", uidx, "Cost ", cost, "UD ", ud log.update({"Error": float(cost)}, n_samples) if numpy.mod(uidx, model_options["validFreq"]) == 0: print "Computing results..." # encode sentences efficiently dev_s = encode_sentences(curr_model, dev_caps, batch_size=model_options["batch_size"]) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True) (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs) print "Text to image (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr) log.update({"R@1": r1, "R@5": r5, "R@10": r10, "median_rank": medr, "mean_rank": meanr}, n_samples) print "Image to text (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri) log.update( { "Image2Caption_R@1": r1i, "Image2Caption_R@5": r5i, "Image2CaptionR@10": r10i, "Image2Caption_median_rank": medri, "Image2Caption_mean_rank": meanri, }, n_samples, ) tot = r1 + r5 + r10 if tot > curr: curr = tot # Save parameters print "Saving...", numpy.savez("%s/%s" % (save_dir, name), **unzip(tparams)) print "Done" vis_details["hyperparams"] = model_options # Save visualization details with open("vis/roc/%s/%s.json" % (model_options["data"], timestampedName), "w") as f: json.dump(vis_details, f) # Add the new model to the index try: index = json.load(open("vis/roc/index.json", "r")) except IOError: index = {model_options["data"]: []} models = index[model_options["data"]] if timestampedName not in models: models.append(timestampedName) with open("vis/roc/index.json", "w") as f: json.dump(index, f) print "Seen %d samples" % n_samples
def validate_split(opt, vid_data_loader, text_data_loader, model, measure='cosine'): # compute the encoding for all the validation video and captions model.val_start() video_embs, video_ids = evaluation.encode_text_or_vid( model.embed_vis, vid_data_loader) cap_embs, caption_ids = evaluation.encode_text_or_vid( model.embed_txt, text_data_loader) c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, measure) if opt.val_metric == "recall": # video retrieval if opt.testCollection.startswith('msvd'): (r1i, r5i, r10i, medri, meanri, t2i_map_score) = evaluation.t2i_varied(c2i_all_errors, caption_ids, video_ids) else: (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=opt.n_caption) print(" * Text to video:") print(" * r_1_5_10: {}".format( [round(r1i, 3), round(r5i, 3), round(r10i, 3)])) print(" * medr, meanr: {}".format([round(medri, 3), round(meanri, 3)])) print(" * " + '-' * 10) # caption retrieval if opt.testCollection.startswith('msvd'): (r1, r5, r10, medr, meanr, i2t_map_score) = evaluation.i2t_varied(c2i_all_errors, caption_ids, video_ids) else: (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=opt.n_caption) print(" * Video to text:") print(" * r_1_5_10: {}".format( [round(r1, 3), round(r5, 3), round(r10, 3)])) print(" * medr, meanr: {}".format([round(medr, 3), round(meanr, 3)])) print(" * " + '-' * 10) # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanri', meanri, step=model.Eiters) elif opt.val_metric == "map": i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=opt.n_caption) t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=opt.n_caption) tb_logger.log_value('i2t_map', i2t_map_score, step=model.Eiters) tb_logger.log_value('t2i_map', t2i_map_score, step=model.Eiters) print('i2t_map', i2t_map_score) print('t2i_map', t2i_map_score) currscore = 0 if opt.val_metric == "recall": if opt.direction == 'i2t' or opt.direction == 'all': currscore += (r1 + r5 + r10) if opt.direction == 't2i' or opt.direction == 'all': currscore += (r1i + r5i + r10i) elif opt.val_metric == "map": if opt.direction == 'i2t' or opt.direction == 'all': currscore += i2t_map_score if opt.direction == 't2i' or opt.direction == 'all': currscore += t2i_map_score tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def trainer(data='f30k', margin=0.2, dim=1024, dim_image=4096, dim_word=300, max_epochs=15, encoder='lstm', dispFreq=10, grad_clip=2.0, maxlen_w=150, batch_size=128, saveto='vse/f30K', validFreq=100, early_stop=20, lrate=1e-3, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ logging.info(model_options) # reload options if reload_ and os.path.exists(saveto): logging.info('reloading...' + saveto) with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) # Load training and development sets logging.info('loading dataset') titles, album_ims, artist, genre = load_dataset(data) artist_string = artist genre_string = genre # Create and save dictionary if os.path.exists('%s.dictionary.pkl' % saveto): logging.info('loading dict from...' + saveto) with open('%s.dictionary.pkl' % saveto, 'rb') as wdict: worddict = pkl.load(wdict) n_words = len(worddict) model_options['n_words'] = n_words logging.info('Dictionary size: ' + str(n_words)) else: logging.info('Create dictionary') worddict = build_dictionary(titles + artist + genre)[0] n_words = len(worddict) model_options['n_words'] = n_words logging.info('Dictionary words: ' + str(n_words)) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.items(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' model_options['worddict'] = worddict model_options['word_idict'] = word_idict # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData( [titles, album_ims, artist, genre], batch_size=batch_size, maxlen=maxlen_w) img_sen_model = Img_Sen_Artist_Ranking(model_options) # todo code to load saved model dict if os.path.exists('%s_model_%s.pkl' % (saveto, encoder)): logging.info('Loading model...') # pkl.dump(model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) img_sen_model.load_state_dict( torch.load('%s_model_%s.pkl' % (saveto, encoder))) logging.info('Done') img_sen_model = img_sen_model.cuda() loss_fn = PairwiseRankingLoss(margin=margin).cuda() params = filter(lambda p: p.requires_grad, img_sen_model.parameters()) optimizer = torch.optim.Adam(params, lr=lrate) scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=40, mode='min', verbose=True, threshold=1e-8) uidx = 0 curr = 0.0 n_samples = 0 # For Early-stopping best_r1, best_r5, best_r10, best_medr = 0.0, 0.0, 0.0, 0 best_step = 0 writer = SummaryWriter() for eidx in range(max_epochs): for x, im, artist, genre in train_iter: n_samples += len(x) uidx += 1 x, im, artist, genre = homogeneous_data.prepare_data( x, im, artist, genre, worddict, maxlen=maxlen_w, n_words=n_words) if x is None: logging.info('Minibatch with zero sample under length ', maxlen_w) uidx -= 1 continue x = Variable(torch.from_numpy(x).cuda()) im = Variable(torch.from_numpy(im).cuda()) artist = Variable(torch.from_numpy(artist).cuda()) genre = Variable(torch.from_numpy(genre).cuda()) # Update x1, im1, artist, genre = img_sen_model(x, im, artist, genre) #make validation on inout before trainer see it if numpy.mod(uidx, validFreq) == 0: img_sen_model.eval() with torch.no_grad(): print('Epoch ', eidx, '\tUpdate@ ', uidx, '\tCost ', cost.data.item()) writer.add_scalar('Evaluation/Validation_Loss', cost.data.item(), uidx) (r1, r5, r10, medr) = i2t(im1, x) #distances with l2norm logging.info("Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr)) (r1g, r5g, r10g, medrg) = i2t(im1, genre) logging.info("Image to genre: %.1f, %.1f, %.1f, %.1f" % (r1g, r5g, r10g, medrg)) (r1a, r5a, r10a, medra) = i2t(im1, artist) logging.info("Image to Artist: %.1f, %.1f, %.1f, %.1f" % (r1a, r5a, r10a, medra)) logging.info("Cal Recall@K ") writer.add_scalars('Validation Recal/Image2Album', { 'r@1': r1, 'r@5': r5, 'r@10': r10 }, uidx) writer.add_scalars('Validation Recal/Image2Genres', { 'r@1': r1g, 'r@5': r5g, 'r@10': r10g }, uidx) writer.add_scalars('Validation Recal/Image2Artist', { 'r@1': r1a, 'r@5': r5a, 'r@10': r5a }, uidx) curr_step = uidx / validFreq currscore = r1 + r5 + r10 + r1a + r5a + r10a + r1g + r5g + r10g - medr - medrg - medra if currscore > curr: curr = currscore best_r1, best_r5, best_r10, best_medr = r1, r5, r10, medr best_r1g, best_r5g, best_r10g, best_medrg = r1, r5, r10, medrg best_step = curr_step # Save model logging.info('Saving model...') pkl.dump( model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) torch.save(img_sen_model.state_dict(), '%s_model_%s.pkl' % (saveto, encoder)) logging.info('Done') if curr_step - best_step > early_stop: logging.info('early stopping, jumping now...') logging.info("Image to text: %.1f, %.1f, %.1f, %.1f" % (best_r1, best_r5, best_r10, best_medr)) logging.info( "Image to genre: %.1f, %.1f, %.1f, %.1f" % (best_r1g, best_r5g, best_r10g, best_medrg)) #return 0 lrate = 1e-4 for param_group in optimizer.param_groups: param_group['lr'] = lrate img_sen_model.train() cost = loss_fn(im1, x1, artist, genre) writer.add_scalar('Evaluation/training_Loss', cost, uidx) optimizer.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm_(params, grad_clip) scheduler.step(cost.data.item()) optimizer.step() #scheduler.step(cost.data.item()) logging.info('Seen %d samples' % n_samples)
def main(): # parse options parser = TrainOptions() opts = parser.parse() # daita loader print('\n--- load dataset ---') vocab = pickle.load( open(os.path.join(opts.vocab_path, '%s_vocab.pkl' % opts.data_name), 'rb')) vocab_size = len(vocab) opts.vocab_size = vocab_size torch.backends.cudnn.enabled = False # Load data loaders train_loader, val_loader = data.get_loaders(opts.data_name, vocab, opts.crop_size, opts.batch_size, opts.workers, opts) test_loader = data.get_test_loader('test', opts.data_name, vocab, opts.crop_size, opts.batch_size, opts.workers, opts) # model print('\n--- load subspace ---') subspace = model_2.VSE(opts) subspace.setgpu() print('\n--- load model ---') model = DRIT(opts) model.setgpu(opts.gpu) if opts.resume is None: #之前没有保存过模型 model.initialize() ep0 = -1 total_it = 0 else: ep0, total_it = model.resume(opts.resume) model.set_scheduler(opts, last_ep=ep0) ep0 += 1 print('start the training at epoch %d' % (ep0)) # saver for display and output saver = Saver(opts) # train print('\n--- train ---') max_it = 500000 score = 0.0 subspace.train_start() for ep in range(ep0, opts.pre_iter): print('-----ep:{} --------'.format(ep)) for it, (images, captions, lengths, ids) in enumerate(train_loader): if it >= opts.train_iter: break # input data images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img, cap = subspace.train_emb(images, captions, lengths, ids, pre=True) #[b,1024] subspace.pre_optimizer.zero_grad() img = img.view(images.size(0), -1, 32, 32) cap = cap.view(images.size(0), -1, 32, 32) model.pretrain_ae(img, cap) if opts.grad_clip > 0: clip_grad_norm(subspace.params, opts.grad_clip) subspace.pre_optimizer.step() for ep in range(ep0, opts.n_ep): subspace.train_start() adjust_learning_rate(opts, subspace.optimizer, ep) for it, (images, captions, lengths, ids) in enumerate(train_loader): if it >= opts.train_iter: break # input data images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img, cap = subspace.train_emb(images, captions, lengths, ids) #[b,1024] img = img.view(images.size(0), -1, 32, 32) cap = cap.view(images.size(0), -1, 32, 32) subspace.optimizer.zero_grad() for p in model.disA.parameters(): p.requires_grad = True for p in model.disB.parameters(): p.requires_grad = True for p in model.disA_attr.parameters(): p.requires_grad = True for p in model.disB_attr.parameters(): p.requires_grad = True for i in range(opts.niters_gan_d): #5 model.update_D(img, cap) for p in model.disA.parameters(): p.requires_grad = False for p in model.disB.parameters(): p.requires_grad = False for p in model.disA_attr.parameters(): p.requires_grad = False for p in model.disB_attr.parameters(): p.requires_grad = False for i in range(opts.niters_gan_enc): model.update_E(img, cap) #利用新的content损失函数 subspace.optimizer.step() print('total_it: %d (ep %d, it %d), lr %09f' % (total_it, ep, it, model.gen_opt.param_groups[0]['lr'])) total_it += 1 # decay learning rate if opts.n_ep_decay > -1: model.update_lr() # save result image #saver.write_img(ep, model) if (ep + 1) % opts.n_ep == 0: print('save model') filename = os.path.join(opts.result_dir, opts.name) model.save('%s/final_model.pth' % (filename), ep, total_it) torch.save(subspace.state_dict(), '%s/final_subspace.pth' % (filename)) elif (ep + 1) % 10 == 0: print('save model') filename = os.path.join(opts.result_dir, opts.name) model.save('%s/%s_model.pth' % (filename, str(ep + 1)), ep, total_it) torch.save(subspace.state_dict(), '%s/%s_subspace.pth' % (filename, str(ep + 1))) if (ep + 1) % opts.model_save_freq == 0: a = None b = None c = None d = None subspace.val_start() for it, (images, captions, lengths, ids) in enumerate(test_loader): if it >= opts.val_iter: break images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img_emb, cap_emb = subspace.forward_emb(images, captions, lengths, volatile=True) img = img_emb.view(images.size(0), -1, 32, 32) cap = cap_emb.view(images.size(0), -1, 32, 32) image1, text1 = model.test_model2(img, cap) img2 = image1.view(images.size(0), -1) cap2 = text1.view(images.size(0), -1) if a is None: a = np.zeros( (opts.val_iter * opts.batch_size, img_emb.size(1))) b = np.zeros( (opts.val_iter * opts.batch_size, cap_emb.size(1))) c = np.zeros( (opts.val_iter * opts.batch_size, img2.size(1))) d = np.zeros( (opts.val_iter * opts.batch_size, cap2.size(1))) a[ids] = img_emb.data.cpu().numpy().copy() b[ids] = cap_emb.data.cpu().numpy().copy() c[ids] = img2.data.cpu().numpy().copy() d[ids] = cap2.data.cpu().numpy().copy() aa = torch.from_numpy(a) bb = torch.from_numpy(b) cc = torch.from_numpy(c) dd = torch.from_numpy(d) (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure) print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medr, r1, r5, r10)) (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure) print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medri, r1i, r5i, r10i)) (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure) print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1, r2, r3, r4)) (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure) print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1i, r2i, r3i, r4i)) curr = r2 + r3 + r4 + r2i + r3i + r4i if curr > score: score = curr print('save model') filename = os.path.join(opts.result_dir, opts.name) model.save('%s/best_model.pth' % (filename), ep, total_it) torch.save(subspace.state_dict(), '%s/subspace.pth' % (filename)) a = None b = None c = None d = None for it, (images, captions, lengths, ids) in enumerate(test_loader): images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img_emb, cap_emb = subspace.forward_emb(images, captions, lengths, volatile=True) img = img_emb.view(images.size(0), -1, 32, 32) cap = cap_emb.view(images.size(0), -1, 32, 32) image1, text1 = model.test_model2(img, cap) img2 = image1.view(images.size(0), -1) cap2 = text1.view(images.size(0), -1) if a is None: a = np.zeros((len(test_loader.dataset), img_emb.size(1))) b = np.zeros((len(test_loader.dataset), cap_emb.size(1))) c = np.zeros((len(test_loader.dataset), img2.size(1))) d = np.zeros((len(test_loader.dataset), cap2.size(1))) a[ids] = img_emb.data.cpu().numpy().copy() b[ids] = cap_emb.data.cpu().numpy().copy() c[ids] = img2.data.cpu().numpy().copy() d[ids] = cap2.data.cpu().numpy().copy() aa = torch.from_numpy(a) bb = torch.from_numpy(b) cc = torch.from_numpy(c) dd = torch.from_numpy(d) (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure) print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medr, r1, r5, r10)) (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure) print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medri, r1i, r5i, r10i)) (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure) print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1, r2, r3, r4)) (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure) print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1i, r2i, r3i, r4i)) return
c = np.zeros((opts.batch_size * opts.test_iter, img2.size(1))) d = np.zeros((opts.batch_size * opts.test_iter, cap2.size(1))) a[ids] = img_emb.data.cpu().numpy().copy() b[ids] = cap_emb.data.cpu().numpy().copy() c[ids] = img2.data.cpu().numpy().copy() d[ids] = cap2.data.cpu().numpy().copy() aa = torch.from_numpy(a) bb = torch.from_numpy(b) cc = torch.from_numpy(c) dd = torch.from_numpy(d) #print(image1.size()) (r1, r5, r10, medr, meanr) = i2t(aa, bb) print('subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(medr, r1, r5, r10)) (r1i, r5i, r10i, medri, meanri) = t2i( aa, bb, ) print('subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format(medri, r1i, r5i, r10i)) (r2, r3, r4, m1, m2) = i2t(cc, dd) print('encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(m1, r2, r3, r4)) (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd) print('encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format(m1i, r2i, r3i, r4i))
def train_matching_gan(self): margin_ranking_loss = nn.MarginRankingLoss(self.margin) for epoch in range(self.epochs): for sample in self.match_data_loader: images = sample['images'] sentences = sample['sentences'] unmatched_images = sample['unmatched_images'] unmatched_sentences = sample['unmatched_sentences'] images = torch.tensor(images, requires_grad=False).cuda() sentences = torch.tensor(sentences, requires_grad=False).cuda() unmatched_images = torch.tensor(unmatched_images, requires_grad=False).cuda() unmatched_sentences = torch.tensor(unmatched_sentences, requires_grad=False).cuda() # 更新判别器 self.match_discriminator_optimizer.zero_grad() fake_images = self.image_generator(sentences) if self.arguments['use_sentence_generator']: # fake_sentences = self.sentence_generator(images) with torch.no_grad(): image_features = self.downsapmle_block(images) fake_sentences = self.sentence_decoder_block.sample( image_features) fake_sentences = convert_indexes2sentence( self.arguments['idx2word'], fake_sentences) xs = [] for sentence in fake_sentences: sentence = [ self.arguments['word2idx'][w] if self.arguments['word2idx'][w] < self.arguments['word_number'] else 1 for w in sentence.split() ] x = np.zeros( self.arguments['sentence_max_length']).astype( 'int64') if len(sentence ) < self.arguments['sentence_max_length']: x[:len(sentence)] = sentence else: x[:] = sentence[:self.arguments[ 'sentence_max_length']] xs.append(x) fake_sentences = np.stack(xs, 0) fake_sentences = torch.LongTensor(fake_sentences) fake_sentences = torch.tensor( fake_sentences, requires_grad=False).cuda() fake_sentence_scores = self.match_discriminator( images, fake_sentences) loss4 = margin_ranking_loss(fake_sentence_scores, unmatched_sentence_scores, real_labels) matching_scores = self.match_discriminator(images, sentences) unmatched_sentence_scores = self.match_discriminator( images, unmatched_sentences) unmatched_image_scores = self.match_discriminator( unmatched_images, sentences) fake_image_scores = self.match_discriminator( fake_images, sentences) real_labels = torch.ones(images.size(0)).cuda() loss1 = margin_ranking_loss(matching_scores, unmatched_sentence_scores, real_labels) loss2 = margin_ranking_loss(matching_scores, unmatched_image_scores, real_labels) loss3 = margin_ranking_loss(fake_image_scores, unmatched_image_scores, real_labels) if self.arguments['use_sentence_generator']: discriminator_loss = loss1 + loss2 + loss3 + loss4 else: discriminator_loss = loss1 + loss2 + loss3 discriminator_loss.backward() self.match_discriminator_optimizer.step() print("Epoch: %d, discriminator_loss= %f" % (epoch, discriminator_loss.data)) if (epoch + 1) == self.epochs: save_discriminator_checkpoint(self.match_discriminator, self.model_save_path, epoch) val_images, val_sentences = load_validation_set(self.arguments) val_sentences = torch.tensor(val_sentences, requires_grad=False).cuda() val_images = torch.tensor(val_images, requires_grad=False).cuda() i2t_r1, i2t_r5, i2t_r10, i2t_medr = i2t(self.match_discriminator, val_images, val_sentences) t2i_r1, t2i_r5, t2i_r10, t2i_medr = t2i(self.match_discriminator, val_sentences, val_images) print "Image to Text: %.2f, %.2f, %.2f, %.2f" \ % (i2t_r1, i2t_r5, i2t_r10, i2t_medr) print "Text to Image: %.2f, %.2f, %.2f, %.2f" \ % (t2i_r1, t2i_r5, t2i_r10, t2i_medr)
def trainer(data='coco', #f8k, f30k, coco margin=0.2, dim=1024, dim_image=4096, dim_word=300, encoder='gru', # gru OR bow max_epochs=15, dispFreq=10, decay_c=0., grad_clip=2., maxlen_w=100, optimizer='adam', batch_size = 128, saveto='/ais/gobi3/u/rkiros/uvsmodels/coco.npz', validFreq=100, lrate=0.0002, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['encoder'] = encoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['optimizer'] = optimizer model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl'%saveto, 'rb') as f: models_options = pkl.load(f) # Load training and development sets print 'Loading dataset' train, dev = load_dataset(data)[:2] # Create and save dictionary print 'Creating dictionary' worddict = build_dictionary(train[0]+dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print 'Dictionary size: ' + str(n_words) with open('%s.dictionary.pkl'%saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, inps, cost = build_model(tparams, model_options) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) print 'Done' # weight decay, if applicable if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=False) print 'Done' print 'Building sentence encoder' trng, inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' trng, inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) f_grad_norm = theano.function(inps, [(g**2).sum() for g in grads], profile=False) f_weight_norm = theano.function([], [(t**2).sum() for k,t in tparams.iteritems()], profile=False) if grad_clip > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (grad_clip**2), g / tensor.sqrt(g2) * grad_clip, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Optimization' # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) uidx = 0 curr = 0. n_samples = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x, mask, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x == None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(lrate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc ls = encode_sentences(curr_model, dev[0]) lim = encode_images(curr_model, dev[1]) (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print 'Saving...', params = unzip(tparams) numpy.savez(saveto, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' print 'Seen %d samples'%n_samples
def trainer(data='coco', margin=0.2, dim=1024, dim_image=4096, dim_word=300, max_epochs=15, encoder='lstm', dispFreq=10, grad_clip=2.0, maxlen_w=150, batch_size=128, saveto='vse/coco', validFreq=100, early_stop=20, lrate=0.0002, reload_=False): # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) # Load training and development sets print 'loading dataset' train, dev = load_dataset(data) # Create and save dictionary print 'Create dictionary' worddict = build_dictionary(train[0] + dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print 'Dictionary size: ' + str(n_words) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' model_options['worddict'] = worddict model_options['word_idict'] = word_idict # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) img_sen_model = ImgSenRanking(model_options) img_sen_model = img_sen_model.cuda() loss_fn = PairwiseRankingLoss(margin=margin) loss_fn = loss_fn.cuda() params = filter(lambda p: p.requires_grad, img_sen_model.parameters()) optimizer = torch.optim.Adam(params, lrate) uidx = 0 curr = 0.0 n_samples = 0 # For Early-stopping best_r1, best_r5, best_r10, best_medr = 0.0, 0.0, 0.0, 0 best_r1i, best_r5i, best_r10i, best_medri = 0.0, 0.0, 0.0, 0 best_step = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue x = Variable(torch.from_numpy(x).cuda()) im = Variable(torch.from_numpy(im).cuda()) # Update x, im = img_sen_model(x, im) cost = loss_fn(im, x) optimizer.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm(params, grad_clip) optimizer.step() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, '\tUpdate ', uidx, '\tCost ', cost.data.cpu( ).numpy()[0] if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['img_sen_model'] = img_sen_model ls, lim = encode_sentences(curr_model, dev[0]), encode_images( curr_model, dev[1]) r_time = time.time() (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri) print "Cal Recall@K using %ss" % (time.time() - r_time) curr_step = uidx / validFreq currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore best_r1, best_r5, best_r10, best_medr = r1, r5, r10, medr best_r1i, best_r5i, best_r10i, best_medri = r1i, r5i, r10i, medri best_step = curr_step # Save model print 'Saving model...', pkl.dump( model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) torch.save(img_sen_model.state_dict(), '%s_model_%s.pkl' % (saveto, encoder)) print 'Done' if curr_step - best_step > early_stop: print 'Early stopping ...' print "Image to text: %.1f, %.1f, %.1f, %.1f" % ( best_r1, best_r5, best_r10, best_medr) print "Text to image: %.1f, %.1f, %.1f, %.1f" % ( best_r1i, best_r5i, best_r10i, best_medri) return 0 print 'Seen %d samples' % n_samples
def validate(val_loader, model, tb_logger, measure='cosine', log_step=10, ndcg_scorer=None, alignment_mode=None): # compute the encoding for all the validation images and captions img_embs, cap_embs, img_lenghts, cap_lenghts = encode_data( model, val_loader, log_step, logging.info) # initialize similarity matrix evaluator sim_matrix_fn = AlignmentContrastiveLoss( aggregation=alignment_mode, return_similarity_mat=True) if alignment_mode is not None else None if measure == 'cosine': sim_fn = cosine_sim elif measure == 'dot': sim_fn = dot_sim # caption retrieval (r1, r5, r10, medr, meanr, mean_rougel_ndcg, mean_spice_ndcg) = i2t(img_embs, cap_embs, img_lenghts, cap_lenghts, measure=measure, ndcg_scorer=ndcg_scorer, sim_function=sim_matrix_fn) logging.info( "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % (r1, r5, r10, medr, meanr, mean_rougel_ndcg, mean_spice_ndcg)) # image retrieval (r1i, r5i, r10i, medri, meanr, mean_rougel_ndcg_i, mean_spice_ndcg_i) = t2i(img_embs, cap_embs, img_lenghts, cap_lenghts, ndcg_scorer=ndcg_scorer, measure=measure, sim_function=sim_matrix_fn) logging.info( "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f, ndcg_rouge=%.4f ndcg_spice=%.4f" % (r1i, r5i, r10i, medri, meanr, mean_rougel_ndcg_i, mean_spice_ndcg_i)) # sum of recalls to be used for early stopping currscore = r1 + r5 + r10 + r1i + r5i + r10i spice_ndcg_sum = mean_spice_ndcg + mean_spice_ndcg_i # record metrics in tensorboard tb_logger.add_scalar('r1', r1, model.Eiters) tb_logger.add_scalar('r5', r5, model.Eiters) tb_logger.add_scalar('r10', r10, model.Eiters) tb_logger.add_scalars('mean_ndcg', { 'rougeL': mean_rougel_ndcg, 'spice': mean_spice_ndcg }, model.Eiters) tb_logger.add_scalar('medr', medr, model.Eiters) tb_logger.add_scalar('meanr', meanr, model.Eiters) tb_logger.add_scalar('r1i', r1i, model.Eiters) tb_logger.add_scalar('r5i', r5i, model.Eiters) tb_logger.add_scalar('r10i', r10i, model.Eiters) tb_logger.add_scalars('mean_ndcg_i', { 'rougeL': mean_rougel_ndcg_i, 'spice': mean_spice_ndcg_i }, model.Eiters) tb_logger.add_scalar('medri', medri, model.Eiters) tb_logger.add_scalar('meanr', meanr, model.Eiters) tb_logger.add_scalar('rsum', currscore, model.Eiters) tb_logger.add_scalar('spice_ndcg_sum', spice_ndcg_sum, model.Eiters) return currscore, spice_ndcg_sum
def trainer(load_from=None, save_dir='snapshots', name='anon', **kwargs): """ :param load_from: location to load parameters + options from :param name: name of model, used as location to save parameters + options """ curr_model = dict() # load old model, including parameters, but overwrite with new options if load_from: print 'reloading...' + load_from with open('%s.pkl'%load_from, 'rb') as f: curr_model = pkl.load(f) else: curr_model['options'] = {} for k, v in kwargs.iteritems(): curr_model['options'][k] = v model_options = curr_model['options'] # initialize logger import datetime timestampedName = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + name from logger import Log log = Log(name=timestampedName, hyperparams=model_options, saveDir='vis/training', xLabel='Examples Seen', saveFrequency=1) print curr_model['options'] # Load training and development sets print 'Loading dataset' dataset = load_dataset(model_options['data'], cnn=model_options['cnn'], load_train=True) train = dataset['train'] dev = dataset['dev'] # Create dictionary print 'Creating dictionary' worddict = build_dictionary(train['caps']+dev['caps']) print 'Dictionary size: ' + str(len(worddict)) curr_model['worddict'] = worddict curr_model['options']['n_words'] = len(worddict) + 2 # save model pkl.dump(curr_model, open('%s/%s.pkl' % (save_dir, name), 'wb')) print 'Loading data' train_iter = datasource.Datasource(train, batch_size=model_options['batch_size'], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print 'Building model' params = init_params(model_options) # reload parameters if load_from is not None and os.path.exists(load_from): params = load_params(load_from, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print 'Building sentence encoder' inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print 'Building image encoder' inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print 'Building f_grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Building errors..' inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc curr_model['f_err'] = f_err if model_options['grad_clip'] > 0.: grads = [maxnorm(g, model_options['grad_clip']) for g in grads] lr = tensor.scalar(name='lr') print 'Building optimizers...', # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' uidx = 0 curr = 0 n_samples = 0 for eidx in xrange(model_options['max_epochs']): print 'Epoch ', eidx for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options['lrate']) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, model_options['dispFreq']) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud log.update({'Error': float(cost)}, n_samples) if numpy.mod(uidx, model_options['validFreq']) == 0: print 'Computing results...' # encode sentences efficiently dev_s = encode_sentences(curr_model, dev_caps, batch_size=model_options['batch_size']) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr), vis_details = t2i(dev_errs, vis_details=True) (r1i, r5i, r10i, medri, meanri) = i2t(dev_errs) print "Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr) log.update({'R@1': r1, 'R@5': r5, 'R@10': r10, 'median_rank': medr, 'mean_rank': meanr}, n_samples) print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri) log.update({'Image2Caption_R@1': r1i, 'Image2Caption_R@5': r5i, 'Image2CaptionR@10': r10i, 'Image2Caption_median_rank': medri, 'Image2Caption_mean_rank': meanri}, n_samples) tot = r1 + r5 + r10 if tot > curr: curr = tot # Save parameters print 'Saving...', numpy.savez('%s/%s'%(save_dir, name), **unzip(tparams)) print 'Done' vis_details['hyperparams'] = model_options # Save visualization details with open('vis/roc/%s/%s.json' % (model_options['data'], timestampedName), 'w') as f: json.dump(vis_details, f) # Add the new model to the index index = json.load(open('vis/roc/index.json', 'r')) models = index[model_options['data']] if timestampedName not in models: models.append(timestampedName) with open('vis/roc/index.json', 'w') as f: json.dump(index, f) print 'Seen %d samples'%n_samples
def validate(opt, val_loader, model, measure='cosine'): # compute the encoding for all the validation video and captions video_embs, cap_embs, video_ids, caption_ids = evaluation.encode_data( model, val_loader, opt.log_step, logging.info) # we load data as video-sentence pairs # but we only need to forward each video once for evaluation # so we get the video set and mask out same videos with feature_mask feature_mask = [] evaluate_videos = set() for video_id in video_ids: feature_mask.append(video_id not in evaluate_videos) evaluate_videos.add(video_id) video_embs = video_embs[feature_mask] video_ids = [ x for idx, x in enumerate(video_ids) if feature_mask[idx] is True ] c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, measure) if opt.val_metric == "recall": # video retrieval (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=opt.n_caption) print(" * Text to video:") print(" * r_1_5_10: {}".format( [round(r1i, 3), round(r5i, 3), round(r10i, 3)])) print(" * medr, meanr: {}".format([round(medri, 3), round(meanri, 3)])) print(" * " + '-' * 10) # caption retrieval (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=opt.n_caption) print(" * Video to text:") print(" * r_1_5_10: {}".format( [round(r1, 3), round(r5, 3), round(r10, 3)])) print(" * medr, meanr: {}".format([round(medr, 3), round(meanr, 3)])) print(" * " + '-' * 10) # record metrics in tensorboard tb_logger.log_value('r1', r1, step=model.Eiters) tb_logger.log_value('r5', r5, step=model.Eiters) tb_logger.log_value('r10', r10, step=model.Eiters) tb_logger.log_value('medr', medr, step=model.Eiters) tb_logger.log_value('meanr', meanr, step=model.Eiters) tb_logger.log_value('r1i', r1i, step=model.Eiters) tb_logger.log_value('r5i', r5i, step=model.Eiters) tb_logger.log_value('r10i', r10i, step=model.Eiters) tb_logger.log_value('medri', medri, step=model.Eiters) tb_logger.log_value('meanri', meanri, step=model.Eiters) elif opt.val_metric == "map": i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=opt.n_caption) t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=opt.n_caption) tb_logger.log_value('i2t_map', i2t_map_score, step=model.Eiters) tb_logger.log_value('t2i_map', t2i_map_score, step=model.Eiters) print('i2t_map', i2t_map_score) print('t2i_map', t2i_map_score) currscore = 0 if opt.val_metric == "recall": if opt.direction == 'i2t' or opt.direction == 'all': currscore += (r1 + r5 + r10) if opt.direction == 't2i' or opt.direction == 'all': currscore += (r1i + r5i + r10i) elif opt.val_metric == "map": if opt.direction == 'i2t' or opt.direction == 'all': currscore += i2t_map_score if opt.direction == 't2i' or opt.direction == 'all': currscore += t2i_map_score tb_logger.log_value('rsum', currscore, step=model.Eiters) return currscore
def trainer(data='coco', margin=0.2, dim=1024, dim_image=4096, dim_word=300, encoder='gru', max_epochs=15, dispFreq=10, decay_c=0.0, grad_clip=2.0, maxlen_w=150, batch_size=128, saveto='vse/coco', validFreq=100, lrate=0.0002, concat=True, reload_=False): hyper_params = { 'data': data, 'encoder': encoder, 'batch_size': batch_size, 'time': cur_time, 'lrate': lrate, 'concat': concat, } i2t_r1 = dict([('i2t_recall', 'r1')] + hyper_params.items()) i2t_r5 = dict([('i2t_recall', 'r5')] + hyper_params.items()) i2t_r10 = dict([('i2t_recall', 'r10')] + hyper_params.items()) t2i_r1 = dict([('t2i_recall', 'r1')] + hyper_params.items()) t2i_r5 = dict([('t2i_recall', 'r5')] + hyper_params.items()) t2i_r10 = dict([('t2i_recall', 'r10')] + hyper_params.items()) i2t_med = dict([('i2t_med', 'i2t_med')] + hyper_params.items()) t2i_med = dict([('t2i_med', 't2i_med')] + hyper_params.items()) agent = Agent(port=5020) i2t_r1_agent = agent.register(i2t_r1, 'recall', overwrite=True) i2t_r5_agent = agent.register(i2t_r5, 'recall', overwrite=True) i2t_r10_agent = agent.register(i2t_r10, 'recall', overwrite=True) t2i_r1_agent = agent.register(t2i_r1, 'recall', overwrite=True) t2i_r5_agent = agent.register(t2i_r5, 'recall', overwrite=True) t2i_r10_agent = agent.register(t2i_r10, 'recall', overwrite=True) i2t_med_agent = agent.register(i2t_med, 'median', overwrite=True) t2i_med_agent = agent.register(t2i_med, 'median', overwrite=True) # Model options model_options = {} model_options['data'] = data model_options['margin'] = margin model_options['dim'] = dim model_options['dim_image'] = dim_image model_options['dim_word'] = dim_word model_options['encoder'] = encoder model_options['max_epochs'] = max_epochs model_options['dispFreq'] = dispFreq model_options['decay_c'] = decay_c model_options['grad_clip'] = grad_clip model_options['maxlen_w'] = maxlen_w model_options['batch_size'] = batch_size model_options['saveto'] = saveto model_options['validFreq'] = validFreq model_options['lrate'] = lrate model_options['reload_'] = reload_ model_options['concat'] = concat print model_options # reload options if reload_ and os.path.exists(saveto): print 'reloading...' + saveto with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) # Load training and development sets print 'loading dataset' train, dev = load_dataset(data)[:2] # Create and save dictionary print 'Create dictionary' worddict = build_dictionary(train[0] + dev[0])[0] n_words = len(worddict) model_options['n_words'] = n_words print 'Dictionary size: ' + str(n_words) with open('%s.dictionary.pkl' % saveto, 'wb') as f: pkl.dump(worddict, f) # Inverse dictionary word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' model_options['worddict'] = worddict model_options['word_idict'] = word_idict # Each sentence in the minibatch have same length (for encoder) train_iter = homogeneous_data.HomogeneousData([train[0], train[1]], batch_size=batch_size, maxlen=maxlen_w) img_sen_model = ImgSenRanking(model_options) img_sen_model = img_sen_model.cuda() loss_fn = PairwiseRankingLoss(margin=margin) loss_fn = loss_fn.cuda() params = filter(lambda p: p.requires_grad, img_sen_model.parameters()) optimizer = torch.optim.Adam(params, lrate) uidx = 0 curr = 0.0 n_samples = 0 for eidx in xrange(max_epochs): print 'Epoch ', eidx for x, im in train_iter: n_samples += len(x) uidx += 1 x_id, im = homogeneous_data.prepare_data(x, im, worddict, maxlen=maxlen_w, n_words=n_words) if x_id is None: print 'Minibatch with zero sample under length ', maxlen_w uidx -= 1 continue x_id = Variable(torch.from_numpy(x_id).cuda()) im = Variable(torch.from_numpy(im).cuda()) # Update ud_start = time.time() x, im = img_sen_model(x_id, im, x) cost = loss_fn(im, x) optimizer.zero_grad() cost.backward() torch.nn.utils.clip_grad_norm(params, grad_clip) optimizer.step() ud = time.time() - ud_start if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost.data.cpu( ).numpy()[0], 'UD ', ud if numpy.mod(uidx, validFreq) == 0: print 'Computing results...' curr_model = {} curr_model['options'] = model_options curr_model['worddict'] = worddict curr_model['word_idict'] = word_idict curr_model['img_sen_model'] = img_sen_model ls, lim = encode_sentences(curr_model, dev[0]), encode_images( curr_model, dev[1]) r1, r5, r10, medr = 0.0, 0.0, 0.0, 0 r1i, r5i, r10i, medri = 0.0, 0.0, 0.0, 0 r_time = time.time() if data == 'arch' or data == 'arch_small': (r1, r5, r10, medr) = i2t_arch(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i_arch(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % ( r1i, r5i, r10i, medri) else: (r1, r5, r10, medr) = i2t(lim, ls) print "Image to text: %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr) (r1i, r5i, r10i, medri) = t2i(lim, ls) print "Text to image: %.1f, %.1f, %.1f, %.1f" % ( r1i, r5i, r10i, medri) print "Cal Recall@K using %ss" % (time.time() - r_time) record_num = uidx / validFreq agent.append(i2t_r1_agent, record_num, r1) agent.append(i2t_r5_agent, record_num, r5) agent.append(i2t_r10_agent, record_num, r10) agent.append(t2i_r1_agent, record_num, r1i) agent.append(t2i_r5_agent, record_num, r5i) agent.append(t2i_r10_agent, record_num, r10i) agent.append(i2t_med_agent, record_num, medr) agent.append(t2i_med_agent, record_num, medri) currscore = r1 + r5 + r10 + r1i + r5i + r10i if currscore > curr: curr = currscore # Save model print 'Saving model...', pkl.dump( model_options, open('%s_params_%s.pkl' % (saveto, encoder), 'wb')) torch.save(img_sen_model.state_dict(), '%s_model_%s.pkl' % (saveto, encoder)) print 'Done' print 'Seen %d samples' % n_samples
def trainer(**kwargs): """ Train the model according to input params Info about input params is available in parameters.py """ # Timing print('Starting time:', datetime.now()) sys.stdout.flush() t_start_train = time.time() # Model options # load old model, including parameters, but overwrite with new options # Extract model options from arguments model_options = {} for k, v in kwargs.iteritems(): model_options[k] = v # Print input options print('PARAMETERS BEFORE LOADING:') for k, v in model_options.items(): print('{:>26}: {}'.format(k, v)) sys.stdout.flush() # Reload options if required curr_model = dict() if model_options['reload_']: # Reload model parameters opt_filename_reload = get_opt_filename(model_options, previous=True) print('reloading...', opt_filename_reload) sys.stdout.flush() try: with open(opt_filename_reload, 'rb') as f: curr_model = pkl.load(f) except: print( 'Failed to reload parameters, try to use only feeded parameters' ) curr_model['options'] = {} # Check if we reload from best model or last model if model_options['load_from'] in ['Best', 'best', 'B', 'b']: load_from_best = True print('Loading from Best saved model in validation results') elif model_options['load_from'] in ['Last', 'last', 'L', 'l']: load_from_best = False print('Loading from Last saved model') else: print('Unkown choice for "load_from" parameter', model_options['load_from']) print('Please choose one of:', ['Best', 'best', 'B', 'b'], ['Last', 'last', 'L', 'l']) print('Using Last as default') load_from_best = False # Reload end-point parameters state_filename = get_sol_filename(model_options, best=load_from_best, previous=True) print('reloading...', state_filename) sys.stdout.flush() try: with open(state_filename, 'rb') as f: state_params = pkl.load(f) if load_from_best: init_epoch = state_params['epoch'] solution = state_params else: init_epoch = state_params['epoch_done'] + 1 solution = state_params['solution'] best_val_score = solution['best_val_score'] n_samples = solution['samples_seen'] except: print('Failed to reload state parameters, starting from 0') init_epoch = 0 best_val_score = 0 n_samples = 0 else: curr_model['options'] = {} init_epoch = 0 best_val_score = 0 n_samples = 0 # Overwrite loaded options with input options for k, v in kwargs.iteritems(): curr_model['options'][k] = v model_options = curr_model['options'] # Print final options loaded if model_options['reload_']: print('PARAMETERS AFTER LOADING:') for k, v in model_options.items(): print('{:>26}: {}'.format(k, v)) sys.stdout.flush() # Load training and development sets print('Loading dataset') sys.stdout.flush() dataset = load_dataset(dataset_name=model_options['data'], embedding=model_options['embedding'], path_to_data=model_options['data_path'], test_subset=model_options['test_subset'], load_train=True, fold=0) train = dataset['train'] dev = dataset['val'] # Create word dictionary print('Creating dictionary') sys.stdout.flush() worddict = build_dictionary(train['caps'] + dev['caps']) print('Dictionary size: ' + str(len(worddict))) sys.stdout.flush() curr_model['worddict'] = worddict curr_model['options']['n_words'] = len(worddict) + 2 # save model opt_filename_save = get_opt_filename(model_options, previous=False) print('Saving model parameters in', opt_filename_save) sys.stdout.flush() try: os.makedirs(os.path.dirname(opt_filename_save)) except: pass pkl.dump(curr_model, open(opt_filename_save, 'wb')) # Load data from dataset print('Loading data') sys.stdout.flush() train_iter = datasource.Datasource(train, batch_size=model_options['batch_size'], worddict=worddict) dev = datasource.Datasource(dev, worddict=worddict) dev_caps, dev_ims = dev.all() print('Building model') sys.stdout.flush() params = init_params(model_options) # reload network parameters, ie. weights if model_options['reload_']: params_filename = get_npz_filename(model_options, best=load_from_best, previous=True) params = load_params(params_filename, params) tparams = init_tparams(params) inps, cost = build_model(tparams, model_options) print('Building sentence encoder') sys.stdout.flush() inps_se, sentences = build_sentence_encoder(tparams, model_options) f_senc = theano.function(inps_se, sentences, profile=False) print('Building image encoder') sys.stdout.flush() inps_ie, images = build_image_encoder(tparams, model_options) f_ienc = theano.function(inps_ie, images, profile=False) print('Building f_grad...') sys.stdout.flush() grads = tensor.grad(cost, wrt=itemlist(tparams)) print('Building errors...') sys.stdout.flush() inps_err, errs = build_errors(model_options) f_err = theano.function(inps_err, errs, profile=False) curr_model['f_senc'] = f_senc curr_model['f_ienc'] = f_ienc curr_model['f_err'] = f_err if model_options['grad_clip'] > 0.: grads = [maxnorm(g, model_options['grad_clip']) for g in grads] lr = tensor.scalar(name='lr') print('Building optimizers...') sys.stdout.flush() # (compute gradients), (updates parameters) f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) # Get names for the files to save model and solution sol_filename_best = get_sol_filename(model_options, best=True, previous=False) sol_filename_last = get_sol_filename(model_options, best=False, previous=False) params_filename_best = get_npz_filename(model_options, best=True, previous=False) params_filename_last = get_npz_filename(model_options, best=False, previous=False) print('PATHS TO MODELS:') for filename in [ sol_filename_best, sol_filename_last, params_filename_best, params_filename_last ]: print(filename) sys.stdout.flush() try: os.makedirs(os.path.dirname(filename)) except: pass # Start optimization print('Optimization') sys.stdout.flush() uidx = 0 # Timing t_start = time.time() print('Starting time:', datetime.now()) for eidx in range(init_epoch, model_options['max_epochs']): t_start_epoch = time.time() print('Epoch ', eidx) sys.stdout.flush() for x, mask, im in train_iter: n_samples += x.shape[1] uidx += 1 # Update ud_start = time.time() cost = f_grad_shared(x, mask, im) f_update(model_options['lrate']) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') sys.stdout.flush() return 1., 1., 1. if numpy.mod(uidx, model_options['dispFreq']) == 0: print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud) sys.stdout.flush() if numpy.mod(uidx, model_options['validFreq']) == 0: print('Computing results...') sys.stdout.flush() # encode sentences efficiently dev_s = encode_sentences( curr_model, dev_caps, batch_size=model_options['batch_size']) dev_i = encode_images(curr_model, dev_ims) # compute errors dev_errs = compute_errors(curr_model, dev_s, dev_i) # compute ranking error (r1, r5, r10, medr, meanr) = i2t(dev_errs) (r1i, r5i, r10i, medri, meanri) = t2i(dev_errs) print("Text to image (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri)) sys.stdout.flush() print("Image to text (dev set): %.1f, %.1f, %.1f, %.1f, %.1f" % (r1, r5, r10, medr, meanr)) sys.stdout.flush() # Score val_score = r1 + r5 + r10 + r1i + r5i + r10i if val_score > best_val_score: print('BEST MODEL FOUND') print('Score:', val_score) print('Previous best score:', best_val_score) best_val_score = val_score # Join in a results dict results_dict = build_results_dict(r1, r5, r10, medr, r1i, r5i, r10i, medri) # Save parameters print('Saving...', end=' ') sys.stdout.flush() numpy.savez(params_filename_best, **unzip(tparams)) print('Done') sys.stdout.flush() # Update solution solution = OrderedDict([ ('epoch', eidx), ('update', uidx), ('samples_seen', n_samples), ('best_val_score', best_val_score), ('best_val_res', results_dict), ('time_until_results', str(timedelta(seconds=(time.time() - t_start_train)))) ]) pkl.dump(solution, open(sol_filename_best, 'wb')) print('Seen %d samples' % n_samples) sys.stdout.flush() # Timing t_epoch = time.time() - t_start_epoch t_epoch_avg = (time.time() - t_start) / (eidx + 1 - (init_epoch)) print('Time for this epoch:', str(timedelta(seconds=t_epoch)), 'Average:', str(timedelta(seconds=t_epoch_avg))) t_2_complete = t_epoch_avg * (model_options['max_epochs'] - (eidx + 1)) print('Time since start session:', str(timedelta(seconds=time.time() - t_start)), 'Estimated time to complete training:', str(timedelta(seconds=t_2_complete))) print('Current time:', datetime.now()) sys.stdout.flush() # Save current model try: state_params = OrderedDict([('epoch_done', eidx), ('solution', solution)]) except: solution = OrderedDict([ ('epoch', eidx), ('update', uidx), ('samples_seen', n_samples), ('best_val_score', best_val_score), ('time_until_results', str(timedelta(seconds=(time.time() - t_start_train)))) ]) state_params = OrderedDict([('epoch_done', eidx), ('solution', solution)]) pkl.dump(state_params, open(sol_filename_last, 'wb')) # Save parameters print('Saving LAST npz...', end=' ') sys.stdout.flush() numpy.savez(params_filename_last, **unzip(tparams)) print('Done') sys.stdout.flush() return solution
def main(): opt = parse_args() print(json.dumps(vars(opt), indent=2)) rootpath = opt.rootpath testCollection = opt.testCollection n_caption = opt.n_caption resume = os.path.join(opt.logger_name, opt.checkpoint_name) if not os.path.exists(resume): logging.info(resume + ' not exists.') sys.exit(0) checkpoint = torch.load(resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( resume, start_epoch, best_rsum)) options = checkpoint['opt'] if not hasattr(options, 'concate'): setattr(options, "concate", "full") trainCollection = options.trainCollection output_dir = resume.replace(trainCollection, testCollection) output_dir = output_dir.replace('/%s/' % options.cv_name, '/results/%s/' % trainCollection) result_pred_sents = os.path.join(output_dir, 'id.sent.score.txt') pred_error_matrix_file = os.path.join(output_dir, 'pred_errors_matrix.pth.tar') if checkToSkip(pred_error_matrix_file, opt.overwrite): sys.exit(0) makedirsforfile(pred_error_matrix_file) # data loader prepare caption_files = { 'test': os.path.join(rootpath, testCollection, 'TextData', '%s.caption.txt' % testCollection) } img_feat_path = os.path.join(rootpath, testCollection, 'FeatureData', options.visual_feature) visual_feats = {'test': BigFile(img_feat_path)} assert options.visual_feat_dim == visual_feats['test'].ndims video2frames = { 'test': read_dict( os.path.join(rootpath, testCollection, 'FeatureData', options.visual_feature, 'video2frames.txt')) } # set bow vocabulary and encoding bow_vocab_file = os.path.join(rootpath, options.trainCollection, 'TextData', 'vocabulary', 'bow', options.vocab + '.pkl') bow_vocab = pickle.load(open(bow_vocab_file, 'rb')) bow2vec = get_text_encoder('bow')(bow_vocab) options.bow_vocab_size = len(bow_vocab) # set rnn vocabulary rnn_vocab_file = os.path.join(rootpath, options.trainCollection, 'TextData', 'vocabulary', 'rnn', options.vocab + '.pkl') rnn_vocab = pickle.load(open(rnn_vocab_file, 'rb')) options.vocab_size = len(rnn_vocab) # Construct the model model = get_model(options.model)(options) model.load_state_dict(checkpoint['model']) model.Eiters = checkpoint['Eiters'] model.val_start() if testCollection.startswith( 'msvd'): # or testCollection.startswith('msrvtt'): # set data loader video_ids_list = data.read_video_ids(caption_files['test']) vid_data_loader = data.get_vis_data_loader(visual_feats['test'], opt.batch_size, opt.workers, video2frames['test'], video_ids=video_ids_list) text_data_loader = data.get_txt_data_loader(caption_files['test'], rnn_vocab, bow2vec, opt.batch_size, opt.workers) # mapping video_embs, video_ids = evaluation.encode_text_or_vid( model.embed_vis, vid_data_loader) cap_embs, caption_ids = evaluation.encode_text_or_vid( model.embed_txt, text_data_loader) else: # set data loader data_loader = data.get_test_data_loaders(caption_files, visual_feats, rnn_vocab, bow2vec, opt.batch_size, opt.workers, opt.n_caption, video2frames=video2frames) # mapping video_embs, cap_embs, video_ids, caption_ids = evaluation.encode_data( model, data_loader['test'], opt.log_step, logging.info) # remove duplicate videos idx = range(0, video_embs.shape[0], n_caption) video_embs = video_embs[idx, :] video_ids = video_ids[::opt.n_caption] c2i_all_errors = evaluation.cal_error(video_embs, cap_embs, options.measure) torch.save( { 'errors': c2i_all_errors, 'videos': video_ids, 'captions': caption_ids }, pred_error_matrix_file) print("write into: %s" % pred_error_matrix_file) if testCollection.startswith( 'msvd'): # or testCollection.startswith('msrvtt'): # caption retrieval (r1, r5, r10, medr, meanr, i2t_map_score) = evaluation.i2t_varied(c2i_all_errors, caption_ids, video_ids) # video retrieval (r1i, r5i, r10i, medri, meanri, t2i_map_score) = evaluation.t2i_varied(c2i_all_errors, caption_ids, video_ids) else: # caption retrieval (r1i, r5i, r10i, medri, meanri) = evaluation.t2i(c2i_all_errors, n_caption=n_caption) t2i_map_score = evaluation.t2i_map(c2i_all_errors, n_caption=n_caption) # video retrieval (r1, r5, r10, medr, meanr) = evaluation.i2t(c2i_all_errors, n_caption=n_caption) i2t_map_score = evaluation.i2t_map(c2i_all_errors, n_caption=n_caption) print(" * Text to Video:") print(" * r_1_5_10, medr, meanr: {}".format([ round(r1i, 1), round(r5i, 1), round(r10i, 1), round(medri, 1), round(meanri, 1) ])) print(" * recall sum: {}".format(round(r1i + r5i + r10i, 1))) print(" * mAP: {}".format(round(t2i_map_score, 3))) print(" * " + '-' * 10) # caption retrieval print(" * Video to text:") print(" * r_1_5_10, medr, meanr: {}".format([ round(r1, 1), round(r5, 1), round(r10, 1), round(medr, 1), round(meanr, 1) ])) print(" * recall sum: {}".format(round(r1 + r5 + r10, 1))) print(" * mAP: {}".format(round(i2t_map_score, 3))) print(" * " + '-' * 10)