def debug_show_similarity_with_manually_created_examples( model_path, data_path=None, split='test'): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = False if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) img_embs, cap_embs = encode_data(model, data_loader) img_embs = img_embs[:100] cap_embs = cap_embs[:100] data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 0) encoding_0 = encode_data(model, data_loader_ex_0)[1] data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 1) encoding_1 = encode_data(model, data_loader_ex_1)[1] print('Computing results...') # compute similarity result = list() result_0 = list() result_1 = list() npts = img_embs.shape[0] // 5 for index in range(npts): # Get query image im = img_embs[5 * index].reshape(1, img_embs.shape[1]) # Compute scores if opt.measure == 'order': raise Exception('Measure order not supported.') else: result.append(numpy.dot(im, cap_embs.T).flatten()) result_0.append(numpy.dot(im, encoding_0.T).flatten()) result_1.append(numpy.dot(im, encoding_1.T).flatten()) torch.save({ 'orig': result, 'Tete': result_0, 'Haoyue': result_1 }, 'shy_runs/debug.pt')
def eval_with_single_extended(model_path, data_path=None, data_name=None, split='test', backup_vec_ex=None): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = False if data_path is not None: opt.data_path = data_path if data_name is not None: opt.data_name = data_name # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) img_embs, cap_embs = encode_data(model, data_loader) if backup_vec_ex is None: cap_embs_ex = list() for i in range(img_embs.shape[0]): data_loader_ex = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'ex/%d' % i) encoding = encode_data(model, data_loader_ex)[1] if encoding is not None: cap_embs_ex.append(encoding.copy()) else: cap_embs_ex.append(np.zeros(cap_embs[:1].shape)) print('Caption Embedding: %d' % i) # torch.save(cap_embs_ex, 'data/coco_precomp/cap_embs_ex.pth') else: cap_embs_ex = torch.load(backup_vec_ex) print('Computing results...') r, rt = i2t_split(img_embs, cap_embs, cap_embs_ex, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r) torch.save({'rt': rt}, model_path[:model_path.find('model_best')] + 'ranks_single_extended.pth.tar')
def post_init(self): checkpoint = torch.load(self.path, map_location=torch.device('cpu' if not self.on_gpu else 'cuda')) opt = checkpoint['opt'] with open(self.vocab_path, 'rb') as f: self.vocab = CustomUnpickler(f).load() opt.vocab_size = len(self.vocab) model = VSE(opt) model.load_state_dict(checkpoint['model']) model.txt_enc.eval() self.model = model.txt_enc del model.img_enc
def eval_with_manually_extended(model_path, data_path=None, split='test'): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = False if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) img_embs, cap_embs = encode_data(model, data_loader) img_embs = img_embs[:100] cap_embs = cap_embs[:100] cap_embs_ex = list() data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 0) encoding_0 = encode_data(model, data_loader_ex_0)[1] data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 1) encoding_1 = encode_data(model, data_loader_ex_1)[1] for i in range(100): cap_emb = np.concatenate( (encoding_0[i * 2:i * 2 + 2], encoding_1[i * 2:i * 2 + 2]), axis=0) cap_embs_ex.append(cap_emb) print('Computing results...') r, rt = i2t_split(img_embs, cap_embs, cap_embs_ex, measure=opt.measure, return_ranks=True) # r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) torch.save({'rt': rt}, model_path[:model_path.find('model_best')] + 'ranks_manually_extended_1.pth.tar')
class FeatureExtractor(object): def __init__(self, checkpoint, image_encoder, dataset): self.load_checkpoint(checkpoint) self.image_encoder = image_encoder self.dataset = dataset def load_checkpoint(self, checkpoint): checkpoint = torch.load(checkpoint) opt = checkpoint['opt'] opt.use_external_captions = False vocab = Vocab.from_pickle(pjoin(opt.vocab_path, '%s_vocab.pkl' % opt.data_name)) opt.vocab_size = len(vocab) from model import VSE self.model = VSE(opt) self.model.load_state_dict(checkpoint['model']) self.projector = vocab self.model.img_enc.eval() self.model.txt_enc.eval() for p in self.model.img_enc.parameters(): p.requires_grad = False for p in self.model.txt_enc.parameters(): p.requires_grad = False def __call__(self, ind): raw_img, img, img_embedding, cap, cap_ext = self.dataset[ind] img_embedding_precomp = self.model.img_enc(as_cuda(as_variable(img_embedding).unsqueeze(0))) img = as_variable(img) img.requires_grad = True img_embedding_a = img_embedding = self.image_encoder(as_cuda(img.unsqueeze(0))) img_embedding = self.model.img_enc(img_embedding) txt = [cap] txt.extend(cap_ext) txt_embeddings, txt_var = self.enc_txt(txt) return Record( raw_img, cap, cap_ext, img, img_embedding, img_embedding_precomp, txt_var, txt_embeddings[0], txt_embeddings[1:] ) def enc_txt(self, caps): sents, lengths, _, inv = _prepare_batch(caps, self.projector) inv = var_with(as_variable(inv), sents) out, x = self.model.txt_enc.forward(sents, lengths, True) return out[inv], x
def post_init(self): if self.pool_strategy is not None: self.pool_fn = getattr(np, self.pool_strategy) checkpoint = torch.load( self.path, map_location=torch.device('cpu' if not self.on_gpu else 'cuda')) opt = checkpoint['opt'] model = VSE(opt) model.load_state_dict(checkpoint['model']) model.img_enc.eval() self.model = model.img_enc self.to_device(self.model) del model.txt_enc
def load_checkpoint(self, checkpoint): checkpoint = torch.load(checkpoint) opt = checkpoint['opt'] opt.use_external_captions = False vocab = Vocab.from_pickle(pjoin(opt.vocab_path, '%s_vocab.pkl' % opt.data_name)) opt.vocab_size = len(vocab) from model import VSE self.model = VSE(opt) self.model.load_state_dict(checkpoint['model']) self.projector = vocab self.model.img_enc.eval() self.model.txt_enc.eval() for p in self.model.img_enc.parameters(): p.requires_grad = False for p in self.model.txt_enc.parameters(): p.requires_grad = False
def eval_with_extended(model_path, data_path=None, data_name=None, split='test'): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = True opt.negative_number = 5 if data_path is not None: opt.data_path = data_path if data_name is not None: opt.data_name = data_name # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) opt.use_external_captions = True # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] // 5, cap_embs.shape[0])) r, rt = i2t_text_only(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r) torch.save({'rt': rt}, model_path[:model_path.find('model_best')] + 'ranks_extended.pth.tar')
def evalrank(model_path, data_path=None, split='dev', fold5=False, lang=None): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path # Never use undersample when testing. opt.undersample = False print(opt) #Load vocabulary used by the model if opt.data_name != "m30k": with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) else: vocab = pickle.load(open(os.path.join(opt.logger_name, 'vocab.pkl'))) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') if lang is not None: opt.lang = lang langs = opt.lang.split('-') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) if len(langs) > 1: for data, loader_lang in zip(data_loader, langs): loader = data run_eval(model, loader, fold5, opt, loader_lang) else: run_eval(model, data_loader, fold5, opt, opt.lang)
def evalrank(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path print(opt) # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) scores = numpy.sum(numpy.multiply(img_embs, cap_embs), -1) print(scores.shape) print('scores:', np.mean(scores))
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='data', help='path to datasets') parser.add_argument('--data_name', default='f30k', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='vocab', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=2e-4, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=100, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/test', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') parser.add_argument('--K', default=2, type=int,help='num of JSR.') parser.add_argument('--feature_path', default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/trainval/', type=str, help='path to the pre-computed image features') parser.add_argument('--region_bbox_file', default='data/joint-pretrain/flickr30k/region_feat_gvd_wo_bgd/flickr30k_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5', type=str, help='path to the region_bbox_file(.h5)') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load(open(os.path.join( opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) best_rsum = 0 # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) del checkpoint # Train the Model for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, epoch, prefix=opt.logger_name + '/')
def evalrank(model_path1, model_path2, data_path=None, split='dev', fold5=False, shared_space='both'): """ Evaluate a trained model. """ # load model and options checkpoint = torch.load(model_path1) opt = checkpoint['opt'] print(opt) if data_path is not None: opt.data_path = data_path opt.vocab_path = "./vocab/" # load vocabulary used by the model vocab = pickle.load(open(os.path.join(opt.vocab_path, 'vocab.pkl'), 'rb')) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader1(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs1, cap_embs1 = encode_data(model, data_loader) # load second model and options checkpoint2 = torch.load(model_path2) opt = checkpoint2['opt'] print(opt) if data_path is not None: opt.data_path = data_path opt.vocab_path = "./vocab/" # load vocabulary used by the model vocab = pickle.load(open(os.path.join(opt.vocab_path, 'vocab.pkl'), 'rb')) opt.vocab_size = len(vocab) # construct model model2 = VSE(opt) # load model state model2.load_state_dict(checkpoint2['model']) print('Loading dataset') data_loader = get_test_loader2(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs2, cap_embs2 = encode_data(model2, data_loader) print('Images: %d, Captions: %d' % (img_embs2.shape[0] / 20, cap_embs2.shape[0])) # no cross-validation, full evaluation r, rt = i2t(img_embs1, cap_embs1, img_embs2, cap_embs2, shared_space, measure=opt.measure, return_ranks=True) ri, rti = t2i(img_embs1, cap_embs1, img_embs2, cap_embs2, shared_space, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data/stud/jorgebjorn/data/', help='path to datasets') parser.add_argument('--data_name', default='f8k_precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='/data/stud/jorgebjorn/data/vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument( '--logger_name', default='/data/stud/jorgebjorn/runs/{}/{}'.format( getpass.getuser(), datetime.datetime.now().strftime("%d-%m-%y_%H:%M")), help='Path to save the model and Tensorboard log.') parser.add_argument('--selection', default='uncertainty', help='Active learning selection algorithm') parser.add_argument('--primary', default='images', help='Image- or caption-centric active learning') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--device', default=0, type=int, help='which gpu to use') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') parser.add_argument('--no_log', action='store_true', default=False, help='Disable logging') opt = parser.parse_args() opt.logger_name += "_" + opt.selection + "_" + opt.primary print(opt) if torch.cuda.is_available(): torch.cuda.set_device(opt.device) # Setup tensorboard logger if not opt.no_log: logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders active_loader, train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) if torch.cuda.is_available(): model.cuda() # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) n_rounds = 234 if opt.selection == "uncertainty": selection = select_uncertainty elif opt.selection == "margin": selection = select_margin elif opt.selection == "random": selection = select_random elif opt.selection == "hybrid": selection = select_hybrid elif opt.selection == "all": selection = select_all elif opt.selection == "capsim": selection = select_captionSimilarity else: selection = select_uncertainty for r in range(n_rounds): best_indices = selection(r, model, train_loader) for index in best_indices: active_loader.dataset.add_single(train_loader.dataset[index][0], train_loader.dataset[index][1]) train_loader.dataset.delete_indices(best_indices) # Train the Model print("Training on {} items ".format(len(active_loader))) # Reset the model model = VSE(opt) if torch.cuda.is_available(): model.cuda() best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, active_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model, not opt.no_log, r)
def main(): # Hyper Parameters torch.cuda.set_device(opt.gpu_id) tb_logger.configure(opt.logger_name, flush_secs=5) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO, filename=opt.logger_name+'/log.log') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logging.info(opt) # Load Vocabulary Wrapper vocab_path = os.path.join(opt.vocab_path, '%s_vocab_total.pkl' % opt.data_name) print (vocab_path) vocab = pickle.load(open(vocab_path, 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) print('Print out models:') print(model.clip_enc) print(model.txt_enc) print(model.vid_seq_enc) print(model.txt_seq_enc) start_epoch = 0 # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model'], opt) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) if opt.eval_only: return else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(start_epoch, opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(opt), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/', epoch=epoch)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/w/31/faghri/vsepp_data/', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=15, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=8, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--sum_violation', action='store_true') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') parser.add_argument('--save_all', action='store_true', help="Save model after the training of each epoch") parser.add_argument('--memory_bank', action='store_true', help="Train model with memory bank") parser.add_argument('--record_val', action='store_true', help="Record the rsum values on validation set in file during training") parser.add_argument('--local_alpha', default=30.0, type=float) parser.add_argument('--local_ep', default=0.3, type=float) parser.add_argument('--global_alpha', default=40.0, type=float) parser.add_argument('--global_beta', default=40.0, type=float) parser.add_argument('--global_ep_posi', default=0.2, type=float, help="Global epsilon for positive pairs") parser.add_argument('--global_ep_nega', default=0.1, type=float, help="Global epsilon for negative pairs") parser.add_argument('--mb_k', default=250, type=int, help="Use top K items in memory bank") parser.add_argument('--mb_rate', default=0.05, type=float, help="-") opt = parser.parse_args() print(opt) logging.basicConfig(format='%(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load(open(os.path.join( opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) print("Vocab Size: %d" % opt.vocab_size) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) memory_bank = opt.memory_bank if memory_bank and epoch > 0: load_memory_bank(opt, train_loader, model) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) print ("rsum: %.1f" % rsum) if opt.record_val: with open("rst_val_" + opt.logger_name[5:], "a") as f: f.write("Epoch: %d ; rsum: %.1f\n" %(epoch, rsum)) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/', save_all=opt.save_all) # reset memory bank model.mb_img = None model.mb_cap = None
def main(): parser = argparse.ArgumentParser() # Directories. parser.add_argument('--data_path', default='/DATA/cvpr19', help='path to datasets') parser.add_argument('--vocab_path', default='../vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--model', type=str, default='beenburger', help='model_name') parser.add_argument( '--save_dir', type=str, default='coco2', help='save checkpoint and results in DATA_PATH/MODEL_NAME/SAVE_DIR') # Dataset. parser.add_argument('--data_name', default='coco', help='{coco|ours}') parser.add_argument('--use_restval', default='True', type=str2bool, help='Use the restval data for training on MSCOCO.') # Model configurations. parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--K', default=620, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--num_layers', default=4, type=int, help='Number of SRU layers.') parser.add_argument('--D', type=int, default='2048', help='dimension of image feature from ResNet') parser.add_argument('--D_prime', type=int, default='2400', help='dimension of adaptation + pooling') parser.add_argument('--d', type=int, default='2400', help='Dimensionality of the joint embedding') # Training configurations. parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=160, type=int, help='Size of a training mini-batch.') parser.add_argument('--img_size', default=256, type=int, help='image_size') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--learning_rate', default=.001, type=float, help='Initial learning rate.') parser.add_argument('--lr_decay', type=float, default=0.5, help='learning rate decay') parser.add_argument('--workers', default=4, type=int, help='Number of data loader workers.') # Miscellaneous. parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='../runs/runX', help='Path to save the model and Tensorboard log.') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) opt.vocab = vocab # Create directories create_directory(opt.data_path, opt.model, opt.save_dir) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] model.change_training_state(0) if start_epoch > 2: model.change_training_state(2) if start_epoch > 8: model.change_training_state(8) best_rsum = checkpoint['best_rsum'] model.optimizer = checkpoint['optim'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 start_epoch = 0 for epoch in range(start_epoch, opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) model.change_training_state(epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, 'optim': model.optimizer, }, is_best, prefix=os.path.join(opt.data_path, opt.model, opt.save_dir))
def evalrank(model_path, data_path=None, split='dev', fold5=False, region_bbox_file=None, feature_path=None): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path if region_bbox_file is not None: opt.region_bbox_file = region_bbox_file if feature_path is not None: opt.feature_path = feature_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) print(opt) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs= encode_data(model, data_loader) time_sim_start = time.time() if not fold5: img_emb_new = img_embs[0:img_embs.size(0):5] print(img_emb_new.size()) sims = torch.mm(img_emb_new, cap_embs.t()) sims_T = torch.mm(cap_embs, cap_embs.t()) sims_T = sims_T.cpu().numpy() sims = sims.cpu().numpy() np.save('sims_f.npy',sims) np.save('sims_f_T.npy',sims_T) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) r = simrank(sims) time_sim_end = time.time() print('sims_time:%f' % (time_sim_end - time_sim_start)) del sims else: # fold5-especially for coco print('5k---------------') img_emb_new = img_embs[0:img_embs.size(0):5] print(img_emb_new.size()) sims = torch.mm(img_emb_new, cap_embs.t()) sims_T = torch.mm(cap_embs, cap_embs.t()) sims = sims.cpu().numpy() sims_T = sims_T.cpu().numpy() np.save('sims_full_5k.npy',sims) np.save('sims_full_T_5k.npy',sims_T) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) r = simrank(sims) time_sim_end = time.time() print('sims_time:%f' % (time_sim_end - time_sim_start)) del sims, sims_T print('1k---------------') r_ = [0, 0, 0, 0, 0, 0, 0] for i in range(5): print(i) img_emb_new = img_embs[i * 5000 : int(i * 5000 + img_embs.size(0)/5):5] cap_emb_new = cap_embs[i * 5000 : int(i * 5000 + cap_embs.size(0)/5)] sims = torch.mm(img_emb_new, cap_emb_new.t()) sims_T = torch.mm(cap_emb_new, cap_emb_new.t()) sims_T = sims_T.cpu().numpy() sims = sims.cpu().numpy() np.save('sims_full_%d.npy'%i,sims) np.save('sims_full_T_%d'%i,sims_T) print('Images: %d, Captions: %d' % (img_emb_new.size(0), cap_emb_new.size(0))) r = simrank(sims) r_ = np.array(r_) + np.array(r) del sims print('--------------------') r_ = tuple(r_/5) print('I2T:%.1f %.1f %.1f' % r_[0:3]) print('T2I:%.1f %.1f %.1f' % r_[3:6]) print('Rsum:%.1f' % r_[-1])
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/data1/hexianghu/activitynet/captions/', help='path to datasets') parser.add_argument('--data_name', default='anet_precomp', help='anet_precomp') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=50, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=64, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=0., type=float, help='Gradient clipping threshold.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.001, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=10, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', required=True, help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=500, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--gpu_id', default=0, type=int, help='GPU to use.') parser.add_argument('--rnn_type', default='maxout', choices=['maxout', 'seq2seq', 'attention'], help='Type of recurrent model.') parser.add_argument('--img_first_size', default=1024, type=int, help='first img layer emb size') parser.add_argument('--cap_first_size', default=1024, type=int, help='first cap layer emb size') parser.add_argument('--img_first_dropout', default=0, type=float, help='first img layer emb size') parser.add_argument('--cap_first_dropout', default=0, type=float, help='first cap layer emb size') opt = parser.parse_args() print(opt) torch.cuda.set_device(opt.gpu_id) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load(open(os.path.join( opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders( opt.data_name, vocab, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) print('Print out models:') print(model.img_enc) print(model.txt_enc) print(model.img_seq_enc) print(model.txt_seq_enc) # optionally resume from a checkpoint if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume))
def evalrank(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) if not fold5: # no cross-validation, full evaluation r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ri, rti = t2i(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) else: # 5fold cross-validation, only for MSCOCO results = [] for i in range(5): r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) if i == 0: rt, rti = rt0, rti0 print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[10] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10]) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
def __init__(self, model_path, img_path, precomp_path, split, vocab_path, batch_size): checkpoint = torch.load(model_path) self.opt = checkpoint['opt'] self.opt.data_path = img_path self.opt.data_name = 'coco' self.opt.batch_size = batch_size self.img_loader = get_test_loader(split, self.opt.data_name, None, self.opt.crop_size, self.opt.batch_size, self.opt.workers, self.opt) self.opt.data_path = precomp_path self.opt.data_name = 'coco_precomp' # load vocabulary used by the model with open(vocab_path, 'rb') as f: self.vocab = pickle.load(f) self.vocab_size = len(self.vocab) self.precomp_loader = get_test_loader(split, self.opt.data_name, self.vocab, self.opt.crop_size, self.opt.batch_size, self.opt.workers, self.opt) self.model = VSE(self.opt) # load model state self.model.load_state_dict(checkpoint['model']) # precompute all image embeddings self.model.val_start() # numpy array to keep all the embeddings self.img_embs = None batch_time = AverageMeter() val_logger = LogCollector() end = time.time() log_step = 10 for i, (images, captions, lengths, ids) in enumerate(self.precomp_loader): self.model.logger = val_logger # compute the embeddings images = Variable(images, volatile=True) if torch.cuda.is_available(): images = images.cuda() # Forward img_emb = self.model.img_enc(images) # initialize the numpy arrays given the size of the embeddings if self.img_embs is None: self.img_embs = np.zeros( (len(self.precomp_loader.dataset), img_emb.size(1))) # preserve the embeddings by copying from gpu and converting to numpy self.img_embs[ids] = img_emb.data.cpu().numpy().copy() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % log_step == 0: print('Test: [{0}/{1}]\t' '{e_log}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'. format(i, len(self.precomp_loader), batch_time=batch_time, e_log=str(self.model.logger))) del images, captions
class ImageRetriever(object): def __init__(self, model_path, img_path, precomp_path, split, vocab_path, batch_size): checkpoint = torch.load(model_path) self.opt = checkpoint['opt'] self.opt.data_path = img_path self.opt.data_name = 'coco' self.opt.batch_size = batch_size self.img_loader = get_test_loader(split, self.opt.data_name, None, self.opt.crop_size, self.opt.batch_size, self.opt.workers, self.opt) self.opt.data_path = precomp_path self.opt.data_name = 'coco_precomp' # load vocabulary used by the model with open(vocab_path, 'rb') as f: self.vocab = pickle.load(f) self.vocab_size = len(self.vocab) self.precomp_loader = get_test_loader(split, self.opt.data_name, self.vocab, self.opt.crop_size, self.opt.batch_size, self.opt.workers, self.opt) self.model = VSE(self.opt) # load model state self.model.load_state_dict(checkpoint['model']) # precompute all image embeddings self.model.val_start() # numpy array to keep all the embeddings self.img_embs = None batch_time = AverageMeter() val_logger = LogCollector() end = time.time() log_step = 10 for i, (images, captions, lengths, ids) in enumerate(self.precomp_loader): self.model.logger = val_logger # compute the embeddings images = Variable(images, volatile=True) if torch.cuda.is_available(): images = images.cuda() # Forward img_emb = self.model.img_enc(images) # initialize the numpy arrays given the size of the embeddings if self.img_embs is None: self.img_embs = np.zeros( (len(self.precomp_loader.dataset), img_emb.size(1))) # preserve the embeddings by copying from gpu and converting to numpy self.img_embs[ids] = img_emb.data.cpu().numpy().copy() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % log_step == 0: print('Test: [{0}/{1}]\t' '{e_log}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'. format(i, len(self.precomp_loader), batch_time=batch_time, e_log=str(self.model.logger))) del images, captions def get_NN(self, query, measure='dot', k=5): # convert query from string to index tokens = nltk.tokenize.word_tokenize( str(query).lower().decode('utf-8')) query = [] query.append(self.vocab('<start>')) query.extend([self.vocab(token) for token in tokens]) query.append(self.vocab('<end>')) # embedd query query = Variable(torch.LongTensor(query), volatile=True) if torch.cuda.is_available(): query = query.cuda() # Forward q_emb = self.model.txt_enc(query, [query.size(0)]) q_embs = q_emb.data.cpu().numpy().copy() # run nearest neighbours searching text -> image return self.find_NN(self.img_embs, q_embs, measure=measure, k=k) def find_NN(self, images, query, measure, k, npts=None): """ Text->Images (Image Search) Images: (5N, K) matrix of images query: (1, K) matrix of captions """ if npts is None: npts = images.shape[0] / 5 ims = numpy.array([images[i] for i in range(0, len(images), 5)]) # Compute scores tic = time.clock() if measure == 'order': d2 = order_sim( torch.Tensor(ims).cuda(), torch.Tensor(query).cuda()) d2 = d2.cpu().numpy() d = d2.T else: d = numpy.dot( query, ims.T ) # TODO Try to optimize this computation, see if sorting is bottleneck imgs_by_similarity = numpy.argsort(numpy.squeeze(d))[::-1] toc = time.clock() print('NN search took {} ms over {} images'.format( (toc - tic) * 1000.0, ims.shape[0])) return imgs_by_similarity[0:k] * 5, query, ims def visualise_NN(self, inds, img_identifier, file_name): html_file = open(file_name, "w") for i, ind in enumerate(inds): root, caption, img_id, path, image = self.img_loader.dataset.get_raw_item( ind) image.save("./out/{}_img{}.png".format(img_identifier, i + 1)) img_tag = '<img src="./out/{}_img{}.png" style="max-height: 400px; max-width: 400px;">'\ .format(img_identifier, i + 1) html_file.write(img_tag) html_file.close()
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/home/dcsaero01/data/datasets/vsepp/', help='path to datasets') parser.add_argument('--data_name', default='minicsdv2_precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=300, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument( '--dropout_value', default=0, type=float, help='Probability value for dropout after linear layer') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument( '--resume', default= '/home/dcsaero01/data/projects/vsepp/runs/minicsdv2/checkpoint.pth.tar', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--text_dim', default=6000, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper if opt.data_name == 'coco_st_precomp' or opt.data_name == 'coco_st_ner_precomp': vocab = None opt.vocab_size = 0 else: vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, train_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) """
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default=".", help='path to datasets') parser.add_argument( '--data_name', default='m30k', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k|m30k') parser.add_argument( '--lang', default='en', help='Which language(s) to use from m30k, en-de, trains on en+de.') parser.add_argument('--sentencepair', action='store_true', help='Train caption-caption ranking as well.') parser.add_argument( '--sentencepair_p', default=0.5, type=float, help='Probability of training on caption-caption and not image-caption.' ) parser.add_argument( '--primary', default=None, help= 'Which language to monitor for early stopping. Multiple with l1-l2-l3') parser.add_argument( '--undersample', action='store_true', help='Pick only one of the 5 possilbe captions for m30k task 2.') parser.add_argument('--half', action='store_true', help='Use only half of the M30K from task 2.') parser.add_argument('--disaligned', action='store_true', help='Use only half of the M30K from task 2.') parser.add_argument('--vocab_path', default='.', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument( '--patience', default=10, type=int, help='Number of validation steps to tolerate without improvement.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument( '--logger_path', default='.', help='Path where to save the model and Tensorboard log.') parser.add_argument( '--logger_name', help='Name of the folder where to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--sum_violation', dest="max_violation", action='store_false', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') parser.add_argument('--seed', default=42, type=int, help='Random seed.') opt = parser.parse_args() if torch.__version__ >= "0.3": opt.reset_train = True opt.vocab_path = os.path.join(opt.vocab_path, "vocab") if opt.logger_name is None: name = "lang{}_half-{}_undersample-{}_disaligned-{}_sentencepair-{}_primary-{}_epochs-{}" name = name.format(opt.lang, opt.half, opt.undersample, opt.disaligned, opt.sentencepair, opt.primary, opt.num_epochs) opt.logger_name = os.path.join(opt.data_name, name) opt.logger_name = os.path.join(opt.logger_path, opt.logger_name, str(opt.seed)) print(opt) random.seed(rseed + opt.seed) np.random.seed(rseed + opt.seed) torch.cuda.manual_seed(rseed + opt.seed) torch.cuda.manual_seed_all(rseed + opt.seed) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # For multi30k compute vocabulary mappings on the fly. if opt.data_name == "m30k": vocab = None langs = opt.lang.split("-") # Load Vocabulary Wrapper for COCO or F30K else: vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) langs = [opt.data_name] # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) print(model.txt_enc) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model, "") else: print("=> no checkpoint found at '{}'".format(opt.resume)) if len(langs) == 1 or opt.data_name != 'm30k': # Train the Model on a single data set best_rsum = 0 model.train_start() for epoch in range(opt.num_epochs): if opt.reset_train: # Always reset to train mode, this is not the default behavior model.train_start() adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model, langs[0]) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/') if is_best: patience_count = 0 print("New best: {}".format(best_rsum)) else: patience_count += 1 print("No improvement in {}".format(patience_count)) if patience_count == opt.patience: print("No improvement in {} epochs, stoppin".format( patience_count)) break else: joint_train(opt, train_loader, model, val_loader)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/media/shenkev/data/Ubuntu/vsepp/data/data', help='path to datasets') parser.add_argument('--data_name', default='coco_precomp', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/coco_vse', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', default=True, action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper vocab = pickle.load( open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/')
def main(): #训练外参数 parser = argparse.ArgumentParser() parser.add_argument('--data_type', default='rgb', help='path to datasets') parser.add_argument('--feature_path', default='', help='path to datasets') parser.add_argument('--anno_path', default='', help='path to datasets') parser.add_argument('--feature_prefix', default='', help='prefix of feaeture') parser.add_argument('--dropout', default=0.5, help='prefix of feature') parser.add_argument('--split_video_file', default='', help='prefix of feature') parser.add_argument('--num_pos_sample', default=10, help='prefix of feature') parser.add_argument('--num_neg_sample', default=10, help='prefix of feature') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch') parser.add_argument('--embed_size', default=10240, typr=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2, type=float, help='Gradient clipping threshold') parser.add_argument('--learning_rate', default=.001, type=float, help='Initial learning rate') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate') parser.add_argument('--wrokers', default=10, type=int, help='Number of data loader workers') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint default=none') parser.add_argument('--storage_place', default='', type=str, metavar='Path', help='path to latest checkpoint default=none') parser.add_argument('--instance_data_path', default='', type=str, metavar='PATH', help='path to the latest checkpoint') opt = parser.parse_args() logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) #Load data loaders #1 train_loader, val_loader = data.get_loaders(opt) #Construct the model #2 model = VSE(opt) #optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) #Eiters is used to show logs as the continuation of another #training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) #train for one epoch #3 train(opt, train_loader, model, epoch) #evaluate on validation set rsum = 0 #remeber best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/') #4 val_loader (feat,id,labels) eval_feat(val_loader, model, opt.storage_place)
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/A/VSE/data/', help='path to datasets') parser.add_argument( '--data_name', default='resnet152_precomp', help='{coco,f8k,f30k,10crop,irv2,resnet152}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.05, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument( '--embed_size', default=1024, type=int, help= 'Dimensionality of the joint embedding. [NOTE: this is used only if <embed_size> differs from <gru_units>]' ) parser.add_argument('--gru_units', default=1024, type=int, help='Number of GRU neurons.') parser.add_argument('--grad_clip', default=1., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.001, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/runX', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='vgg19', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument( '--test_measure', default=None, help= 'Similarity used for retrieval (None<same used for training>|cosine|order)' ) parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--text_encoder', default='seam-e', choices=text_encoders.text_encoders_alias.keys()) parser.add_argument( '--att_units', default=300, type=int, help= 'Number of tanh neurons. When using --att_dim=None we apply a tanh directly to the att input. ' ) parser.add_argument('--att_hops', default=30, type=int, help='Number of attention hops (viewpoints).') parser.add_argument( '--att_coef', default=0., type=float, help='Influence of Frobenius divergence in the loss function.') opt = parser.parse_args() if opt.test_measure is None: opt.test_measure = opt.measure print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) tokenizer, vocab_size = data.get_tokenizer(opt.vocab_path, opt.data_name) opt.vocab_size = vocab_size collate_fn = 'collate_fn' # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, tokenizer, opt.crop_size, opt.batch_size, opt.workers, opt, collate_fn) # Construct the model model = VSE(opt) print(model.txt_enc) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/')
def create_model(opt, ema=False): model = VSE(opt, ema) return model