def eval_model(model, dataset_config, image_size, device): # extract query feature query = get_test_loader(root=os.path.join(dataset_config.root, dataset_config.query), batch_size=512, image_size=image_size, num_workers=16) query_feat = [] query_label = [] query_cam_id = [] for data, label, cam_id, _ in query: feat = model(data.cuda(non_blocking=True)) query_feat.append(feat.data.cpu().numpy()) query_label.append(label.data.cpu().numpy()) query_cam_id.append(cam_id.data.cpu().numpy()) query_feat = np.concatenate(query_feat, axis=0) query_label = np.concatenate(query_label, axis=0) query_cam_id = np.concatenate(query_cam_id, axis=0) # extract gallery feature gallery = get_test_loader(root=os.path.join(dataset_config.root, dataset_config.gallery), batch_size=512, image_size=image_size, num_workers=16) gallery_feat = [] gallery_label = [] gallery_cam_id = [] for data, label, cam_id, _ in gallery: feat = model(data.cuda(non_blocking=True)) gallery_feat.append(feat.data.cpu().numpy()) gallery_label.append(label) gallery_cam_id.append(cam_id) gallery_feat = np.concatenate(gallery_feat, axis=0) gallery_label = np.concatenate(gallery_label, axis=0) gallery_cam_id = np.concatenate(gallery_cam_id, axis=0) mAP, r1, r5, r10 = eval_feature(query_feat, gallery_feat, query_label, query_cam_id, gallery_label, gallery_cam_id, device) print( 'mAP = %f , r1 precision = %f , r5 precision = %f , r10 precision = %f' % (mAP, r1, r5, r10))
def debug_show_similarity_with_manually_created_examples( model_path, data_path=None, split='test'): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = False if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) img_embs, cap_embs = encode_data(model, data_loader) img_embs = img_embs[:100] cap_embs = cap_embs[:100] data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 0) encoding_0 = encode_data(model, data_loader_ex_0)[1] data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 1) encoding_1 = encode_data(model, data_loader_ex_1)[1] print('Computing results...') # compute similarity result = list() result_0 = list() result_1 = list() npts = img_embs.shape[0] // 5 for index in range(npts): # Get query image im = img_embs[5 * index].reshape(1, img_embs.shape[1]) # Compute scores if opt.measure == 'order': raise Exception('Measure order not supported.') else: result.append(numpy.dot(im, cap_embs.T).flatten()) result_0.append(numpy.dot(im, encoding_0.T).flatten()) result_1.append(numpy.dot(im, encoding_1.T).flatten()) torch.save({ 'orig': result, 'Tete': result_0, 'Haoyue': result_1 }, 'shy_runs/debug.pt')
def evalrank(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] print(opt) if data_path is not None: opt.data_path = data_path # load vocabulary used by the model vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) # construct model model = Local_Alignment(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs, cap_lens = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0], cap_embs.shape[0])) img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)]) print('Images: ', img_embs.shape) print('Captions: ', cap_embs.shape) start = time.time() sims = compute_sims(img_embs, cap_embs, cap_lens, opt, shard_size=128) print(sims[:20, :4]) end = time.time() print("calculate similarity time:", end - start) print('Saving results...') sio.savemat('%s_relation.mat' % opt.data_name, {'similarity': sims}) print('Saving success...') r, rt = i2t(img_embs, cap_embs, cap_lens, sims, return_ranks=True) ri, rti = t2i(img_embs, cap_embs, cap_lens, sims, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
def eval_with_single_extended(model_path, data_path=None, data_name=None, split='test', backup_vec_ex=None): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = False if data_path is not None: opt.data_path = data_path if data_name is not None: opt.data_name = data_name # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) img_embs, cap_embs = encode_data(model, data_loader) if backup_vec_ex is None: cap_embs_ex = list() for i in range(img_embs.shape[0]): data_loader_ex = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'ex/%d' % i) encoding = encode_data(model, data_loader_ex)[1] if encoding is not None: cap_embs_ex.append(encoding.copy()) else: cap_embs_ex.append(np.zeros(cap_embs[:1].shape)) print('Caption Embedding: %d' % i) # torch.save(cap_embs_ex, 'data/coco_precomp/cap_embs_ex.pth') else: cap_embs_ex = torch.load(backup_vec_ex) print('Computing results...') r, rt = i2t_split(img_embs, cap_embs, cap_embs_ex, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r) torch.save({'rt': rt}, model_path[:model_path.find('model_best')] + 'ranks_single_extended.pth.tar')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='./data/', help='path to datasets') parser.add_argument('--model_path', default='./data/', help='path to model') parser.add_argument('--split', default='test', help='val/test') parser.add_argument('--gpuid', default=0., type=str, help='gpuid') parser.add_argument('--fold5', action='store_true', help='fold5') opts = parser.parse_args() device_id = opts.gpuid print("use GPU:", device_id) os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id) device_id = 0 torch.cuda.set_device(0) # load model and options checkpoint = torch.load(opts.model_path) opt = checkpoint['opt'] opt.loss_verbose = False opt.split = opts.split opt.data_path = opts.data_path opt.fold5 = opts.fold5 # load vocabulary used by the model vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) # construct model model = SCAN(opt) model.cuda() model = nn.DataParallel(model) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = data.get_test_loader(opt.split, opt.data_name, vocab, opt.batch_size, opt.workers, opt) print(opt) print('Computing results...') evaluation.evalrank(model.module, data_loader, opt, split=opt.split, fold5=opt.fold5)
def eval_with_manually_extended(model_path, data_path=None, split='test'): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = False if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) img_embs, cap_embs = encode_data(model, data_loader) img_embs = img_embs[:100] cap_embs = cap_embs[:100] cap_embs_ex = list() data_loader_ex_0 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 0) encoding_0 = encode_data(model, data_loader_ex_0)[1] data_loader_ex_1 = get_text_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt, 'manually_ex_%d' % 1) encoding_1 = encode_data(model, data_loader_ex_1)[1] for i in range(100): cap_emb = np.concatenate( (encoding_0[i * 2:i * 2 + 2], encoding_1[i * 2:i * 2 + 2]), axis=0) cap_embs_ex.append(cap_emb) print('Computing results...') r, rt = i2t_split(img_embs, cap_embs, cap_embs_ex, measure=opt.measure, return_ranks=True) # r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) torch.save({'rt': rt}, model_path[:model_path.find('model_best')] + 'ranks_manually_extended_1.pth.tar')
def online_test(data_path,model_dir_list, loggits_save_path, test_img_size=128): models = [] device = torch.device("cuda:0") for inx, i in enumerate(model_dir_list): if 'se50' in i: model=GermanNetSE50().to(device) elif 'xcep' in i: model=GermanNetXcep().to(device) else: model=GermanNetIncepRes().to(device) model.eval() model_path = os.path.join(i, 'model_best.pth') print(model_path) model.load_state_dict(torch.load(model_path)) models.append(model) dataloader = get_test_loader(data_path,bsize=32,img_size=test_img_size) pred_npy = np.zeros((len(dataloader.dataset),17),np.float32) print('online test predicting.....') utils.create_dir(config_dict['commit_outdir']) submit_csv = config_dict['commit_outdir'] + \ datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + "_submit.csv" fout = open(submit_csv,'w') inx=0 for data,label in tqdm.tqdm(dataloader): data=data.to(device) pred = models[0](data) for i in range(1, len(models)): pred_sub = models[i](data) pred += pred_sub pred = pred.data.cpu().numpy() pred_npy[inx:inx+pred.shape[0],:]=pred inx+=pred.shape[0] pred = np.argmax(pred, 1) for i in range(pred.shape[0]): one_hot = [0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0] one_hot[pred[i]]=1 for j in range(16): fout.write(str(one_hot[j])+',') fout.write(str(one_hot[16]) + '\n') fout.close() np.save(loggits_save_path,pred_npy) print('pred logits saved in: '+loggits_save_path) print('submit csv saved in: '+submit_csv)
def eval_with_extended(model_path, data_path=None, data_name=None, split='test'): checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.use_external_captions = True opt.negative_number = 5 if data_path is not None: opt.data_path = data_path if data_name is not None: opt.data_name = data_name # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) opt.use_external_captions = True # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] // 5, cap_embs.shape[0])) r, rt = i2t_text_only(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f\t%.1f\t%.1f\t%.1f\t%.1f" % r) torch.save({'rt': rt}, model_path[:model_path.find('model_best')] + 'ranks_extended.pth.tar')
def evalrank(model_path, data_path=None, split='dev', fold5=False, lang=None): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path # Never use undersample when testing. opt.undersample = False print(opt) #Load vocabulary used by the model if opt.data_name != "m30k": with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) else: vocab = pickle.load(open(os.path.join(opt.logger_name, 'vocab.pkl'))) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') if lang is not None: opt.lang = lang langs = opt.lang.split('-') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) if len(langs) > 1: for data, loader_lang in zip(data_loader, langs): loader = data run_eval(model, loader, fold5, opt, loader_lang) else: run_eval(model, data_loader, fold5, opt, opt.lang)
def extract_feats(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSRN(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) # SAVE SCAN FEATS if split == 'dev': np.save('/media/sounak/4tbdisk/VSRN/img_embs_1K.npy', img_embs) np.save('/media/sounak/4tbdisk/VSRN/cap_embs_1K.npy', cap_embs) else: np.save('/media/sounak/4tbdisk/VSRN/img_embs_5K.npy', img_embs) np.save('/media/sounak/4tbdisk/VSRN/cap_embs_5K.npy', cap_embs) return
def evalrank(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path print(opt) # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) scores = numpy.sum(numpy.multiply(img_embs, cap_embs), -1) print(scores.shape) print('scores:', np.mean(scores))
def evalrank(model_path, data_path=None, split='dev', fold5=False, region_bbox_file=None, feature_path=None): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path if region_bbox_file is not None: opt.region_bbox_file = region_bbox_file if feature_path is not None: opt.feature_path = feature_path # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) print(opt) # construct model model = VSE(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs= encode_data(model, data_loader) time_sim_start = time.time() if not fold5: img_emb_new = img_embs[0:img_embs.size(0):5] print(img_emb_new.size()) sims = torch.mm(img_emb_new, cap_embs.t()) sims_T = torch.mm(cap_embs, cap_embs.t()) sims_T = sims_T.cpu().numpy() sims = sims.cpu().numpy() np.save('sims_f.npy',sims) np.save('sims_f_T.npy',sims_T) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) r = simrank(sims) time_sim_end = time.time() print('sims_time:%f' % (time_sim_end - time_sim_start)) del sims else: # fold5-especially for coco print('5k---------------') img_emb_new = img_embs[0:img_embs.size(0):5] print(img_emb_new.size()) sims = torch.mm(img_emb_new, cap_embs.t()) sims_T = torch.mm(cap_embs, cap_embs.t()) sims = sims.cpu().numpy() sims_T = sims_T.cpu().numpy() np.save('sims_full_5k.npy',sims) np.save('sims_full_T_5k.npy',sims_T) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) r = simrank(sims) time_sim_end = time.time() print('sims_time:%f' % (time_sim_end - time_sim_start)) del sims, sims_T print('1k---------------') r_ = [0, 0, 0, 0, 0, 0, 0] for i in range(5): print(i) img_emb_new = img_embs[i * 5000 : int(i * 5000 + img_embs.size(0)/5):5] cap_emb_new = cap_embs[i * 5000 : int(i * 5000 + cap_embs.size(0)/5)] sims = torch.mm(img_emb_new, cap_emb_new.t()) sims_T = torch.mm(cap_emb_new, cap_emb_new.t()) sims_T = sims_T.cpu().numpy() sims = sims.cpu().numpy() np.save('sims_full_%d.npy'%i,sims) np.save('sims_full_T_%d'%i,sims_T) print('Images: %d, Captions: %d' % (img_emb_new.size(0), cap_emb_new.size(0))) r = simrank(sims) r_ = np.array(r_) + np.array(r) del sims print('--------------------') r_ = tuple(r_/5) print('I2T:%.1f %.1f %.1f' % r_[0:3]) print('T2I:%.1f %.1f %.1f' % r_[3:6]) print('Rsum:%.1f' % r_[-1])
def evalrank(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] print(opt) if data_path is not None: opt.data_path = data_path # load vocabulary used by the model vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) # construct model model = BFAN(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs, cap_lens = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) if not fold5: # no cross-validation, full evaluation img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)]) start = time.time() sims = shard_xattn(img_embs, cap_embs, cap_lens, opt, shard_size=128) end = time.time() print("calculate similarity time:", end - start) batch_size = img_embs.shape[0] r, rt = i2t(batch_size, sims, return_ranks=True) ri, rti = t2i(batch_size, sims, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) else: # 5fold cross-validation, only for MSCOCO results = [] for i in range(5): img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5] cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000] cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000] start = time.time() sims = shard_xattn(img_embs_shard, cap_embs_shard, cap_lens_shard, opt, shard_size=128) end = time.time() print("calculate similarity time:", end - start) batch_size = img_embs_shard.shape[0] r, rt0 = i2t(batch_size, sims, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i(batch_size, sims, return_ranks=True) print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) if i == 0: rt, rti = rt0, rti0 ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[10] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10]) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
logger.info(pprint.pformat(customized_cfg)) # data loader train_loader = get_train_loader(root=os.path.join(cfg.root, cfg.train), batch_size=cfg.batch_size, image_size=cfg.image_size, random_crop=cfg.random_crop, random_erase=cfg.random_erase, random_mirror=cfg.random_mirror, num_workers=4) query_loader = None gallery_loader = None if cfg.validate_interval > 0: query_loader = get_test_loader(root=os.path.join(cfg.root, cfg.query), batch_size=512, image_size=cfg.image_size, num_workers=4) gallery_loader = get_test_loader(root=os.path.join( cfg.root, cfg.gallery), batch_size=512, image_size=cfg.image_size, num_workers=4) # model model = PCBModel(num_class=cfg.num_id, num_parts=cfg.num_parts, bottleneck_dims=cfg.bottleneck_dims, pool_type=cfg.pool_type, share_embed=cfg.share_embed)
def main(): # parse options parser = TrainOptions() opts = parser.parse() # daita loader print('\n--- load dataset ---') vocab = pickle.load( open(os.path.join(opts.vocab_path, '%s_vocab.pkl' % opts.data_name), 'rb')) vocab_size = len(vocab) opts.vocab_size = vocab_size torch.backends.cudnn.enabled = False # Load data loaders train_loader, val_loader = data.get_loaders(opts.data_name, vocab, opts.crop_size, opts.batch_size, opts.workers, opts) test_loader = data.get_test_loader('test', opts.data_name, vocab, opts.crop_size, opts.batch_size, opts.workers, opts) # model print('\n--- load subspace ---') subspace = model_2.VSE(opts) subspace.setgpu() print('\n--- load model ---') model = DRIT(opts) model.setgpu(opts.gpu) if opts.resume is None: #之前没有保存过模型 model.initialize() ep0 = -1 total_it = 0 else: ep0, total_it = model.resume(opts.resume) model.set_scheduler(opts, last_ep=ep0) ep0 += 1 print('start the training at epoch %d' % (ep0)) # saver for display and output saver = Saver(opts) # train print('\n--- train ---') max_it = 500000 score = 0.0 subspace.train_start() for ep in range(ep0, opts.pre_iter): print('-----ep:{} --------'.format(ep)) for it, (images, captions, lengths, ids) in enumerate(train_loader): if it >= opts.train_iter: break # input data images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img, cap = subspace.train_emb(images, captions, lengths, ids, pre=True) #[b,1024] subspace.pre_optimizer.zero_grad() img = img.view(images.size(0), -1, 32, 32) cap = cap.view(images.size(0), -1, 32, 32) model.pretrain_ae(img, cap) if opts.grad_clip > 0: clip_grad_norm(subspace.params, opts.grad_clip) subspace.pre_optimizer.step() for ep in range(ep0, opts.n_ep): subspace.train_start() adjust_learning_rate(opts, subspace.optimizer, ep) for it, (images, captions, lengths, ids) in enumerate(train_loader): if it >= opts.train_iter: break # input data images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img, cap = subspace.train_emb(images, captions, lengths, ids) #[b,1024] img = img.view(images.size(0), -1, 32, 32) cap = cap.view(images.size(0), -1, 32, 32) subspace.optimizer.zero_grad() for p in model.disA.parameters(): p.requires_grad = True for p in model.disB.parameters(): p.requires_grad = True for p in model.disA_attr.parameters(): p.requires_grad = True for p in model.disB_attr.parameters(): p.requires_grad = True for i in range(opts.niters_gan_d): #5 model.update_D(img, cap) for p in model.disA.parameters(): p.requires_grad = False for p in model.disB.parameters(): p.requires_grad = False for p in model.disA_attr.parameters(): p.requires_grad = False for p in model.disB_attr.parameters(): p.requires_grad = False for i in range(opts.niters_gan_enc): model.update_E(img, cap) #利用新的content损失函数 subspace.optimizer.step() print('total_it: %d (ep %d, it %d), lr %09f' % (total_it, ep, it, model.gen_opt.param_groups[0]['lr'])) total_it += 1 # decay learning rate if opts.n_ep_decay > -1: model.update_lr() # save result image #saver.write_img(ep, model) if (ep + 1) % opts.n_ep == 0: print('save model') filename = os.path.join(opts.result_dir, opts.name) model.save('%s/final_model.pth' % (filename), ep, total_it) torch.save(subspace.state_dict(), '%s/final_subspace.pth' % (filename)) elif (ep + 1) % 10 == 0: print('save model') filename = os.path.join(opts.result_dir, opts.name) model.save('%s/%s_model.pth' % (filename, str(ep + 1)), ep, total_it) torch.save(subspace.state_dict(), '%s/%s_subspace.pth' % (filename, str(ep + 1))) if (ep + 1) % opts.model_save_freq == 0: a = None b = None c = None d = None subspace.val_start() for it, (images, captions, lengths, ids) in enumerate(test_loader): if it >= opts.val_iter: break images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img_emb, cap_emb = subspace.forward_emb(images, captions, lengths, volatile=True) img = img_emb.view(images.size(0), -1, 32, 32) cap = cap_emb.view(images.size(0), -1, 32, 32) image1, text1 = model.test_model2(img, cap) img2 = image1.view(images.size(0), -1) cap2 = text1.view(images.size(0), -1) if a is None: a = np.zeros( (opts.val_iter * opts.batch_size, img_emb.size(1))) b = np.zeros( (opts.val_iter * opts.batch_size, cap_emb.size(1))) c = np.zeros( (opts.val_iter * opts.batch_size, img2.size(1))) d = np.zeros( (opts.val_iter * opts.batch_size, cap2.size(1))) a[ids] = img_emb.data.cpu().numpy().copy() b[ids] = cap_emb.data.cpu().numpy().copy() c[ids] = img2.data.cpu().numpy().copy() d[ids] = cap2.data.cpu().numpy().copy() aa = torch.from_numpy(a) bb = torch.from_numpy(b) cc = torch.from_numpy(c) dd = torch.from_numpy(d) (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure) print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medr, r1, r5, r10)) (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure) print('test640: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medri, r1i, r5i, r10i)) (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure) print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1, r2, r3, r4)) (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure) print('test640: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1i, r2i, r3i, r4i)) curr = r2 + r3 + r4 + r2i + r3i + r4i if curr > score: score = curr print('save model') filename = os.path.join(opts.result_dir, opts.name) model.save('%s/best_model.pth' % (filename), ep, total_it) torch.save(subspace.state_dict(), '%s/subspace.pth' % (filename)) a = None b = None c = None d = None for it, (images, captions, lengths, ids) in enumerate(test_loader): images = images.cuda(opts.gpu).detach() captions = captions.cuda(opts.gpu).detach() img_emb, cap_emb = subspace.forward_emb(images, captions, lengths, volatile=True) img = img_emb.view(images.size(0), -1, 32, 32) cap = cap_emb.view(images.size(0), -1, 32, 32) image1, text1 = model.test_model2(img, cap) img2 = image1.view(images.size(0), -1) cap2 = text1.view(images.size(0), -1) if a is None: a = np.zeros((len(test_loader.dataset), img_emb.size(1))) b = np.zeros((len(test_loader.dataset), cap_emb.size(1))) c = np.zeros((len(test_loader.dataset), img2.size(1))) d = np.zeros((len(test_loader.dataset), cap2.size(1))) a[ids] = img_emb.data.cpu().numpy().copy() b[ids] = cap_emb.data.cpu().numpy().copy() c[ids] = img2.data.cpu().numpy().copy() d[ids] = cap2.data.cpu().numpy().copy() aa = torch.from_numpy(a) bb = torch.from_numpy(b) cc = torch.from_numpy(c) dd = torch.from_numpy(d) (r1, r5, r10, medr, meanr) = i2t(aa, bb, measure=opts.measure) print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medr, r1, r5, r10)) (r1i, r5i, r10i, medri, meanr) = t2i(aa, bb, measure=opts.measure) print('test5000: subspace: Med:{}, r1:{}, r5:{}, r10:{}'.format( medri, r1i, r5i, r10i)) (r2, r3, r4, m1, m2) = i2t(cc, dd, measure=opts.measure) print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1, r2, r3, r4)) (r2i, r3i, r4i, m1i, m2i) = t2i(cc, dd, measure=opts.measure) print('test5000: encoder: Med:{}, r1:{}, r5:{}, r10:{}'.format( m1i, r2i, r3i, r4i)) return
'--model', type=str, default='model.pt', metavar='M', help="the model file to be evaluated. (default: model.pt)") parser.add_argument( '--outfile', type=str, default='visualize_stn.png', metavar='O', help= "visualize the STN transformation on some input batch (default: visualize_stn.png)" ) args = parser.parse_args() # Load model checkpoint device = torch.device("cuda" if torch.cuda.is_available() else "cpu") checkpoint = torch.load(args.model, map_location=device) # Neural Network and Loss Function model = TrafficSignNet().to(device) model.load_state_dict(checkpoint) model.eval() criterion = nn.CrossEntropyLoss() # Data Initialization and Loading test_loader = get_test_loader(args.data, device) evaluate(model, criterion, test_loader) visualize_stn(test_loader, args.outfile)
def evalrank(model_path, data_path=None, data_name=None, data_name_vocab=None, split='dev', fold5=False, VSE_model=None, data_loader=None, concept_path=None, transfer_test=False, concept_name=None): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) if data_path is not None: opt.data_path = data_path if data_name is not None: opt.data_name = data_name # Jugde whether to use transfering testing results if transfer_test == True: opt.attribute_path = concept_path if concept_name is not None: opt.concept_name = concept_name if 'coco' in opt.data_name: fuse_weight = 0.9 elif 'f30k' in opt.data_name: fuse_weight = 0.85 print(opt) print("=> loading checkpoint '{}'".format(opt.resume)) with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % data_name_vocab), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) word2idx = vocab.word2idx # construct model model = VSE_model(word2idx, opt) # if with channel attention # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, transfer_test, opt) print('Computing results...') img_embs, cap_embs, img_emb_cons, cap_emb_cons, concept_labels = encode_data(model=model, data_loader=data_loader, alpha=fuse_weight) '''2). Make label completation''' ind_cap_complete = label_complete(concept_label=concept_labels, img_embs=img_embs, cap_embs=cap_embs, data_name=opt.data_name) img_embs, cap_embs, img_emb_cons, cap_emb_cons, completion_labels = encode_data_KNN_rerank(model=model, data_loader=data_loader, index_KNN_neighbour=ind_cap_complete, concept_labels=concept_labels, alpha=fuse_weight) print('Images: %d, Captions: %d' % (img_embs.shape[0] // 5, cap_embs.shape[0]), " for testing") if not fold5: # no cross-validation, full evaluation r, rt = i2t_sep_sim(img_embs, cap_embs, img_emb_cons, cap_emb_cons, opt.data_name, weight_fused=0.95, measure=opt.measure, return_ranks=True) ri, rti = t2i_sep_sim(img_embs, cap_embs, img_emb_cons, cap_emb_cons, opt.data_name, weight_fused=0.95, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) else: # 5fold cross-validation, only for MSCOCO results = [] for i in range(5): r, rt0 = i2t_sep_sim(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], img_emb_cons[i * 5000:(i + 1) * 5000], cap_emb_cons[i * 5000:(i + 1) * 5000], opt.data_name, weight_fused=0.95, measure=opt.measure, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i_sep_sim(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], img_emb_cons[i * 5000:(i + 1) * 5000], cap_emb_cons[i * 5000:(i + 1) * 5000], opt.data_name, weight_fused=0.95, measure=opt.measure, return_ranks=True) if i == 0: rt, rti = rt0, rti0 print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[10] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10]) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
is_cuda = torch.cuda.is_available() print('is cuda : ', is_cuda) os.environ['CUDA_VISIBLE_DEVICES'] = '1' device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") if __name__ == "__main__": code_intent_pair = Code_Intent_Pairs() path = 'vocab/' code_intent_pair.load_dict(path) special_symbols = code_intent_pair.get_special_symbols() word_size = code_intent_pair.get_word_size() code_size = code_intent_pair.get_code_size() test_path = 'processed_corpus/test.json' test_entries = code_intent_pair.load_entries(test_path) testloader = get_test_loader(test_entries) model = Seq2Seq(word_size, code_size, hyperP) if hyperP['load_pretrain_code_embed']: model.decoder.embed[0].load_state_dict( torch.load('./pretrain_code_lm/embedding-1556211835.t7')) if hyperP['freeze_embed']: for param in model.decoder.embed[0].parameters(): param.requires_grad = False model.load('model_100.t7') beam_decoder = Decoder(model) if is_cuda: model.to(device) # beam_decoder.to(device) model.eval()
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/home/dcsaero01/data/datasets/vsepp/', help='path to datasets') parser.add_argument('--data_name', default='minicsdv2', help='{coco,f8k,f30k,10crop}_precomp|coco|f8k|f30k') parser.add_argument('--vocab_path', default='./vocab/', help='Path to saved vocabulary pickle files.') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=20, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--crop_size', default=224, type=int, help='Size of an image crop as the CNN input.') parser.add_argument('--num_layers', default=1, type=int, help='Number of GRU layers.') parser.add_argument('--learning_rate', default=.0002, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=15, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--dropout_value', default=0, type=float, help='Probability value for dropout after linear layer') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=500, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='runs/minicsdv2/', help='Path to save the model and Tensorboard log.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=4096, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--text_dim', default=500, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--seq_length', default=10, type=int, help='Max sentence sequence length of the GRU') parser.add_argument('--finetune', action='store_true', help='Fine-tune the image encoder.') parser.add_argument('--cnn_type', default='resnet152', help="""The CNN used for image encoder (e.g. vgg19, resnet152)""") parser.add_argument('--use_restval', action='store_true', help='Use the restval data for training on MSCOCO.') parser.add_argument('--measure', default='cosine', help='Similarity measure used (cosine|order)') parser.add_argument('--use_abs', action='store_true', help='Take the absolute value of embedding vectors.') parser.add_argument('--test_mode', action='store_true', default=False, help='Set this flag to run the script in testing mode') parser.add_argument('--skip_model', action='store_true', help='Whether to train with Skipthoughts RNN Model') parser.add_argument('--no_imgnorm', action='store_true', help='Do not normalize the image embeddings.') parser.add_argument('--reset_train', action='store_true', help='Ensure the training is always done in ' 'train mode (Not recommended).') opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Load Vocabulary Wrapper if opt.data_name =='coco_st_precomp' or opt.data_name == 'coco_st_ner_precomp' or opt.data_name == 'csd_ner_precomp' or opt.data_name == 'breakingnews_precomp': vocab = None opt.vocab_size = 0 elif opt.data_name =='csd_skip_precomp': opt.vocab_size = -1 vocab = None else: vocab = pickle.load(open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) # Construct the model model = VSE(opt) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})" .format(opt.resume, start_epoch, best_rsum)) validate(opt, val_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume)) if opt.test_mode: ###Test the Model test_loader = data.get_test_loader('test',opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) validate(opt, test_loader, model) else: # Train the Model best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint({ 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '/')
def main(): # Hyper Parameters parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='./data/', help='path to datasets') parser.add_argument('--data_name', default='precomp', help='{coco,f30k}_precomp') parser.add_argument('--margin', default=0.2, type=float, help='Rank loss margin.') parser.add_argument('--num_epochs', default=30, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=64, type=int, help='Size of a training mini-batch.') parser.add_argument('--word_dim', default=300, type=int, help='Dimensionality of the word embedding.') parser.add_argument('--embed_size', default=1024, type=int, help='Dimensionality of the joint embedding.') parser.add_argument('--grad_clip', default=2., type=float, help='Gradient clipping threshold.') parser.add_argument('--learning_rate', default=.0001, type=float, help='Initial learning rate.') parser.add_argument('--lr_update', default=10, type=int, help='Number of epochs to update the learning rate.') parser.add_argument('--workers', default=10, type=int, help='Number of data loader workers.') parser.add_argument('--log_step', default=10, type=int, help='Number of steps to print and record the log.') parser.add_argument('--val_step', default=1000, type=int, help='Number of steps to run validation.') parser.add_argument('--logger_name', default='./runs/runX/log', help='Path to save Tensorboard log.') parser.add_argument('--model_name', default='./runs/runX/checkpoint', help='Path to save the model.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--max_violation', default=True, action='store_true', help='Use max instead of sum in the rank loss.') parser.add_argument('--img_dim', default=2048, type=int, help='Dimensionality of the image embedding.') parser.add_argument('--final_dims', default=256, type=int, help='dimension of final codes.') parser.add_argument('--max_words', default=32, type=int, help='maximum number of words in a sentence.') parser.add_argument( "--bert_path", default= '/media/ling/datum/Datasets/word_embeddings/uncased_L-12_H-768_A-12/', type=str, help="The BERT model path.") parser.add_argument("--txt_stru", default='cnn', help="Whether to use pooling or cnn or rnn") parser.add_argument( "--trans_cfg", default='/media/ling/datum/workspace/SCAN/image_bert.json', help="config file for image transformer") opt = parser.parse_args() print(opt) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now()) opt.logger_name = opt.logger_name + TIMESTAMP tb_logger.configure(opt.logger_name, flush_secs=5) f = open(opt.logger_name + "opt.txt", 'w') f.write(opt.__str__()) f.close() opt.vocab_file = opt.bert_path + 'vocab.txt' opt.bert_config_file = opt.bert_path + 'bert_config.json' opt.init_checkpoint = opt.bert_path + 'pytorch_model.bin' opt.do_lower_case = True # Load data loaders test_loader = data.get_test_loader('test', opt.data_name, opt.batch_size, opt.workers, opt) # Construct the model model = SAEM(opt) opt.resume = 'runs/f30k/log/model_best.pth.tar' # optionally resume from a checkpoint # opt.resume = 'runs/f30k/log/checkpoint_27.pth.tar' if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) validate(opt, test_loader, model) else: print("=> no checkpoint found at '{}'".format(opt.resume))
def evalrank(model_path, data_path=None, split='dev', fold5=False): checkpoint = torch.load(model_path) opt = checkpoint['opt'] print(opt) if data_path is not None: opt.data_path = data_path vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) captions_w = np.load(opt.caption_np + 'caption_np.npy') captions_w = torch.from_numpy(captions_w) captions_w = captions_w.cuda() model = SCAN(opt, captions_w) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs, cap_lens = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) if not fold5: img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)]) start = time.time() if opt.cross_attn == 't2i': sims = shard_xattn_t2i(img_embs, cap_embs, cap_lens, opt, shard_size=128) elif opt.cross_attn == 'i2t': sims = shard_xattn_i2t(img_embs, cap_embs, cap_lens, opt, shard_size=128) elif opt.cross_attn == 'all': sims, label = shard_xattn_all(model, img_embs, cap_embs, cap_lens, opt, shard_size=128) else: raise NotImplementedError end = time.time() print("calculate similarity time:", end - start) np.save('sim_stage1', sims) r, rt = i2t(label, img_embs, cap_embs, cap_lens, sims, return_ranks=True) ri, rti = t2i(label, img_embs, cap_embs, cap_lens, sims, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) else: results = [] for i in range(5): img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5] cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000] cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000] start = time.time() if opt.cross_attn == 't2i': sims = shard_xattn_t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, opt, shard_size=128) elif opt.cross_attn == 'i2t': sims = shard_xattn_i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, opt, shard_size=128) else: raise NotImplementedError end = time.time() print("calculate similarity time:", end - start) r, rt0 = i2t(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i(img_embs_shard, cap_embs_shard, cap_lens_shard, sims, return_ranks=True) print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) if i == 0: rt, rti = rt0, rti0 ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[10] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10]) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
def main(opt): logging.basicConfig(format='%(message)s', level=logging.INFO) tb_logger.configure(opt.logger_name, flush_secs=5) # Construct the model model = UDAG(opt) # optionally resume from a checkpoint if opt.evaluation: val_loader = data.get_test_loader(opt.data_name, opt.batch_size, opt.workers, opt) if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) _, sims = validate(opt, val_loader, model) np.save(opt.data_name + '_sims', sims) else: print("=> no checkpoint found at '{}'".format(opt.resume)) # if opt.resume: # if os.path.isfile(opt.resume): # print("=> loading checkpoint '{}'".format(opt.resume)) # checkpoint = torch.load(opt.resume) # start_epoch = checkpoint['epoch'] # best_rsum = checkpoint['best_rsum'] # model.load_state_dict(checkpoint['model']) # # Eiters is used to show logs as the continuation of another training # model.Eiters = checkpoint['Eiters'] # print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format(opt.resume, start_epoch, best_rsum)) # validate(opt, val_loader, model) # else: # print("=> no checkpoint found at '{}'".format(opt.resume)) else: # Train the Model # Load data loaders train_loader, val_loader = data.get_loaders(opt.data_name, opt.batch_size, opt.workers, opt) best_rsum = 0 for epoch in range(opt.num_epochs): adjust_learning_rate(opt, model.optimizer, epoch) # rsum = validate(opt, val_loader, model) # train for one epoch train(opt, train_loader, model, epoch, val_loader) # evaluate on validation set rsum = validate(opt, val_loader, model) # remember best R@ sum and save checkpoint is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) save_checkpoint( { 'epoch': epoch + 1, 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'Eiters': model.Eiters, }, is_best, prefix=opt.logger_name + '_' + opt.model_name + '/')
def evalrank(model, args, split='test'): print('Loading dataset') data_loader = get_test_loader(args, vocab) print('Computing results... (eval_on_gpu={})'.format(args.eval_on_gpu)) img_embs, txt_embs = encode_data(model, data_loader, args.eval_on_gpu) n_samples = img_embs.shape[0] nreps = 5 if args.data_name == 'coco' else 1 print('Images: %d, Sentences: %d' % (img_embs.shape[0] / nreps, txt_embs.shape[0])) # 5fold cross-validation, only for MSCOCO mean_metrics = None if args.data_name == 'coco': results = [] for i in range(5): r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], txt_embs[i * 5000:(i + 1) * 5000], nreps=nreps, return_ranks=True, order=args.order, use_gpu=args.eval_on_gpu) r = (r[0], r[1], r[2], r[3], r[3] / n_samples, r[4], r[4] / n_samples) print("Image to text: %.2f, %.2f, %.2f, %.2f (%.2f), %.2f (%.2f)" % r) ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], txt_embs[i * 5000:(i + 1) * 5000], nreps=nreps, return_ranks=True, order=args.order, use_gpu=args.eval_on_gpu) if i == 0: rt, rti = rt0, rti0 ri = (ri[0], ri[1], ri[2], ri[3], ri[3] / n_samples, ri[4], ri[4] / n_samples) print("Text to image: %.2f, %.2f, %.2f, %.2f (%.2f), %.2f (%.2f)" % ri) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.2f ar: %.2f ari: %.2f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("-----------------------------------") print("Mean metrics from 5-fold evaluation: ") print("rsum: %.2f" % (mean_metrics[-1] * 6)) print("Average i2t Recall: %.2f" % mean_metrics[-3]) print("Image to text: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" % mean_metrics[:7]) print("Average t2i Recall: %.2f" % mean_metrics[-2]) print("Text to image: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" % mean_metrics[7:14]) # no cross-validation, full evaluation r, rt = i2t(img_embs, txt_embs, nreps=nreps, return_ranks=True, use_gpu=args.eval_on_gpu) ri, rti = t2i(img_embs, txt_embs, nreps=nreps, return_ranks=True, use_gpu=args.eval_on_gpu) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] r = (r[0], r[1], r[2], r[3], r[3] / n_samples, r[4], r[4] / n_samples) ri = (ri[0], ri[1], ri[2], ri[3], ri[3] / n_samples, ri[4], ri[4] / n_samples) print("rsum: %.2f" % rsum) print("Average i2t Recall: %.2f" % ar) print("Image to text: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" % r) print("Average t2i Recall: %.2f" % ari) print("Text to image: %.2f %.2f %.2f %.2f (%.2f) %.2f (%.2f)" % ri) return mean_metrics
def train(cfg): num_gpus = torch.cuda.device_count() if num_gpus > 1: torch.distributed.init_process_group(backend="nccl", world_size=num_gpus) # set logger log_dir = os.path.join("logs/", cfg.source_dataset, cfg.prefix) if not os.path.isdir(log_dir): os.makedirs(log_dir, exist_ok=True) logging.basicConfig(format="%(asctime)s %(message)s", filename=log_dir + "/" + "log.txt", filemode="a") logger = logging.getLogger() logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) logger.addHandler(stream_handler) # writer = SummaryWriter(log_dir, purge_step=0) if dist.is_initialized() and dist.get_rank() != 0: logger = writer = None else: logger.info(pprint.pformat(cfg)) # training data loader if not cfg.joint_training: # single domain train_loader = get_train_loader(root=os.path.join( cfg.source.root, cfg.source.train), batch_size=cfg.batch_size, image_size=cfg.image_size, random_flip=cfg.random_flip, random_crop=cfg.random_crop, random_erase=cfg.random_erase, color_jitter=cfg.color_jitter, padding=cfg.padding, num_workers=4) else: # cross domain source_root = os.path.join(cfg.source.root, cfg.source.train) target_root = os.path.join(cfg.target.root, cfg.target.train) train_loader = get_cross_domain_train_loader( source_root=source_root, target_root=target_root, batch_size=cfg.batch_size, random_flip=cfg.random_flip, random_crop=cfg.random_crop, random_erase=cfg.random_erase, color_jitter=cfg.color_jitter, padding=cfg.padding, image_size=cfg.image_size, num_workers=8) # evaluation data loader query_loader = None gallery_loader = None if cfg.eval_interval > 0: query_loader = get_test_loader(root=os.path.join( cfg.target.root, cfg.target.query), batch_size=512, image_size=cfg.image_size, num_workers=4) gallery_loader = get_test_loader(root=os.path.join( cfg.target.root, cfg.target.gallery), batch_size=512, image_size=cfg.image_size, num_workers=4) # model num_classes = cfg.source.num_id num_cam = cfg.source.num_cam + cfg.target.num_cam cam_ids = train_loader.dataset.target_dataset.cam_ids if cfg.joint_training else train_loader.dataset.cam_ids num_instances = len( train_loader.dataset.target_dataset) if cfg.joint_training else None model = Model(num_classes=num_classes, drop_last_stride=cfg.drop_last_stride, joint_training=cfg.joint_training, num_instances=num_instances, cam_ids=cam_ids, num_cam=num_cam, neighbor_mode=cfg.neighbor_mode, neighbor_eps=cfg.neighbor_eps, scale=cfg.scale, mix=cfg.mix, alpha=cfg.alpha) model.cuda() # optimizer ft_params = model.backbone.parameters() new_params = [ param for name, param in model.named_parameters() if not name.startswith("backbone.") ] param_groups = [{ 'params': ft_params, 'lr': cfg.ft_lr }, { 'params': new_params, 'lr': cfg.new_params_lr }] optimizer = optim.SGD(param_groups, momentum=0.9, weight_decay=cfg.wd) # convert model for mixed precision distributed training model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level="O2") lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=cfg.lr_step, gamma=0.1) if dist.is_initialized(): model = parallel.DistributedDataParallel(model, delay_allreduce=True) # engine checkpoint_dir = os.path.join("checkpoints", cfg.source_dataset, cfg.prefix) engine = get_trainer( model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, logger=logger, # writer=writer, non_blocking=True, log_period=cfg.log_period, save_interval=10, save_dir=checkpoint_dir, prefix=cfg.prefix, eval_interval=cfg.eval_interval, query_loader=query_loader, gallery_loader=gallery_loader) # training engine.run(train_loader, max_epochs=cfg.num_epoch) if dist.is_initialized(): dist.destroy_process_group()
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) model = Baseline(eval=True, drop_last_stride=True, dual_path=False) # model = MixNet(eval=True, drop_last_stride=True) state_dict = torch.load(model_path) model.load_state_dict(state_dict, strict=False) model.float() model.eval() model.cuda() # extract test feature gallery_loader, query_loader = get_test_loader( dataset=dataset, root=dataset_config.data_root, batch_size=512, image_size=image_size, num_workers=16) # extract query features feats = [] labels = [] cam_ids = [] img_paths = [] for data, label, cam_id, img_path, _ in query_loader: with torch.autograd.no_grad(): feat = model(data.cuda(non_blocking=True), cam_ids=cam_id) feats.append(feat.data.cpu().numpy()) labels.append(label.data.cpu().numpy()) cam_ids.append(cam_id.data.cpu().numpy()) img_paths.extend(img_path)
def evalstack(model_path, data_path=None, split='dev', fold5=False, is_sparse=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] opt.is_sparse = is_sparse print(opt) if data_path is not None: opt.data_path = data_path opt.vocab_path = "/media/ubuntu/data/chunxiao/vocab" # load vocabulary used by the model vocab = deserialize_vocab(os.path.join( opt.vocab_path, '%s_vocab.json' % opt.data_name)) opt.vocab_size = len(vocab) # construct model model = GSMN(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs, bbox, depends, cap_lens = encode_data( model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) if not fold5: # no cross-validation, full evaluation img_embs = np.array([img_embs[i] for i in range(0, len(img_embs), 5)]) start = time.time() sims = shard_xattn(model, img_embs, cap_embs, bbox, depends, cap_lens, opt, shard_size=80) end = time.time() print("calculate similarity time:", end - start) return sims else: # 5fold cross-validation, only for MSCOCO sims_a = [] for i in range(5): img_embs_shard = img_embs[i * 5000:(i + 1) * 5000:5] cap_embs_shard = cap_embs[i * 5000:(i + 1) * 5000] cap_lens_shard = cap_lens[i * 5000:(i + 1) * 5000] bbox_shard = bbox[i * 5000:(i + 1) * 5000:5] depend_shard = depends[i * 5000:(i + 1) * 5000] start = time.time() sims = shard_xattn(model, img_embs_shard, cap_embs_shard, bbox_shard, depend_shard, cap_lens_shard, opt, shard_size=80) end = time.time() print("calculate similarity time:", end - start) sims_a.append(sims) return sims_a
import model_2 import pickle """ f = open('./test_recall.log', 'a') sys.stdout = f sys.stderr = f """ parser = TestOptions() opts = parser.parse() vocab = pickle.load( open(os.path.join(opts.vocab_path, '%s_vocab.pkl' % opts.data_name), 'rb')) opts.vocab_size = len(vocab) test_loader = data.get_test_loader('test', opts.data_name, vocab, opts.crop_size, opts.batch_size, opts.workers, opts) subspace = model_2.VSE(opts) subspace.setgpu() subspace.load_state_dict(torch.load(opts.resume2)) subspace.val_start() # model print('\n--- load model ---') model = DRIT(opts) model.setgpu(opts.gpu) model.resume(opts.resume, train=False) model.eval() a = None
def train(cfg): # set logger log_dir = os.path.join("logs/", cfg.dataset, cfg.prefix) if not os.path.isdir(log_dir): os.makedirs(log_dir, exist_ok=True) logging.basicConfig(format="%(asctime)s %(message)s", filename=log_dir + "/" + "log.txt", filemode="a") logger = logging.getLogger() logger.setLevel(logging.INFO) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) logger.addHandler(stream_handler) logger.info(pprint.pformat(cfg)) # training data loader train_loader = get_train_loader(dataset=cfg.dataset, root=cfg.data_root, sample_method=cfg.sample_method, batch_size=cfg.batch_size, p_size=cfg.p_size, k_size=cfg.k_size, random_flip=cfg.random_flip, random_crop=cfg.random_crop, random_erase=cfg.random_erase, color_jitter=cfg.color_jitter, padding=cfg.padding, image_size=cfg.image_size, num_workers=8) # evaluation data loader gallery_loader, query_loader = None, None if cfg.eval_interval > 0: gallery_loader, query_loader = get_test_loader( dataset=cfg.dataset, root=cfg.data_root, batch_size=512, image_size=cfg.image_size, num_workers=4) # model model = Baseline(num_classes=cfg.num_id, dual_path=cfg.dual_path, drop_last_stride=cfg.drop_last_stride, triplet=cfg.triplet, classification=cfg.classification) model.cuda() # optimizer assert cfg.optimizer in ['adam', 'sgd'] if cfg.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.wd) else: optimizer = optim.SGD(model.parameters(), lr=cfg.lr, momentum=0.9, weight_decay=cfg.wd) # convert model for mixed precision training model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level="O2") lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=cfg.lr_step, gamma=0.1) # engine checkpoint_dir = os.path.join("checkpoints", cfg.dataset, cfg.prefix) engine = get_trainer(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, logger=logger, non_blocking=True, log_period=cfg.log_period, save_dir=checkpoint_dir, prefix=cfg.prefix, eval_interval=cfg.eval_interval, gallery_loader=gallery_loader, query_loader=query_loader, dataset=cfg.dataset) # training engine.run(train_loader, max_epochs=cfg.num_epoch)
def evalrank(model_path, data_path=None, split='dev', fold5=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path opt.vocab_path = 'vocab' # load vocabulary used by the model with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f: vocab = pickle.load(f) opt.vocab_size = len(vocab) # construct model model = XRN(opt) # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) if not fold5: # no cross-validation, full evaluation r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ri, rti = t2i(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) else: # 5fold cross-validation, only for MSCOCO results = [] for i in range(5): r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) if i == 0: rt, rti = rt0, rti0 print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[10] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10]) torch.save({'rt': rt, 'rti': rti}, 'ranks.pth.tar')
def evalrank(model_path, data_path=None, split='dev', fold5=False, return_ranks=False): """ Evaluate a trained model on either dev or test. If `fold5=True`, 5 fold cross-validation is done (only for MSCOCO). Otherwise, the full data is used for evaluation. """ # load model and options logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) checkpoint = torch.load(model_path) opt = checkpoint['opt'] if data_path is not None: opt.data_path = data_path # load vocabulary used by the model vocab = pickle.load(open(os.path.join( opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb')) opt.vocab_size = len(vocab) opt.distributed = False opt.use_all = True opt.instance_loss = False opt.attention = False print(opt) # construct model model = VSE(opt) if "cnn.classifier.1.weight" in checkpoint['model'][0]: checkpoint['model'][0]["cnn.classifier.0.weight"] = checkpoint['model'][0].pop("cnn.classifier.1.weight") checkpoint['model'][0]["cnn.classifier.0.bias"] = checkpoint['model'][0].pop("cnn.classifier.1.bias") checkpoint['model'][0]["cnn.classifier.3.weight"] = checkpoint['model'][0].pop("cnn.classifier.4.weight") checkpoint['model'][0]["cnn.classifier.3.bias"] = checkpoint['model'][0].pop("cnn.classifier.4.bias") # load model state model.load_state_dict(checkpoint['model']) print('Loading dataset') data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, opt.batch_size, opt.workers, opt) print('Computing results...') img_embs, cap_embs = encode_data(model, data_loader) print('Images: %d, Captions: %d' % (img_embs.shape[0] / 5, cap_embs.shape[0])) if not fold5: # no cross-validation, full evaluation r, rt = i2t(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ri, rti = t2i(img_embs, cap_embs, measure=opt.measure, return_ranks=True) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f" % rsum) print("Average i2t Recall: %.1f" % ar) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % r) print("Average t2i Recall: %.1f" % ari) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % ri) else: # 5fold cross-validation, only for MSCOCO results = [] for i in range(5): r, rt0 = i2t(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) print("Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % r) ri, rti0 = t2i(img_embs[i * 5000:(i + 1) * 5000], cap_embs[i * 5000:(i + 1) * 5000], measure=opt.measure, return_ranks=True) if i == 0: rt, rti = rt0, rti0 print("Text to image: %.1f, %.1f, %.1f, %.1f, %.1f" % ri) ar = (r[0] + r[1] + r[2]) / 3 ari = (ri[0] + ri[1] + ri[2]) / 3 rsum = r[0] + r[1] + r[2] + ri[0] + ri[1] + ri[2] print("rsum: %.1f ar: %.1f ari: %.1f" % (rsum, ar, ari)) results += [list(r) + list(ri) + [ar, ari, rsum]] print("-----------------------------------") print("Mean metrics: ") mean_metrics = tuple(np.array(results).mean(axis=0).flatten()) print("rsum: %.1f" % (mean_metrics[10] * 6)) print("Average i2t Recall: %.1f" % mean_metrics[11]) print("Image to text: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[:5]) print("Average t2i Recall: %.1f" % mean_metrics[12]) print("Text to image: %.1f %.1f %.1f %.1f %.1f" % mean_metrics[5:10]) if return_ranks: return rt, rti