os.path.join('data', vocab_source, 'explain_dictionary.pkl')) #train_dset = VQAFeatureDataset('train', q_dict, c_dict, 'cache/VQAE2',args.att_thr) #eval_dset = VQAFeatureDataset('val', q_dict, c_dict, 'cache/VQAE2',args.att_thr) train_dset = VQAEDataset('train', q_dict, c_dict, 'cache/VQAE2') eval_dset = VQAEDataset('val', q_dict, c_dict, 'cache/VQAE2') #train_dset = VQAEVQA2Dataset('train', q_dict, c_dict, 'cache') #eval_dset = VQAEVQA2Dataset('val', q_dict, c_dict, 'cache') batch_size = args.batch_size constructor = 'build_%s' % args.model model = utils.model_factory(constructor, train_dset, args.num_hid, args.att_dim, args.decode_dim).cuda() model_path = os.path.join(args.output, 'model.pth') model_state = torch.load(model_path) model.load_state_dict(model_state) print('Model has {} parameters in total'.format(utils.params_count(model))) #model = nn.DataParallel(model).cuda() eval_loader = DataLoader(eval_dset, batch_size, shuffle=False, num_workers=1) model.train(False) vqa_score, results = evaluate(model, eval_loader, q_dict, c_dict) save_obj = {'vqa_score': vqa_score, 'results': results} save_results(save_obj, args.output) #save_results(results, args.output)
def run(config, is_train, eval_name): torch.manual_seed(config['training_parameters']['seed']) args.gpu = config['training_parameters']['gpu'] output = config['logs']['dir_logs'] batch_size = config['training_parameters']['batch_size'] if args.gpu: torch.cuda.manual_seed(config['training_parameters']['seed']) torch.backends.cudnn.benchmark = True if is_train: ''' eval_name 为 test 时会同时加载test 数据集 ''' print("training . . .") model, train_dset, eval_dset, embedding_weight, test_dset = load_model_data( config, is_train=is_train, eval_name=eval_name) else: print("testing . . .") model, eval_dset = load_model_data(config, is_train=is_train, eval_name=eval_name) if args.gpu: # model = model.cuda() model = nn.DataParallel(model).cuda() model_dir = os.path.join(output, "model_epoch16.pth") eval_loader = DataLoader( eval_dset, batch_size, shuffle=False, num_workers=config['training_parameters']['num_workers'], collate_fn=utils.trim_collate) utils.compute_result(eval_name, model, model_dir, eval_loader, output) return logger = utils.logger(os.path.join(output, 'log.json')) model_size = utils.params_count(model) print("nParams:", model_size) logger.add("model size(Params)", model_size) logger.add("train set", len(train_dset)) logger.add("val set", len(eval_dset)) with open(output + "config.yaml", "w") as yaml_file: yaml.dump(config, yaml_file) # model.embedding.init_embedding(embedding_weight) if args.gpu: # model = model.cuda() model = nn.DataParallel(model).cuda() print("sucees to create model.") # use_vg = config['data']['use_vg'] evaluation = True if eval_name == "val" else False #config['data']['evaluation'] if evaluation: print("train with train dataset") eval_loader = DataLoader( eval_dset, batch_size, shuffle=False, num_workers=config['training_parameters']['num_workers'], collate_fn=utils.trim_collate) train_loader = DataLoader( train_dset, batch_size, shuffle=True, num_workers=config['training_parameters']['num_workers'], collate_fn=utils.trim_collate) else: print("train with train and val dataset") eval_loader = None train_dset = ConcatDataset([train_dset, eval_dset]) train_loader = DataLoader( train_dset, batch_size, shuffle=True, num_workers=config['training_parameters']['num_workers'], collate_fn=utils.trim_collate) # model_data = torch.load(output+'model_epoch8.pth') # model.load_state_dict(model_data.get('model_state', model_data)) # print("success to load model!") # 初始化优化器 # ignored_params = list(map(id, model.module.bert.parameters())) # base_params = filter(lambda p: id(p) not in ignored_params, model.parameters()) # optim = torch.optim.Adamax([ # {'params': base_params}, # {'params': model.module.bert.parameters(), 'lr': 1e-6} #FC层使用较大的学习率 # ], # lr = 0.0015 # ) optim = torch.optim.Adamax(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0015) # optim = torch.optim.Adam( # filter(lambda p:p.requires_grad, model.parameters()), # lr=0.00015, # betas = (0.9, 0.98), # eps = 1e-9 # # weight_decay=0.001 # ) train(model, train_loader, eval_loader, logger, optim, output, **config['training_parameters']) if eval_name == "val": model_dir = os.path.join(output, "model_best.pth") utils.compute_result(eval_name, model, model_dir, eval_loader, output) else: # test model_dir = os.path.join(output, "model_epoch5.pth") test_loader = DataLoader( test_dset, batch_size, shuffle=False, num_workers=config['training_parameters']['num_workers'], collate_fn=utils.trim_collate) utils.compute_result(eval_name, model, model_dir, test_loader, output)
constructor = 'build_%s' % args.model model1 = utils.model_factory(constructor, train_dset, args.num_hid, args.att_dim, args.decode_dim).cuda() model2 = utils.model_factory(constructor, train_dset, args.num_hid, args.att_dim, args.decode_dim).cuda() model1_path = os.path.join(args.source1, 'model.pth') model1_state = torch.load(model1_path) model1.load_state_dict(model1_state) model2_path = os.path.join(args.source2, 'model.pth') model2_state = torch.load(model2_path) model2.load_state_dict(model2_state) print('Model has {} parameters in total'.format( utils.params_count(model1))) eval_loader = DataLoader(eval_dset, batch_size, shuffle=False, num_workers=1) model1.train(False) model2.train(False) compare(model1, model2, eval_loader, q_dict, c_dict, label2ans, args.output) evaluate(model1, eval_loader, q_dict, c_dict, True) evaluate(model2, eval_loader, q_dict, c_dict, False) #vqa_score, results = evaluate(model1, eval_loader, q_dict, c_dict) #save_obj = {'vqa_score': vqa_score, 'results': results} #save_results(save_obj, args.output)
# exp_logger = load_checkpoint(model.module, optimizer, cf.resume) # else: exp_logger = utils.Experiment(os.path.basename(cf.log_dir)) meters = { 'loss': utils.AvgMeter(), 'acc1': utils.AvgMeter(), 'acc5': utils.AvgMeter(), 'batch_time': utils.AvgMeter(), 'data_time': utils.AvgMeter(), 'epoch_time': utils.SumMeter() } for split in vqa.data['qa'].keys(): exp_logger.add_meters(split, meters) exp_logger.info['model_params'] = utils.params_count(model) # print('Model has {} parameters'.format(exp_logger.info['model_params'])) print('<train.py> Start training...') max_step = None if cf.debug: # max_step = 5 print('<train.py>: You are in debugging mode...') auto_find = { 'train': ['train'] + [False] * cf.epochs, 'test_dev': ['test_dev'] + [False] * cf.epochs, 'test': ['test'] + [False] * cf.epochs, 'test_local': ['test_local'] + [False] * cf.epochs
def main(): # Hyper Parameters setting parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='/mnt/data/linkaiyi/scan/data/f30k_precomp', help='path to datasets') parser.add_argument('--path_opt', default='option/FusionNoattn_baseline.yaml', type=str, help='path to a yaml options file') parser.add_argument('--data_name', default='flickr30k_splits', help='{coco,f30k}_splits') parser.add_argument('--logger_name', default='./log_2', help='Path to save Tensorboard log.') parser.add_argument( '--vocab_path', default= '/home/linkaiyi/fusion_wangtan/Fusion_flickr/Fusion_10.28/vocab', help='Path to saved vocabulary json files.') parser.add_argument( '--model_name', default='/mnt/data/linkaiyi/mscoco/fusion/Fusion_flic/runs/checkpoint', help='Path to save the model.') parser.add_argument('--num_epochs', default=120, type=int, help='Number of training epochs.') parser.add_argument('--batch_size', default=128, type=int, help='Size of a training mini-batch.') parser.add_argument('--workers', default=2, type=int, help='Number of data loader workers.') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--lr_update', default=20, type=int, help='Number of epochs to update the learning rate.') opt = parser.parse_args() if os.path.isdir(opt.logger_name): if click.confirm('Logs directory already exists in {}. Erase?'.format( opt.logger_name, default=False)): os.system('rm -r ' + opt.logger_name) tb_logger.configure(opt.logger_name, flush_secs=5) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) ######################################################################################### # Create options ######################################################################################### options = {'logs': {}, 'coco': {}, 'model': {'seq2vec': {}}, 'optim': {}} if opt.path_opt is not None: with open(opt.path_opt, 'r') as handle: options_yaml = yaml.load(handle) options = utils.update_values(options, options_yaml) vocab = deserialize_vocab( os.path.join(opt.vocab_path, '%s_vocab.json' % opt.data_name)) vocab_word = sorted(vocab.word2idx.items(), key=lambda x: x[1], reverse=False) vocab_word = [tup[0] for tup in vocab_word] opt.vocab_size = len(vocab) # Create dataset, model, criterion and optimizer train_loader, val_loader = data.get_loaders(opt.data_path, vocab, opt.batch_size, opt.workers, opt) model = models.factory(options['model'], vocab_word, cuda=True, data_parallel=False) criterion = nn.CrossEntropyLoss(weight=torch.Tensor([1, 128])).cuda() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=options['optim']['lr']) print('Model has {} parameters'.format(utils.params_count(model))) # optionally resume from a checkpoint if opt.resume: if os.path.isfile(opt.resume): print("=> loading checkpoint '{}'".format(opt.resume)) checkpoint = torch.load(opt.resume) start_epoch = checkpoint['epoch'] best_rsum = checkpoint['best_rsum'] model.load_state_dict(checkpoint['model']) # Eiters is used to show logs as the continuation of another # training model.Eiters = checkpoint['Eiters'] print("=> loaded checkpoint '{}' (epoch {}, best_rsum {})".format( opt.resume, start_epoch, best_rsum)) engine.validate(val_loader, model, criterion, optimizer, opt.batch_size) else: print("=> no checkpoint found at '{}'".format(opt.resume)) else: start_epoch = 0 # Train the Model best_rsum = 0 for epoch in range(start_epoch, opt.num_epochs): adjust_learning_rate(opt, options, optimizer, epoch) # train for one epoch engine.train(train_loader, model, criterion, optimizer, epoch, print_freq=10) # evaluate on validation set rsum = engine.validate(val_loader, model, criterion, optimizer, opt.batch_size) is_best = rsum > best_rsum best_rsum = max(rsum, best_rsum) if not os.path.exists(opt.model_name): os.mkdir(opt.model_name) save_checkpoint( { 'epoch': epoch + 1, 'arch': 'baseline', 'model': model.state_dict(), 'best_rsum': best_rsum, 'opt': opt, 'options': options, 'Eiters': model.Eiters, }, is_best, filename='checkpoint_{}{}.pth.tar'.format(epoch, best_rsum), prefix=opt.model_name + '/')