def main_merge(): global args, best_corr args.store_name = '{}_merged'.format(args.model) args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M') args.start_epoch = 0 check_rootfolders(args) model = Baseline(args.img_feat_size, args.au_feat_size) model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) if args.use_multistep: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, args.step_milestones, args.step_decay) # ckpt structure {epoch, state_dict, optimizer, best_corr} if args.resume and os.path.isfile(args.resume): print('Load checkpoint:', args.resume) ckpt = torch.load(args.resume) args.start_epoch = ckpt['epoch'] best_corr = ckpt['best_corr'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) print('Loaded ckpt at epoch:', args.start_epoch) # initialize datasets train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset( csv_path=[args.train_csv, args.val_csv], vidmap_path=[args.train_vidmap, args.val_vidmap], image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='merge'), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True) log_training = open( os.path.join(args.root_log, args.store_name, 'log.csv'), 'w') with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f: f.write(str(args)) tb_writer = SummaryWriter( log_dir=os.path.join(args.root_log, args.store_name)) for epoch in range(args.start_epoch, args.epochs): train(train_loader, model, optimizer, epoch, log_training, tb_writer) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_corr': 0.0, }, False) if args.use_multistep: scheduler.step()
def train(): # Prepare gym env = create_env() h, w, c = env.observation_space.shape # Prepare models device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_dir, fn = "./policy_grad", '{}.pth' model = Baseline(h, w).to(device) model.train() optimizer = optim.RMSprop(model.parameters(), lr=LEARN_RATE, weight_decay=WEIGHT_DECAY) # Train steps_done = 0 num_episodes = 2000 episode_rewards = [] for i_episode in tqdm(range(num_episodes)): # Complete 1 episode print("Episode {}".format(i_episode + 1)) i_rewards, i_states, i_actions, steps_done = generate_episode( env, model, device, steps_done, episode_rewards) # Update model optimize_model(device, model, optimizer, i_rewards, i_actions, i_states) # Save model every couple episodes if (i_episode + 1) % SAVE_EPI == 0: path = os.path.join(model_dir, fn.format(episode_rewards[-1])) torch.save(model.state_dict(), path) print('Complete') np.save('./rewards_policy_grad.npy', episode_rewards) env.close() plt.ioff() plt.show()
def main_train(config, checkpoint_dir=None): global args, best_corr best_corr = 0.0 args.store_name = '{}'.format(args.model) args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M-%S') args.start_epoch = 0 # check_rootfolders(args) if args.model == 'Baseline': model = Baseline() elif args.model == 'TCFPN': model = TCFPN(layers=[48, 64, 96], in_channels=(2048 + 128), num_classes=15, kernel_size=11) model = torch.nn.DataParallel(model).cuda() if config['optimizer'] == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) elif config['optimizer'] == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr']) # custom optimizer if args.use_sam: base_optim = torch.optim.Adam optimizer = SAM(model.parameters(), base_optim, lr=config['lr']) # custom lr scheduler if args.use_cos_wr: scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=args.cos_wr_t0,T_mult=args.cos_wr_t_mult) elif args.use_cos: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.cos_t_max) # SWA if args.use_swa: swa_model = torch.optim.swa_utils.AveragedModel(model) swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=config['lr']) # ckpt structure {epoch, state_dict, optimizer, best_corr} # if args.resume and os.path.isfile(args.resume): # print('Load checkpoint:', args.resume) # ckpt = torch.load(args.resume) # args.start_epoch = ckpt['epoch'] # best_corr = ckpt['best_corr'] # model.load_state_dict(ckpt['state_dict']) # optimizer.load_state_dict(ckpt['optimizer']) # print('Loaded ckpt at epoch:', args.start_epoch) if checkpoint_dir: model_state, optimizer_state = torch.load( os.path.join(checkpoint_dir, "checkpoint")) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) # initialize datasets train_loader = torch.utils.data.DataLoader( dataset=EEV_Dataset( csv_path=args.train_csv, vidmap_path=args.train_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='train', lpfilter=args.lp_filter ), batch_size=config['batch_size'], shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True ) val_loader = torch.utils.data.DataLoader( dataset=EEV_Dataset( csv_path=args.val_csv, vidmap_path=args.val_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='val' ), batch_size=None, shuffle=False, num_workers=args.workers, pin_memory=False ) accuracy = correlation # with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f: # f.write(str(args)) # tb_writer = SummaryWriter(log_dir=os.path.join(args.root_log, args.store_name)) for epoch in range(args.start_epoch, args.epochs): # train train(train_loader, model, optimizer, epoch, None, None) # do lr scheduling after epoch if args.use_swa and epoch >= args.swa_start: print('swa stepping...') swa_model.update_parameters(model) swa_scheduler.step() elif args.use_cos_wr: print('cos warm restart (T0:{} Tm:{}) stepping...'.format(args.cos_wr_t0, args.cos_wr_t_mult)) scheduler.step() elif args.use_cos: print('cos (Tmax:{}) stepping...'.format(args.cos_t_max)) scheduler.step() # validate if args.use_swa and epoch >= args.swa_start: # validate use swa model corr, loss = validate(val_loader, swa_model, accuracy, epoch, None, None) else: corr, loss = validate(val_loader, model, accuracy, epoch, None, None) is_best = corr > best_corr best_corr = max(corr, best_corr) # tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch) # output_best = 'Best corr: %.4f\n' % (best_corr) # print(output_best) # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'optimizer': optimizer.state_dict(), # 'best_corr': best_corr, # }, is_best) with tune.checkpoint_dir(epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") if is_best: path = os.path.join(checkpoint_dir, "checkpoint_best") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(loss=loss, accuracy=corr, best_corr=best_corr)
def main_train(): global args, best_corr args.store_name = '{}'.format(args.model) args.store_name = args.store_name + datetime.now().strftime( '_%m-%d_%H-%M-%S') args.start_epoch = 0 if not args.val_only: check_rootfolders(args) if args.model == 'Baseline': if args.cls_indices: model = Baseline(args.img_feat_size, args.au_feat_size, num_classes=len(args.cls_indices)) else: print('Feature size:', args.img_feat_size, args.au_feat_size) model = Baseline(args.img_feat_size, args.au_feat_size) elif args.model == 'TCFPN': model = TCFPN(layers=[48, 64, 96], in_channels=(128), num_classes=15, kernel_size=11) elif args.model == 'BaseAu': model = Baseline_Au(args.au_feat_size) elif args.model == 'BaseImg': model = Baseline_Img(args.img_feat_size) elif args.model == 'EmoBase': model = EmoBase() model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) # optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate) # custom optimizer if args.use_sam: base_optim = torch.optim.Adam optimizer = SAM(model.parameters(), base_optim, lr=args.learning_rate) # custom lr scheduler if args.use_cos_wr: scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=args.cos_wr_t0, T_mult=args.cos_wr_t_mult) elif args.use_cos: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.cos_t_max) elif args.use_multistep: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, args.step_milestones, args.step_decay) # SWA if args.use_swa: swa_model = torch.optim.swa_utils.AveragedModel(model) swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=args.learning_rate) # ckpt structure {epoch, state_dict, optimizer, best_corr} if args.resume and os.path.isfile(args.resume): print('Load checkpoint:', args.resume) ckpt = torch.load(args.resume) args.start_epoch = ckpt['epoch'] best_corr = ckpt['best_corr'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) print('Loaded ckpt at epoch:', args.start_epoch) # initialize datasets train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset( csv_path=args.train_csv, vidmap_path=args.train_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='train', lpfilter=args.lp_filter, train_freq=args.train_freq, val_freq=args.val_freq, cls_indices=args.cls_indices), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True) val_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset( csv_path=args.val_csv, vidmap_path=args.val_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='val', train_freq=args.train_freq, val_freq=args.val_freq, cls_indices=args.cls_indices, repeat_sample=args.repeat_sample), batch_size=None, shuffle=False, num_workers=args.workers, pin_memory=False) accuracy = correlation if args.val_only: print('Run validation ...') print('start epoch:', args.start_epoch, 'model:', args.resume) validate(val_loader, model, accuracy, args.start_epoch, None, None) return log_training = open( os.path.join(args.root_log, args.store_name, 'log.csv'), 'w') with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f: f.write(str(args)) tb_writer = SummaryWriter( log_dir=os.path.join(args.root_log, args.store_name)) for epoch in range(args.start_epoch, args.epochs): train(train_loader, model, optimizer, epoch, log_training, tb_writer) # do lr scheduling after epoch if args.use_swa and epoch >= args.swa_start: print('swa stepping...') swa_model.update_parameters(model) swa_scheduler.step() elif args.use_cos_wr or args.use_cos or args.use_multistep: scheduler.step() if (epoch + 1) > 2 and ((epoch + 1) % args.eval_freq == 0 or (epoch + 1) == args.epochs): # validate if args.use_swa and epoch >= args.swa_start: # validate use swa model corr = validate(val_loader, swa_model, accuracy, epoch, log_training, tb_writer) else: corr = validate(val_loader, model, accuracy, epoch, log_training, tb_writer) is_best = corr > best_corr best_corr = max(corr, best_corr) tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch) output_best = 'Best corr: %.4f\n' % (best_corr) print(output_best) log_training.write(output_best + '\n') log_training.flush() save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_corr': best_corr, }, is_best)
# batch_size = 48 # instance_num = 1 train_data, val_data, trains, vals = make_dataloader(kd_id, kd_num) train_loader = DataLoader(dataset=train_data, batch_size=batch_size, sampler=RandomSampler(trains, batch_size, instance_num), shuffle=False, num_workers=2, collate_fn=train_collate) #train_loader = DataLoader(dataset=train_data, batch_size=48, shuffle=False, num_workers=2, collate_fn=train_collate) val_loader = DataLoader(dataset=val_data, batch_size=64, shuffle=False, num_workers=2, collate_fn=train_collate) train_length = len(train_loader) val_length = len(val_loader) if __name__ == '__main__': max_epoch = 50 max_val_acc = 0 for epoch in range(0, max_epoch): adjust_lr(optimizer, epoch) train_fuc(model, epoch) val_acc = val_fuc(model, epoch) torch.save( model.state_dict(), 'models/' + str(kd_id) + '_' + model_name + '_' + '%.5f' % (val_acc) + '_' + str(epoch) + '.pth')
# model_path = ' ' model = Baseline(model='train',model_name = model_name, model_path=model_path) #model.load_param('models/model_1_180000.pth') model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) #exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) # kd_id = 0 # kd_num = 7 # batch_size = 48 # instance_num = 1 train_data, val_data, trains, vals = make_dataloader(kd_id,kd_num) train_loader = DataLoader(dataset=train_data, batch_size=batch_size, sampler=RandomSampler(trains, batch_size, instance_num), shuffle=False, num_workers=2, collate_fn=train_collate) #train_loader = DataLoader(dataset=train_data, batch_size=48, shuffle=False, num_workers=2, collate_fn=train_collate) val_loader = DataLoader(dataset=val_data, batch_size=64, shuffle=False, num_workers=2, collate_fn=train_collate ) train_length = len(train_loader) val_length = len(val_loader) if __name__ == '__main__': max_epoch = 50 max_val_acc = 0 for epoch in range(0,max_epoch): adjust_lr(optimizer, epoch) train_fuc(model, epoch) val_acc = val_fuc(model, epoch) torch.save(model.state_dict(), 'models/'+ str(kd_id)+'_'+ model_name + '_'+ '%.5f'%(val_acc) +'_'+ str(epoch) +'.pth')