def main(args): test_path = get_dset_path(args.dataset_name, 'test') logger.info("Initializing test dataset") test_dset, test_loader = data_loader(args, test_path) net = LSTM_model(args) net = net.cuda() checkpoint_path = ".\model\lstm767.tar" checkpoint = torch.load(checkpoint_path) net.load_state_dict(checkpoint['state_dict']) net.eval() batch_error = 0 batch_fde = 0 for idx, batch in enumerate(test_loader): (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end) = batch num_ped = obs_traj.size(1) # (8 n 2) pred_traj_gt = pred_traj_gt.cuda() pred_traj = net(obs_traj.cuda(), num_ped, pred_traj_gt) ade_1 = get_mean_error(pred_traj, pred_traj_gt) ade_2 = displacement_error(pred_traj, pred_traj_gt) / (pred_traj.size(1) * 12) fde = final_displacement_error(pred_traj, pred_traj_gt) / pred_traj.size(1) batch_error += ade_2 batch_fde += fde ade = batch_error / (idx+1) fin_fde = batch_fde / (idx+1) logger.info("ade is {:.2f}".format(ade)) logger.info("ade is {:.2f}".format(fin_fde))
def main(args): if os.path.isdir(args.model_path): filenames = os.listdir(args.model_path) filenames.sort() paths = [os.path.join(args.model_path, file_) for file_ in filenames] else: paths = [args.model_path] for path in paths: checkpoint = torch.load(path) generator = get_generator(checkpoint) _args = AttrDict(checkpoint['args']) path = get_dset_path(_args.dataset_name, args.dset_type) _, loader = data_loader(_args, path) ade, fde, trajs = evaluate(_args, loader, generator, args.num_samples) print('Dataset: {}, Pred Len: {}, ADE: {:.2f}, FDE: {:.2f}'.format( _args.dataset_name, _args.pred_len, ade, fde)) path = "trajs_dumped/" + "/".join(_args.dataset_name.split("/")[:-1]) pathlib.Path(path).mkdir(parents=True, exist_ok=True) with open( "trajs_dumped/" + args.model_path.split("/")[-1].split(".")[0] + "_" + args.dset_type + "_trajs.pkl", 'wb+') as f: pickle.dump(trajs, f) print( "trajs dumped at ", args.model_path.split("/")[-1].split(".")[0] + "_" + args.dset_type + "_trajs.pkl")
def main(args): checkpoint = torch.load(args.resume) generator = get_generator(checkpoint) path = get_dset_path(args.dataset_name, args.dset_type) _, loader = data_loader(args, path) plot_trajectory(args, loader, generator)
def main(args): checkpoint = torch.load(args.resume) generator = get_generator(checkpoint) path = get_dset_path(args.dataset_name, args.dset_type) _, loader = data_loader(args, path) ade, fde = evaluate(args, loader, generator) print("Dataset: {}, Pred Len: {}, ADE: {:.12f}, FDE: {:.12f}".format( args.dataset_name, args.pred_len, ade, fde))
def main(args): checkpoint = torch.load(args.resume) generator = get_generator(checkpoint) path = get_dset_path(args.dataset_name, args.dset_type) _, loader = data_loader(args, path) prediction = evaluate(args, loader, generator) print(len(prediction)) print(prediction[0])
def main(args): if os.path.isdir(args.model_path): filenames = os.listdir(args.model_path) filenames.sort() paths = [os.path.join(args.model_path, file_) for file_ in filenames] else: paths = [args.model_path] for path in paths: checkpoint = torch.load(path) generator = get_generator(checkpoint) _args = AttrDict(checkpoint['args']) path = get_dset_path(_args.dataset_name, args.dset_type) _, loader = data_loader(_args, path) ade, fde = evaluate(_args, loader, generator, args.num_samples) print('Dataset: {}, Pred Len: {}, ADE: {:.2f}, FDE: {:.2f}'.format( _args.dataset_name, _args.pred_len, ade, fde))
def main(args): os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" device_ids = [0, 1] test_path = get_dset_path(args.dataset_name, 'test') logger.info("Initializing test dataset") test_dset, test_loader = data_loader(args, test_path) net = LSTM_model(args) #net = net.cuda(device_ids[1]) checkpoint_path = "./model/lstm348.tar" checkpoint = torch.load(checkpoint_path) net.load_state_dict(checkpoint['state_dict']) net.eval() count = 0 total_ade = 0 total_fde = 0 for batch in test_loader: (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end) = batch num_ped = obs_traj.size(1) # (8 n 2) #pred_traj_gt = pred_traj_gt.cuda(device_ids[1]) pred_traj = net(obs_traj, num_ped, pred_traj_gt, seq_start_end) ade = get_mean_error(pred_traj, pred_traj_gt) total_ade += ade fde = final_displacement_error(pred_traj[-1], pred_traj_gt[-1]) total_fde += (fde / num_ped) #logger.info("ade is {:.2f}".format(ade)) count += 1 ade_fin = total_ade / count fde_fin = total_fde / count logger.info("ade is {:.2f}".format(ade_fin)) logger.info("fde is {:.2f}".format(fde_fin))
def objective(trial): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num train_path = get_dset_path(args.dataset_name, 'train') val_path = get_dset_path(args.dataset_name, 'val') long_dtype, float_dtype = get_dtypes(args) discriminator_wight = trial.suggest_categorical('discriminator_wight', [0, 1]) optim_name = trial.suggest_categorical('optim_name', ['Adam', 'Adamax', 'RMSprop']) # args.batch_size = trial.suggest_categorical('batch_size', [32, 64, 128]) args.dropout = trial.suggest_categorical('drop_out', [0, 0.2, 0.5]) args.batch_norm = trial.suggest_categorical('batch_norm', [0, 1]) N_TRAIN_EXAMPLES = args.batch_size * 30 N_VALID_EXAMPLES = args.batch_size * 10 logger.info("Initializing train dataset") train_dset, train_loader = data_loader(args, train_path) logger.info("Initializing val dataset") _, val_loader = data_loader(args, val_path) generator = TrajectoryGenerator( obs_len=args.obs_len, pred_len=args.pred_len, embedding_dim=args.embedding_dim, encoder_h_dim=args.encoder_h_dim_g, decoder_h_dim=args.decoder_h_dim_g, mlp_dim=args.mlp_dim, num_layers=args.num_layers, noise_dim=args.noise_dim, noise_type=args.noise_type, noise_mix_type=args.noise_mix_type, pooling_type=args.pooling_type, pool_every_timestep=args.pool_every_timestep, dropout=args.dropout, bottleneck_dim=args.bottleneck_dim, neighborhood_size=args.neighborhood_size, grid_size=args.grid_size, batch_norm=args.batch_norm, use_cuda=args.use_gpu) generator.apply(init_weights) generator.type(float_dtype).train() logger.info('Here is the generator:') logger.info(generator) discriminator = TrajectoryDiscriminator(obs_len=args.obs_len, pred_len=args.pred_len, embedding_dim=args.embedding_dim, h_dim=args.encoder_h_dim_d, mlp_dim=args.mlp_dim, num_layers=args.num_layers, dropout=args.dropout, batch_norm=args.batch_norm, d_type=args.d_type, use_cuda=args.use_gpu) discriminator.apply(init_weights) discriminator.type(float_dtype).train() logger.info('Here is the discriminator:') logger.info(discriminator) g_loss_fn = gan_g_loss d_loss_fn = gan_d_loss if optim_name == 'Adam': optimizer_g = optim.Adam([{ 'params': generator.parameters(), 'initial_lr': args.g_learning_rate }], lr=args.g_learning_rate) optimizer_d = optim.Adam([{ 'params': discriminator.parameters(), 'initial_lr': args.d_learning_rate }], lr=args.d_learning_rate) elif optim_name == 'Adamax': optimizer_g = optim.Adamax([{ 'params': generator.parameters(), 'initial_lr': args.g_learning_rate }], lr=args.g_learning_rate) optimizer_d = optim.Adamax([{ 'params': discriminator.parameters(), 'initial_lr': args.d_learning_rate }], lr=args.d_learning_rate) else: optimizer_g = optim.RMSprop([{ 'params': generator.parameters(), 'initial_lr': args.g_learning_rate }], lr=args.g_learning_rate) optimizer_d = optim.RMSprop([{ 'params': discriminator.parameters(), 'initial_lr': args.d_learning_rate }], lr=args.d_learning_rate) scheduler_g = optim.lr_scheduler.StepLR(optimizer_g, step_size=100, gamma=0.5, last_epoch=-1) scheduler_d = optim.lr_scheduler.StepLR(optimizer_d, step_size=100, gamma=0.5, last_epoch=-1) t, epoch = 0, 0 while t < 50: gc.collect() d_steps_left = args.d_steps g_steps_left = args.g_steps for batch_idx, batch in enumerate(train_loader): # Limiting training utils for faster epochs. if batch_idx * args.batch_size >= N_TRAIN_EXAMPLES: break # Decide whether to use the batch for stepping on discriminator or # generator; an iteration consists of args.d_steps steps on the # discriminator followed by args.g_steps steps on the generator. if d_steps_left > 0: step_type = 'd' losses_d = discriminator_step(args, batch, generator, discriminator, d_loss_fn, optimizer_d) d_steps_left -= 1 elif g_steps_left > 0: step_type = 'g' losses_g = generator_step(args, batch, generator, discriminator, g_loss_fn, optimizer_g, discriminator_wight) g_steps_left -= 1 # Skip the rest if we are not at the end of an iteration if d_steps_left > 0 or g_steps_left > 0: continue t += 1 d_steps_left = args.d_steps g_steps_left = args.g_steps if t >= args.num_iterations: break scheduler_g.step() scheduler_d.step() metrics_val = check_accuracy(args, val_loader, generator, discriminator, d_loss_fn, N_VALID_EXAMPLES) ade = metrics_val['ade'] trial.report(ade, t) return ade
def main(args): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num train_path = get_dset_path(args.dataset_name, 'train') val_path = get_dset_path(args.dataset_name, 'val') long_dtype, float_dtype = get_dtypes(args) logger.info("Initializing train dataset") train_dset, train_loader = data_loader(args, train_path) logger.info("Initializing val dataset") _, val_loader = data_loader(args, val_path) iterations_per_epoch = len(train_dset) / args.batch_size / args.d_steps if args.num_epochs: args.num_iterations = int(iterations_per_epoch * args.num_epochs) logger.info( 'There are {} iterations per epoch'.format(iterations_per_epoch)) generator = TrajectoryGenerator( obs_len=args.obs_len, pred_len=args.pred_len, embedding_dim=args.embedding_dim, encoder_h_dim=args.encoder_h_dim_g, decoder_h_dim=args.decoder_h_dim_g, mlp_dim=args.mlp_dim, num_layers=args.num_layers, noise_dim=args.noise_dim, noise_type=args.noise_type, noise_mix_type=args.noise_mix_type, pooling_type=args.pooling_type, pool_every_timestep=args.pool_every_timestep, dropout=args.dropout, bottleneck_dim=args.bottleneck_dim, neighborhood_size=args.neighborhood_size, grid_size=args.grid_size, batch_norm=args.batch_norm) generator.apply(init_weights) generator.type(float_dtype).train() logger.info('Here is the generator:') logger.info(generator) # discriminator = TrajectoryDiscriminator( # obs_len=args.obs_len, # pred_len=args.pred_len, # embedding_dim=args.embedding_dim, # h_dim=args.encoder_h_dim_d, # mlp_dim=args.mlp_dim, # num_layers=args.num_layers, # dropout=args.dropout, # batch_norm=args.batch_norm, # d_type=args.d_type) # discriminator.apply(init_weights) # discriminator.type(float_dtype).train() logger.info('Here is the discriminator:') # logger.info(discriminator) g_loss_fn = gan_g_loss d_loss_fn = gan_d_loss optimizer_g = optim.Adam(generator.parameters(), lr=args.g_learning_rate) # optimizer_d = optim.Adam( # discriminator.parameters(), lr=args.d_learning_rate # ) # Maybe restore from checkpoint restore_path = None if args.checkpoint_start_from is not None: restore_path = args.checkpoint_start_from elif args.restore_from_checkpoint == 1: restore_path = os.path.join(args.output_dir, '%s_with_model.pt' % args.checkpoint_name) if restore_path is not None and os.path.isfile(restore_path): logger.info('Restoring from checkpoint {}'.format(restore_path)) checkpoint = torch.load(restore_path) generator.load_state_dict(checkpoint['g_state']) # discriminator.load_state_dict(checkpoint['d_state']) optimizer_g.load_state_dict(checkpoint['g_optim_state']) # optimizer_d.load_state_dict(checkpoint['d_optim_state']) t = checkpoint['counters']['t'] epoch = checkpoint['counters']['epoch'] checkpoint['restore_ts'].append(t) else: # Starting from scratch, so initialize checkpoint data structure t, epoch = 0, 0 checkpoint = { 'args': args.__dict__, 'G_losses': defaultdict(list), 'D_losses': defaultdict(list), 'losses_ts': [], 'metrics_val': defaultdict(list), 'metrics_train': defaultdict(list), 'sample_ts': [], 'restore_ts': [], 'norm_g': [], 'norm_d': [], 'counters': { 't': None, 'epoch': None, }, 'g_state': None, 'g_optim_state': None, 'd_state': None, 'd_optim_state': None, 'g_best_state': None, 'd_best_state': None, 'best_t': None, 'g_best_nl_state': None, 'd_best_state_nl': None, 'best_t_nl': None, } t0 = None while t < args.num_iterations: gc.collect() d_steps_left = args.d_steps g_steps_left = args.g_steps epoch += 1 logger.info('Starting epoch {}'.format(epoch)) for batch in train_loader: # if args.timing == 1: # torch.cuda.synchronize() # t1 = time.time() # Decide whether to use the batch for stepping on discriminator or # generator; an iteration consists of args.d_steps steps on the # discriminator followed by args.g_steps steps on the generator. # if d_steps_left > 0: # step_type = 'd' # losses_d = discriminator_step(args, batch, generator, # discriminator, d_loss_fn, # optimizer_d) # checkpoint['norm_d'].append( # get_total_norm(discriminator.parameters())) # d_steps_left -= 1 # elif g_steps_left > 0: step_type = 'g' losses_g = generator_step(args, batch, generator, optimizer_g) checkpoint['norm_g'].append(get_total_norm(generator.parameters())) g_steps_left -= 1 # if args.timing == 1: # torch.cuda.synchronize() # t2 = time.time() # logger.info('{} step took {}'.format(step_type, t2 - t1)) # Skip the rest if we are not at the end of an iteration # if d_steps_left > 0 or g_steps_left > 0: # continue # if args.timing == 1: # if t0 is not None: # logger.info('Interation {} took {}'.format( # t - 1, time.time() - t0 # )) # t0 = time.time() # Maybe save loss if t % args.print_every == 0: print( "ARSAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ) logger.info('t = {} / {}'.format(t + 1, args.num_iterations)) # for k, v in sorted(losses_d.items()): # logger.info(' [D] {}: {:.3f}'.format(k, v)) # checkpoint['D_losses'][k].append(v) for k, v in sorted(losses_g.items()): print(k) print(v) logger.info(' [G] {}: {:.3f}'.format(k, v)) checkpoint['G_losses'][k].append(v) checkpoint['losses_ts'].append(t) # Maybe save a checkpoint # if t > 0 and t % args.checkpoint_every == 0: if t > 0: checkpoint['counters']['t'] = t checkpoint['counters']['epoch'] = epoch checkpoint['sample_ts'].append(t) # Check stats on the validation set logger.info('Checking stats on val ...') metrics_val = check_accuracy(args, val_loader, generator) logger.info('Checking stats on train ...') metrics_train = check_accuracy( args, train_loader, generator, # d_loss_fn, limit=True) for k, v in sorted(metrics_val.items()): logger.info(' [val] {}: {:.3f}'.format(k, v)) checkpoint['metrics_val'][k].append(v) for k, v in sorted(metrics_train.items()): logger.info(' [train] {}: {:.3f}'.format(k, v)) checkpoint['metrics_train'][k].append(v) min_ade = min(checkpoint['metrics_val']['ade']) min_ade_nl = min(checkpoint['metrics_val']['ade_nl']) if metrics_val['ade'] == min_ade: logger.info('New low for avg_disp_error') checkpoint['best_t'] = t checkpoint['g_best_state'] = generator.state_dict() # checkpoint['d_best_state'] = discriminator.state_dict() if metrics_val['ade_nl'] == min_ade_nl: logger.info('New low for avg_disp_error_nl') checkpoint['best_t_nl'] = t checkpoint['g_best_nl_state'] = generator.state_dict() # checkpoint['d_best_nl_state'] = discriminator.state_dict() # Save another checkpoint with model weights and # optimizer state checkpoint['g_state'] = generator.state_dict() checkpoint['g_optim_state'] = optimizer_g.state_dict() # checkpoint['d_state'] = discriminator.state_dict() # checkpoint['d_optim_state'] = optimizer_d.state_dict() checkpoint_path = os.path.join( args.output_dir, '%s_with_model.pt' % args.checkpoint_name) logger.info('Saving checkpoint to {}'.format(checkpoint_path)) torch.save(checkpoint, checkpoint_path) logger.info('Done.') # Save a checkpoint with no model weights by making a shallow # copy of the checkpoint excluding some items checkpoint_path = os.path.join( args.output_dir, '%s_no_model.pt' % args.checkpoint_name) logger.info('Saving checkpoint to {}'.format(checkpoint_path)) key_blacklist = [ 'g_state', 'd_state', 'g_best_state', 'g_best_nl_state', 'g_optim_state', 'd_optim_state', 'd_best_state', 'd_best_nl_state' ] small_checkpoint = {} for k, v in checkpoint.items(): if k not in key_blacklist: small_checkpoint[k] = v torch.save(small_checkpoint, checkpoint_path) logger.info('Done.') t += 1 # d_steps_left = args.d_steps g_steps_left = args.g_steps if t >= args.num_iterations: break
def main(args): train_path = get_dset_path(args.dataset_name, 'train') val_path = get_dset_path(args.dataset_name, 'val') # 随机种子 # torch.manual_seed(2) # np.random.seed(2) # if args.use_gpu: # torch.cuda.manual_seed_all(2) logger.info("Initializing train dataset") train_dset, train_loader = data_loader(args, train_path) logger.info("Initializing val dataset") _, val_loader = data_loader(args, val_path) log_path = './log/' log_file_curve = open(os.path.join(log_path, 'log_loss.txt'), 'w+') log_file_curve_val = open(os.path.join(log_path, 'log_loss_val.txt'), 'w+') log_file_curve_val_ade = open( os.path.join(log_path, 'log_loss_val_ade.txt'), 'w+') net = LSTM_model(args) if args.use_gpu: net = net.cuda() optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate) #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.98) #接着上次训练的地方继续训练 # restore_path = '.\model\lstm294.tar' # logger.info('Restoring from checkpoint {}'.format(restore_path)) # checkpoint = torch.load(restore_path) # net.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # # for i_epoch in range(checkpoint['epoch']+1): # if (i_epoch + 1) % 100 == 0: # args.learning_rate *= 0.98 epoch_loss_min = 160 epoch_smallest = 0 #for epoch in range(checkpoint['epoch']+1, args.num_epochs): for epoch in range(args.num_epochs): count = 0 batch_loss = 0 for batch in train_loader: # Zero out gradients net.zero_grad() optimizer.zero_grad() (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end) = batch num_ped = obs_traj.size(1) pred_traj_gt = pred_traj_gt #model_teacher.py pred_traj = net(obs_traj, num_ped, pred_traj_gt, seq_start_end) loss = displacement_error(pred_traj, pred_traj_gt) #loss = get_mean_error(pred_traj, pred_traj_gt) # Compute gradients loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm_(net.parameters(), args.grad_clip) # Update parameters optimizer.step() batch_loss += loss count += 1 #print(loss / num_ped) if (epoch + 1) % 6 == 0: pass #scheduler.step() logger.info('epoch {} train loss is {}'.format(epoch, batch_loss / count)) log_file_curve.write(str(batch_loss.item() / count) + "\n") batch_loss = 0 val_ade = 0 total_ade = 0 for idx, batch in enumerate(val_loader): (obs_traj, pred_traj_gt, obs_traj_rel, pred_traj_gt_rel, non_linear_ped, loss_mask, seq_start_end) = batch num_ped = obs_traj.size(1) pred_traj_gt = pred_traj_gt # model_teacher.py pred_traj = net(obs_traj, num_ped, pred_traj_gt, seq_start_end) loss = displacement_error(pred_traj, pred_traj_gt) batch_loss += loss val_ade += loss / (num_ped * 12) total_ade += val_ade count += 1 fin_ade = total_ade / (idx + 1) log_file_curve_val_ade.write(str(fin_ade.item()) + "\n") epoch_loss = batch_loss / count if epoch_loss_min > epoch_loss: epoch_loss_min = epoch_loss epoch_smallest = epoch logger.info('Saving model') torch.save( { 'epoch': epoch, 'state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, checkpoint_path(epoch)) logger.info('epoch {} val loss is {}'.format(epoch, epoch_loss)) log_file_curve_val.write(str(epoch_loss.item()) + "\n") logger.info('epoch {} is smallest loss is {}'.format( epoch_smallest, epoch_loss_min)) logger.info('the smallest ade is {}'.format(total_ade / (idx + 1))) logger.info("-" * 50)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num train_path = get_dset_path(args.dataset_name, "train") val_path = get_dset_path(args.dataset_name, "test") logging.info("Initializing train dataset") train_dset, train_loader = data_loader(args, train_path) logging.info("Initializing val dataset") _, val_loader = data_loader(args, val_path) writer = SummaryWriter() n_units = ([args.traj_lstm_hidden_size] + [int(x) for x in args.hidden_units.strip().split(",")] + [args.graph_lstm_hidden_size]) n_heads = [int(x) for x in args.heads.strip().split(",")] model = TrajectoryGenerator( obs_len=args.obs_len, pred_len=args.pred_len, traj_lstm_input_size=args.traj_lstm_input_size, traj_lstm_hidden_size=args.traj_lstm_hidden_size, n_units=n_units, n_heads=n_heads, graph_network_out_dims=args.graph_network_out_dims, dropout=args.dropout, alpha=args.alpha, graph_lstm_hidden_size=args.graph_lstm_hidden_size, noise_dim=args.noise_dim, noise_type=args.noise_type, ) model.cuda() optimizer = optim.Adam( [ { "params": model.traj_lstm_model.parameters(), "lr": 1e-2 }, { "params": model.traj_hidden2pos.parameters() }, { "params": model.gatencoder.parameters(), "lr": 3e-2 }, { "params": model.graph_lstm_model.parameters(), "lr": 1e-2 }, { "params": model.traj_gat_hidden2pos.parameters() }, { "params": model.pred_lstm_model.parameters() }, { "params": model.pred_hidden2pos.parameters() }, ], lr=args.lr, ) global best_ade if args.resume: if os.path.isfile(args.resume): logging.info("Restoring from checkpoint {}".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint["epoch"] model.load_state_dict(checkpoint["state_dict"]) logging.info("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint["epoch"])) else: logging.info("=> no checkpoint found at '{}'".format(args.resume)) training_step = 1 for epoch in range(args.start_epoch, args.num_epochs + 1): if epoch < 150: training_step = 1 elif epoch < 250: training_step = 2 else: if epoch == 250: for param_group in optimizer.param_groups: param_group["lr"] = 5e-3 training_step = 3 train(args, model, train_loader, optimizer, epoch, training_step, writer) if training_step == 3: ade = validate(args, model, val_loader, epoch, writer) is_best = ade < best_ade best_ade = min(ade, best_ade) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_ade": best_ade, "optimizer": optimizer.state_dict(), }, is_best, f"./checkpoint/checkpoint{epoch}.pth.tar", ) writer.close()
def main(args): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num train_path = get_dset_path(args.dataset_name, 'train') val_path = get_dset_path(args.dataset_name, 'val') long_dtype, float_dtype = get_dtypes(args) logger.info("Initializing train dataset") train_dset, train_loader = data_loader(args, train_path) logger.info("Initializing val dataset") _, val_loader = data_loader(args, val_path) iterations_per_epoch = len(train_dset) / args.batch_size if args.num_epochs: args.num_iterations = int(iterations_per_epoch * args.num_epochs) logger.info( 'There are {} iterations per epoch'.format(iterations_per_epoch)) lstm = My_Net_V2(seq_len=args.pred_len, mlp_dim=args.mlp_dim, dropout=args.dropout, use_cuda=args.use_gpu) lstm.apply(init_weights) lstm.type(float_dtype).train() logger.info('Here is the lstm:') logger.info(lstm) # optimizer = optim.Adam(lstm.parameters(), lr=args.g_learning_rate) optimizer = optim.RMSprop(lstm.parameters(), lr=args.g_learning_rate) # Maybe restore from checkpoint restore_path = None if args.checkpoint_start_from is not None: restore_path = args.checkpoint_start_from elif args.restore_from_checkpoint == 1: restore_path = os.path.join(args.output_dir, '%s_with_model.pt' % args.checkpoint_name) if restore_path is not None and os.path.isfile(restore_path): logger.info('Restoring from checkpoint {}'.format(restore_path)) checkpoint = torch.load(restore_path) lstm.load_state_dict(checkpoint['state']) optimizer.load_state_dict(checkpoint['optim_state']) t = checkpoint['counters']['t'] epoch = checkpoint['counters']['epoch'] checkpoint['restore_ts'].append(t) else: # Starting from scratch, so initialize checkpoint dataset structure t, epoch = 0, 0 checkpoint = { 'args': args.__dict__, 'losses': defaultdict(list), 'losses_ts': [], 'metrics_val': defaultdict(list), 'metrics_train': defaultdict(list), 'sample_ts': [], 'restore_ts': [], 'norm': [], 'counters': { 't': None, 'epoch': None, }, 'state': None, 'optim_state': None, 'best_state': None, 'best_t': None } t0 = None while t < args.num_iterations: gc.collect() epoch += 1 logger.info('Starting epoch {}'.format(epoch)) for batch in train_loader: if args.timing == 1: torch.cuda.synchronize() t1 = time.time() # Decide whether to use the batch for stepping on discriminator or # generator; an iteration consists of args.d_steps steps on the # discriminator followed by args.g_steps steps on the generator. losses = generator_step(args, batch, lstm, optimizer) # checkpoint['norm_g'].append( # get_total_norm(lstm.parameters()) # ) if args.timing == 1: if t0 is not None: logger.info('Interation {} took {}'.format( t - 1, time.time() - t0)) t0 = time.time() # Maybe save loss if t % args.print_every == 0: logger.info('t = {} / {}'.format(t + 1, args.num_iterations)) # for k, v in sorted(losses.items()): # logger.info(' [D] {}: {:.7f}'.format(k, v)) # checkpoint['losses'][k].append(v) # checkpoint['losses_ts'].append(t) # Maybe save a checkpoint if t > 0 and t % args.checkpoint_every == 0: checkpoint['counters']['t'] = t checkpoint['counters']['epoch'] = epoch checkpoint['sample_ts'].append(t) # Check stats on the validation set logger.info('Checking stats on val ...') metrics_val = check_accuracy(args, val_loader, lstm, is_train=False) logger.info('Checking stats on train ...') metrics_train = check_accuracy(args, train_loader, lstm, limit=True, is_train=True) for k, v in sorted(metrics_val.items()): logger.info(' [val] {}: {:.7f}'.format(k, v)) checkpoint['metrics_val'][k].append(v) for k, v in sorted(metrics_train.items()): logger.info(' [train] {}: {:.7f}'.format(k, v)) checkpoint['metrics_train'][k].append(v) min_ade = min(checkpoint['metrics_val']['ade']) if metrics_val['ade'] == min_ade: logger.info('New low for avg_disp_error') checkpoint['best_t'] = t checkpoint['best_state'] = lstm.state_dict() # Save another checkpoint with model weights and # optimizer state checkpoint['state'] = lstm.state_dict() checkpoint['optim_state'] = optimizer.state_dict() checkpoint_path = os.path.join( args.output_dir, '%s_with_model.pt' % args.checkpoint_name) logger.info('Saving checkpoint to {}'.format(checkpoint_path)) torch.save(checkpoint, checkpoint_path) logger.info('Done.') t += 1 if t >= args.num_iterations: break
def main(): train_path = get_dset_path(DATASET_NAME, 'train') val_path = get_dset_path(DATASET_NAME, 'val') long_dtype, float_dtype = get_dtypes() print("Initializing train dataset") train_dset, train_loader = data_loader(train_path) print("Initializing val dataset") _, val_loader = data_loader(val_path) iterations_per_epoch = len(train_dset) / D_STEPS NUM_ITERATIONS = int(iterations_per_epoch * NUM_EPOCHS) print('There are {} iterations per epoch'.format(iterations_per_epoch)) generator = TrajectoryGenerator() generator.apply(init_weights) generator.type(float_dtype).train() print('Here is the generator:') print(generator) discriminator = TrajectoryDiscriminator() discriminator.apply(init_weights) discriminator.type(float_dtype).train() print('Here is the discriminator:') print(discriminator) optimizer_g = optim.Adam(generator.parameters(), lr=G_LR) optimizer_d = optim.Adam(discriminator.parameters(), lr=D_LR) t, epoch = 0, 0 t0 = None min_ade = None while t < NUM_ITERATIONS: gc.collect() d_steps_left = D_STEPS g_steps_left = G_STEPS epoch += 1 print('Starting epoch {}'.format(epoch)) for batch in train_loader: if d_steps_left > 0: losses_d = discriminator_step(batch, generator, discriminator, gan_d_loss, optimizer_d) d_steps_left -= 1 elif g_steps_left > 0: losses_g = generator_step(batch, generator, discriminator, gan_g_loss, optimizer_g) g_steps_left -= 1 if d_steps_left > 0 or g_steps_left > 0: continue if t % PRINT_EVERY == 0: print('t = {} / {}'.format(t + 1, NUM_ITERATIONS)) for k, v in sorted(losses_d.items()): print(' [D] {}: {:.3f}'.format(k, v)) for k, v in sorted(losses_g.items()): print(' [G] {}: {:.3f}'.format(k, v)) print('Checking stats on val ...') metrics_val = check_accuracy(val_loader, generator, discriminator, gan_d_loss) print('Checking stats on train ...') metrics_train = check_accuracy(train_loader, generator, discriminator, gan_d_loss, limit=True) for k, v in sorted(metrics_val.items()): print(' [val] {}: {:.3f}'.format(k, v)) for k, v in sorted(metrics_train.items()): print(' [train] {}: {:.3f}'.format(k, v)) if min_ade is None or metrics_val['ade'] < min_ade: min_ade = metrics_val['ade'] checkpoint = { 't': t, 'g': generator.state_dict(), 'd': discriminator.state_dict(), 'g_optim': optimizer_g.state_dict(), 'd_optim': optimizer_d.state_dict() } print("Saving checkpoint to model.pt") torch.save(checkpoint, "model.pt") print("Done.") t += 1 d_steps_left = D_STEPS g_steps_left = G_STEPS if t >= NUM_ITERATIONS: break
def load_and_evaluate(generator, version): print("Initializing {} dataset".format(version)) path = get_dset_path(DATASET_NAME, version) _, loader = data_loader(path) ade, fde = evaluate(loader, generator) print('{} Dataset: {}, Pred Len: {}, ADE: {:.2f}, FDE: {:.2f}'.format(version, DATASET_NAME, PRED_LEN, ade, fde))