def main(args): utils.set_seed_everywhere(args.seed) cfg = hyperparameters.get_config(args) cfg.seed = args.seed args.cuda = not args.no_cuda and torch.cuda.is_available() time_str = datetime.now( timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S") exp_dir = os.path.join(cfg.base_dir, time_str) checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir) log_dir = os.path.join(exp_dir, cfg.log_dir) save_config(cfg, exp_dir, "config.json") print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir) num_timsteps = cfg.observed_steps + cfg.predicted_steps data_shape = {'image': (None, num_timsteps, 3, 64, 64)} cfg.data_shapes = data_shape model = KeypointModel(cfg) cp_callback = ModelCheckpoint(filepath=os.path.join( checkpoint_dir, "model_"), period=25, save_top_k=-1) logger = TensorBoardLogger(log_dir, name="", version=None) gpus = 1 if args.cuda else None if args.pretrained_path: checkpoint_path = get_latest_checkpoint(args.pretrained_path) import json model = KeypointModel.load_from_checkpoint(checkpoint_path) print(json.dumps(model.cfg, indent=4)) print("On GPU Device: ", gpus) trainer = Trainer( max_epochs=args.num_epochs, logger=logger, checkpoint_callback=cp_callback, gpus=gpus, #distributed_backend='dp', progress_bar_refresh_rate=1, #gradient_clip_val=cfg.clipnorm, fast_dev_run=False, #train_percent_check=0.1,val_percent_check=0.0, #val_percent_check=0.3, track_grad_norm=2, show_progress_bar=True) trainer.fit(model) save_path = os.path.join(checkpoint_dir, "model_final_" + str(args.num_epochs) + ".ckpt") print("Saving model finally:") trainer.save_checkpoint(save_path)
def main(args): global LOG_INTERVAL, SAVE_PATH torch.manual_seed(args.seed) np.random.seed(args.seed) cfg = hyperparameters.get_config(args) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if args.cuda else "cpu") print("Using device: ", device) time_str = datetime.now( timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S") exp_dir = os.path.join(cfg.base_dir, time_str) checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir) log_dir = os.path.join(exp_dir, cfg.log_dir) print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir) if not os.path.isdir(log_dir): os.makedirs(log_dir) if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) LOG_INTERVAL = args.log_interval SAVE_PATH = os.path.join(checkpoint_dir, "model-") save_config(cfg, exp_dir, "config.json") train_loader, data_shapes = datasets.get_dataset(data_dir=os.path.join( cfg.data_dir, cfg.train_dir), batch_size=cfg.batch_size, shuffle=False) test_loader, _ = datasets.get_dataset(data_dir=os.path.join( cfg.data_dir, cfg.test_dir), batch_size=cfg.batch_size, shuffle=False) models = build_model_noseq(cfg, data_shapes).to(device) optimizer = optim.Adam(models.parameters(), lr=cfg.learning_rate, weight_decay=1e-4) model_dict = { 'models': models, 'optimizer': optimizer, 'train_loader': train_loader, 'test_loader': test_loader, 'writer': SummaryWriter(log_dir), 'device': device } for i in tqdm(range(args.num_epochs)): train_epoch(i, model_dict, cfg)
def main(args): np.random.seed(args.seed) torch.manual_seed(args.seed) cfg = hyperparameters.get_config(args) args.cuda = not args.no_cuda and torch.cuda.is_available() time_str = datetime.now( timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S") exp_dir = os.path.join(cfg.base_dir, time_str) checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir) log_dir = os.path.join(exp_dir, cfg.log_dir) save_config(cfg, exp_dir, "config.json") print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir) data_shape = {'image': (None, 3, 64, 64)} cfg.data_shapes = data_shape model = KeypointModel(cfg) cp_callback = ModelCheckpoint(filepath=os.path.join( checkpoint_dir, "model_"), period=2, save_top_k=-1) logger = TensorBoardLogger(log_dir, name="", version=None) gpus = 1 if args.cuda else None if args.pretrained_path: checkpoint_path = get_latest_checkpoint(args.pretrained_path) import json model = KeypointModel.load_from_checkpoint(checkpoint_path) print(json.dumps(model.cfg, indent=4)) print("On GPU Device: ", gpus) trainer = Trainer(max_epochs=args.num_epochs, logger=logger, checkpoint_callback=cp_callback, gpus=gpus, progress_bar_refresh_rate=1, gradient_clip_val=cfg.clipnorm, fast_dev_run=False) trainer.fit(model)