示例#1
0
def main(args):
    utils.set_seed_everywhere(args.seed)

    cfg = hyperparameters.get_config(args)
    cfg.seed = args.seed

    args.cuda = not args.no_cuda and torch.cuda.is_available()

    time_str = datetime.now(
        timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S")
    exp_dir = os.path.join(cfg.base_dir, time_str)
    checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir)
    log_dir = os.path.join(exp_dir, cfg.log_dir)

    save_config(cfg, exp_dir, "config.json")

    print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir)

    num_timsteps = cfg.observed_steps + cfg.predicted_steps
    data_shape = {'image': (None, num_timsteps, 3, 64, 64)}
    cfg.data_shapes = data_shape

    model = KeypointModel(cfg)

    cp_callback = ModelCheckpoint(filepath=os.path.join(
        checkpoint_dir, "model_"),
                                  period=25,
                                  save_top_k=-1)

    logger = TensorBoardLogger(log_dir, name="", version=None)

    gpus = 1 if args.cuda else None

    if args.pretrained_path:
        checkpoint_path = get_latest_checkpoint(args.pretrained_path)
        import json
        model = KeypointModel.load_from_checkpoint(checkpoint_path)
        print(json.dumps(model.cfg, indent=4))

    print("On GPU Device: ", gpus)
    trainer = Trainer(
        max_epochs=args.num_epochs,
        logger=logger,
        checkpoint_callback=cp_callback,
        gpus=gpus,
        #distributed_backend='dp',
        progress_bar_refresh_rate=1,
        #gradient_clip_val=cfg.clipnorm,
        fast_dev_run=False,
        #train_percent_check=0.1,val_percent_check=0.0,
        #val_percent_check=0.3,
        track_grad_norm=2,
        show_progress_bar=True)
    trainer.fit(model)
    save_path = os.path.join(checkpoint_dir,
                             "model_final_" + str(args.num_epochs) + ".ckpt")
    print("Saving model finally:")
    trainer.save_checkpoint(save_path)
示例#2
0
def main(args):
    global LOG_INTERVAL, SAVE_PATH

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    cfg = hyperparameters.get_config(args)
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if args.cuda else "cpu")
    print("Using device: ", device)

    time_str = datetime.now(
        timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S")
    exp_dir = os.path.join(cfg.base_dir, time_str)
    checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir)
    log_dir = os.path.join(exp_dir, cfg.log_dir)

    print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir)
    if not os.path.isdir(log_dir): os.makedirs(log_dir)
    if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir)

    LOG_INTERVAL = args.log_interval
    SAVE_PATH = os.path.join(checkpoint_dir, "model-")

    save_config(cfg, exp_dir, "config.json")

    train_loader, data_shapes = datasets.get_dataset(data_dir=os.path.join(
        cfg.data_dir, cfg.train_dir),
                                                     batch_size=cfg.batch_size,
                                                     shuffle=False)

    test_loader, _ = datasets.get_dataset(data_dir=os.path.join(
        cfg.data_dir, cfg.test_dir),
                                          batch_size=cfg.batch_size,
                                          shuffle=False)

    models = build_model_noseq(cfg, data_shapes).to(device)
    optimizer = optim.Adam(models.parameters(),
                           lr=cfg.learning_rate,
                           weight_decay=1e-4)

    model_dict = {
        'models': models,
        'optimizer': optimizer,
        'train_loader': train_loader,
        'test_loader': test_loader,
        'writer': SummaryWriter(log_dir),
        'device': device
    }

    for i in tqdm(range(args.num_epochs)):
        train_epoch(i, model_dict, cfg)
示例#3
0
def main(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    cfg = hyperparameters.get_config(args)

    args.cuda = not args.no_cuda and torch.cuda.is_available()

    time_str = datetime.now(
        timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S")
    exp_dir = os.path.join(cfg.base_dir, time_str)
    checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir)
    log_dir = os.path.join(exp_dir, cfg.log_dir)

    save_config(cfg, exp_dir, "config.json")

    print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir)

    data_shape = {'image': (None, 3, 64, 64)}
    cfg.data_shapes = data_shape

    model = KeypointModel(cfg)

    cp_callback = ModelCheckpoint(filepath=os.path.join(
        checkpoint_dir, "model_"),
                                  period=2,
                                  save_top_k=-1)
    logger = TensorBoardLogger(log_dir, name="", version=None)

    gpus = 1 if args.cuda else None

    if args.pretrained_path:
        checkpoint_path = get_latest_checkpoint(args.pretrained_path)
        import json
        model = KeypointModel.load_from_checkpoint(checkpoint_path)
        print(json.dumps(model.cfg, indent=4))

    print("On GPU Device: ", gpus)
    trainer = Trainer(max_epochs=args.num_epochs,
                      logger=logger,
                      checkpoint_callback=cp_callback,
                      gpus=gpus,
                      progress_bar_refresh_rate=1,
                      gradient_clip_val=cfg.clipnorm,
                      fast_dev_run=False)
    trainer.fit(model)