示例#1
0
def train_model(args, metadata, device='cuda'):
    print('training on {}'.format(torch.cuda.get_device_name(device) if args.cuda else 'cpu'))

    # load data
    if not args.preload:
        dset = SyntheticDataset(args.file, 'cpu')  # originally 'cpu' ????
        train_loader = DataLoader(dset, shuffle=True, batch_size=args.batch_size)
        data_dim, latent_dim, aux_dim = dset.get_dims()
        args.N = len(dset)
        metadata.update(dset.get_metadata())
    else:
        train_loader = DataLoaderGPU(args.file, shuffle=True, batch_size=args.batch_size)
        data_dim, latent_dim, aux_dim = train_loader.get_dims()
        args.N = train_loader.dataset_len
        metadata.update(train_loader.get_metadata())

    if args.max_iter is None:
        args.max_iter = len(train_loader) * args.epochs

    if args.latent_dim is not None:
        latent_dim = args.latent_dim
        metadata.update({"train_latent_dim": latent_dim})

    # define model and optimizer
    model = None
    if args.i_what == 'iVAE':
        model = iVAE(latent_dim,
                     data_dim,
                     aux_dim,
                     n_layers=args.depth,
                     activation='lrelu',
                     device=device,
                     hidden_dim=args.hidden_dim,
                     anneal=args.anneal,  # False
                     file=metadata['file'],  # Added dataset location for easier checkpoint loading
                     seed=1,
                     epochs=args.epochs)
    elif args.i_what == 'iFlow':
        metadata.update({"device": device})
        model = iFlow(args=metadata).to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, \
                                                     factor=args.lr_drop_factor, \
                                                     patience=args.lr_patience, \
                                                     verbose=True)  # factor=0.1 and patience=4

    ste = time.time()
    print('setup time: {}s'.format(ste - st))

    # setup loggers
    logger = Logger(logdir=LOG_FOLDER)  # 'log/'
    exp_id = logger.get_id()  # 1

    tensorboard_run_name = TENSORBOARD_RUN_FOLDER + 'exp' + str(exp_id) + '_'.join(
        map(str, ['', args.batch_size, args.max_iter, args.lr, args.hidden_dim, args.depth, args.anneal]))
    # 'runs/exp1_64_12500_0.001_50_3_False'

    writer = SummaryWriter(logdir=tensorboard_run_name)

    if args.i_what == 'iFlow':
        logger.add('log_normalizer')
        logger.add('neg_log_det')
        logger.add('neg_trace')

    logger.add('loss')
    logger.add('perf')
    print('Beginning training for exp: {}'.format(exp_id))

    # training loop
    epoch = 0
    model.train()
    while epoch < args.epochs:  # args.max_iter:  #12500
        est = time.time()
        for itr, (x, u, z) in enumerate(train_loader):
            acc_itr = itr + epoch * len(train_loader)

            # x is of shape [64, 4]
            # u is of shape [64, 40], one-hot coding of 40 classes
            # z is of shape [64, 2]

            # it += 1
            # model.anneal(args.N, args.max_iter, it)
            optimizer.zero_grad()

            if args.cuda and not args.preload:
                x = x.cuda(device=device, non_blocking=True)
                u = u.cuda(device=device, non_blocking=True)

            if args.i_what == 'iVAE':
                elbo, z_est = model.elbo(x, u)  # elbo is a scalar loss while z_est is of shape [64, 2]
                loss = elbo.mul(-1)

            elif args.i_what == 'iFlow':
                (log_normalizer, neg_trace, neg_log_det), z_est = model.neg_log_likelihood(x, u)
                loss = log_normalizer + neg_trace + neg_log_det

            loss.backward()
            optimizer.step()

            logger.update('loss', loss.item())
            if args.i_what == 'iFlow':
                logger.update('log_normalizer', log_normalizer.item())
                logger.update('neg_trace', neg_trace.item())
                logger.update('neg_log_det', neg_log_det.item())

            perf = mcc(z.cpu().numpy(), z_est.cpu().detach().numpy())
            logger.update('perf', perf)

            if acc_itr % args.log_freq == 0:  # % 25
                logger.log()
                writer.add_scalar('data/performance', logger.get_last('perf'), acc_itr)
                writer.add_scalar('data/loss', logger.get_last('loss'), acc_itr)

                if args.i_what == 'iFlow':
                    writer.add_scalar('data/log_normalizer', logger.get_last('log_normalizer'), acc_itr)
                    writer.add_scalar('data/neg_trace', logger.get_last('neg_trace'), acc_itr)
                    writer.add_scalar('data/neg_log_det', logger.get_last('neg_log_det'), acc_itr)

                scheduler.step(logger.get_last('loss'))

            if acc_itr % int(args.max_iter / 5) == 0 and not args.no_log:
                checkpoint(TORCH_CHECKPOINT_FOLDER, \
                           exp_id, \
                           acc_itr, \
                           model, \
                           optimizer, \
                           logger.get_last('loss'), \
                           logger.get_last('perf'))

        epoch += 1
        eet = time.time()
        if args.i_what == 'iVAE':
            print('epoch {}: {:.4f}s;\tloss: {:.4f};\tperf: {:.4f}'.format(epoch,
                                                                           eet - est,
                                                                           logger.get_last('loss'),
                                                                           logger.get_last('perf')))
        elif args.i_what == 'iFlow':
            print('epoch {}: {:.4f}s;\tloss: {:.4f} (l1: {:.4f}, l2: {:.4f}, l3: {:.4f});\tperf: {:.4f}'.format( \
                epoch,
                eet - est,
                logger.get_last('loss'),
                logger.get_last('log_normalizer'),
                logger.get_last('neg_trace'),
                logger.get_last('neg_log_det'),
                logger.get_last('perf')))

    et = time.time()
    print('training time: {}s'.format(et - ste))

    # Save final model
    checkpoint(PT_MODELS_FOLDER,
               "",
               'final',
               model,
               optimizer,
               logger.get_last('loss'),
               logger.get_last('perf'))

    writer.close()
    if not args.no_log:
        logger.add_metadata(**metadata)
        logger.save_to_json()
        logger.save_to_npz()

    print('total time: {}s'.format(et - st))
    return model
示例#2
0
文件: main.py 项目: TimStolp/iFlow
        args.max_iter = len(train_loader) * args.epochs

    if args.latent_dim is not None:
        latent_dim = args.latent_dim
        metadata.update({"train_latent_dim": latent_dim})

    # define model and optimizer
    model = None
    if args.i_what == 'iVAE':
        model = iVAE(
            latent_dim,
            data_dim,
            aux_dim,
            n_layers=args.depth,
            activation='lrelu',
            device=device,
            hidden_dim=args.hidden_dim,
            anneal=args.anneal,  #False
            file=metadata[
                'file'],  # Added dataset location for easier checkpoint loading
            seed=1,
            epochs=args.epochs)
    elif args.i_what == 'iFlow':
        metadata.update({"device": device})
        model = iFlow(args=metadata).to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, \
                                                     factor=args.lr_drop_factor, \
                                                     patience=args.lr_patience, \
                                                     verbose=True) # factor=0.1 and patience=4
示例#3
0
文件: main.py 项目: diadochos/iVAE
                                     batch_size=args.batch_size)
        data_dim, latent_dim, aux_dim = train_loader.get_dims()
        args.N = train_loader.dataset_len
        metadata.update(train_loader.get_metadata())
    if args.max_iter is None:
        args.max_iter = len(train_loader) * args.epochs

    if args.latent_dim is not None:
        latent_dim = args.latent_dim
        metadata.update({"train_latent_dim": latent_dim})

    # define model and optimizer
    model = iVAE(latent_dim,
                 data_dim,
                 aux_dim,
                 activation='lrelu',
                 device=device,
                 hidden_dim=args.hidden_dim,
                 anneal=args.anneal)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.1,
                                                     patience=4,
                                                     verbose=True)

    ste = time.time()
    print('setup time: {}s'.format(ste - st))

    # setup loggers
    logger = Logger(path=LOG_FOLDER)
    exp_id = logger.get_id()
示例#4
0
文件: main.py 项目: fsun2000/iFlow
        metadata.update(train_loader.get_metadata())

    if args.max_iter is None:
        args.max_iter = len(train_loader) * args.epochs

    if args.latent_dim is not None:
        latent_dim = args.latent_dim
        metadata.update({"train_latent_dim": latent_dim})

    # define model and optimizer
    model = None
    if args.i_what == 'iVAE':
        model = iVAE(latent_dim, \
                 data_dim, \
                 aux_dim, \
                 n_layers=args.depth, \
                 activation='lrelu', \
                 device=device, \
                 hidden_dim=args.hidden_dim, \
                 anneal=args.anneal) # False
    elif args.i_what == 'iFlow':
        metadata.update({"device": device})
        model = iFlow(args=metadata).to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, \
                                                     factor=args.lr_drop_factor, \
                                                     patience=args.lr_patience, \
                                                     verbose=True) # factor=0.1 and patience=4

    ste = time.time()
    print('setup time: {}s'.format(ste - st))
示例#5
0
def load_model_from_checkpoint(ckpt_path, device, model_seed=1):
    print('checkpoint path:', ckpt_path)
    model_args = ckpt_path.split('/')[
        1]  # get folder name containing model and data properties
    ckpt_filename = ckpt_path.split('/')[-1]

    model_args = model_args.split('_')
    epochs = model_args[-1]
    model_name = model_args[-2]
    data_args = model_args[:-2]

    data_file = create_if_not_exist_dataset(root='data/{}/'.format(model_seed),
                                            arg_str="_".join(data_args))

    nps = int(data_args[0])
    ns = int(data_args[1])
    aux_dim = int(data_args[1])
    n = nps * ns
    latent_dim = int(data_args[2])
    data_dim = int(data_args[3])

    print('Loading model', model_name)
    model_path = ckpt_path

    print('Loading data', data_file)
    A = np.load(data_file)

    x = A['x']  # of shape
    x = torch.from_numpy(x).to(device)
    print("x.shape ==", x.shape)

    s = A['s']  # of shape
    # s = torch.from_numpy(s).to(device)
    print("s.shape ==", s.shape)

    u = A['u']  # of shape
    u = torch.from_numpy(u).to(device)
    print("u.shape ==", u.shape)

    checkpoint = torch.load(model_path)

    # Arguments (metadata, from argparse in main.py), have to correspond to selected dataset and model properties
    # Hyperparameter and configurations as precribed in the paper.
    metadata = {
        'file': data_file,
        'path': data_file,
        'batch_size': 64,
        'epochs': epochs,
        'device': device,
        'seed': 1,
        'i_what': model_name,
        'max_iter': None,
        'hidden_dim': 50,
        'depth': 3,
        'lr': 1e-3,
        'cuda': True,
        'preload': True,
        'anneal': False,
        'log_freq': 25,
        'flow_type': 'RQNSF_AG',
        'num_bins': 8,
        'nat_param_act': 'Softplus',
        'gpu_id': '0',
        'flow_length': 10,
        'lr_drop_factor': 0.25,
        'lr_patience': 10
    }

    # Get dataset properties
    metadata.update({
        'nps': nps,
        'ns': ns,
        'n': n,
        'latent_dim': latent_dim,
        'data_dim': data_dim,
        'aux_dim': aux_dim
    })

    if model_name == 'iFlow':
        model = iFlow(args=metadata).to(device)
    elif model_name == "iVAE":
        model = iVAE(
            latent_dim,  # latent_dim
            data_dim,  # data_dim
            aux_dim,  # aux_dim
            n_layers=metadata['depth'],
            activation='lrelu',
            device=device,
            hidden_dim=metadata['hidden_dim'],
            anneal=metadata['anneal'],  # False
            file=metadata['file'],
            seed=1)

    model.load_state_dict(checkpoint['model_state_dict'])
    return model