예제 #1
0
    # get the options
    options['dataset_options'] = dynsys_params.get_dataset_options(
        options['dataset'])
    options['model_options'] = model_params.get_model_options(
        options['model'], options['dataset'], options['dataset_options'])
    options['train_options'] = train_params.get_train_options(
        options['dataset'])
    options['test_options'] = train_params.get_test_options()

    # optimal model parameters
    options['model_options'].h_dim = options['optValue']['h_opt']
    options['model_options'].z_dim = options['optValue']['z_opt']
    options['model_options'].n_layers = options['optValue']['n_opt']

    # save the options
    save_options(options, path_general, 'options.txt')

    # allocation
    rmse_all = np.zeros([options['MCsamples']])
    vaf_all = np.zeros([options['MCsamples']])
    logLikelihood_all = np.zeros([options['MCsamples']])
    rmse_KF_all = np.zeros([options['MCsamples']])
    vaf_KF_all = np.zeros([options['MCsamples']])

    # print model type and dynamic system type
    print('\n\tModel Type: {}'.format(options['model']))
    print('\tDynamic System: {}\n'.format(options['dataset']))

    file_name_general = '{}_h{}_z{}_n{}'.format(
        options['dataset'], options['model_options'].h_dim,
        options['model_options'].z_dim, options['model_options'].n_layers)
예제 #2
0
def main(config):
    save_path = config['save_path']
    epochs = config['epochs']
    os.environ['TORCH_HOME'] = config['torch_home']
    distributed = config['use_DDP']
    start_ep = 0
    start_cnt = 0

    # initialize model
    print("Initializing model...")
    if distributed:
        initialize_distributed(config)
    rank = config['rank']

    # map string name to class constructor
    model = get_model(config)
    model.apply(init_weights)
    if config['resume_ckpt'] is not None:
        # load weights from checkpoint
        state_dict = load_weights(config['resume_ckpt'])
        model.load_state_dict(state_dict)

    print("Moving model to GPU")
    model.cuda(torch.cuda.current_device())
    print("Setting up losses")

    if config['use_vgg']:
        criterionVGG = Vgg19PerceptualLoss(config['reduced_w'])
        criterionVGG.cuda()
        validationLoss = criterionVGG
    if config['use_gan']:
        use_sigmoid = config['no_lsgan']
        disc_input_channels = 3
        discriminator = MultiscaleDiscriminator(disc_input_channels,
                                                config['ndf'],
                                                config['n_layers_D'],
                                                'instance', use_sigmoid,
                                                config['num_D'], False, False)
        discriminator.apply(init_weights)
        if config['resume_ckpt_D'] is not None:
            # load weights from checkpoint
            print("Resuming discriminator from %s" % (config['resume_ckpt_D']))
            state_dict = load_weights(config['resume_ckpt_D'])
            discriminator.load_state_dict(state_dict)

        discriminator.cuda(torch.cuda.current_device())
        criterionGAN = GANLoss(use_lsgan=not config['no_lsgan'])
        criterionGAN.cuda()
        criterionFeat = nn.L1Loss().cuda()
    if config['use_l2']:
        criterionMSE = nn.MSELoss()
        criterionMSE.cuda()
        validationLoss = criterionMSE

    # initialize dataloader
    print("Setting up dataloaders...")
    train_dataloader, val_dataloader, train_sampler = setup_dataloaders(config)
    print("Done!")
    # run the training loop
    print("Initializing optimizers...")
    optimizer_G = optim.Adam(model.parameters(),
                             lr=config['learning_rate'],
                             weight_decay=config['weight_decay'])
    if config['resume_ckpt_opt_G'] is not None:
        optimizer_G_state_dict = torch.load(
            config['resume_ckpt_opt_G'],
            map_location=lambda storage, loc: storage)
        optimizer_G.load_state_dict(optimizer_G_state_dict)
    if config['use_gan']:
        optimizer_D = optim.Adam(discriminator.parameters(),
                                 lr=config['learning_rate'])
        if config['resume_ckpt_opt_D'] is not None:
            optimizer_D_state_dict = torch.load(
                config['resume_ckpt_opt_D'],
                map_location=lambda storage, loc: storage)
            optimizer_D.load_state_dict(optimizer_D_state_dict)

    print("Done!")

    if distributed:
        print("Moving model to DDP...")
        model = DDP(model)
        if config['use_gan']:
            discriminator = DDP(discriminator, delay_allreduce=True)
        print("Done!")

    tb_logger = None
    if rank == 0:
        tb_logdir = os.path.join(save_path, 'tbdir')
        if not os.path.exists(tb_logdir):
            os.makedirs(tb_logdir)
        tb_logger = SummaryWriter(tb_logdir)
        # run training
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        log_name = os.path.join(save_path, 'loss_log.txt')
        opt_name = os.path.join(save_path, 'opt.yaml')
        print(config)
        save_options(opt_name, config)
        log_handle = open(log_name, 'a')

    print("Starting training")
    cnt = start_cnt
    assert (config['use_warped'] or config['use_temporal'])

    for ep in range(start_ep, epochs):
        if train_sampler is not None:
            train_sampler.set_epoch(ep)

        for curr_batch in train_dataloader:
            optimizer_G.zero_grad()
            input_a = curr_batch['input_a'].cuda()
            target = curr_batch['target'].cuda()
            if config['use_warped'] and config['use_temporal']:
                input_a = torch.cat((input_a, input_a), 0)
                input_b = torch.cat((curr_batch['input_b'].cuda(),
                                     curr_batch['input_temporal'].cuda()), 0)
                target = torch.cat((target, target), 0)
            elif config['use_temporal']:
                input_b = curr_batch['input_temporal'].cuda()
            elif config['use_warped']:
                input_b = curr_batch['input_b'].cuda()

            output_dict = model(input_a, input_b)
            output_recon = output_dict['reconstruction']

            loss_vgg = loss_G_GAN = loss_G_feat = loss_l2 = 0
            if config['use_vgg']:
                loss_vgg = criterionVGG(output_recon,
                                        target) * config['vgg_lambda']
            if config['use_gan']:
                predicted_landmarks = output_dict['input_a_gauss_maps']
                # output_dict['reconstruction'] can be considered normalized
                loss_G_GAN, loss_D_real, loss_D_fake = apply_GAN_criterion(
                    output_recon, target, predicted_landmarks.detach(),
                    discriminator, criterionGAN)
                loss_D = (loss_D_fake + loss_D_real) * 0.5
            if config['use_l2']:
                loss_l2 = criterionMSE(output_recon,
                                       target) * config['l2_lambda']

            loss_G = loss_G_GAN + loss_G_feat + loss_vgg + loss_l2
            loss_G.backward()
            # grad_norm clipping
            if not config['no_grad_clip']:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer_G.step()
            if config['use_gan']:
                optimizer_D.zero_grad()
                loss_D.backward()
                # grad_norm clipping
                if not config['no_grad_clip']:
                    torch.nn.utils.clip_grad_norm_(discriminator.parameters(),
                                                   1.0)
                optimizer_D.step()

            if distributed:
                if config['use_vgg']:
                    loss_vgg = reduce_tensor(loss_vgg, config['world_size'])

            if rank == 0:
                if cnt % 10 == 0:
                    run_visualization(output_dict, output_recon, target,
                                      input_a, input_b, save_path, tb_logger,
                                      cnt)

                print_dict = {"learning_rate": get_learning_rate(optimizer_G)}
                if config['use_vgg']:
                    tb_logger.add_scalar('vgg.loss', loss_vgg, cnt)
                    print_dict['Loss_VGG'] = loss_vgg.data
                if config['use_gan']:
                    tb_logger.add_scalar('gan.loss', loss_G_GAN, cnt)
                    tb_logger.add_scalar('d_real.loss', loss_D_real, cnt)
                    tb_logger.add_scalar('d_fake.loss', loss_D_fake, cnt)
                    print_dict['Loss_G_GAN'] = loss_G_GAN
                    print_dict['Loss_real'] = loss_D_real.data
                    print_dict['Loss_fake'] = loss_D_fake.data
                if config['use_l2']:
                    tb_logger.add_scalar('l2.loss', loss_l2, cnt)
                    print_dict['Loss_L2'] = loss_l2.data

                log_iter(ep,
                         cnt % len(train_dataloader),
                         len(train_dataloader),
                         print_dict,
                         log_handle=log_handle)

            if loss_G != loss_G:
                print("NaN!!")
                exit(-2)

            cnt = cnt + 1
            # end of train iter loop

            if cnt % config['val_freq'] == 0 and config['val_freq'] > 0:
                val_loss = run_val(
                    model, validationLoss, val_dataloader,
                    os.path.join(save_path, 'val_%d_renders' % (ep)))

                if distributed:
                    val_loss = reduce_tensor(val_loss, config['world_size'])
                if rank == 0:
                    tb_logger.add_scalar('validation.loss', val_loss, cnt)
                    log_iter(ep,
                             cnt % len(train_dataloader),
                             len(train_dataloader), {"Loss_VGG": val_loss},
                             header="Validation loss: ",
                             log_handle=log_handle)

        if rank == 0:
            if (ep % config['save_freq'] == 0):
                fname = 'checkpoint_%d.ckpt' % (ep)
                fname = os.path.join(save_path, fname)
                print("Saving model...")
                save_weights(model, fname, distributed)
                optimizer_g_fname = os.path.join(
                    save_path, 'latest_optimizer_g_state.ckpt')
                torch.save(optimizer_G.state_dict(), optimizer_g_fname)
                if config['use_gan']:
                    fname = 'checkpoint_D_%d.ckpt' % (ep)
                    fname = os.path.join(save_path, fname)
                    save_weights(discriminator, fname, distributed)
                    optimizer_d_fname = os.path.join(
                        save_path, 'latest_optimizer_d_state.ckpt')
                    torch.save(optimizer_D.state_dict(), optimizer_d_fname)
예제 #3
0
def run_main_single(options, path_general, file_name_general):
    start_time = time.time()
    print('Run file: main_single.py')
    print(time.strftime("%c"))

    # get correct computing device
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Device: {}'.format(device))

    # get the options
    options['device'] = device
    options['dataset_options'] = dynsys_params.get_dataset_options(
        options['dataset'])
    options['model_options'] = model_params.get_model_options(
        options['model'], options['dataset'], options['dataset_options'])
    options['train_options'] = train_params.get_train_options(
        options['dataset'])
    options['test_options'] = train_params.get_test_options()

    # print model type and dynamic system type
    print('\n\tModel Type: {}'.format(options['model']))
    print('\tDynamic System: {}\n'.format(options['dataset']))

    file_name_general = file_name_general + '_h{}_z{}_n{}'.format(
        options['model_options'].h_dim, options['model_options'].z_dim,
        options['model_options'].n_layers)
    path = path_general + 'data/'
    # check if path exists and create otherwise
    if not os.path.exists(path):
        os.makedirs(path)
    # set logger
    set_redirects(path, file_name_general)

    # Specifying datasets
    loaders = loader.load_dataset(
        dataset=options["dataset"],
        dataset_options=options["dataset_options"],
        train_batch_size=options["train_options"].batch_size,
        test_batch_size=options["test_options"].batch_size,
    )

    # Compute normalizers
    if options["normalize"]:
        normalizer_input, normalizer_output = compute_normalizer(
            loaders['train'])
    else:
        normalizer_input = normalizer_output = None

    # Define model
    modelstate = ModelState(seed=options["seed"],
                            nu=loaders["train"].nu,
                            ny=loaders["train"].ny,
                            model=options["model"],
                            options=options,
                            normalizer_input=normalizer_input,
                            normalizer_output=normalizer_output)
    modelstate.model.to(options['device'])

    # save the options
    save_options(options, path_general, 'options.txt')

    # allocation
    df = {}
    if options['do_train']:
        # train the model
        df = training.run_train(modelstate=modelstate,
                                loader_train=loaders['train'],
                                loader_valid=loaders['valid'],
                                options=options,
                                dataframe=df,
                                path_general=path_general,
                                file_name_general=file_name_general)

    if options['do_test']:
        # test the model
        df = testing.run_test(options, loaders, df, path_general,
                              file_name_general)

    # save data
    # get saving path
    path = path_general + 'data/'
    # check if path exists and create otherwise
    if not os.path.exists(path):
        os.makedirs(path)
    # to pandas
    df = pd.DataFrame(df)
    # filename
    file_name = file_name_general + '.csv'
    # save data
    df.to_csv(path + file_name)

    # time output
    time_el = time.time() - start_time
    hours = time_el // 3600
    min = time_el // 60 - hours * 60
    sec = time_el - min * 60 - hours * 3600
    print('Total ime of file execution: {}:{:2.0f}:{:2.0f} [h:min:sec]'.format(
        hours, min, sec))
    print(time.strftime("%c"))