예제 #1
0

if __name__ == "__main__":
    variant = dict(
        algo_params=dict(
            num_epochs=int(sys.argv[2]),
            num_steps_per_epoch=1000,
            num_steps_per_eval=1000,
            batch_size=128,
            max_path_length=999,
            discount=0.99,
            reward_scale=float(sys.argv[3]),
            soft_target_tau=0.001,
            policy_lr=3E-4,
            qf_lr=3E-4,
            vf_lr=3E-4,
        ),
        net_size=300,
        env=sys.argv[1],
        algo_name="virel",
        algo_seed=int(sys.argv[5]),
    )
    seed = int(sys.argv[5])
    random.seed(seed)
    np.random.seed(seed)
    name = "virel_" + "_" + sys.argv[1] + "_" + sys.argv[5] + "_" + sys.argv[
        3] + "_" + sys.argv[6]
    setup_logger(name, variant=variant)
    ptu.set_gpu_mode(True)
    experiment(variant)
예제 #2
0
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()


if __name__ == "__main__":
    # noinspection PyTypeChecker
    variant = dict(
        algo_params=dict(
            num_epochs=1000,
            num_steps_per_epoch=1000,
            num_steps_per_eval=1000,
            batch_size=128,
            max_path_length=999,
            discount=0.99,
            reward_scale=1,
            soft_target_tau=0.001,
            policy_lr=3E-4,
            qf_lr=3E-4,
            vf_lr=3E-4,
        ),
        net_size=300,
    )
    setup_logger('name-of-experiment', variant=variant)
    experiment(variant)
예제 #3
0
def experiment(log_dir, variant_overwrite, cpu=False):
    if not cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    # Load experiment from file.
    env, _, data, variant = load_experiment(log_dir, variant_overwrite)
    assert all([
        a == b
        for a, b in zip(env.sampled_goal, variant['env_kwargs']['goal_prior'])
    ])

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format(
        variant['algo_kwargs']['num_episodes'],
        variant['algo_kwargs']['max_path_length'],
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        variant['algo_kwargs']['reward_scale'],
        variant['historical_policies_kwargs']['num_historical_policies'],
    )
    exp_id = create_exp_name(exp_id)
    out_dir = os.path.join(log_dir, exp_id)
    print('Logging to:', out_dir)
    setup_logger(
        log_dir=out_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if variant['intrinsic_reward'] == 'smm':
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if variant['historical_policies_kwargs']['num_historical_policies'] > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            log_dir=log_dir,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    algorithm.train()
예제 #4
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    replay_dict = joblib.load(exp_specs['replay_dict_path'])
    next_obs_array = replay_dict['next_observations']
    acts_array = replay_dict['actions']
    data_loader = BasicDataLoader(next_obs_array[:40000],
                                  acts_array[:40000],
                                  exp_specs['episode_length'],
                                  exp_specs['batch_size'],
                                  use_gpu=ptu.gpu_enabled())
    val_data_loader = BasicDataLoader(next_obs_array[40000:],
                                      acts_array[40000:],
                                      exp_specs['episode_length'],
                                      exp_specs['batch_size'],
                                      use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    conv_encoder = nn.Sequential(
        nn.Conv2d(3, 32, 1, stride=1, padding=0, bias=False),
        nn.BatchNorm2d(32), nn.ReLU(),
        nn.Conv2d(32, 32, 1, stride=1, padding=0, bias=False),
        nn.BatchNorm2d(32), nn.ReLU())
    ae_dim = 128
    z_dim = 128
    pre_gru = nn.Sequential(nn.Linear(288 + z_dim + 4, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU())
    post_fc = nn.Sequential(nn.Linear(ae_dim + 288 + 4, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU())
    post_mean_fc = nn.Linear(ae_dim, z_dim, bias=True)
    post_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True)
    prior_fc = nn.Sequential(nn.Linear(ae_dim + 4, ae_dim, bias=False),
                             nn.BatchNorm1d(ae_dim), nn.ReLU(),
                             nn.Linear(ae_dim, ae_dim, bias=False),
                             nn.BatchNorm1d(ae_dim), nn.ReLU())
    prior_mean_fc = nn.Linear(ae_dim, z_dim, bias=True)
    prior_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True)
    gru = nn.GRUCell(ae_dim, ae_dim, bias=True)
    fc_decoder = nn.Sequential(
        nn.Linear(ae_dim + z_dim + 4, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, 288, bias=False),
        nn.BatchNorm1d(288),
        nn.ReLU(),
    )
    conv_decoder = nn.Sequential(
        nn.ConvTranspose2d(32,
                           32,
                           1,
                           stride=1,
                           padding=0,
                           output_padding=0,
                           bias=False), nn.BatchNorm2d(32), nn.ReLU(),
        nn.ConvTranspose2d(32,
                           32,
                           1,
                           stride=1,
                           padding=0,
                           output_padding=0,
                           bias=False), nn.BatchNorm2d(32), nn.ReLU(),
        nn.Conv2d(32, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid())
    if ptu.gpu_enabled():
        conv_encoder.cuda()
        pre_gru.cuda()
        post_fc.cuda()
        post_mean_fc.cuda()
        post_log_cov_fc.cuda()
        prior_fc.cuda()
        prior_mean_fc.cuda()
        prior_log_cov_fc.cuda()
        gru.cuda()
        fc_decoder.cuda()
        conv_decoder.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam([
        item for sublist in map(lambda x: list(x.parameters()), [
            pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc,
            post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc,
            prior_mean_fc
        ]) for item in sublist
    ],
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    losses = []
    KLs = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss = loss + total_KL
                loss.backward()
                model_optim.step()
            loss = 0
            total_KL = 0
            prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], ae_dim))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()

            if iter_num % exp_specs['freq_val'] == 0:
                train_loss_print = '\t'.join(losses)
                train_KLs_print = '\t'.join(KLs)
            losses = []
            KLs = []

        obs_batch, act_batch = data_loader.get_next_batch()

        enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1)

        hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1))
        post_mean = post_mean_fc(hidden)
        post_log_cov = post_log_cov_fc(hidden)

        hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1))
        prior_mean = prior_mean_fc(hidden)
        prior_log_cov = prior_log_cov_fc(hidden)

        recon = fc_decoder(torch.cat([prev_h_batch, act_batch, post_mean],
                                     1)).view(obs_batch.size(0), 32, 3, 3)
        recon = conv_decoder(recon)

        hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1))
        prev_h_batch = gru(hidden, prev_h_batch)

        KL = compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov)
        if iter_num % episode_length != 0:
            loss = loss + torch.sum(
                (obs_batch.view(obs_batch.size(0), -1) -
                 recon.view(obs_batch.size(0), -1))**2, 1).mean()
            total_KL = total_KL + KL
        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        KLs.append('%.4f' % KL)

        if iter_num % (50 * exp_specs['episode_length']) in range(
                2 * exp_specs['episode_length']):
            save_pytorch_tensor_as_img(
                recon[0].data.cpu(),
                'junk_vis/full_KL_mem_grid_%d_recon.png' % iter_num)
            save_pytorch_tensor_as_img(
                obs_batch[0].data.cpu(),
                'junk_vis/full_KL_mem_grid_%d_obs.png' % iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            list(
                map(lambda x: x.eval(), [
                    pre_gru, conv_encoder, gru, fc_decoder, conv_decoder,
                    post_fc, post_log_cov_fc, post_mean_fc, prior_fc,
                    prior_log_cov_fc, prior_mean_fc
                ]))

            val_prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], ae_dim))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()

            val_losses = []
            val_KLs = []
            for i in range(freq_bptt):
                obs_batch, act_batch = data_loader.get_next_batch()

                enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1)

                hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1))
                post_mean = post_mean_fc(hidden)
                post_log_cov = post_log_cov_fc(hidden)

                hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1))
                prior_mean = prior_mean_fc(hidden)
                prior_log_cov = prior_log_cov_fc(hidden)

                recon = fc_decoder(
                    torch.cat([prev_h_batch, act_batch, post_mean],
                              1)).view(obs_batch.size(0), 32, 3, 3)
                recon = conv_decoder(recon)

                hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1))
                prev_h_batch = gru(hidden, prev_h_batch)

                val_losses.append('%.4f' % ((obs_batch - recon)**2).mean())
                val_KL = compute_KL(prior_mean, prior_log_cov, post_mean,
                                    post_log_cov)
                val_KLs.append('%.4f' % val_KL)

            val_loss_print = '\t'.join(val_losses)
            val_KLs_print = '\t'.join(val_KLs)
            print('Val MSE:\t' + val_loss_print)
            print('Train MSE:\t' + train_loss_print)
            print('Val KL:\t\t' + val_KLs_print)
            print('Train KL:\t' + train_KLs_print)

            list(
                map(lambda x: x.train(), [
                    pre_gru, conv_encoder, gru, fc_decoder, conv_decoder,
                    post_fc, post_log_cov_fc, post_mean_fc, prior_fc,
                    prior_log_cov_fc, prior_mean_fc
                ]))
예제 #5
0
def experiment(variant):

    # create multi-task environment and sample tasks
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim

    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder
    hidden_sizes = [200, 200, 200]
    if variant['algo_params']['snail']:
        encoder_model = SnailEncoder
        hidden_sizes = [20]

    context_encoder = encoder_model(
        hidden_sizes=hidden_sizes,
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    context_encoder.use_next_obs_in_context = variant['algo_params'][
        'use_next_obs_in_context']
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = PEARLTanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])

    qf1_exp = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2_exp = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf_exp = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy_exp = PEARLTanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
        latent_dim=latent_dim)
    agent_exp = ExpAgentSimple(latent_dim, context_encoder, policy_exp,
                               **variant['algo_params'])
    algorithm = ExpSACSimple(env=env,
                             train_tasks=list(
                                 tasks[:variant['n_train_tasks']]),
                             eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
                             nets=[agent, qf1, qf2, vf],
                             nets_exp=[agent_exp, qf1_exp, qf2_exp, vf_exp],
                             encoder=context_encoder,
                             **variant['algo_params'])

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-6].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        device = torch.device('cuda:0')
        print(device)
        algorithm.to(device)
        context_encoder.to(device)

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['env_name'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
예제 #6
0
파일: sac.py 프로젝트: mihdalal/rlkit
            num_eval_steps_per_epoch=500 * 5,
            num_trains_per_train_loop=1000,
            num_expl_steps_per_train_loop=1000,
            min_num_steps_before_training=1000,
            max_path_length=500,
            batch_size=256,
        ),
        trainer_kwargs=dict(
            discount=0.99,
            soft_target_tau=5e-3,
            target_update_period=1,
            policy_lr=3e-4,
            qf_lr=3e-4,
            reward_scale=1,
            use_automatic_entropy_tuning=True,
        ),
        env_kwargs=dict(
            robots="Panda",
            has_renderer=False,
            has_offscreen_renderer=False,
            use_camera_obs=False,
            camera_heights=64,
            camera_widths=64,
            reward_shaping=True,
        ),
        env_name="Lift",
    )
    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)
    experiment(variant)
예제 #7
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    env_specs = {
        'flat_repr': False,
        'one_hot_repr': False,
        'maze_h': 9,
        'maze_w': 9,
        'obs_h': 5,
        'obs_w': 5,
        'scale': 4,
        'num_objs': 10
    }
    maze_constructor = lambda: PartiallyObservedGrid(env_specs)
    data_loader = VerySpecificOnTheFLyDataLoader(maze_constructor,
                                                 exp_specs['episode_length'],
                                                 exp_specs['batch_size'],
                                                 use_gpu=ptu.gpu_enabled())
    val_data_loader = VerySpecificOnTheFLyDataLoader(
        maze_constructor,
        exp_specs['episode_length'],
        exp_specs['batch_size'],
        use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    conv_channels = 32
    conv_encoder = nn.Sequential(
        nn.Conv2d(3, conv_channels, 4, stride=2, padding=1, bias=False),
        nn.BatchNorm2d(conv_channels), nn.ReLU(),
        nn.Conv2d(conv_channels,
                  conv_channels,
                  4,
                  stride=2,
                  padding=1,
                  bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU())
    gru_channels = 128
    inter_h = 5
    act_channels = 4
    act_proc = nn.Linear(4, act_channels * inter_h * inter_h, bias=True)
    pre_gru_conv = nn.Sequential(
        nn.Conv2d(act_channels + conv_channels,
                  conv_channels,
                  3,
                  stride=1,
                  padding=1,
                  bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
    )
    gru = ConvGRUCell(conv_channels, gru_channels, 3)
    post_gru_conv = nn.Sequential(
        nn.Conv2d(act_channels + gru_channels,
                  conv_channels,
                  3,
                  stride=1,
                  padding=1,
                  bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
    )
    conv_decoder = nn.Sequential(
        nn.ConvTranspose2d(conv_channels,
                           conv_channels,
                           4,
                           stride=2,
                           padding=1,
                           output_padding=0,
                           bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
        # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False),
        # nn.BatchNorm2d(conv_channels),
        # nn.ReLU(),
        nn.ConvTranspose2d(conv_channels,
                           conv_channels,
                           4,
                           stride=2,
                           padding=1,
                           output_padding=0,
                           bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
        # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False),
        # nn.BatchNorm2d(conv_channels),
        # nn.ReLU(),
    )
    mean_decoder = nn.Sequential(
        nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True),
        nn.Sigmoid())
    log_cov_decoder = nn.Sequential(
        nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True), )
    if ptu.gpu_enabled():
        conv_encoder.cuda()
        pre_gru_conv.cuda()
        gru.cuda()
        post_gru_conv.cuda()
        conv_decoder.cuda()
        mean_decoder.cuda()
        log_cov_decoder.cuda()
        act_proc.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam([
        item for sublist in map(lambda x: list(x.parameters()), [
            conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder,
            mean_decoder, log_cov_decoder
        ]) for item in sublist
    ],
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss.backward()
                model_optim.step()
                prev_h_batch = prev_h_batch.detach()
            loss = 0
        if iter_num % episode_length == 0:
            prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_channels, inter_h,
                            inter_h))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()

            train_loss_print = '\t'.join(losses)
            losses = []

        obs_batch, act_batch = data_loader.get_next_batch()
        act_batch = act_proc(act_batch).view(act_batch.size(0), act_channels,
                                             inter_h, inter_h)

        hidden = post_gru_conv(torch.cat([prev_h_batch, act_batch], 1))
        hidden = conv_decoder(hidden)
        recon = mean_decoder(hidden)
        log_cov = log_cov_decoder(hidden)
        log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX)

        enc = conv_encoder(obs_batch)
        enc = pre_gru_conv(torch.cat([enc, act_batch], 1))
        prev_h_batch = gru(enc, prev_h_batch)

        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        if iter_num % episode_length != 0:
            loss = loss + (
                (obs_batch - recon)**2).sum() / float(exp_specs['batch_size'])
            # loss = loss + compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])

        if iter_num % (500 * episode_length) in range(2 * episode_length):
            save_pytorch_tensor_as_img(
                recon[0].data.cpu(),
                'junk_vis/onthefly_conv_gru_pogrid_len_8_scale_4/rnn_recon_%d.png'
                % iter_num)
            save_pytorch_tensor_as_img(
                obs_batch[0].data.cpu(),
                'junk_vis/onthefly_conv_gru_pogrid_len_8_scale_4/rnn_obs_%d.png'
                % iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            list(
                map(lambda x: x.eval(), [
                    conv_encoder, pre_gru_conv, gru, post_gru_conv,
                    conv_decoder, mean_decoder, log_cov_decoder, act_proc
                ]))

            val_prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_channels, inter_h,
                            inter_h))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()

            losses = []
            for i in range(episode_length):
                obs_batch, act_batch = val_data_loader.get_next_batch()
                act_batch = act_proc(act_batch).view(act_batch.size(0),
                                                     act_channels, inter_h,
                                                     inter_h)

                hidden = post_gru_conv(
                    torch.cat([val_prev_h_batch, act_batch], 1))
                hidden = conv_decoder(hidden)
                recon = mean_decoder(hidden)
                log_cov = log_cov_decoder(hidden)
                log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX)

                enc = conv_encoder(obs_batch)
                enc = pre_gru_conv(torch.cat([enc, act_batch], 1))
                val_prev_h_batch = gru(enc, val_prev_h_batch)

                # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])
                losses.append('%.4f' % ((obs_batch - recon)**2).mean())

            loss_print = '\t'.join(losses)
            print('Val MSE:\t' + loss_print)
            print('Train MSE:\t' + train_loss_print)

            list(
                map(lambda x: x.train(), [
                    conv_encoder, pre_gru_conv, gru, post_gru_conv,
                    conv_decoder, mean_decoder, log_cov_decoder, act_proc
                ]))
예제 #8
0
    algorithm = MetaSoftActorCritic(env_sampler=env_sampler,
                                    policy=policy,
                                    qf=qf,
                                    vf=vf,
                                    **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1


if __name__ == '__main__':
    # Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        '--experiment',
                        help='experiment specification file')
    args = parser.parse_args()
    with open(args.experiment, 'r') as spec_file:
        spec_string = spec_file.read()
        exp_specs = yaml.load(spec_string)

    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    experiment(exp_specs)
예제 #9
0
    variant = dict(
        algorithm='HER-TD3',
        version='normal',
        algo_kwargs=dict(
            batch_size=256,
            num_epochs=100,
            num_eval_steps_per_epoch=5000,
            num_expl_steps_per_train_loop=1000,
            num_trains_per_train_loop=1000,
            min_num_steps_before_training=1000,
            max_path_length=50,
        ),
        td3_trainer_kwargs=dict(
            discount=0.95,
            reward_scale=1
        ),
        replay_buffer_kwargs=dict(
            max_size=int(1E6),
            fraction_goals_rollout_goals=0.2,  # equal to k = 4 in HER paper
            fraction_goals_env_goals=0,
        ),
        qf_kwargs=dict(
            hidden_sizes=[400, 300],
        ),
        policy_kwargs=dict(
            hidden_sizes=[400, 300],
        ),
    )
    setup_logger('her-td3-fetch-experiment', variant=variant)
    experiment(variant)
def experiment(exp_specs):
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Load the data -----------------------------------------------------------
    extra_data_path = exp_specs['extra_data_path']
    train_replay_buffer = joblib.load(extra_data_path)['replay_buffer']
    train_replay_buffer.change_max_size_to_cur_size()
    train_replay_buffer._next_obs = train_replay_buffer._next_obs[:,exp_specs['extra_obs_dim']:]
    if exp_specs['remove_env_info']:
        train_replay_buffer._observations = train_replay_buffer._observations[:,exp_specs['extra_obs_dim']:]
    else:
        if exp_specs['normalize_env_info']:
            low, high = exp_specs['env_info_range'][0], exp_specs['env_info_range'][1]
            train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] -= (low + high)/2.0
            train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] /= (high - low)/2.0

    print('\nRewards: {} +/- {}'.format(
        np.mean(train_replay_buffer._rewards),
        np.std(train_replay_buffer._rewards)
    ))

    next_obs_mean = np.mean(train_replay_buffer._next_obs, 0)
    next_obs_std = np.std(train_replay_buffer._next_obs, 0)
    print('\nNext Obs:\n{}\n+/-\n{}'.format(
        next_obs_mean,
        next_obs_std
    ))

    print('\nAvg Next Obs Square Norm: {}'.format(
        np.mean(np.linalg.norm(train_replay_buffer._next_obs, axis=1)**2)
    ))

    sample_batch = train_replay_buffer.random_batch(exp_specs['train_batch_size'])
    obs_dim = sample_batch['observations'].shape[-1]
    act_dim = sample_batch['actions'].shape[-1]

    val_replay_buffer = SimpleReplayBuffer(exp_specs['val_set_size'], obs_dim, act_dim)
    val_replay_buffer.set_buffer_from_dict(
        train_replay_buffer.sample_and_remove(exp_specs['val_set_size'])
    )
    if exp_specs['train_from_beginning_transitions']:
        trans_dict = dict(
            observations=train_replay_buffer._observations[:exp_specs['train_set_size']],
            actions=train_replay_buffer._actions[:exp_specs['train_set_size']],
            rewards=train_replay_buffer._rewards[:exp_specs['train_set_size']],
            terminals=train_replay_buffer._terminals[:exp_specs['train_set_size']],
            next_observations=train_replay_buffer._next_obs[:exp_specs['train_set_size']],
        )
        train_replay_buffer.set_buffer_from_dict(trans_dict)
    else:
        train_replay_buffer.set_buffer_from_dict(
            train_replay_buffer.sample_and_remove(exp_specs['train_set_size'])
        )

    # Model Definitions -------------------------------------------------------
    if exp_specs['remove_env_info']:
        output_dim = [obs_dim + 1]
    else:
        output_dim = [obs_dim - exp_specs['extra_obs_dim'] + 1]
    model = GenericMap(
        [obs_dim + act_dim],
        output_dim,
        siamese_input=False,
        siamese_output=False,
        num_hidden_layers=exp_specs['num_hidden_layers'],
        hidden_dim=exp_specs['hidden_dim'],
        act='relu',
        use_bn=True,
        deterministic=True
    )

    model_optim = Adam(model.parameters(), lr=float(exp_specs['lr']))

    # Train -------------------------------------------------------------------
    model.train()
    for iter_num in range(exp_specs['max_iters']):
        model_optim.zero_grad()

        batch = train_replay_buffer.random_batch(exp_specs['train_batch_size'])
        batch = convert_numpy_dict_to_pytorch(batch)
        inputs = Variable(torch.cat([batch['observations'], batch['actions']], -1))
        outputs = Variable(torch.cat([batch['next_observations'], batch['rewards']], -1))

        preds = model([inputs])[0]
        if exp_specs['residual']:
            # residual for observations
            preds = preds + Variable(
                        torch.cat(
                            [
                                batch['observations'][:,exp_specs['extra_obs_dim']:],
                                torch.zeros(exp_specs['train_batch_size'], 1)
                            ],
                        1)
                    )
        
        loss = torch.mean(torch.sum((outputs - preds)**2, -1))

        loss.backward()
        model_optim.step()

        if iter_num % exp_specs['freq_val'] == 0:
            model.eval()

            val_batch = val_replay_buffer.random_batch(exp_specs['val_batch_size'])
            val_batch = convert_numpy_dict_to_pytorch(val_batch)
            inputs = Variable(torch.cat([val_batch['observations'], val_batch['actions']], -1))
            outputs = Variable(torch.cat([val_batch['next_observations'], val_batch['rewards']], -1))

            # print(exp_specs['remove_env_info'])
            # print(inputs)
            # print(outputs)
            # sleep(5)
            
            preds = model([inputs])[0]
            if exp_specs['residual']:
                # residual for observations
                preds = preds + Variable(
                            torch.cat(
                                [
                                    val_batch['observations'][:,exp_specs['extra_obs_dim']:],
                                    torch.zeros(exp_specs['train_batch_size'], 1)
                                ],
                            1)
                        )

            loss = torch.mean(torch.sum((outputs - preds)**2, -1))
            next_obs_loss = torch.mean(torch.sum((outputs[:,:-1] - preds[:,:-1])**2, -1))
            rew_loss = torch.mean(torch.sum((outputs[:,-1:] - preds[:,-1:])**2, -1))

            print('\n')
            print('-'*20)
            logger.record_tabular('Iter', iter_num)
            logger.record_tabular('Loss', loss.data[0])
            logger.record_tabular('Obs Loss', next_obs_loss.data[0])
            logger.record_tabular('Rew Loss', rew_loss.data[0])
            logger.dump_tabular(with_prefix=False, with_timestamp=False)

            model.train()
예제 #11
0

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "exp_dir",
        type=str,
        help="Experiment directory to load params and append logs")
    parser.add_argument('start_epoch',
                        type=int,
                        help="Start epoch for continue training logs")
    parser.add_argument("--params_fname", default="params.pkl", type=str)
    parser.add_argument('--gui', action='store_true')
    parser.add_argument('--no_gpu', action='store_true')

    args = parser.parse_args()

    variant = load_variant(args.exp_dir)
    variant["start_epoch"] = args.start_epoch
    variant['headless'] = not args.gui

    gpu_str = "0"
    if not args.no_gpu:
        ptu.enable_gpus(gpu_str)
        ptu.set_gpu_mode(True)

    params_data = load_params(os.path.join(args.exp_dir, args.params_fname))
    setup_logger(log_dir=args.exp_dir, variant=variant)

    experiment(variant, params_data)
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    env_specs = {
        'flat_repr': False,
        'one_hot_repr': False,
        'maze_h': 9,
        'maze_w': 9,
        'obs_h': 5,
        'obs_w': 5,
        'scale': 4,
        'num_objs': 10
    }
    maze_constructor = lambda: PartiallyObservedGrid(env_specs)
    data_loader = VerySpecificOnTheFLyDataLoader(maze_constructor,
                                                 exp_specs['episode_length'],
                                                 exp_specs['batch_size'],
                                                 use_gpu=ptu.gpu_enabled())
    val_data_loader = VerySpecificOnTheFLyDataLoader(
        maze_constructor,
        exp_specs['episode_length'],
        exp_specs['batch_size'],
        use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    model = RecurrentModel()
    if ptu.gpu_enabled():
        model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(),
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss.backward()
                model_optim.step()
                prev_h_batch = prev_h_batch.detach()
                prev_c_batch = prev_c_batch.detach()
            loss = 0
        if iter_num % episode_length == 0:
            prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            prev_c_batch = Variable(
                torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()
                prev_c_batch = prev_c_batch.cuda()

            train_loss_print = '\t'.join(losses)
            losses = []

        obs_batch, act_batch = data_loader.get_next_batch()
        recon, log_cov, prev_h_batch, prev_c_batch = model.forward(
            obs_batch, act_batch, prev_h_batch, prev_c_batch)

        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        if iter_num % episode_length != 0:
            # temp = (obs_batch - recon)**2 / 4.
            # temp[:,:,1:4,1:4] = temp[:,:,1:4,1:4] * 4.

            temp = (obs_batch - recon)**2
            loss = loss + temp.sum() / float(
                exp_specs['batch_size']) + model.reg_loss

            # loss = loss - compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])

        if iter_num % (500 * episode_length) in range(2 * episode_length):
            save_pytorch_tensor_as_img(
                recon[0].data.cpu(),
                'junk_vis/recurrent_deconv_stronger_2/rnn_recon_%d.png' %
                iter_num)
            save_pytorch_tensor_as_img(
                obs_batch[0].data.cpu(),
                'junk_vis/recurrent_deconv_stronger_2/rnn_obs_%d.png' %
                iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            model.eval()
            # print(mask[0], torch.mean(mask, 1), torch.std(mask, 1), torch.min(mask, 1), torch.max(mask, 1))
            print('\nValidating Iter %d...' % iter_num)

            val_prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            val_prev_c_batch = Variable(
                torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()
                val_prev_c_batch = val_prev_c_batch.cuda()

            losses = []
            for i in range(episode_length):
                obs_batch, act_batch = val_data_loader.get_next_batch()

                recon, log_cov, val_prev_h_batch, val_prev_c_batch = model.forward(
                    obs_batch, act_batch, val_prev_h_batch, val_prev_c_batch)

                # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])
                losses.append('%.4f' % ((obs_batch - recon)**2).mean())

            loss_print = '\t'.join(losses)
            print('Val MSE:\t' + loss_print)
            print('Train MSE:\t' + train_loss_print)
            model.train()
예제 #13
0
def experiment(variant):
    setup_logger("name-of-experiment", variant=variant)
    ptu.set_gpu_mode(True)
    log_dir = os.path.expanduser(variant["log_dir"])
    eval_log_dir = log_dir + "_eval"
    utils.cleanup_log_dir(log_dir)
    utils.cleanup_log_dir(eval_log_dir)

    # missing - set torch seed and num threads=1

    # expl_env = gym.make(variant["env_name"])
    expl_envs = make_vec_envs(
        variant["env_name"],
        variant["seed"],
        variant["num_processes"],
        variant["gamma"],
        variant["log_dir"],  # probably change this?
        ptu.device,
        False,
        pytorch=False,
    )
    # eval_env = gym.make(variant["env_name"])
    eval_envs = make_vec_envs(
        variant["env_name"],
        variant["seed"],
        variant["num_processes"],
        variant["gamma"],
        variant["log_dir"],
        ptu.device,
        False,
        pytorch=False,
    )
    obs_shape = expl_envs.observation_space.image.shape
    # if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:  # convert WxHxC into CxWxH
    #     expl_env = TransposeImage(expl_env, op=[2, 0, 1])
    #     eval_env = TransposeImage(eval_env, op=[2, 0, 1])
    # obs_shape = expl_env.observation_space.shape

    channels, obs_width, obs_height = obs_shape
    action_space = expl_envs.action_space
    action_space = gym.spaces.Box(-np.inf, np.inf, (10, ))
    expl_envs.action_space = action_space  # not sure if this works... lets see?!
    eval_envs.action_space = action_space

    base_kwargs = {
        "num_inputs": channels,
        "recurrent": variant["recurrent_policy"]
    }

    base = CNNBase(**base_kwargs)

    bernoulli_dist = distributions.Bernoulli(base.output_size, 4)
    continuous_dist = distributions.DiagGaussian(base.output_size, 6)
    dist = distributions.DistributionGeneratorTuple(
        (bernoulli_dist, continuous_dist))

    eval_policy = LearnPlanPolicy(
        WrappedPolicy(
            obs_shape,
            action_space,
            ptu.device,
            base=base,
            deterministic=True,
            dist=dist,
            num_processes=variant["num_processes"],
        ),
        num_processes=variant["num_processes"],
        vectorised=True,
    )
    expl_policy = LearnPlanPolicy(
        WrappedPolicy(
            obs_shape,
            action_space,
            ptu.device,
            base=base,
            deterministic=False,
            dist=dist,
            num_processes=variant["num_processes"],
        ),
        num_processes=variant["num_processes"],
        vectorised=True,
    )

    # missing: at this stage, policy hasn't been sent to device, but happens later
    eval_path_collector = HierarchicalStepCollector(
        eval_envs,
        eval_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["algorithm_kwargs"]
        ["num_eval_steps_per_epoch"],
        num_processes=variant["num_processes"],
        render=variant["render"],
    )
    expl_path_collector = HierarchicalStepCollector(
        expl_envs,
        expl_policy,
        ptu.device,
        max_num_epoch_paths_saved=variant["num_steps"],
        num_processes=variant["num_processes"],
        render=variant["render"],
    )
    # added: created rollout(5,1,(4,84,84),Discrete(6),1), reset env and added obs to rollout[step]

    trainer = A2CTrainer(actor_critic=expl_policy.learner,
                         **variant["trainer_kwargs"])
    # missing: by this point, rollout back in sync.
    replay_buffer = EnvReplayBuffer(variant["replay_buffer_size"], expl_envs)
    # added: replay buffer is new
    algorithm = TorchIkostrikovRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_envs,
        evaluation_env=eval_envs,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"],
        # batch_size,
        # max_path_length,
        # num_epochs,
        # num_eval_steps_per_epoch,
        # num_expl_steps_per_train_loop,
        # num_trains_per_train_loop,
        # num_train_loops_per_epoch=1,
        # min_num_steps_before_training=0,
    )

    algorithm.to(ptu.device)
    # missing: device back in sync
    algorithm.train()
예제 #14
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    path = 'junk_vis/debug_att_vae_shallower_48_64_dim_0p1_kl_stronger_seg_conv'
    (X_train, Y_train), (X_test, Y_test) = multi_mnist(path,
                                                       max_digits=2,
                                                       canvas_size=48,
                                                       seed=42,
                                                       use_max=False)
    convert_dict = {0: [0., 0.], 1: [1., 0.], 2: [1., 1.]}
    Num_train = np.array([convert_dict[a.shape[0]] for a in Y_train])
    Num_test = np.array([convert_dict[a.shape[0]] for a in Y_test])
    X_train = X_train[:, None, ...]
    X_test = X_test[:, None, ...]
    X_train, X_test = torch.FloatTensor(X_train) / 255.0, torch.FloatTensor(
        X_test) / 255.0
    mask_train, mask_test = torch.FloatTensor(Num_train), torch.FloatTensor(
        Num_test)
    train_ds = TensorDataset(X_train, Num_train)
    val_ds = TensorDataset(X_test, Num_test)

    # Model Definition --------------------------------------------------------
    model = AttentiveVAE([1, 48, 48], exp_specs['vae_specs']['z_dim'],
                         exp_specs['vae_specs']['x_encoder_specs'],
                         exp_specs['vae_specs']['z_seg_conv_specs'],
                         exp_specs['vae_specs']['z_seg_fc_specs'],
                         exp_specs['vae_specs']['z_obj_conv_specs'],
                         exp_specs['vae_specs']['z_obj_fc_specs'],
                         exp_specs['vae_specs']['z_seg_recon_fc_specs'],
                         exp_specs['vae_specs']['z_seg_recon_upconv_specs'],
                         exp_specs['vae_specs']['z_obj_recon_fc_specs'],
                         exp_specs['vae_specs']['z_obj_recon_upconv_specs'],
                         exp_specs['vae_specs']['recon_upconv_part_specs'])
    if ptu.gpu_enabled():
        model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(),
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    global_iter = 0
    for epoch in range(exp_specs['epochs']):
        train_loader = DataLoader(train_ds,
                                  batch_size=exp_specs['batch_size'],
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=False,
                                  drop_last=True)
        for iter_num, img_batch in enumerate(train_loader):
            img_batch, num_batch = img_batch[0], img_batch[1]
            if ptu.gpu_enabled(): img_batch = img_batch.cuda()

            what_means, what_log_covs, where_means, where_log_covs, masks, recon_mean, recon_log_cov = model(
                img_batch, num_batch)
            elbo, KL = model.compute_ELBO(what_means + where_means,
                                          what_log_covs + where_log_covs,
                                          recon_mean,
                                          recon_log_cov,
                                          img_batch,
                                          average_over_batch=True)
            loss = -1. * elbo
            loss = loss + 1. * sum([m.mean() for m in masks])
            loss.backward()
            model_optim.step()

            if global_iter % exp_specs['freq_val'] == 0:
                with torch.no_grad():
                    print('\nValidating Iter %d...' % global_iter)
                    model.eval()

                    idxs = np.random.choice(int(X_test.size(0)),
                                            size=exp_specs['batch_size'],
                                            replace=False)
                    img_batch, num_batch = X_test[idxs], Num_test[idxs]
                    if ptu.gpu_enabled(): img_batch = img_batch.cuda()

                    what_means, what_log_covs, where_means, where_log_covs, masks, recon_mean, recon_log_cov = model(
                        img_batch, num_batch)
                    elbo, KL = model.compute_ELBO(what_means + where_means,
                                                  what_log_covs +
                                                  where_log_covs,
                                                  recon_mean,
                                                  recon_log_cov,
                                                  img_batch,
                                                  average_over_batch=True)

                    mse = ((recon_mean - img_batch)**2).mean()

                    print('ELBO:\t%.4f' % elbo)
                    print('MSE:\t%.4f' % mse)
                    print('KL:\t%.4f' % KL)

                    for i in range(1):
                        save_pytorch_tensor_as_img(
                            img_batch[i].data.cpu(),
                            os.path.join(path,
                                         '%d_%d_img.png' % (global_iter, i)))
                        save_pytorch_tensor_as_img(
                            recon_mean[i].data.cpu(),
                            os.path.join(path,
                                         '%d_%d_recon.png' % (global_iter, i)))
                        save_pytorch_tensor_as_img(
                            masks[0][i].data.cpu(),
                            os.path.join(path, '%d_%d_mask_0.png' %
                                         (global_iter, i)))
                        # save_pytorch_tensor_as_img(masks[1][i].data.cpu(), os.path.join(path, '%d_%d_mask_1.png'%(global_iter, i)))

                    model.train()

            global_iter += 1
예제 #15
0
파일: ppo.py 프로젝트: naruya/DIAYN
if __name__ == "__main__":
    # noinspection PyTypeChecker
    T = 2048
    max_ep_len = 1000
    epochs = 10
    minibatch_size = 64

    variant = dict(
        algorithm="PPO",
        version="normal",
        layer_size=64,
        replay_buffer_size=T,
        algorithm_kwargs=dict(
            num_iter=int(1e6 // T),
            num_eval_steps_per_epoch=max_ep_len,
            num_trains_per_train_loop=T // minibatch_size * epochs,
            num_expl_steps_per_train_loop=T,
            min_num_steps_before_training=0,
            max_path_length=max_ep_len,
            minibatch_size=minibatch_size,
        ),
        trainer_kwargs=dict(
            epsilon=0.2,
            reward_scale=1.0,
            lr=3e-4,
        ),
    )
    setup_logger('PPOBipedalWalkerV2', variant=variant)
    #ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)
    experiment(variant)
예제 #16
0
            qf_lr=args.qf_lr,
            reward_scale=1,

            # BEAR specific params
            mode='auto',
            kernel_choice=args.kernel_type,
            policy_update_style='0',
            mmd_sigma=args.mmd_sigma,
            target_mmd_thresh=args.target_mmd_thresh,
        ),
    )

    setup_logger(
        exp_prefix='bear-' + args.env,
        variant=variant,
        text_log_file="debug.log",
        variant_log_file="variant.json",
        tabular_log_file="progress.csv",
        snapshot_mode="gap_and_last",
        snapshot_gap=100,
        log_tabular_only=False,
        log_dir=None,
        git_infos=None,
        script_name=None,
        # **create_log_dir_kwargs
        base_log_dir='./data',
        exp_id=9999,
        seed=0)
    ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)
    experiment(variant)
예제 #17
0
            sup_lr=(args.lr if args.lr else 1e-3),
        ),
        load_kwargs=dict(
            load=args.load,
            load_dir=log_dir,
        ),
    )
    if args.load:
        log_dir = log_dir + '_load'
    import os
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)
    with open(osp.join(log_dir, 'variant.json'), 'w') as out_json:
        import json
        json.dump(variant, out_json, indent=2)
    import sys
    cmd_input = 'python ' + ' '.join(sys.argv) + '\n'
    with open(osp.join(log_dir, 'cmd_input.txt'), 'a') as f:
        f.write(cmd_input)
    setup_logger(args.exp_name + '/' + main_dir,
                 variant=variant,
                 snapshot_mode=args.snapshot_mode,
                 snapshot_gap=args.snapshot_gap,
                 log_dir=log_dir)
    import numpy as np
    import torch
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)
    experiment(variant)
예제 #18
0
def experiment(variant, args):
    # expl_env = NormalizedBoxEnv(gym.make(str(args.env)))
    # eval_env = NormalizedBoxEnv(gym.make(str(args.env)))
    print(os.getpid())
    ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)
    set_seed(args.seed)
    setup_logger('DIAYN_' + str(args.skill_dim) + '_' + args.env +
                 str(args.seed),
                 variant=variant,
                 snapshot_mode="last")

    expl_env = NormalizedBoxEnv(Mani2dEnv())
    eval_env = NormalizedBoxEnv(Mani2dEnv())

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    skill_dim = args.skill_dim

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    df = FlattenMlp(
        input_size=obs_dim,
        output_size=skill_dim,
        hidden_sizes=[M, M],
    )
    policy = SkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim,
                                     action_dim=action_dim,
                                     hidden_sizes=[M, M],
                                     skill_dim=skill_dim)
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = DIAYNMdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_step_collector = MdpStepCollector(
        expl_env,
        policy,
    )
    replay_buffer = DIAYNEnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
        skill_dim,
    )
    trainer = DIAYNTrainer(env=eval_env,
                           policy=policy,
                           qf1=qf1,
                           qf2=qf2,
                           df=df,
                           target_qf1=target_qf1,
                           target_qf2=target_qf2,
                           **variant['trainer_kwargs'])
    algorithm = DIAYNTorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_step_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()