示例#1
0
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # env = NormalizedBoxEnv(InvertedPendulumEnv())
    env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))
    training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))
    
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=training_env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
示例#2
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning GAIL
    train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError('Not implemented pixel version of things!')
        else:
            obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the policy and policy optimizer
    hidden_sizes = [variant['algo_params']['policy_net_size']] * variant['algo_params']['policy_num_layers']
    z_dim = variant['algo_params']['np_params']['z_dim']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )
    
    # disc_model = GAILDiscModel(obs_dim + action_dim + z_dim, hid_dim=variant['algo_params']['disc_net_size'])
    disc_model = MlpGAILDisc(
        hidden_sizes=variant['disc_hidden_sizes'],
        output_size=1,
        input_size=obs_dim + action_dim + z_dim,
        hidden_activation=torch.nn.functional.tanh,
        layer_norm=variant['disc_uses_layer_norm']
        # output_activation=identity,
    )

    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['algo_params']['policy_params']
    )

    # Make the neural process
    # in the initial version we are assuming all trajectories have the same length
    timestep_enc_params = variant['algo_params']['np_params']['traj_enc_params']['timestep_enc_params']
    traj_enc_params = variant['algo_params']['np_params']['traj_enc_params']['traj_enc_params']
    timestep_enc_params['input_size'] = obs_dim + action_dim
    
    traj_samples, _ = train_context_buffer.sample_trajs(1, num_tasks=1)
    # len_context_traj = traj_samples[0][0]['observations'].shape[0]
    len_context_traj = 5
    traj_enc_params['input_size'] = timestep_enc_params['output_size'] * len_context_traj

    traj_enc = TrivialTrajEncoder(
        timestep_enc_params,
        traj_enc_params
    )

    trunk_params = variant['algo_params']['np_params']['r2z_map_params']['trunk_params']
    trunk_params['input_size'] = traj_enc.output_size
    
    split_params = variant['algo_params']['np_params']['r2z_map_params']['split_heads_params']
    split_params['input_size'] = trunk_params['output_size']
    split_params['output_size'] = variant['algo_params']['np_params']['z_dim']
    
    r2z_map = TrivialR2ZMap(
        trunk_params,
        split_params
    )
    
    np_enc = TrivialNPEncoder(
        variant['algo_params']['np_params']['np_enc_params']['agg_type'],
        traj_enc,
        r2z_map
    )
    
    # class StupidDistFormat():
    #     def __init__(self, var):
    #         self.mean = var
    # class ZeroModule(nn.Module):
    #     def __init__(self, z_dim):
    #         super().__init__()
    #         self.z_dim = z_dim
    #         self.fc = nn.Linear(10,10)
        
    #     def forward(self, context):
    #         c_len = len(context)
    #         return StupidDistFormat(Variable(torch.zeros(c_len, self.z_dim), requires_grad=False))
    # np_enc = ZeroModule(variant['algo_params']['np_params']['z_dim'])


    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs)
    algorithm = NeuralProcessAIRL(
        meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup)
        
        policy,
        disc_model,

        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,

        np_enc,

        policy_optimizer,

        training_env=meta_train_env, # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning airl
    train_context_buffer, train_test_buffer = extra_data['meta_train'][
        'context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test'][
        'context'], extra_data['meta_test']['test']

    # load the expert
    expert_policy = joblib.load(variant['expert_policy'])['algorithm']
    expert_policy.replay_buffer = None

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        meta_train_env = ScaledMetaEnv(
            meta_train_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        meta_test_env = ScaledMetaEnv(
            meta_test_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
    print(meta_train_env)
    print(meta_test_env)

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']

    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )

    # Make the encoder
    encoder = TimestepBasedEncoder(
        2 * obs_dim + action_dim,  #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg'])

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = MetaDagger(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        expert_policy,
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,
        encoder,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        **variant['algo_params'])

    for task_id in train_context_buffer.task_replay_buffers:
        erb = train_context_buffer.task_replay_buffers[task_id]
        rb = algorithm.replay_buffer.task_replay_buffers[task_id]
        erb_size = erb._size
        print(erb_size)
        for k in erb._observations:
            rb._observations[k][:erb_size] = erb._observations[k][:erb_size]
            rb._next_obs[k][:erb_size] = erb._next_obs[k][:erb_size]
        rb._actions[:erb_size] = erb._actions[:erb_size]
        rb._rewards[:erb_size] = erb._rewards[:erb_size]
        rb._terminals[:erb_size] = erb._terminals[:erb_size]
        rb._absorbing[:erb_size] = erb._absorbing[:erb_size]
        rb._size = erb_size
        rb._top = erb_size

    # print('\n\n')
    # for task_id in algorithm.replay_buffer.task_replay_buffers:
    #     rb = algorithm.replay_buffer.task_replay_buffers[task_id]
    #     print(rb._size)
    #     print(rb._top)
    #     print(rb._max_replay_buffer_size)

    if ptu.gpu_enabled():
        expert_policy.cuda()
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning AIRL
    train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError('Not implemented pixel version of things!')
        else:
            obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    if variant['algo_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n')
    if variant['algo_params']['transfer_version']:
        disc_model = TransferVersionSingleColorFetchCustomDisc(
            clamp_magnitude=variant['disc_clamp_magnitude'],
            z_dim=variant['algo_params']['np_params']['z_dim'],
            gamma=0.99
        )
    else:
        disc_model = ThirdVersionSingleColorFetchCustomDisc(
            clamp_magnitude=variant['disc_clamp_magnitude'],
            state_only=variant['algo_params']['state_only'],
            wrap_absorbing=variant['algo_params']['wrap_absorbing'],
            z_dim=variant['algo_params']['np_params']['z_dim']
        )
    if variant['algo_params']['use_target_disc']:
        target_disc = disc_model.copy()
    else:
        target_disc = None
    print(disc_model)
    print(disc_model.clamp_magnitude)

    z_dim = variant['algo_params']['np_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']
    qf1 = ObsPreprocessedQFunc(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 4 + 1*variant['algo_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing']
    )
    qf2 = ObsPreprocessedQFunc(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 4 + 1*variant['algo_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing']
    )
    vf = ObsPreprocessedVFunc(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 1*variant['algo_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing']
    )
    policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy(
        target_disc.obs_processor if target_disc is not None else disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        obs_dim=6 + 4,
        action_dim=4
    )

    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'],
        **variant['policy_params']
    )

    # Make the neural process
    traj_enc = TrivialTrajEncoder(state_only=variant['algo_params']['state_only'])
    context_enc = TrivialContextEncoder(
        variant['algo_params']['np_params']['agg_type'],
        traj_enc,
        state_only=variant['algo_params']['state_only']
    )
    r2z_map = TrivialR2ZMap(z_dim)
    
    np_enc = TrivialNPEncoder(
        context_enc,
        r2z_map,
        state_only=variant['algo_params']['state_only']
    )
    
    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs)

    if variant['meta_fairl']:
        print('\n\nUSING META-FAIRL\n\n')
        algorithm_class = MetaFAIRL
    else:
        print('\n\nUSING META-AIRL\n\n')
        algorithm_class = NeuralProcessAIRL
    
    algorithm = algorithm_class(
        meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup)
        
        policy,
        disc_model,

        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,

        np_enc,

        policy_optimizer,

        training_env=meta_train_env, # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,

        target_disc=target_disc,
        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
        # print(target_disc)
        # print(next(algorithm.discriminator.obs_processor.parameters()).is_cuda)
        # print(next(algorithm.main_policy.preprocess_model.parameters()).is_cuda)
        # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.copy().preprocess_model)
        # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.preprocess_model.copy())
        # 1/0
    algorithm.train()

    return 1
示例#5
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning AIRL
    train_context_buffer, train_test_buffer = extra_data['meta_train'][
        'context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test'][
        'context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    if variant['algo_params']['state_only']:
        print('\n\nUSING STATE ONLY DISC\n\n')
    assert 'transfer_version' not in variant['algo_params']
    if variant['algo_params']['only_Dc'] or variant['algo_params'][
            'disc_ignores_z']:
        disc_model = OnlyDcTFuncForFetch(
            T_clamp_magnitude=variant['T_clamp_magnitude'],
            gating_clamp_magnitude=variant['gating_clamp_magnitude'],
            state_only=variant['algo_params']['state_only'],
            wrap_absorbing=variant['algo_params']['wrap_absorbing'],
            D_c_repr_dim=variant['algo_params']['D_c_repr_dim'],
        )
    else:
        disc_model = TFuncForFetch(
            T_clamp_magnitude=variant['T_clamp_magnitude'],
            gating_clamp_magnitude=variant['gating_clamp_magnitude'],
            state_only=variant['algo_params']['state_only'],
            wrap_absorbing=variant['algo_params']['wrap_absorbing'],
            D_c_repr_dim=variant['algo_params']['D_c_repr_dim'],
            z_dim=variant['algo_params']['np_params']['z_dim'])
    if variant['algo_params']['use_target_disc']:
        target_disc = disc_model.copy()
    else:
        target_disc = None
    print(disc_model)
    print(disc_model.T_clamp_magnitude)
    print(disc_model.gating_clamp_magnitude)

    z_dim = variant['algo_params']['np_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']

    # make the policy and its obs gating model
    if not variant['algo_params']['only_Dc']:
        latent_dim = z_dim
    else:
        latent_dim = variant['algo_params']['D_c_repr_dim']

    if variant['algo_params']['use_disc_obs_processor'] and variant[
            'algo_params']['only_Dc']:
        assert variant['algo_params']['only_Dc']
        print('\n\nUSING DISC OBS PROCESSOR\n\n')
        policy_obs_gating = disc_model.D_c_repr_obs_processor
        policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy(
            policy_obs_gating,
            latent_dim,
            train_preprocess_model=False,
            hidden_sizes=hidden_sizes,
            obs_dim=6 + 4,
            action_dim=4)
    else:
        # print('\n\n$$$$$$$$\nNO BN IN POL GATING\n$$$$$$$$$\n\n')
        policy_obs_gating = ObsGating(variant['gating_clamp_magnitude'],
                                      z_dim=latent_dim)
        policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy(
            policy_obs_gating,
            latent_dim,
            train_preprocess_model=True,
            hidden_sizes=hidden_sizes,
            obs_dim=6 + 4,
            action_dim=4)
    print(policy)
    qf1 = ObsPreprocessedQFunc(
        policy.preprocess_model,
        latent_dim,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 4 + 1 * variant['algo_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'])
    qf2 = ObsPreprocessedQFunc(
        policy.preprocess_model,
        latent_dim,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 4 + 1 * variant['algo_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'])
    vf = ObsPreprocessedVFunc(
        policy.preprocess_model,
        latent_dim,
        hidden_sizes=hidden_sizes,
        input_size=6 + 4 + 1 * variant['algo_params']['wrap_absorbing'],
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'])

    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'],
        **variant['policy_params'])

    # make the context encoder for the discrimiantor
    traj_enc = TrivialTrajEncoder(
        state_only=variant['algo_params']['state_only'])
    disc_r_getter = TrivialContextEncoder(
        variant['algo_params']['np_params']['agg_type'],
        traj_enc,
        state_only=variant['algo_params']['state_only'])
    disc_encoder = TrivialDiscDcEncoder(disc_r_getter,
                                        variant['algo_params']['D_c_repr_dim'])

    # make the amortized q distribution
    if variant['algo_params']['q_uses_disc_r_getter']:
        r2z_map = TrivialR2ZMap(z_dim)
        q_model = TrivialNPEncoder(disc_r_getter,
                                   r2z_map,
                                   train_context_encoder=False)
    else:
        traj_enc = TrivialTrajEncoder(
            state_only=variant['algo_params']['state_only'])
        r2z_map = TrivialR2ZMap(z_dim)
        q_context_encoder = TrivialContextEncoder(
            variant['algo_params']['np_params']['agg_type'],
            traj_enc,
            state_only=variant['algo_params']['state_only'])
        q_model = TrivialNPEncoder(q_context_encoder,
                                   r2z_map,
                                   train_context_encoder=True)

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = NeuralProcessMetaIRL(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        disc_model,
        disc_encoder,
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,
        q_model,
        policy_optimizer,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        target_disc=target_disc,
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
        # print(target_disc)
        # print(next(algorithm.discriminator.obs_processor.parameters()).is_cuda)
        # print(next(algorithm.main_policy.preprocess_model.parameters()).is_cuda)
        # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.copy().preprocess_model)
        # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.preprocess_model.copy())
        # 1/0
    algorithm.train()

    return 1
示例#6
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning airl
    train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        meta_train_env = ScaledMetaEnv(
            meta_train_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        meta_test_env = ScaledMetaEnv(
            meta_test_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
    print(meta_train_env)
    print(meta_test_env)

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError('Not implemented pixel version of things!')
        else:
            obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']

    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )

    # Make the encoder
    encoder = TimestepBasedEncoder(
        2*obs_dim + action_dim, #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg']
    )
    # ---------------
    # encoder = WeightShareTimestepBasedEncoder(
    #     obs_dim,
    #     action_dim,
    #     64,
    #     variant['algo_params']['r_dim'],
    #     variant['algo_params']['z_dim'],
    #     variant['algo_params']['enc_hid_dim'],
    #     variant['algo_params']['r2z_hid_dim'],
    #     variant['algo_params']['num_enc_layer_blocks'],
    #     hid_act='relu',
    #     use_bn=True,
    #     within_traj_agg=variant['algo_params']['within_traj_agg']
    # )
    # ---------------
    # traj_enc = ConvTrajEncoder(
    #     variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'],
    #     # obs_dim + action_dim,
    #     obs_dim + action_dim + obs_dim,
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['traj_enc_params']['kernel'],
    #     variant['algo_params']['np_params']['traj_enc_params']['stride'],
    # )
    # Dc2R_map = Dc2RMap(
    #     variant['algo_params']['np_params']['Dc2r_params']['agg_type'],
    #     traj_enc,
    #     state_only=False
    # )
    # r2z_map = R2ZMap(
    #     variant['algo_params']['np_params']['r2z_params']['num_layers'],
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['r2z_params']['hid_dim'],
    #     variant['algo_params']['z_dim']
    # )
    # encoder = NPEncoder(
    #     Dc2R_map,
    #     r2z_map,
    # )

    
    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs)

    algorithm = NeuralProcessBC(
        meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup)
        
        policy,

        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,

        encoder,

        training_env=meta_train_env, # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,

        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
示例#7
0
def experiment(specs):
    # this is just bad nomenclature: specific_exp_dir is the dir where you will find
    # the specific experiment run (with a particular seed etc.) of the expert policy
    # to use for generating trajectories
    if not specs['use_scripted_policy']:
        policy_is_scripted = False
        expert = joblib.load(path.join(specs['expert_dir'],
                                       'extra_data.pkl'))['algorithm']
        # max_path_length = expert.max_path_length
        max_path_length = specs['max_path_length']
        if max_path_length != expert.max_path_length:
            print('\n\nUsing max_path_length {}! Expert\'s was {}!'.format(
                max_path_length, expert.max_path_length))
        attrs = [
            'max_path_length', 'policy_uses_pixels', 'policy_uses_task_params',
            'no_terminal'
        ]
        expert_policy_specs = {att: getattr(expert, att) for att in attrs}
        expert_policy_specs['wrap_absorbing'] = specs['wrap_absorbing']
        no_terminal = specs['no_terminal']
    else:
        policy_is_scripted = True
        max_path_length = specs['max_path_length']
        wrap_absorbing = specs['wrap_absorbing']
        expert_policy_specs = {
            'policy_uses_pixels':
            specs['policy_uses_pixels'],
            'policy_uses_task_params':
            specs['policy_uses_task_params'],
            'concat_task_params_to_policy_obs':
            specs['concat_task_params_to_policy_obs']
        }
        no_terminal = specs['no_terminal']
        expert = get_scripted_policy(specs['scripted_policy_name'])

    # set up the envs
    env_specs = specs['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)

    # get the task param iterators for the meta envs
    meta_train_params_sampler, meta_test_params_sampler = get_meta_env_params_iters(
        env_specs)

    # make the replay buffers
    if specs['wrap_absorbing']:
        _max_buffer_size = (max_path_length +
                            2) * specs['num_rollouts_per_task']
    else:
        _max_buffer_size = max_path_length * specs['num_rollouts_per_task']
    _max_buffer_size = int(
        np.ceil(_max_buffer_size / float(specs['subsample_factor']))) + 10
    # + 10 is just in case somewhere someone uses ._size of replay buffers incorrectly

    buffer_constructor = lambda env_for_buffer: MetaEnvReplayBuffer(
        _max_buffer_size,
        env_for_buffer,
        policy_uses_pixels=specs['student_policy_uses_pixels'],
        # we don't want the student policy to be looking at true task parameters
        policy_uses_task_params=False,
        concat_task_params_to_policy_obs=False)

    train_context_buffer = buffer_constructor(meta_train_env)
    test_context_buffer = buffer_constructor(meta_test_env)

    render = specs['render']
    check_for_success = specs['check_for_success']
    # fill the train buffers
    fill_buffer(train_context_buffer,
                meta_train_env,
                expert,
                expert_policy_specs,
                meta_train_params_sampler,
                specs['num_rollouts_per_task'],
                max_path_length,
                no_terminal=no_terminal,
                wrap_absorbing=specs['wrap_absorbing'],
                policy_is_scripted=policy_is_scripted,
                render=render,
                check_for_success=check_for_success,
                subsample_factor=specs['subsample_factor'],
                deterministic=specs['get_deterministic_expert_demos'])
    train_test_buffer = deepcopy(train_context_buffer)

    # fill the test buffers
    fill_buffer(test_context_buffer,
                meta_train_env,
                expert,
                expert_policy_specs,
                meta_test_params_sampler,
                specs['num_rollouts_per_task'],
                max_path_length,
                no_terminal=no_terminal,
                wrap_absorbing=specs['wrap_absorbing'],
                policy_is_scripted=policy_is_scripted,
                render=render,
                check_for_success=check_for_success,
                subsample_factor=specs['subsample_factor'],
                deterministic=specs['get_deterministic_expert_demos'])
    test_test_buffer = deepcopy(test_context_buffer)

    # save the replay buffers
    d = {
        'meta_train': {
            'context': train_context_buffer,
            'test': train_test_buffer
        },
        'meta_test': {
            'context': test_context_buffer,
            'test': test_test_buffer
        }
    }
    logger.save_extra_data(d)

    return 1
示例#8
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning GAIL
    train_context_buffer, train_test_buffer = extra_data['meta_train'][
        'context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test'][
        'context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)

    # student policy should not have access to any task information
    print(variant['algo_params'].keys())
    meta_train_env.policy_uses_pixels = variant['algo_params'][
        'policy_uses_pixels']
    meta_train_env.policy_uses_task_params = False
    meta_train_env.concat_task_params_to_policy_obs = False

    meta_test_env.policy_uses_pixels = variant['algo_params'][
        'policy_uses_pixels']
    meta_test_env.policy_uses_task_params = False
    meta_test_env.concat_task_params_to_policy_obs = False

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    policy_net_size = variant['algo_params']['policy_net_size']
    policy_num_layers = variant['algo_params']['policy_num_layers']
    hidden_sizes = [policy_net_size] * policy_num_layers
    # policy = MlpPolicy(
    #     [policy_net_size, policy_net_size],
    #     action_dim,
    #     obs_dim + variant['algo_params']['np_params']['z_dim'],
    #     hidden_activation=torch.nn.functional.tanh,
    #     layer_norm=variant['algo_params']['use_layer_norm']
    # )
    policy = MlpPolicy(
        hidden_sizes,
        action_dim,
        obs_dim + variant['algo_params']['np_params']['z_dim'],
        # hidden_activation=torch.nn.functional.relu,
        hidden_activation=torch.nn.functional.tanh,
        output_activation=torch.nn.functional.tanh,
        layer_norm=variant['algo_params']['use_layer_norm']
        # batch_norm=True
    )

    # Make the neural process
    # in the initial version we are assuming all trajectories have the same length
    timestep_enc_params = variant['algo_params']['np_params'][
        'traj_enc_params']['timestep_enc_params']
    traj_enc_params = variant['algo_params']['np_params']['traj_enc_params'][
        'traj_enc_params']
    timestep_enc_params['input_size'] = obs_dim + action_dim

    traj_samples, _ = train_context_buffer.sample_trajs(1, num_tasks=1)
    len_context_traj = traj_samples[0][0]['observations'].shape[0]
    len_context_traj = 5
    traj_enc_params[
        'input_size'] = timestep_enc_params['output_size'] * len_context_traj

    traj_enc = TrivialTrajEncoder(timestep_enc_params, traj_enc_params)

    trunk_params = variant['algo_params']['np_params']['r2z_map_params'][
        'trunk_params']
    trunk_params['input_size'] = traj_enc.output_size

    split_params = variant['algo_params']['np_params']['r2z_map_params'][
        'split_heads_params']
    split_params['input_size'] = trunk_params['output_size']
    split_params['output_size'] = variant['algo_params']['np_params']['z_dim']

    r2z_map = TrivialR2ZMap(trunk_params, split_params)

    np_enc = TrivialNPEncoder(
        variant['algo_params']['np_params']['np_enc_params']['agg_type'],
        traj_enc, r2z_map)

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)
    algorithm = NeuralProcessBC(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,
        np_enc,
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        training_env=meta_train_env,  # the env used for generating trajectories
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning airl
    train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError('Not implemented pixel version of things!')
        else:
            obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    z_dim = variant['algo_params']['np_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']

    obs_processor = ObsGatingV1(
        clamp_magnitude=variant['gate_logit_clamp_magnitude'],
        z_dim=z_dim
    )
    policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy(
        obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        obs_dim=6 + 4,
        action_dim=4,

        train_preprocess_model=True
    )

    # Make the neural process
    traj_enc = TrivialTrajEncoder(state_only=variant['algo_params']['state_only'])
    context_enc = TrivialContextEncoder(
        variant['algo_params']['np_params']['agg_type'],
        traj_enc,
        state_only=variant['algo_params']['state_only']
    )
    r2z_map = TrivialR2ZMap(z_dim)
    
    np_enc = TrivialNPEncoder(
        context_enc,
        r2z_map,
        state_only=variant['algo_params']['state_only']
    )
    
    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs)

    algorithm = NeuralProcessBC(
        meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup)
        
        policy,

        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,

        np_enc,

        training_env=meta_train_env, # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,

        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
示例#10
0
def experiment(variant):
    env_specs = variant['env_specs']
    if variant['algo_params']['meta']:
        env, training_env = get_meta_env(env_specs)
    else:
        if env_specs['train_test_env']:
            env, training_env = get_env(env_specs)
        else:
            env, _ = get_env(env_specs)
            training_env, _ = get_env(env_specs)

    if variant['algo_params']['meta']:
        train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
            env_specs)

    print(env.observation_space)

    if isinstance(env.observation_space, Dict):
        if not variant['algo_params']['policy_uses_pixels']:
            obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape))
            if variant['algo_params']['policy_uses_task_params']:
                obs_dim += int(
                    np.prod(
                        env.observation_space.spaces['obs_task_params'].shape))
        else:
            raise NotImplementedError()
    else:
        obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    hidden_sizes = [net_size] * variant['num_hidden_layers']
    if variant['use_custom_ant_models']:
        assert isinstance(env.observation_space, Dict)
        print('CUSTOM ANT WITH LINEAR EMBEDDING OF THE TARGET POSITION')
        qf1 = AntRandGoalCustomQFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape))
            + action_dim,
            output_size=1,
        )
        qf2 = AntRandGoalCustomQFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape))
            + action_dim,
            output_size=1,
        )
        vf = AntRandGoalCustomVFunc(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            input_size=int(np.prod(env.observation_space.spaces['obs'].shape)),
            output_size=1,
        )
        policy = AntRandGoalCustomReparamTanhMultivariateGaussianPolicy(
            int(np.prod(
                env.observation_space.spaces['obs_task_params'].shape)),
            variant['goal_embed_dim'],
            hidden_sizes=hidden_sizes,
            obs_dim=int(np.prod(env.observation_space.spaces['obs'].shape)),
            action_dim=action_dim,
        )

        # CUSTOM ANT WITH GATING ACTIVATIONS OF EACH LAYER
        # qf1 = AntCustomGatingQFuncV1()
        # qf2 = AntCustomGatingQFuncV1()
        # vf = AntCustomGatingVFuncV1()
        # policy = AntCustomGatingV1ReparamTanhMultivariateGaussianPolicy()
    else:
        print('Using simple model')
        qf1 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        )
        qf2 = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim + action_dim,
            output_size=1,
        )
        vf = FlattenMlp(
            hidden_sizes=hidden_sizes,
            input_size=obs_dim,
            output_size=1,
        )
        policy = ReparamTanhMultivariateGaussianPolicy(
            hidden_sizes=hidden_sizes,
            obs_dim=obs_dim,
            action_dim=action_dim,
        )

    if variant['algo_params']['meta']:
        algorithm = MetaNewSoftActorCritic(
            env=env,
            training_env=training_env,
            policy=policy,
            qf1=qf1,
            qf2=qf2,
            vf=vf,
            train_task_params_sampler=train_task_params_sampler,
            test_task_params_sampler=test_task_params_sampler,
            true_env_obs_dim=int(
                np.prod(env.observation_space.spaces['obs'].shape)),
            **variant['algo_params'])
    else:
        algorithm = NewSoftActorCritic(env=env,
                                       training_env=training_env,
                                       policy=policy,
                                       qf1=qf1,
                                       qf2=qf2,
                                       vf=vf,
                                       **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    train_context_buffer, train_test_buffer = extra_data['meta_train'][
        'context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test'][
        'context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    if variant['scale_env_with_given_demo_stats']:
        assert not env_specs['normalized']
        meta_train_env = ScaledMetaEnv(
            meta_train_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )
        meta_test_env = ScaledMetaEnv(
            meta_test_env,
            obs_mean=extra_data['obs_mean'],
            obs_std=extra_data['obs_std'],
            acts_mean=extra_data['acts_mean'],
            acts_std=extra_data['acts_std'],
        )

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    if variant['algo_params']['state_only']:
        print('\n\nUSING STATE ONLY DISC\n\n')
    if variant['algo_params']['state_only']: raise NotImplementedError()
    disc_model = AntLinClassDisc(
        obs_dim - 12 + 2 + action_dim,
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'],
        z_dim=variant['algo_params']['z_dim'])
    print(disc_model)
    print(disc_model.clamp_magnitude)

    if variant['algo_params']['use_target_disc']:
        target_disc = disc_model.copy()
    else:
        target_disc = None
    print(disc_model)
    print(disc_model.clamp_magnitude)

    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']
    qf1 = ObsPreprocessedQFunc(
        disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        input_size=obs_dim - 12 + 2 + action_dim,
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'])
    qf2 = ObsPreprocessedQFunc(
        disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        input_size=obs_dim - 12 + 2 + action_dim,
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'])
    vf = ObsPreprocessedVFunc(
        disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        input_size=obs_dim - 12 + 2,
        output_size=1,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'])
    policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy(
        disc_model.obs_processor,
        z_dim,
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim - 12 + 2,
        action_dim=action_dim)
    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'],
        **variant['policy_params'])

    # make the encoder
    encoder = TimestepBasedEncoder(
        2 * obs_dim if variant['algo_params']['state_only'] else 2 * obs_dim +
        action_dim,  #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg'],
        state_only=variant['algo_params']['state_only'])
    # ---------------
    # traj_enc = ConvTrajEncoder(
    #     variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'],
    #     obs_dim + action_dim if not variant['algo_params']['state_only'] else obs_dim,
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['traj_enc_params']['kernel'],
    #     variant['algo_params']['np_params']['traj_enc_params']['stride'],
    # )
    # Dc2R_map = Dc2RMap(
    #     variant['algo_params']['np_params']['Dc2r_params']['agg_type'],
    #     traj_enc,
    #     state_only=variant['algo_params']['state_only']
    # )
    # r2z_map = R2ZMap(
    #     variant['algo_params']['np_params']['r2z_params']['num_layers'],
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['r2z_params']['hid_dim'],
    #     variant['algo_params']['z_dim']
    # )
    # encoder = NPEncoder(
    #     Dc2R_map,
    #     r2z_map,
    # )

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = NeuralProcessAIRL(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        disc_model,
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,
        encoder,
        policy_optimizer,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        target_disc=target_disc,
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
# env_specs = {
#     'base_env_name': 'meta_simple_meta_reacher',
#     'normalized': False
# }
env_specs = {
    'base_env_name': 'meta_simple_meta_reacher',
    'normalized': False,
    'need_pixels': True,
    'render_kwargs': {
        'height': 64,
        'width': 64,
        'camera_id': 0
    }
}

meta_train_env, meta_test_env = get_meta_env(env_specs)

meta_train_params_sampler, meta_test_params_sampler = get_meta_env_params_iters(
    env_specs)
buffer = d['meta_train']['context']

buffer.policy_uses_pixels = True

task_params, obs_task_params = meta_train_params_sampler.sample()
meta_train_env.reset(task_params=task_params, obs_task_params=obs_task_params)
task_id = meta_train_env.task_identifier

# print(buffer.num_steps_can_sample())

# print(buffer.task_replay_buffers.keys())
示例#13
0
def experiment(variant):
    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )

    # make the encoder
    encoder = TimestepBasedEncoder(
        2 * obs_dim + action_dim,  #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg'],
        state_only=False)

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = PEARL(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        qf1,
        qf2,
        vf,
        encoder,
        # z_dim,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][
        variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning AIRL
    train_context_buffer, train_test_buffer = extra_data['meta_train'][
        'context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test'][
        'context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    meta_train_env.seed(variant['seed'])
    meta_test_env.seed(variant['seed'])

    # set up the policy and training algorithm
    if isinstance(meta_train_env.observation_space, Dict):
        if variant['algo_params']['policy_uses_pixels']:
            raise NotImplementedError(
                'Not implemented pixel version of things!')
        else:
            obs_dim = int(
                np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    else:
        obs_dim = int(np.prod(meta_train_env.observation_space.shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    if variant['algo_params']['state_only']:
        print('\n\nUSING STATE ONLY DISC\n\n')
    disc_model = StandardMetaDisc(
        2 * obs_dim + action_dim + variant['algo_params']['z_dim']
        if not variant['algo_params']['state_only'] else 2 * obs_dim +
        variant['algo_params']['z_dim'],
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])
    print(disc_model)
    print(disc_model.clamp_magnitude)
    if variant['algo_params']['use_target_disc']:
        target_disc = disc_model.copy()
    else:
        target_disc = None
    print(disc_model)
    print(disc_model.clamp_magnitude)

    z_dim = variant['algo_params']['z_dim']
    policy_net_size = variant['policy_net_size']
    hidden_sizes = [policy_net_size] * variant['num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim + z_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + z_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim + z_dim,
        action_dim=action_dim,
    )
    policy_optimizer = NewSoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        wrap_absorbing=variant['algo_params']['wrap_absorbing'],
        **variant['policy_params'])

    # make the encoder
    encoder = TimestepBasedEncoder(
        2 * obs_dim + action_dim,  #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
    )

    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(
        env_specs)

    algorithm = MetaFAIRL(
        meta_test_env,  # env is the test env, training_env is the training env (following rlkit original setup)
        policy,
        disc_model,
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,
        encoder,
        policy_optimizer,
        training_env=meta_train_env,  # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        target_disc=target_disc,
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
示例#15
0
def experiment(variant):
    with open(EXPERT_LISTING_YAML_PATH, 'r') as f:
        listings = yaml.load(f.read())
    expert_dir = listings[variant['expert_name']]['exp_dir']
    specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']]
    file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl')
    extra_data = joblib.load(file_to_load)

    # this script is for the non-meta-learning airl
    train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test']
    test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test']

    # set up the envs
    env_specs = variant['env_specs']
    meta_train_env, meta_test_env = get_meta_env(env_specs)
    
    # set up the policy and training algorithm
    obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape))
    action_dim = int(np.prod(meta_train_env.action_space.shape))

    print('obs dim: %d' % obs_dim)
    print('act dim: %d' % action_dim)
    sleep(3)

    # make the disc model
    z_dim = variant['algo_params']['z_dim']

    # make the MLP
    hidden_sizes = [variant['algo_params']['mlp_hid_dim']] * variant['algo_params']['mlp_layers']
    obs_task_params_dim = int(np.prod(meta_train_env.observation_space.spaces['obs_task_params'].shape))
    mlp = Mlp(
        hidden_sizes,
        output_size=obs_task_params_dim if variant['algo_params']['training_regression'] else 1,
        input_size=z_dim if variant['algo_params']['training_regression'] else z_dim + 2*obs_task_params_dim,
        batch_norm=variant['algo_params']['mlp_use_bn']
    )

    # Make the encoder
    encoder = TimestepBasedEncoder(
        2*obs_dim + action_dim, #(s,a,s')
        variant['algo_params']['r_dim'],
        variant['algo_params']['z_dim'],
        variant['algo_params']['enc_hid_dim'],
        variant['algo_params']['r2z_hid_dim'],
        variant['algo_params']['num_enc_layer_blocks'],
        hid_act='relu',
        use_bn=True,
        within_traj_agg=variant['algo_params']['within_traj_agg']
    )
    # ---------------
    # encoder = WeightShareTimestepBasedEncoder(
    #     obs_dim,
    #     action_dim,
    #     64,
    #     variant['algo_params']['r_dim'],
    #     variant['algo_params']['z_dim'],
    #     variant['algo_params']['enc_hid_dim'],
    #     variant['algo_params']['r2z_hid_dim'],
    #     variant['algo_params']['num_enc_layer_blocks'],
    #     hid_act='relu',
    #     use_bn=True,
    #     within_traj_agg=variant['algo_params']['within_traj_agg']
    # )
    # ---------------
    # traj_enc = ConvTrajEncoder(
    #     variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'],
    #     # obs_dim + action_dim,
    #     obs_dim + action_dim + obs_dim,
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['traj_enc_params']['kernel'],
    #     variant['algo_params']['np_params']['traj_enc_params']['stride'],
    # )
    # Dc2R_map = Dc2RMap(
    #     variant['algo_params']['np_params']['Dc2r_params']['agg_type'],
    #     traj_enc,
    #     state_only=False
    # )
    # r2z_map = R2ZMap(
    #     variant['algo_params']['np_params']['r2z_params']['num_layers'],
    #     variant['algo_params']['np_params']['traj_enc_params']['channels'],
    #     variant['algo_params']['np_params']['r2z_params']['hid_dim'],
    #     variant['algo_params']['z_dim']
    # )
    # encoder = NPEncoder(
    #     Dc2R_map,
    #     r2z_map,
    # )

    
    train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs)

    algorithm = UpperBound(
        meta_train_env,
        
        train_context_buffer,
        train_test_buffer,
        test_context_buffer,
        test_test_buffer,

        mlp,
        encoder,

        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1