Exemplo n.º 1
0
def main(exp_name=None, fusion=False):
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    irl_model = AIRL(env=env,
                     expert_trajs=experts,
                     state_only=True,
                     fusion=fusion,
                     max_itrs=10)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session():
            algo.train()
Exemplo n.º 2
0
def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99):
    env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10)

    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=discount,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
Exemplo n.º 3
0
def main(exp_name=None, fusion=True):
    # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True))
    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)
    #experts = load_latest_experts('data/ant_data_collect', n=5)

    #qvar: inverse model q(a|s,s')
    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env,
                      qvar=qvar,
                      expert_trajs=experts,
                      fusion=True,
                      max_itrs=10)
    #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      fusion=fusion,
                      max_itrs=10,
                      score_discrim=True)

    #Empowerment-based potential functions gamma* Phi(s')-Phi(s)
    empw_model = Empowerment(env=env, fusion=True, max_itrs=4)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               fusion=True,
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=3000,  #130,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        target_empw_update=5,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        lambda_i=1.0,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        plot=False)
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl'):
        #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above
        with tf.Session():
            algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \
         max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None):
    env = TfEnv(
        CustomGymEnv('PointMazeRight-v0', record_video=False,
                     record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/maze_right_data_collect',
                                                n=2,
                                                visible_gpus=visible_gpus)

    sess = tf.Session(config=tf_config)

    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    max_path_length = 500
    irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, state_only=state_only, fusion=fusion, max_itrs=10, score_discrim=score_discrim, debug = debug, \
                               max_nstep = max_nstep, n_value_funct = n_val, n_rew_funct = n_rew, score_method=score_method)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=max_path_length,
        discount=discount,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )

    # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s'
    dirname = 'data/maze_right_state_bootstrap_%d_irl/%s/%s' % (
        max_nstep, exp_folder, exp_name
    ) if exp_folder is not None else 'data/maze_right_state_bootstrap_%d_irl/%s' % (
        max_nstep, exp_name)
    with rllab_logdir(algo=algo, dirname=dirname):
        sess.__enter__()
        algo.train(sess)
        sess.close()
Exemplo n.º 5
0
def main(exp_name=None, params_folder='data/ant_state_irl'):
    # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True))
    env = TfEnv(
        CustomGymEnv('DisabledAnt-v0',
                     record_video=False,
                     record_log=False,
                     force_reset=False))

    irl_itr = 90  # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr))
    prior_params = load_prior_params(params_file)
    '''q_itr = 400  # earlier IRL iterations overfit less; 100 seems to work well.
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr))
    prior_params_q = load_prior_params(params_file)'''

    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10)
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      score_discrim=False)
    empw_model = Empowerment(env=env, max_itrs=1)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    algo = IRLTRPO(
        init_irl_params=prior_params['irl_params'],
        init_empw_params=None,  #prior_params['empw_params'],
        init_qvar_params=None,  #prior_params['qvar_params'],
        init_policy_params=prior_params['policy_params'],  #None
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=True,
        train_empw=True,
        train_qvar=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
        # plot=True,
    )

    with rllab_logdir(algo=algo, dirname='data/ant_transfer'):  #%s'%exp_name):
        #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name):
        with tf.Session():
            algo.train()
Exemplo n.º 6
0
def main(exp_name=None, fusion=False, latent_dim=3):
    max_path_length = 100
    info_coeff = 0.1
    imitation_coeff = 0.01
    batch_size = 16
    meta_batch_size = 50
    max_itrs = 20
    pre_epoch = 1000
    entropy_weight = 1.0
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim)

    # contexual policy pi(a|s,m)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    # approximate posterior q(m|tau)
    context_encoder_spec = EnvSpec(
        observation_space=Box(
            np.tile(
                np.concatenate((env.observation_space.low[:-latent_dim],
                                env.action_space.low)), max_path_length),
            np.tile(
                np.concatenate((env.observation_space.high[:-latent_dim],
                                env.action_space.high)), max_path_length)),
        action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
    )
    context_encoder = GaussianMLPPolicy(name='context_encoder',
                                        env_spec=context_encoder_spec,
                                        hidden_sizes=(128, 128))

    pretrain_model = Pretrain(experts,
                              policy,
                              context_encoder,
                              env,
                              latent_dim,
                              batch_size=400,
                              kl_weight=0.1,
                              epoch=pre_epoch)
    # pretrain_model = None
    if pretrain_model is None:
        pre_epoch = 0

    irl_model = InfoAIRL(env=env,
                         policy=policy,
                         context_encoder=context_encoder,
                         reward_arch=reward_arch,
                         reward_arch_args=reward_arch_args,
                         expert_trajs=experts,
                         state_only=True,
                         max_path_length=max_path_length,
                         fusion=fusion,
                         max_itrs=max_itrs,
                         meta_batch_size=meta_batch_size,
                         imitation_coeff=imitation_coeff,
                         info_coeff=info_coeff,
                         latent_dim=latent_dim)

    algo = MetaIRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        randomize_policy=True,
        pretrain_model=pretrain_model,
        n_itr=3000,
        meta_batch_size=meta_batch_size,
        batch_size=batch_size,
        max_path_length=max_path_length,
        discount=0.99,
        store_paths=True,
        train_irl=True,
        irl_model_wt=1.0,
        entropy_weight=entropy_weight,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    if fusion:
        dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)
    else:
        dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)

    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.train()
Exemplo n.º 7
0
def main(exp_name=None, latent_dim=3, params_folder=None):
    max_path_length = 100
    batch_size = 16
    meta_batch_size = 1
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    # tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeRight-v0', record_video=False,
                     record_log=False))
    barrier_range = [0.2, 0.6]
    barrier_y = 0.3

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect',
        n=4,
        latent_dim=latent_dim)

    irl_itr_list = [2800]

    for irl_itr in irl_itr_list:
        # params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
        params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr)
        prior_params = load_prior_params(params_file)
        init_context_encoder_params = load_prior_params(
            params_file, 'context_params')

        # params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr-800))
        policy_prior_params = load_prior_params(params_file, 'policy_params')
        # policy_prior_params = None

        # contexual policy pi(a|s,m)
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        # approximate posterior q(m|tau)
        context_encoder_spec = EnvSpec(
            observation_space=Box(
                np.tile(
                    np.concatenate((env.observation_space.low[:-latent_dim],
                                    env.action_space.low)), max_path_length),
                np.tile(
                    np.concatenate((env.observation_space.high[:-latent_dim],
                                    env.action_space.high)), max_path_length)),
            action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
        )
        context_encoder = GaussianMLPPolicy(name='context_encoder',
                                            env_spec=context_encoder_spec,
                                            hidden_sizes=(128, 128))

        irl_model = InfoAIRL(env=env,
                             expert_trajs=experts,
                             reward_arch=reward_arch,
                             reward_arch_args=reward_arch_args,
                             context_encoder=context_encoder,
                             state_only=True,
                             max_path_length=max_path_length,
                             meta_batch_size=meta_batch_size,
                             latent_dim=latent_dim)

        savedir = 'data_fusion_discrete/visualize_reward_right-%s' % irl_itr
        if not os.path.isdir(savedir):
            os.mkdir(savedir)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            irl_model.context_encoder.set_param_values(
                init_context_encoder_params)
            policy.set_param_values(policy_prior_params)
            irl_model.set_params(prior_params)
            boundary_low = -0.1
            boundary_high = 0.6

            expert_obs, expert_acts, expert_contexts = irl_model.extract_paths(
                irl_model.expert_trajs,
                keys=('observations', 'actions', 'contexts'),
                T=max_path_length)
            expert_trajs = np.concatenate(
                (expert_obs, expert_acts),
                axis=-1)  # num_experts x T x (state_dim + act_dim)

            grid_size = 0.005
            rescale = 1. / grid_size

            for itr in range(100):
                expert_traj_batch, m_batch = irl_model.sample_batch(
                    expert_trajs,
                    expert_contexts,
                    batch_size=1,
                    warm_up=False,
                    warm_up_idx=False)
                obs_batch = []
                num_y = 0
                for pos_y in np.arange(boundary_low, boundary_high, grid_size):
                    num_y += 1
                    num_x = 0
                    for pos_x in np.arange(boundary_low, boundary_high,
                                           grid_size):
                        num_x += 1
                        obs_batch.append([pos_x, pos_y, 0.])
                obs_batch = np.array(obs_batch).reshape(
                    [1, -1, max_path_length, 3])
                expert_traj_batch = np.tile(
                    np.reshape(expert_traj_batch, [1, 1, max_path_length, -1]),
                    [1, obs_batch.shape[1], 1, 1])
                reward = tf.get_default_session().run(
                    irl_model.reward,
                    feed_dict={
                        irl_model.expert_traj_var: expert_traj_batch,
                        irl_model.obs_t: obs_batch
                    })
                score = reward[:, 0]
                ax = sns.heatmap(score.reshape([num_x, num_y]),
                                 cmap="YlGnBu_r")
                ax.scatter((m_batch[0][0][0] - boundary_low) * rescale,
                           (m_batch[0][0][1] - boundary_low) * rescale,
                           marker='*',
                           s=150,
                           c='r',
                           edgecolors='k',
                           linewidths=0.5)
                ax.scatter((0.3 - boundary_low +
                            np.random.uniform(low=-0.05, high=0.05)) * rescale,
                           (0. - boundary_low +
                            np.random.uniform(low=-0.05, high=0.05)) * rescale,
                           marker='o',
                           s=120,
                           c='white',
                           linewidths=0.5,
                           edgecolors='k')
                ax.plot([(barrier_range[0] - boundary_low) * rescale,
                         (barrier_range[1] - boundary_low) * rescale],
                        [(barrier_y - boundary_low) * rescale,
                         (barrier_y - boundary_low) * rescale],
                        color='k',
                        linewidth=10)
                ax.invert_yaxis()
                plt.axis('off')
                plt.savefig(savedir + '/%s.png' % itr)
                print('Save Itr', itr)
                plt.close()
Exemplo n.º 8
0
from inverse_rl.utils.log_utils import load_latest_experts_multiple_runs
import pdb

experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)


Exemplo n.º 9
0
def main(exp_name=None, latent_dim=3, params_folder=None):
    max_path_length = 100
    batch_size = 32
    meta_batch_size = 50
    entropy_weight = 0.1
    left = 'right'
    if_filtered = True

    # tf.reset_default_graph()
    if left == 'left':
        env = TfEnv(
            CustomGymEnv('PointMazeLeft-v0',
                         record_video=False,
                         record_log=False))
    else:
        env = TfEnv(
            CustomGymEnv('PointMazeRight-v0',
                         record_video=False,
                         record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect',
        n=4,
        latent_dim=latent_dim)
    if if_filtered:
        experts_filtered = []
        good_range = [0.1, 0.4]  #[0.3, 0.5]
        for expert in experts:
            if expert['contexts'][0,
                                  0] >= good_range[0] and expert['contexts'][
                                      0, 0] <= good_range[1]:
                experts_filtered.append(expert)
        assert len(experts_filtered) >= meta_batch_size
        experts_filtered = experts_filtered[:-(len(experts_filtered) %
                                               meta_batch_size)]
        experts = experts_filtered

    irl_itr_list = [2800]

    results = []
    for irl_itr in irl_itr_list:
        params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr)
        prior_params = load_prior_params(params_file)
        init_context_encoder_params = load_prior_params(
            params_file, 'context_params')

        policy_prior_params = None

        # contexual policy pi(a|s,m)
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        # approximate posterior q(m|tau)
        context_encoder_spec = EnvSpec(
            observation_space=Box(
                np.tile(
                    np.concatenate((env.observation_space.low[:-latent_dim],
                                    env.action_space.low)), max_path_length),
                np.tile(
                    np.concatenate((env.observation_space.high[:-latent_dim],
                                    env.action_space.high)), max_path_length)),
            action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
        )
        context_encoder = GaussianMLPPolicy(name='context_encoder',
                                            env_spec=context_encoder_spec,
                                            hidden_sizes=(128, 128))

        irl_model = InfoAIRL(env=env,
                             expert_trajs=experts,
                             context_encoder=context_encoder,
                             state_only=True,
                             max_path_length=max_path_length,
                             meta_batch_size=meta_batch_size,
                             latent_dim=latent_dim)

        algo = MetaIRLTRPO(
            init_irl_params=prior_params,
            init_pol_params=policy_prior_params,  #policy_prior_params,
            init_context_encoder_params=init_context_encoder_params,
            env=env,
            policy=policy,
            irl_model=irl_model,
            n_itr=150,
            meta_batch_size=meta_batch_size,
            batch_size=batch_size,
            max_path_length=max_path_length,
            discount=0.99,
            store_paths=True,
            train_irl=True,  # True
            train_context_only=True,
            train_policy=True,
            irl_model_wt=1.0,
            entropy_weight=entropy_weight,
            zero_environment_reward=True,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            log_params_folder=params_folder,
            log_experiment_name=exp_name,
        )
        with rllab_logdir(
                algo=algo,
                dirname=
                'data_finetune/maze_finetune_discrete-entropy-%s-irl_itr-%s-%s-%s-generalize/%s'
                % (entropy_weight, irl_itr, left,
                   'filter' if if_filtered else '', exp_name)):
            with tf.Session():
                algo.train()
        results.append((irl_itr, np.max(algo.pol_ret)))
        tf.reset_default_graph()
    print(results)