def main(exp_name, ent_wt=0.1, visible_gpus='0', discount=0.99):
    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    with tf.Session(config=tf_config) as sess:
        algo = TRPO(
            env=env,
            policy=policy,
            n_itr=3000,
            batch_size=20000,
            max_path_length=1000,
            discount=discount,
            store_paths=True,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            step_size=0.01,
            entropy_weight=ent_wt,
            sess=sess,
            exp_name=exp_name,
        )

        with rllab_logdir(algo=algo, dirname='data/swimmer'):
            algo.train(sess)
示例#2
0
def main():
    env = TfEnv(GymEnv('Ant-v1', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/ant', n=50)

    irl_model = GCLDiscrim(
        env_spec=env.spec,
        expert_trajs=experts,
        discrim_arch=disentangled_net)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=1000,
        discount=0.995,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/ant_airl'):
        with tf.Session():
            algo.train()
示例#3
0
def main(eval_reward = False):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    n_experts = 10
    experts = load_latest_experts('plotting/pendulum_final', n=n_experts)
    dirname='data/pendulum' # dir to save logs and images

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        eval_reward=True,
        fig_dir = dirname
    )

    # with rllab_logdir(algo=algo, dirname='data/pendulum_gcl{}'.format(n_experts)):
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.fig_dirname = dirname
            algo.train()
示例#4
0
def main(num_examples=50, discount=0.99):
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=num_examples)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=2000,
        max_path_length=100,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_traj'):
        with tf.Session():
            algo.train()
示例#5
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99):
    env = TfEnv(GymEnv('Swimmer-v3', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=args.visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    experts = load_latest_experts('data/swimmer', n=5, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=False, fusion=args.fusion, max_itrs=10)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=1000,
        discount=discount,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/swimmer_airl_state_action'):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
示例#7
0
def main(exp_name, params_folder=None):
    env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False))

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file)

    irl_model = AIRL(env=env, expert_trajs=None, state_only=True)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s'%exp_name):
        with tf.Session():
            algo.train()
示例#8
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    
    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.0, # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/pendulum_gail'):
        with tf.Session():
            algo.train()
示例#9
0
def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99):
    env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus)

    irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10)

    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=discount,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
示例#10
0
def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    ### VGG 11/29/18: Added support to CSV files
    ## this method loads expert data saved as pickle file
    # experts = load_latest_experts('data/airsim_final', n=1)
    # this one uses csv:
    experts = load_experts('data/airsim_human_data/log.csv',
                           pickle_format=False)

    irl_model = GAIL(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=5000,
        batch_size=60,
        max_path_length=60,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=100,
        irl_model_wt=1.0,
        entropy_weight=0.0,  # GAIL should not use entropy unless for exploration
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        n_parallel=0)

    with rllab_logdir(algo=algo, dirname='data/airsim_gail'):
        with tf.Session():
            algo.train()
示例#11
0
def main(exp_name=None, fusion=False):
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    irl_model = AIRL(env=env,
                     expert_trajs=experts,
                     state_only=True,
                     fusion=fusion,
                     max_itrs=10)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session():
            algo.train()
示例#12
0
def main():
    env = TfEnv(
        GymEnv('HRI_AirSim_Landing-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/airsim', n=5)

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=10,
        batch_size=100,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/airsim_gcl'):
        with tf.Session():
            algo.train()
示例#13
0
def main():
    env = TfEnv(CustomGymEnv('PointMazeLeft-v0'))
    
    experts = load_latest_experts('data/point', n=50)

    irl_model = GCLDiscrimTrajectory(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=10000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/point_traj'):
        with tf.Session():
            algo.train()
            test_pointmaze(sess.run(policy))
示例#14
0
def main(exp_name=None, fusion=True):
    # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))
    env = TfEnv(
        CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True))
    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)
    #experts = load_latest_experts('data/ant_data_collect', n=5)

    #qvar: inverse model q(a|s,s')
    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env,
                      qvar=qvar,
                      expert_trajs=experts,
                      fusion=True,
                      max_itrs=10)
    #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      fusion=fusion,
                      max_itrs=10,
                      score_discrim=True)

    #Empowerment-based potential functions gamma* Phi(s')-Phi(s)
    empw_model = Empowerment(env=env, fusion=True, max_itrs=4)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               fusion=True,
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=3000,  #130,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        target_empw_update=5,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        lambda_i=1.0,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        plot=False)
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl'):
        #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above
        with tf.Session():
            algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \
         max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None):
    env = TfEnv(
        CustomGymEnv('PointMazeRight-v0', record_video=False,
                     record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/maze_right_data_collect',
                                                n=2,
                                                visible_gpus=visible_gpus)

    sess = tf.Session(config=tf_config)

    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    max_path_length = 500
    irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, state_only=state_only, fusion=fusion, max_itrs=10, score_discrim=score_discrim, debug = debug, \
                               max_nstep = max_nstep, n_value_funct = n_val, n_rew_funct = n_rew, score_method=score_method)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=max_path_length,
        discount=discount,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )

    # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s'
    dirname = 'data/maze_right_state_bootstrap_%d_irl/%s/%s' % (
        max_nstep, exp_folder, exp_name
    ) if exp_folder is not None else 'data/maze_right_state_bootstrap_%d_irl/%s' % (
        max_nstep, exp_name)
    with rllab_logdir(algo=algo, dirname=dirname):
        sess.__enter__()
        algo.train(sess)
        sess.close()
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = TRPO(
        env=env,
        policy=policy,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/pendulum'):
        algo.train()
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = TRPO(env=env,
                policy=policy,
                n_itr=200,
                batch_size=1000,
                max_path_length=100,
                discount=0.99,
                store_paths=True,
                baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum'):
        algo.train()
示例#18
0
def main():
    env = TfEnv(CustomGymEnv('PointMazeRight-v0'))
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    algo = TRPO(
        env=env,
        policy=policy,
        n_itr=2000,
        batch_size=10000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec)
    )

    with rllab_logdir(algo=algo, dirname='data/point_trpo'):
        algo.train()
示例#19
0
def main(exp_name, params_folder=None, visible_gpus='0', discount=0.99):
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    gpu_options = tf.GPUOptions(allow_growth=True,
                                visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR,
                               '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file, tf_config)

    irl_model = AIRL(discount=discount,
                     env=env,
                     expert_trajs=None,
                     state_only=True)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=discount,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo,
                      dirname='data/maze_left_transfer/%s' % exp_name):
        with tf.Session(config=tf_config) as sess:
            algo.train(sess)
示例#20
0
def main(exp_name, ent_wt=1.0, discrete=True):
    tf.reset_default_graph()
    if discrete:
        env = TfEnv(
            CustomGymEnv('PointMazeLeft-v0',
                         record_video=False,
                         record_log=False))
    else:
        env = TfEnv(
            CustomGymEnv('PointMazeLeftCont-v0',
                         record_video=False,
                         record_log=False))

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            sess=sess,
            policy=policy,
            n_itr=2000,
            batch_size=20000,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_wt,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            exp_name=exp_name,
            turn_on_wandb=args.turn_on_wandb,
            render_env=True,
            gif_dir='logs/maze_wall_meta_irl',
            gif_header='',
            wandb_entity=args.wandb_entity,
            wandb_project=args.wandb_project,
            wandb_run_name=args.wandb_run_name,
            wandb_monitor_gym=args.wandb_monitor_gym,
        )
        if discrete:
            output = 'data/maze_left_data_collect_discrete-15/%s' % exp_name
        else:
            output = 'data/maze_left_data_collect/%s' % exp_name
    with rllab_logdir(algo=algo, dirname=output):
        algo.train()
示例#21
0
def main(exp_name,
         ent_wt=1.0,
         trpo_step=0.01,
         env_name='Shaped_PM_MazeRoom_Small-v0',
         rundir='data',
         init_pol_std=1.0,
         many_runs=False,
         hid_size=None,
         hid_layers=None):
    os.makedirs(rundir, exist_ok=True)

    env = TfEnv(CustomGymEnv(env_name, record_video=False, record_log=False))
    if hid_size is None or hid_layers is None:
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size, init_pol_std \
            = min_layers_hidsize_polstd_for(env_name)
        hidden_sizes = (hid_size, ) * hid_layers
    env_trpo_params = irltrpo_params_for(env_name, 'expert')
    policy = GaussianMLPPolicy(
        name='policy',
        env_spec=env.spec,
        hidden_sizes=hidden_sizes,
        init_std=init_pol_std)
    algo = TRPO(
        env=env,
        policy=policy,
        entropy_weight=ent_wt,
        step_size=trpo_step,
        discount=0.99,
        store_paths=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        force_batch_sampler=True,
        # extra environment-specific params like batch size, max path length,
        # etc.
        **env_trpo_params, )

    if many_runs:
        logdir = os.path.join(rundir, 'env_%s' % (env_name.lower()), exp_name)
    else:
        logdir = os.path.join(rundir, 'env_%s' % (env_name.lower()))
    with rllab_logdir(algo=algo, dirname=logdir):
        algo.train()
示例#22
0
def main(exp_name, ent_wt=1.0):
    tf.reset_default_graph()
    env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))
    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            sess=sess,
            policy=policy,
            n_itr=1500,
            batch_size=20000,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_wt,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            exp_name=exp_name,
        )
        with rllab_logdir(algo=algo, dirname='data/ant_data_collect/%s'%exp_name):
            algo.train(sess)
def main(exp_name, ent_wt=1.0, visible_gpus='0',discount=0.99):
    tf.reset_default_graph()
    env = TfEnv(CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32))

    gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options)    
    with tf.Session(config=tf_config) as sess:
        algo = TRPO(
            env=env,
            sess=sess,
            policy=policy,
            n_itr=1500,
            batch_size=20000,
            max_path_length=500,
            discount=discount,
            store_paths=True,
            entropy_weight=ent_wt,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            exp_name=exp_name,
        )
        with rllab_logdir(algo=algo, dirname='data/maze_right_data_collect/%s'%exp_name):
            algo.train(sess)
示例#24
0
def airl(log_dir,
         *,
         tf_cfg,
         env_config,
         reward_model_cfg={},
         policy_cfg={},
         training_cfg={},
         ablation='normal'):
    with TfEnvContext(tf_cfg, env_config) as context:
        training_kwargs, policy_cfg, reward_model_cfg, training_cfg = get_training_kwargs(
            venv=context.env_context.environments,
            reward_model_cfg=reward_model_cfg,
            policy_cfg=policy_cfg,
            training_cfg=training_cfg,
            ablation=ablation,
        )
        print("Training arguments: ", training_kwargs)
        algo = IRLRunner(
            **training_kwargs,
            sampler_cls=sampling.PPOBatchSampler,
        )
        irl_model = algo.irl_model
        policy = algo.policy

        with rllab_logdir(algo=algo, dirname=log_dir):
            print("Training!")
            algo.buffered_train()
            #algo.train()
            # need to return these explicitly because they don't survive
            # across tensorflow sessions
            reward_params = irl_model.get_params()
            policy_params = policy.tensor_values()

    policy = policy_cfg, policy_params
    reward = reward_model_cfg, reward_params
    return reward, policy
示例#25
0
def main(exp_name=None, params_folder='data/ant_state_irl'):
    # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True))
    env = TfEnv(
        CustomGymEnv('DisabledAnt-v0',
                     record_video=False,
                     record_log=False,
                     force_reset=False))

    irl_itr = 90  # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr))
    prior_params = load_prior_params(params_file)
    '''q_itr = 400  # earlier IRL iterations overfit less; 100 seems to work well.
    #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr))
    params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr))
    prior_params_q = load_prior_params(params_file)'''

    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    qvar = GaussianMLPInversePolicy(name='qvar_model',
                                    env_spec=env.spec,
                                    hidden_sizes=(32, 32))
    qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10)
    irl_model = EAIRL(env=env,
                      expert_trajs=experts,
                      state_only=False,
                      score_discrim=False)
    empw_model = Empowerment(env=env, max_itrs=1)
    t_empw_model = Empowerment(env=env,
                               scope='t_efn',
                               max_itrs=2,
                               name='empowerment2')

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    algo = IRLTRPO(
        init_irl_params=prior_params['irl_params'],
        init_empw_params=None,  #prior_params['empw_params'],
        init_qvar_params=None,  #prior_params['qvar_params'],
        init_policy_params=prior_params['policy_params'],  #None
        env=env,
        policy=policy,
        empw=empw_model,
        tempw=t_empw_model,
        qvar_model=qvar_model,
        irl_model=irl_model,
        n_itr=2000,
        batch_size=20000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=True,
        train_empw=True,
        train_qvar=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
        # plot=True,
    )

    with rllab_logdir(algo=algo, dirname='data/ant_transfer'):  #%s'%exp_name):
        #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name):
        with tf.Session():
            algo.train()
示例#26
0
def main(exp_name,
         rundir,
         ent_wt=1.0,
         env_name='Shaped_PM_MazeRoom_Small-v0',
         method="airl",
         beta=1e-2,
         disc_step=1e-3,
         disc_iters=100,
         disc_batch_size=32,
         disc_gp=None,
         trpo_step=1e-2,
         init_pol_std=1.0,
         adaptive_beta=False,
         target_kl=0.5,
         beta_step=1e-6,
         hid_size=None,
         hid_layers=None,
         max_traj=None):
    os.makedirs(rundir, exist_ok=True)

    if hid_size is None or hid_layers is None:
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size, init_pol_std \
            = min_layers_hidsize_polstd_for(env_name)
    env_trpo_params = irltrpo_params_for(env_name, 'irl')

    env = TfEnv(CustomGymEnv(env_name, record_video=False, record_log=False))

    expert_dir = os.path.join(rundir, 'env_%s/' % env_name.lower())
    experts = load_latest_experts_walky(expert_dir, n=5, max_traj=max_traj)

    disc_net_kwargs = {
        'layers': hid_layers,
        'd_hidden': hid_size,
    }
    if method in {'airl', 'vairl'}:
        is_vairl = method == 'vairl'
        irl_model = shaped_airl.AIRL(
            env=env,
            expert_trajs=experts,
            state_only=True,
            fusion=True,
            discrim_arch_args=disc_net_kwargs,
            fitted_value_fn_arch_args=disc_net_kwargs,
            gp_coeff=disc_gp,
            # vairl flag
            vairl=is_vairl,
            # vairl fixed beta settings
            vairl_beta=beta,
            # vairl adaptive beta settings
            vairl_adaptive_beta=adaptive_beta,
            vairl_beta_step_size=beta_step,
            vairl_kl_target=target_kl)
    elif method in {'gail', 'vail'}:
        is_vail = method == 'vail'
        assert gp_coeff is None, "no GAIL/VAIL support for GP coeff"
        irl_model = GAIL(
            env,
            expert_trajs=experts,
            discrim_arch_args=disc_net_kwargs,
            name=method,
            # vail stuff (only adaptive beta for VAIL, no fixed beta like
            # VAIRL)
            vail=is_vail,
            # initial beta
            vail_init_beta=beta,
            vail_beta_step_size=beta_step,
            vail_kl_target=target_kl)
    else:
        raise NotImplementedError("don't know how to handle method '%s'" %
                                  (method, ))

    pol_hid_sizes = (hid_size, ) * hid_layers
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=pol_hid_sizes,
                               init_std=init_pol_std)
    irltrpo_kwargs = dict(
        env=env,
        policy=policy,
        irl_model=irl_model,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=disc_iters,
        irl_model_wt=1.0,
        irl_lr=disc_step,
        irl_batch_size=disc_batch_size,
        step_size=trpo_step,
        # entropy_weight should be 1.0 but 0.1 seems to work better
        entropy_weight=ent_wt,
        force_batch_sampler=True,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    irltrpo_kwargs.update(env_trpo_params)
    n_itr = irltrpo_kwargs['n_itr']
    print('irltrpo_kwargs:', irltrpo_kwargs)
    algo = IRLTRPO(**irltrpo_kwargs)

    run_name = 'env_{env_name}_{method}'.format(env_name=env_name.lower(),
                                                method=method)
    exp_folder = os.path.join(rundir, '%s/%s' % (run_name, exp_name))
    with rllab_logdir(algo=algo, dirname=exp_folder):
        with tf.Session():
            algo.train()
    this_dir = os.path.dirname(__file__)
    maze_retrain_path = os.path.join(this_dir, 'env_retrain.py')
    latest_irl_snap = '%s/itr_%d.pkl' % (exp_folder, n_itr - 1)
    subproc_cmd = [
        # script
        'python',
        maze_retrain_path,
        # experiment info
        latest_irl_snap,
        '--rundir',
        rundir,
        # TRPO params
        '--trpo-step',
        '%f' % trpo_step,
        '--trpo-ent',
        '%f' % ent_wt,
        # network params
        '--hid-layers',
        '%d' % hid_layers,
        '--hid-size',
        '%d' % hid_size,
        # we don't care about precise args relevant to given method because
        # we're just reloading a frozen model
        '--method',
        method,
    ]
    subprocess.run(subproc_cmd, check=True)
示例#27
0
def main(exp_name=None, latent_dim=3, params_folder=None):
    max_path_length = 100
    batch_size = 32
    meta_batch_size = 50
    entropy_weight = 0.1
    left = 'right'
    if_filtered = True

    # tf.reset_default_graph()
    if left == 'left':
        env = TfEnv(
            CustomGymEnv('PointMazeLeft-v0',
                         record_video=False,
                         record_log=False))
    else:
        env = TfEnv(
            CustomGymEnv('PointMazeRight-v0',
                         record_video=False,
                         record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect',
        n=4,
        latent_dim=latent_dim)
    if if_filtered:
        experts_filtered = []
        good_range = [0.1, 0.4]  #[0.3, 0.5]
        for expert in experts:
            if expert['contexts'][0,
                                  0] >= good_range[0] and expert['contexts'][
                                      0, 0] <= good_range[1]:
                experts_filtered.append(expert)
        assert len(experts_filtered) >= meta_batch_size
        experts_filtered = experts_filtered[:-(len(experts_filtered) %
                                               meta_batch_size)]
        experts = experts_filtered

    irl_itr_list = [2800]

    results = []
    for irl_itr in irl_itr_list:
        params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr)
        prior_params = load_prior_params(params_file)
        init_context_encoder_params = load_prior_params(
            params_file, 'context_params')

        policy_prior_params = None

        # contexual policy pi(a|s,m)
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))

        # approximate posterior q(m|tau)
        context_encoder_spec = EnvSpec(
            observation_space=Box(
                np.tile(
                    np.concatenate((env.observation_space.low[:-latent_dim],
                                    env.action_space.low)), max_path_length),
                np.tile(
                    np.concatenate((env.observation_space.high[:-latent_dim],
                                    env.action_space.high)), max_path_length)),
            action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
        )
        context_encoder = GaussianMLPPolicy(name='context_encoder',
                                            env_spec=context_encoder_spec,
                                            hidden_sizes=(128, 128))

        irl_model = InfoAIRL(env=env,
                             expert_trajs=experts,
                             context_encoder=context_encoder,
                             state_only=True,
                             max_path_length=max_path_length,
                             meta_batch_size=meta_batch_size,
                             latent_dim=latent_dim)

        algo = MetaIRLTRPO(
            init_irl_params=prior_params,
            init_pol_params=policy_prior_params,  #policy_prior_params,
            init_context_encoder_params=init_context_encoder_params,
            env=env,
            policy=policy,
            irl_model=irl_model,
            n_itr=150,
            meta_batch_size=meta_batch_size,
            batch_size=batch_size,
            max_path_length=max_path_length,
            discount=0.99,
            store_paths=True,
            train_irl=True,  # True
            train_context_only=True,
            train_policy=True,
            irl_model_wt=1.0,
            entropy_weight=entropy_weight,
            zero_environment_reward=True,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            log_params_folder=params_folder,
            log_experiment_name=exp_name,
        )
        with rllab_logdir(
                algo=algo,
                dirname=
                'data_finetune/maze_finetune_discrete-entropy-%s-irl_itr-%s-%s-%s-generalize/%s'
                % (entropy_weight, irl_itr, left,
                   'filter' if if_filtered else '', exp_name)):
            with tf.Session():
                algo.train()
        results.append((irl_itr, np.max(algo.pol_ret)))
        tf.reset_default_graph()
    print(results)
示例#28
0
def main(
    exp_name,
    rundir='data',
    irl_pkl='',
    ent_wt=1.0,
    trpo_anneal_steps=None,
    trpo_anneal_init_ent=None,
    trpo_step=0.01,
    init_pol_std=1.0,
    method=None,
    hid_size=None,
    hid_layers=None,
    switch_env=None,
):
    orig_env_name = get_name(irl_pkl)
    if switch_env is not None:
        this_env_name = switch_env
    else:
        this_env_name = orig_env_name
    print("Running on environment '%s'" % this_env_name)
    env = TfEnv(
        CustomGymEnv(this_env_name, record_video=False, record_log=False))

    if hid_size is None or hid_layers is None:
        assert hid_size is None and hid_layers is None, \
            "must specify both size & layers, not one or the other"
        hid_layers, hid_size, init_pol_std \
            = min_layers_hidsize_polstd_for(orig_env_name)
    env_trpo_params = irltrpo_params_for(orig_env_name, 'retrain')

    folder = os.path.dirname(irl_pkl)

    prior_params = load_prior_params(irl_pkl)
    expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower())
    experts = load_latest_experts_walky(expert_dir, n=5)

    # For some reason IRLTRPO is responsible for setting weights in this code.
    # It would equally be possible to run global_variables_initializer()
    # ourselves and then do irl_model.set_params(prior_params) if we just
    # wanted to query energy, reward, etc. from the trained AIRL model without
    # using IRLTRPO.
    disc_net_kwargs = {
        'layers': hid_layers,
        'd_hidden': hid_size,
    }
    if method in {'airl', 'vairl'}:
        irl_model = AIRL(env=env,
                         expert_trajs=experts,
                         state_only=True,
                         freeze=True,
                         vairl=method == 'vairl',
                         vairl_beta=1e-4,
                         discrim_arch_args=disc_net_kwargs,
                         fitted_value_fn_arch_args=disc_net_kwargs)
    elif method in {'gail', 'vail'}:
        irl_model = GAIL(env,
                         expert_trajs=experts,
                         discrim_arch_args=disc_net_kwargs,
                         name=method,
                         freeze=True,
                         vail=method == 'vail')
    else:
        raise NotImplementedError("Don't know how to handle method '%s'" %
                                  method)

    pol_hid_sizes = (hid_size, ) * hid_layers
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=pol_hid_sizes,
                               init_std=init_pol_std)
    irltrpo_kwargs = dict(
        env=env,
        policy=policy,
        irl_model=irl_model,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=ent_wt,  # should be 1.0 but 0.1 seems to work better
        step_size=trpo_step,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        init_irl_params=prior_params,
        force_batch_sampler=True,
        entropy_anneal_init_weight=trpo_anneal_init_ent,
        entropy_anneal_steps=trpo_anneal_steps,
        retraining=True)
    irltrpo_kwargs.update(env_trpo_params)
    algo = IRLTRPO(**irltrpo_kwargs)
    folder_suffix = ''
    if switch_env is not None:
        # append lower case environment name to retrain folder path
        folder_suffix = '_%s' % switch_env.lower()
    with rllab_logdir(algo=algo,
                      dirname='%s/retrain%s' % (folder, folder_suffix)):
        with tf.Session():
            algo.train()
示例#29
0
def main(exp_name=None, fusion=False, latent_dim=3):
    max_path_length = 100
    info_coeff = 0.1
    imitation_coeff = 0.01
    batch_size = 16
    meta_batch_size = 50
    max_itrs = 20
    pre_epoch = 1000
    entropy_weight = 1.0
    reward_arch = relu_net
    if reward_arch == relu_net:
        layers = 2
        d_hidden = 32
        reward_arch_args = {
            'layers': layers,
            'd_hidden': d_hidden,
        }
    else:
        layers, d_hidden = 0, 0
        reward_arch_args = None

    tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs(
        'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim)

    # contexual policy pi(a|s,m)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))

    # approximate posterior q(m|tau)
    context_encoder_spec = EnvSpec(
        observation_space=Box(
            np.tile(
                np.concatenate((env.observation_space.low[:-latent_dim],
                                env.action_space.low)), max_path_length),
            np.tile(
                np.concatenate((env.observation_space.high[:-latent_dim],
                                env.action_space.high)), max_path_length)),
        action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)),
    )
    context_encoder = GaussianMLPPolicy(name='context_encoder',
                                        env_spec=context_encoder_spec,
                                        hidden_sizes=(128, 128))

    pretrain_model = Pretrain(experts,
                              policy,
                              context_encoder,
                              env,
                              latent_dim,
                              batch_size=400,
                              kl_weight=0.1,
                              epoch=pre_epoch)
    # pretrain_model = None
    if pretrain_model is None:
        pre_epoch = 0

    irl_model = InfoAIRL(env=env,
                         policy=policy,
                         context_encoder=context_encoder,
                         reward_arch=reward_arch,
                         reward_arch_args=reward_arch_args,
                         expert_trajs=experts,
                         state_only=True,
                         max_path_length=max_path_length,
                         fusion=fusion,
                         max_itrs=max_itrs,
                         meta_batch_size=meta_batch_size,
                         imitation_coeff=imitation_coeff,
                         info_coeff=info_coeff,
                         latent_dim=latent_dim)

    algo = MetaIRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        randomize_policy=True,
        pretrain_model=pretrain_model,
        n_itr=3000,
        meta_batch_size=meta_batch_size,
        batch_size=batch_size,
        max_path_length=max_path_length,
        discount=0.99,
        store_paths=True,
        train_irl=True,
        irl_model_wt=1.0,
        entropy_weight=entropy_weight,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    if fusion:
        dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)
    else:
        dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % (
            imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs,
            pre_epoch, entropy_weight, layers, d_hidden, exp_name)

    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.train()