Пример #1
0
def main(exp_name=None, fusion=False):
    env = TfEnv(
        CustomGymEnv('airl/CustomAnt-v0', record_video=False,
                     record_log=False))

    # load ~2 iterations worth of data from each forward RL experiment as demos
    experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)

    irl_model = AIRL(env=env,
                     expert_trajs=experts,
                     state_only=True,
                     fusion=fusion,
                     max_itrs=10)

    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=True,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
    )
    with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name):
        with tf.Session():
            algo.train()
Пример #2
0
def main(exp_name, ent_wt=1.0):
    tf.reset_default_graph()
    env = TfEnv(
        CustomGymEnv('airl/CustomAnt-v0', record_video=False,
                     record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            sess=sess,
            policy=policy,
            n_itr=1500,
            batch_size=20000,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_wt,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            exp_name=exp_name,
        )
        with rllab_logdir(algo=algo,
                          dirname='data/ant_data_collect/%s' % exp_name):
            algo.train()
def run_expt(config):
    env_name = config['environment']
    env = get_env(env_name)
    experts = get_demos(env_name)
    irl_model = algo_string_to_model[config['algo']](env_spec=env.spec,
                                                     expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    # use params for each env
    algo = IRLTRPO(env=env,
                   policy=policy,
                   irl_model=irl_model,
                   n_itr=200,
                   batch_size=2000 if env_name == 'pendulum' else 10000,
                   max_path_length=100,
                   discount=0.99,
                   store_paths=True,
                   discrim_train_itrs=50,
                   irl_model_wt=1.0,
                   entropy_weight=1.0 if env_name == 'pointmass' else 0.1,
                   zero_environment_reward=True,
                   baseline=LinearFeatureBaseline(env_spec=env.spec))
    dirname = DATA_DIR + "/" + "___".join(
        [str(k) + "=" + str(v) for k, v in config.items()])
    with rllab_logdir(algo=algo, dirname=dirname):
        with tf.Session():
            algo.train()
    # a little clumsy but it's the easiest way, as rllab logger doesn't keep data around after
    # it's been written to disk
    train_results = pd.read_csv(dirname + '/progress.csv')
    # return originaltaskaverageReturn for last iteation
    output = config.copy()
    output['return'] = train_results.iloc[-1]['OriginalTaskAverageReturn']
    return output
Пример #4
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))

    experts = load_latest_experts('data/pendulum', n=5)

    irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=200,
        batch_size=1000,
        max_path_length=100,
        discount=0.99,
        store_paths=True,
        discrim_train_itrs=50,
        irl_model_wt=1.0,
        entropy_weight=0.1,  # this should be 1.0 but 0.1 seems to work better
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum_gcl'):
        with tf.Session():
            algo.train()
Пример #5
0
def main(
    log_dir,
    env_name,
    ent_coef,
    n_steps,
    total_timesteps,
    num_vec,
):
    tf.reset_default_graph()
    # n_steps is the `batch_size // num_vec` in `imitation`.
    batch_size = n_steps * num_vec
    n_itr = int(math.ceil(total_timesteps / batch_size))

    if env_name.startswith("airl/"):
        env_cls = CustomGymEnv
    else:
        env_cls = GymEnv
    env = TfEnv(env_cls(env_name, record_video=False, record_log=False))

    # NOTE: Haven't yet checked if hidden_sizes=(32, 32) matches the settings in
    # the `imitation` repo. We use the default Stable Baselines MLP policy.
    if isinstance(env.spec.action_space, Box):
        policy = GaussianMLPPolicy(name='policy',
                                   env_spec=env.spec,
                                   hidden_sizes=(32, 32))
    else:
        policy = CategoricalMLPPolicy(name='policy',
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

    with tf.Session(config=get_session_config()) as sess:
        algo = TRPO(
            env=env,
            policy=policy,
            n_itr=n_itr,
            batch_size=batch_size,
            max_path_length=500,
            discount=0.99,
            store_paths=True,
            entropy_weight=ent_coef,
            baseline=LinearFeatureBaseline(env_spec=env.spec),
            # Maybe it will be the case the not every policy is compatible with
            # the VectorizedSampler. In that case, consider changing to
            # `sampler_cls=None` and adding a dummy `n_envs` kwargs to BatchSampler.
            sampler_cls=VectorizedSampler,
            sampler_args=dict(n_envs=num_vec),
        )
        with rllab_logdir(algo=algo, dirname=log_dir):
            algo.train(sess)
Пример #6
0
def main():
    env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = TRPO(env=env,
                policy=policy,
                n_itr=200,
                batch_size=2000,
                max_path_length=100,
                discount=0.99,
                store_paths=True,
                baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname='data/pendulum'):
        algo.train()
Пример #7
0
def main(env_name, n_itr, batch_size, max_path_length):
    env_id = env_names_to_ids[env_name]
    env = TfEnv(GymEnv(env_id, record_video=False, record_log=False))
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = TRPO(env=env,
                policy=policy,
                n_itr=n_itr,
                batch_size=batch_size,
                max_path_length=max_path_length,
                discount=0.99,
                store_paths=True,
                baseline=LinearFeatureBaseline(env_spec=env.spec))

    with rllab_logdir(algo=algo, dirname=DATA_DIR + '/' + env_name):
        algo.train()
Пример #8
0
def main(exp_name, params_folder=None):
    env = TfEnv(
        CustomGymEnv('airl/DisabledAnt-v0',
                     record_video=False,
                     record_log=False))

    irl_itr = 100  # earlier IRL iterations overfit less; 100 seems to work well.
    params_file = os.path.join(DATA_DIR,
                               '%s/itr_%d.pkl' % (params_folder, irl_itr))
    prior_params = load_prior_params(params_file)

    irl_model = AIRL(env=env, expert_trajs=None, state_only=True)
    policy = GaussianMLPPolicy(name='policy',
                               env_spec=env.spec,
                               hidden_sizes=(32, 32))
    algo = IRLTRPO(
        init_irl_params=prior_params,
        env=env,
        policy=policy,
        irl_model=irl_model,
        n_itr=1000,
        batch_size=10000,
        max_path_length=500,
        discount=0.99,
        store_paths=False,
        train_irl=False,
        irl_model_wt=1.0,
        entropy_weight=0.1,
        zero_environment_reward=True,
        baseline=LinearFeatureBaseline(env_spec=env.spec),
        log_params_folder=params_folder,
        log_experiment_name=exp_name,
    )
    with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s' % exp_name):
        with tf.Session():
            algo.train()
Пример #9
0
def airl(log_dir,
         *,
         tf_cfg,
         env_config,
         reward_model_cfg={},
         policy_cfg={},
         training_cfg={},
         ablation='normal'):
    with TfEnvContext(tf_cfg, env_config) as context:
        training_kwargs, policy_cfg, reward_model_cfg, training_cfg = get_training_kwargs(
            venv=context.env_context.environments,
            reward_model_cfg=reward_model_cfg,
            policy_cfg=policy_cfg,
            training_cfg=training_cfg,
            ablation=ablation,
        )
        print("Training arguments: ", training_kwargs)
        algo = IRLRunner(
            **training_kwargs,
            sampler_cls=sampling.PPOBatchSampler,
        )
        irl_model = algo.irl_model
        policy = algo.policy

        with rllab_logdir(algo=algo, dirname=log_dir):
            print("Training!")
            algo.buffered_train()
            #algo.train()
            # need to return these explicitly because they don't survive
            # across tensorflow sessions
            reward_params = irl_model.get_params()
            policy_params = policy.tensor_values()

    policy = policy_cfg, policy_params
    reward = reward_model_cfg, reward_params
    return reward, policy
Пример #10
0
def finetune(metainit,
             venv,
             trajectories,
             discount,
             seed,
             log_dir,
             *,
             tf_cfg,
             pol_itr=100,
             irl_itr=100,
             model_cfg=None,
             policy_cfg=None,
             training_cfg={}):
    envs = VecGymEnv(venv)
    envs = TfEnv(envs)
    experts = _convert_trajectories(trajectories)

    train_graph = tf.Graph()
    with train_graph.as_default():
        tf.set_random_seed(seed)

        if model_cfg is None:
            model_cfg = {
                'model': AIRLStateOnly,
                'state_only': True,
                'max_itrs': 10
            }
        model_kwargs = dict(model_cfg)
        model_cls = model_kwargs.pop('model')
        irl_model = model_cls(env_spec=envs.spec,
                              expert_trajs=experts,
                              **model_kwargs)

        if policy_cfg is None:
            policy_cfg = {
                'policy': GaussianMLPPolicy,
                'hidden_sizes': (32, 32)
            }
        else:
            policy_cfg = dict(policy_cfg)
        policy_fn = policy_cfg.pop('policy')
        policy = policy_fn(name='policy', env_spec=envs.spec, **policy_cfg)

        training_kwargs = {
            'batch_size': 10000,
            'max_path_length': 500,
            'irl_model_wt': 1.0,
            'entropy_weight': 0.1,
            # paths substantially increase storage requirements
            'store_paths': False,
        }
        training_kwargs.update(training_cfg)
        _kwargs, reward_params = metainit
        algo = IRLTRPO(env=envs,
                       policy=policy,
                       irl_model=irl_model,
                       discount=discount,
                       sampler_args=dict(n_envs=venv.num_envs),
                       zero_environment_reward=True,
                       baseline=LinearFeatureBaseline(env_spec=envs.spec),
                       init_irl_params=reward_params,
                       train_irl=False,
                       n_itr=pol_itr,
                       **training_kwargs)

        with tf.Session(config=tf_cfg):
            # First round: just optimize the policy, do not update IRL model
            with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'pol')):
                with rl_logger.prefix('finetune policy |'):
                    algo.train()
                    pol_params = policy.get_param_values()

            # Second round: we have a good policy (generator), update IRL
            with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'all')):
                with rl_logger.prefix('finetune all |'):
                    algo.train_irl = True
                    algo.init_pol_params = pol_params
                    algo.n_itr = irl_itr
                    algo.train()

            reward_params = irl_model.get_params()

            # Side-effect: forces policy to cache all parameters.
            # This ensures they are saved/restored during pickling.
            policy.get_params()
            # Must pickle policy rather than returning it directly,
            # since parameters in policy will not survive across tf sessions.
            policy_pkl = pickle.dumps(policy)

    reward = model_cfg, reward_params
    return reward, policy_pkl
Пример #11
0
def metalearn(venvs,
              trajectories,
              discount,
              seed,
              log_dir,
              *,
              tf_cfg,
              outer_itr=1000,
              lr=1e-2,
              model_cfg=None,
              policy_cfg=None,
              training_cfg={},
              policy_per_task=False):
    envs = {k: TfEnv(VecGymEnv(v)) for k, v in venvs.items()}
    env_spec = list(envs.values())[0].spec
    num_envs = list(venvs.values())[0].num_envs
    tasks = list(envs.keys())

    experts = {k: _convert_trajectories(v) for k, v in trajectories.items()}

    train_graph = tf.Graph()
    with train_graph.as_default():
        tf.set_random_seed(seed)

        if model_cfg is None:
            model_cfg = {
                'model': AIRLStateOnly,
                'state_only': True,
                'max_itrs': 10
            }
        model_kwargs = dict(model_cfg)
        model_cls = model_kwargs.pop('model')
        irl_model = model_cls(env_spec=env_spec, **model_kwargs)

        if policy_cfg is None:
            policy_cfg = {
                'policy': GaussianMLPPolicy,
                'hidden_sizes': (32, 32)
            }
        else:
            policy_cfg = dict(policy_cfg)
        policy_fn = policy_cfg.pop('policy')
        policy = policy_fn(name='policy', env_spec=env_spec, **policy_cfg)
        pol_params = {}

        training_kwargs = {
            'n_itr': 10,
            'batch_size': 10000,
            'max_path_length': 500,
            'irl_model_wt': 1.0,
            'entropy_weight': 0.1,
            # paths substantially increase storage requirements
            'store_paths': False,
        }
        training_kwargs.update(training_cfg)
        algos = {
            k: IRLTRPO(env=env,
                       policy=policy,
                       irl_model=irl_model,
                       discount=discount,
                       sampler_args=dict(n_envs=num_envs),
                       zero_environment_reward=True,
                       baseline=LinearFeatureBaseline(env_spec=env_spec),
                       **training_kwargs)
            for k, env in envs.items()
        }

        with tf.Session(config=tf_cfg) as sess:
            sess.run(tf.global_variables_initializer())
            meta_reward_params = irl_model.get_params()
            for i in range(outer_itr):
                task = random.choice(tasks)
                pol_task = task if policy_per_task else None
                itr_logdir = osp.join(
                    log_dir, '{}_{}'.format(i, sanitize_env_name(task)))
                with rllab_logdir(algo=algos[task], dirname=itr_logdir):
                    with rl_logger.prefix('outer itr {} | task {}'.format(
                            i, task)):
                        irl_model.set_demos(experts[task])
                        # TODO: rather than specifying these as initializers,
                        # might be more efficient to have AIRL not overwrite
                        # these variables each call to train()?
                        algos[task].init_irl_params = meta_reward_params
                        algos[task].init_pol_params = pol_params.get(pol_task)
                        algos[task].train()

                        # Meta-update reward
                        # {meta,task}_reward_params are lists of NumPy arrays
                        task_reward_params = irl_model.get_params()
                        assert len(task_reward_params) == len(
                            meta_reward_params)
                        for i in range(len(task_reward_params)):
                            meta, task = meta_reward_params[
                                i], task_reward_params[i]
                            # Reptile update: meta <- meta + lr * (task - meta)
                            #TODO: use Adam optimizer?
                            meta_reward_params[i] = (1 - lr) * meta + lr * task

                        # Store policy update (joint if not policy_per_task)
                        pol_params[pol_task] = policy.get_param_values()

    reward = model_kwargs, meta_reward_params

    return reward