def main(exp_name=None, fusion=False): env = TfEnv( CustomGymEnv('airl/CustomAnt-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(): algo.train()
def run_expt(config): env_name = config['environment'] env = get_env(env_name) experts = get_demos(env_name) irl_model = algo_string_to_model[config['algo']](env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # use params for each env algo = IRLTRPO(env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=2000 if env_name == 'pendulum' else 10000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=1.0 if env_name == 'pointmass' else 0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) dirname = DATA_DIR + "/" + "___".join( [str(k) + "=" + str(v) for k, v in config.items()]) with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.train() # a little clumsy but it's the easiest way, as rllab logger doesn't keep data around after # it's been written to disk train_results = pd.read_csv(dirname + '/progress.csv') # return originaltaskaverageReturn for last iteation output = config.copy() output['return'] = train_results.iloc[-1]['OriginalTaskAverageReturn'] return output
def main(): env = TfEnv(GymEnv('Pendulum-v0', record_video=False, record_log=False)) experts = load_latest_experts('data/pendulum', n=5) irl_model = AIRLStateAction(env_spec=env.spec, expert_trajs=experts) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=200, batch_size=1000, max_path_length=100, discount=0.99, store_paths=True, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=0.1, # this should be 1.0 but 0.1 seems to work better zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec)) with rllab_logdir(algo=algo, dirname='data/pendulum_gcl'): with tf.Session(): algo.train()
def main(exp_name, params_folder=None): env = TfEnv( CustomGymEnv('airl/DisabledAnt-v0', record_video=False, record_log=False)) irl_itr = 100 # earlier IRL iterations overfit less; 100 seems to work well. params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) prior_params = load_prior_params(params_file) irl_model = AIRL(env=env, expert_trajs=None, state_only=True) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params, env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=False, train_irl=False, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer/%s' % exp_name): with tf.Session(): algo.train()
def fu_irl( venv, is_airl, expert=None, expert_venv=None, expert_trajectories=None, total_timesteps=10000, gen_batch_size=200, policy_lr=1e-3, callback=None, **kwargs, ): # Disable algorithm's internal prints old_stdout = sys.stdout sys.stdout = open(os.devnull, 'w') raw_env = get_raw_env(venv) tf_env = TfEnv(GymEnv(env=raw_env, record_video=False, record_log=False)) if expert_trajectories is None: expert_trajectories = sample_trajectories( expert_venv, expert, n_episodes=total_timesteps ) expert_trajectories = to_rllab_trajectories(expert_trajectories, venv) if is_airl: irl_model = AIRLStateAction( env_spec=tf_env.spec, expert_trajs=expert_trajectories ) entropy_weight = 1.0 else: irl_model = GAIL(env_spec=tf_env.spec, expert_trajs=expert_trajectories) entropy_weight = 0.0 if isinstance(venv.action_space, Discrete): policy = CategoricalMLPPolicy( name="policy", env_spec=tf_env.spec, hidden_sizes=(32, 32) ) else: policy = GaussianMLPPolicy( name="policy", env_spec=tf_env.spec, hidden_sizes=(32, 32) ) num_epochs = int(total_timesteps // gen_batch_size) algo = IRLTRPO( env=tf_env, policy=policy, irl_model=irl_model, n_itr=num_epochs, batch_size=gen_batch_size, max_path_length=100, discount=0.99, discrim_train_itrs=50, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=tf_env.spec), ) algo.train() sys.stdout = old_stdout def predict_fn(ob, state=None, deterministic=False): act, _ = algo.policy.get_action(ob) return act, state results = {} results["policy"] = LightweightRLModel(predict_fn=predict_fn, env=venv) return results
def finetune(metainit, venv, trajectories, discount, seed, log_dir, *, tf_cfg, pol_itr=100, irl_itr=100, model_cfg=None, policy_cfg=None, training_cfg={}): envs = VecGymEnv(venv) envs = TfEnv(envs) experts = _convert_trajectories(trajectories) train_graph = tf.Graph() with train_graph.as_default(): tf.set_random_seed(seed) if model_cfg is None: model_cfg = { 'model': AIRLStateOnly, 'state_only': True, 'max_itrs': 10 } model_kwargs = dict(model_cfg) model_cls = model_kwargs.pop('model') irl_model = model_cls(env_spec=envs.spec, expert_trajs=experts, **model_kwargs) if policy_cfg is None: policy_cfg = { 'policy': GaussianMLPPolicy, 'hidden_sizes': (32, 32) } else: policy_cfg = dict(policy_cfg) policy_fn = policy_cfg.pop('policy') policy = policy_fn(name='policy', env_spec=envs.spec, **policy_cfg) training_kwargs = { 'batch_size': 10000, 'max_path_length': 500, 'irl_model_wt': 1.0, 'entropy_weight': 0.1, # paths substantially increase storage requirements 'store_paths': False, } training_kwargs.update(training_cfg) _kwargs, reward_params = metainit algo = IRLTRPO(env=envs, policy=policy, irl_model=irl_model, discount=discount, sampler_args=dict(n_envs=venv.num_envs), zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=envs.spec), init_irl_params=reward_params, train_irl=False, n_itr=pol_itr, **training_kwargs) with tf.Session(config=tf_cfg): # First round: just optimize the policy, do not update IRL model with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'pol')): with rl_logger.prefix('finetune policy |'): algo.train() pol_params = policy.get_param_values() # Second round: we have a good policy (generator), update IRL with rllab_logdir(algo=algo, dirname=osp.join(log_dir, 'all')): with rl_logger.prefix('finetune all |'): algo.train_irl = True algo.init_pol_params = pol_params algo.n_itr = irl_itr algo.train() reward_params = irl_model.get_params() # Side-effect: forces policy to cache all parameters. # This ensures they are saved/restored during pickling. policy.get_params() # Must pickle policy rather than returning it directly, # since parameters in policy will not survive across tf sessions. policy_pkl = pickle.dumps(policy) reward = model_cfg, reward_params return reward, policy_pkl
def metalearn(venvs, trajectories, discount, seed, log_dir, *, tf_cfg, outer_itr=1000, lr=1e-2, model_cfg=None, policy_cfg=None, training_cfg={}, policy_per_task=False): envs = {k: TfEnv(VecGymEnv(v)) for k, v in venvs.items()} env_spec = list(envs.values())[0].spec num_envs = list(venvs.values())[0].num_envs tasks = list(envs.keys()) experts = {k: _convert_trajectories(v) for k, v in trajectories.items()} train_graph = tf.Graph() with train_graph.as_default(): tf.set_random_seed(seed) if model_cfg is None: model_cfg = { 'model': AIRLStateOnly, 'state_only': True, 'max_itrs': 10 } model_kwargs = dict(model_cfg) model_cls = model_kwargs.pop('model') irl_model = model_cls(env_spec=env_spec, **model_kwargs) if policy_cfg is None: policy_cfg = { 'policy': GaussianMLPPolicy, 'hidden_sizes': (32, 32) } else: policy_cfg = dict(policy_cfg) policy_fn = policy_cfg.pop('policy') policy = policy_fn(name='policy', env_spec=env_spec, **policy_cfg) pol_params = {} training_kwargs = { 'n_itr': 10, 'batch_size': 10000, 'max_path_length': 500, 'irl_model_wt': 1.0, 'entropy_weight': 0.1, # paths substantially increase storage requirements 'store_paths': False, } training_kwargs.update(training_cfg) algos = { k: IRLTRPO(env=env, policy=policy, irl_model=irl_model, discount=discount, sampler_args=dict(n_envs=num_envs), zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env_spec), **training_kwargs) for k, env in envs.items() } with tf.Session(config=tf_cfg) as sess: sess.run(tf.global_variables_initializer()) meta_reward_params = irl_model.get_params() for i in range(outer_itr): task = random.choice(tasks) pol_task = task if policy_per_task else None itr_logdir = osp.join( log_dir, '{}_{}'.format(i, sanitize_env_name(task))) with rllab_logdir(algo=algos[task], dirname=itr_logdir): with rl_logger.prefix('outer itr {} | task {}'.format( i, task)): irl_model.set_demos(experts[task]) # TODO: rather than specifying these as initializers, # might be more efficient to have AIRL not overwrite # these variables each call to train()? algos[task].init_irl_params = meta_reward_params algos[task].init_pol_params = pol_params.get(pol_task) algos[task].train() # Meta-update reward # {meta,task}_reward_params are lists of NumPy arrays task_reward_params = irl_model.get_params() assert len(task_reward_params) == len( meta_reward_params) for i in range(len(task_reward_params)): meta, task = meta_reward_params[ i], task_reward_params[i] # Reptile update: meta <- meta + lr * (task - meta) #TODO: use Adam optimizer? meta_reward_params[i] = (1 - lr) * meta + lr * task # Store policy update (joint if not policy_per_task) pol_params[pol_task] = policy.get_param_values() reward = model_kwargs, meta_reward_params return reward