def get_name(irl_pkl): with tf.Session(config=get_session_config()): irl_pkl_data = joblib.load(irl_pkl) env_name = get_inner_env(irl_pkl_data['env']).env_name del irl_pkl_data tf.reset_default_graph() return env_name
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session(config=get_session_config()) sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] save_itr_params_pickle(itr, params) prune_old_snapshots(itr, keep_every=self.snap_keep_every, keep_latest=self.snap_keep_latest) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: rollout(self.env, self.policy, animated=True, max_path_length=self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker() if created_session: sess.close()
def main(exp_name, ent_wt=1.0, discrete=True): tf.reset_default_graph() if discrete: env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) else: env = TfEnv( CustomGymEnv('PointMazeLeftCont-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=get_session_config()) as sess: algo = TRPO( env=env, sess=sess, policy=policy, n_itr=2000, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env.spec), exp_name=exp_name, turn_on_wandb=args.turn_on_wandb, render_env=True, gif_dir='logs/maze_wall_meta_irl', gif_header='', wandb_entity=args.wandb_entity, wandb_project=args.wandb_project, wandb_run_name=args.wandb_run_name, wandb_monitor_gym=args.wandb_monitor_gym, ) if discrete: output = 'data/maze_left_data_collect_discrete-15/%s' % exp_name else: output = 'data/maze_left_data_collect/%s' % exp_name with rllab_logdir(algo=algo, dirname=output): algo.train()
def main(exp_name, ent_wt=1.0): tf.reset_default_graph() env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session(config=get_session_config()) as sess: algo = TRPO( env=env, sess=sess, policy=policy, n_itr=1500, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, entropy_weight=ent_wt, baseline=LinearFeatureBaseline(env_spec=env.spec), exp_name=exp_name, ) with rllab_logdir(algo=algo, dirname='data/ant_data_collect/%s'%exp_name): algo.train(sess)
def main(exp_name=None, fusion=False, latent_dim=3): max_path_length = 100 info_coeff = 0.1 imitation_coeff = 0.01 batch_size = 16 meta_batch_size = 50 max_itrs = 20 pre_epoch = args.pre_epoch entropy_weight = 1.0 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( 'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim) # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) pretrain_model = Pretrain(experts, policy, context_encoder, env, latent_dim, batch_size=400, kl_weight=0.1, epoch=pre_epoch) # pretrain_model = None if pretrain_model is None: pre_epoch = 0 irl_model = InfoAIRL(env=env, policy=policy, context_encoder=context_encoder, reward_arch=reward_arch, reward_arch_args=reward_arch_args, expert_trajs=experts, state_only=True, max_path_length=max_path_length, fusion=fusion, max_itrs=max_itrs, meta_batch_size=meta_batch_size, imitation_coeff=imitation_coeff, info_coeff=info_coeff, latent_dim=latent_dim) algo = MetaIRLTRPO( env=env, policy=policy, irl_model=irl_model, randomize_policy=True, pretrain_model=pretrain_model, n_itr=args.n_itr, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), turn_on_wandb=args.turn_on_wandb, render_env=True, gif_dir='logs/maze_wall_meta_irl', gif_header='', wandb_entity=args.wandb_entity, wandb_project=args.wandb_project, wandb_run_name=args.wandb_run_name, wandb_monitor_gym=args.wandb_monitor_gym, ) if fusion: dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) else: dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) config = get_session_config() with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(config=config): algo.train()
# logdir = '/home/usaywook/ext256/inverse_rl/data/ant_state_irl/itr_2999.pkl' logdir = '/home/usaywook/ext256/inverse_rl/data/ant_transfer/itr_1500.pkl' params = load_prior_params(logdir) loaded_params = params['policy_params'] policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if loaded_params is not None: # x = list(params['policy']._cached_params.values())[0] # y = list(params['policy']._cached_param_dtypes.values())[0] policy.set_param_values(loaded_params) # pdb.set_trace() with tf.Session(config=get_session_config()) as sess: # algo = TRPO( # env=env, # sess=sess, # policy=policy, # n_itr=1, # batch_size=20000, # max_path_length=500, # discount=0.99, # store_paths=True, # entropy_weight=0.1, # baseline=LinearFeatureBaseline(env_spec=env.spec), # exp_name=None, # plot=True # ) # algo.train()
def main( rundir='data', irl_pkl='', pol_pkl=None, method=None, hid_size=None, hid_layers=None, switch_env=None, ): print('irl_pkl =', irl_pkl, 'and pol_pkl =', pol_pkl) orig_env_name = get_name(irl_pkl) if switch_env is not None: this_env_name = switch_env else: this_env_name = orig_env_name print("Running on environment '%s'" % this_env_name) env = TfEnv( CustomGymEnv(this_env_name, record_video=False, record_log=False)) if hid_size is None or hid_layers is None: # we want hidden size & layer count for the *original* environment, # since that's what the IRL model that we're trying to reconstruct was # trained on assert hid_size is None and hid_layers is None, \ "must specify both size & layers, not one or the other" hid_layers, hid_size = min_layers_hidsize_for(orig_env_name) # we want trajectory length for the new environment rather than the # original environment, though traj_length = irltrpo_params_for(this_env_name, 'retrain')['max_path_length'] print('Horizon is', traj_length) expert_dir = os.path.join(rundir, 'env_%s/' % orig_env_name.lower()) experts = load_latest_experts_walky(expert_dir, n=1) with tf.Session(config=get_session_config(), graph=tf.Graph()): irl_pkl_data = joblib.load(irl_pkl) disc_net_kwargs = { 'layers': hid_layers, 'd_hidden': hid_size, } if method in {'airl', 'vairl'}: irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, freeze=True, vairl=method == 'vairl', vairl_beta=1e-4, discrim_arch_args=disc_net_kwargs, fitted_value_fn_arch_args=disc_net_kwargs) elif method in {'gail', 'vail'}: irl_model = GAIL(env, expert_trajs=experts, discrim_arch_args=disc_net_kwargs, name=method, freeze=True, vail=method == 'vail') else: raise NotImplementedError("Don't know how to handle method '%s'" % method) irl_model.set_params(irl_pkl_data['irl_params']) if pol_pkl is not None: with tf.variable_scope('please-work'): pol_pkl_data = joblib.load(pol_pkl) policy = pol_pkl_data['policy'] print('Using policy loaded from %s' % pol_pkl) else: print('Using original IRL policy') policy = irl_pkl_data['policy'] # do a few rollouts with given policy on given reward # report both the IRL reward AND the mean reward for the policy n_rollouts = 30 irl_rets = np.zeros((n_rollouts, )) env_rets = np.zeros((n_rollouts, )) for i in tqdm.trange(n_rollouts): # how do I get final return? Hmm path = rollout(env, policy, max_path_length=traj_length) env_rets[i] = np.sum(path['rewards']) irl_rew = irl_model.eval([path]) irl_rets[i] = np.sum(irl_rew) print('Env mean %.2f (std %.2f)' % (np.mean(env_rets), np.std(env_rets))) print('IRL mean %.2f (std %.2f)' % (np.mean(irl_rets), np.std(irl_rets)))