def main(exp_name=None, fusion=False): env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) irl_model = AIRL(env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=0.99, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0',discount=0.99): env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True,visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2, visible_gpus=visible_gpus) irl_model = AIRL(discount=discount, env=env, expert_trajs=experts, state_only=True, fusion=fusion, max_itrs=10) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=500, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): with tf.Session(config=tf_config) as sess: algo.train(sess)
def main(exp_name=None, fusion=True): # env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False)) env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=True)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) #experts = load_latest_experts('data/ant_data_collect', n=5) #qvar: inverse model q(a|s,s') qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=experts, fusion=True, max_itrs=10) #Empowerment-based Adversarial Inverse Reinforcement Learning, set score_discrim=True irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, fusion=fusion, max_itrs=10, score_discrim=True) #Empowerment-based potential functions gamma* Phi(s')-Phi(s) empw_model = Empowerment(env=env, fusion=True, max_itrs=4) t_empw_model = Empowerment(env=env, scope='t_efn', fusion=True, max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=3000, #130, batch_size=20000, max_path_length=500, discount=0.99, store_paths=True, target_empw_update=5, irl_model_wt=1.0, entropy_weight=0.1, lambda_i=1.0, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), plot=False) with rllab_logdir(algo=algo, dirname='data/ant_state_irl'): #with rllab_logdir(algo=algo, dirname='data/ant_state_irl/%s' % exp_name): # if you use multiple runs, use this line instead of above with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, visible_gpus='0', discount=0.99, debug=False, n_val=1, n_rew=1, \ max_nstep=1, exp_folder=None, state_only=False, score_discrim=True, score_method=None): env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=visible_gpus) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs('data/maze_right_data_collect', n=2, visible_gpus=visible_gpus) sess = tf.Session(config=tf_config) # sess = tf_debug.LocalCLIDebugWrapperSession(sess) max_path_length = 500 irl_model = AIRL_Bootstrap(discount=discount, env=env, expert_trajs=experts, state_only=state_only, fusion=fusion, max_itrs=10, score_discrim=score_discrim, debug = debug, \ max_nstep = max_nstep, n_value_funct = n_val, n_rew_funct = n_rew, score_method=score_method) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( env=env, policy=policy, irl_model=irl_model, n_itr=1000, batch_size=10000, max_path_length=max_path_length, discount=discount, store_paths=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) # temp_folder = '/media/data/temp_exp_nstep/maze_right_state_bootstrap_%d_irl/%s' dirname = 'data/maze_right_state_bootstrap_%d_irl/%s/%s' % ( max_nstep, exp_folder, exp_name ) if exp_folder is not None else 'data/maze_right_state_bootstrap_%d_irl/%s' % ( max_nstep, exp_name) with rllab_logdir(algo=algo, dirname=dirname): sess.__enter__() algo.train(sess) sess.close()
def main(exp_name=None, params_folder='data/ant_state_irl'): # env = TfEnv(CustomGymEnv('PointMazeLeft-v0', record_video=True, record_log=True,force_reset=True)) env = TfEnv( CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False)) irl_itr = 90 # earlier IRL iterations overfit less; either 80 or 90 seems to work well. But I usually search through 60,65,70,75, .. uptil 100 #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr)) prior_params = load_prior_params(params_file) '''q_itr = 400 # earlier IRL iterations overfit less; 100 seems to work well. #params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (q_itr)) prior_params_q = load_prior_params(params_file)''' experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2) qvar = GaussianMLPInversePolicy(name='qvar_model', env_spec=env.spec, hidden_sizes=(32, 32)) qvar_model = Qvar(env=env, qvar=qvar, expert_trajs=None, max_itrs=10) irl_model = EAIRL(env=env, expert_trajs=experts, state_only=False, score_discrim=False) empw_model = Empowerment(env=env, max_itrs=1) t_empw_model = Empowerment(env=env, scope='t_efn', max_itrs=2, name='empowerment2') policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) algo = IRLTRPO( init_irl_params=prior_params['irl_params'], init_empw_params=None, #prior_params['empw_params'], init_qvar_params=None, #prior_params['qvar_params'], init_policy_params=prior_params['policy_params'], #None env=env, policy=policy, empw=empw_model, tempw=t_empw_model, qvar_model=qvar_model, irl_model=irl_model, n_itr=2000, batch_size=20000, max_path_length=500, discount=0.99, store_paths=False, train_irl=True, train_empw=True, train_qvar=True, irl_model_wt=1.0, entropy_weight=0.1, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, # plot=True, ) with rllab_logdir(algo=algo, dirname='data/ant_transfer'): #%s'%exp_name): #with rllab_logdir(algo=algo, dirname='data/ant_transfer%s'%exp_name): with tf.Session(): algo.train()
def main(exp_name=None, fusion=False, latent_dim=3): max_path_length = 100 info_coeff = 0.1 imitation_coeff = 0.01 batch_size = 16 meta_batch_size = 50 max_itrs = 20 pre_epoch = 1000 entropy_weight = 1.0 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( 'data/maze_left_data_collect_discrete-15', n=4, latent_dim=latent_dim) # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) pretrain_model = Pretrain(experts, policy, context_encoder, env, latent_dim, batch_size=400, kl_weight=0.1, epoch=pre_epoch) # pretrain_model = None if pretrain_model is None: pre_epoch = 0 irl_model = InfoAIRL(env=env, policy=policy, context_encoder=context_encoder, reward_arch=reward_arch, reward_arch_args=reward_arch_args, expert_trajs=experts, state_only=True, max_path_length=max_path_length, fusion=fusion, max_itrs=max_itrs, meta_batch_size=meta_batch_size, imitation_coeff=imitation_coeff, info_coeff=info_coeff, latent_dim=latent_dim) algo = MetaIRLTRPO( env=env, policy=policy, irl_model=irl_model, randomize_policy=True, pretrain_model=pretrain_model, n_itr=3000, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), ) if fusion: dirname = 'data_fusion_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) else: dirname = 'data_discrete_new/maze_wall_meta_irl_imitcoeff-%s_infocoeff-%s_mbs-%s_bs-%s_itr-%s_preepoch-%s_entropy-%s_RandomPol_Rew-%s-%s/%s' % ( imitation_coeff, info_coeff, meta_batch_size, batch_size, max_itrs, pre_epoch, entropy_weight, layers, d_hidden, exp_name) with rllab_logdir(algo=algo, dirname=dirname): with tf.Session(): algo.train()
def main(exp_name=None, latent_dim=3, params_folder=None): max_path_length = 100 batch_size = 16 meta_batch_size = 1 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None # tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) barrier_range = [0.2, 0.6] barrier_y = 0.3 # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect', n=4, latent_dim=latent_dim) irl_itr_list = [2800] for irl_itr in irl_itr_list: # params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr) prior_params = load_prior_params(params_file) init_context_encoder_params = load_prior_params( params_file, 'context_params') # params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr-800)) policy_prior_params = load_prior_params(params_file, 'policy_params') # policy_prior_params = None # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) irl_model = InfoAIRL(env=env, expert_trajs=experts, reward_arch=reward_arch, reward_arch_args=reward_arch_args, context_encoder=context_encoder, state_only=True, max_path_length=max_path_length, meta_batch_size=meta_batch_size, latent_dim=latent_dim) savedir = 'data_fusion_discrete/visualize_reward_right-%s' % irl_itr if not os.path.isdir(savedir): os.mkdir(savedir) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) irl_model.context_encoder.set_param_values( init_context_encoder_params) policy.set_param_values(policy_prior_params) irl_model.set_params(prior_params) boundary_low = -0.1 boundary_high = 0.6 expert_obs, expert_acts, expert_contexts = irl_model.extract_paths( irl_model.expert_trajs, keys=('observations', 'actions', 'contexts'), T=max_path_length) expert_trajs = np.concatenate( (expert_obs, expert_acts), axis=-1) # num_experts x T x (state_dim + act_dim) grid_size = 0.005 rescale = 1. / grid_size for itr in range(100): expert_traj_batch, m_batch = irl_model.sample_batch( expert_trajs, expert_contexts, batch_size=1, warm_up=False, warm_up_idx=False) obs_batch = [] num_y = 0 for pos_y in np.arange(boundary_low, boundary_high, grid_size): num_y += 1 num_x = 0 for pos_x in np.arange(boundary_low, boundary_high, grid_size): num_x += 1 obs_batch.append([pos_x, pos_y, 0.]) obs_batch = np.array(obs_batch).reshape( [1, -1, max_path_length, 3]) expert_traj_batch = np.tile( np.reshape(expert_traj_batch, [1, 1, max_path_length, -1]), [1, obs_batch.shape[1], 1, 1]) reward = tf.get_default_session().run( irl_model.reward, feed_dict={ irl_model.expert_traj_var: expert_traj_batch, irl_model.obs_t: obs_batch }) score = reward[:, 0] ax = sns.heatmap(score.reshape([num_x, num_y]), cmap="YlGnBu_r") ax.scatter((m_batch[0][0][0] - boundary_low) * rescale, (m_batch[0][0][1] - boundary_low) * rescale, marker='*', s=150, c='r', edgecolors='k', linewidths=0.5) ax.scatter((0.3 - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, (0. - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, marker='o', s=120, c='white', linewidths=0.5, edgecolors='k') ax.plot([(barrier_range[0] - boundary_low) * rescale, (barrier_range[1] - boundary_low) * rescale], [(barrier_y - boundary_low) * rescale, (barrier_y - boundary_low) * rescale], color='k', linewidth=10) ax.invert_yaxis() plt.axis('off') plt.savefig(savedir + '/%s.png' % itr) print('Save Itr', itr) plt.close()
from inverse_rl.utils.log_utils import load_latest_experts_multiple_runs import pdb experts = load_latest_experts_multiple_runs('data/ant_data_collect', n=2)
def main(exp_name=None, latent_dim=3, params_folder=None): max_path_length = 100 batch_size = 32 meta_batch_size = 50 entropy_weight = 0.1 left = 'right' if_filtered = True # tf.reset_default_graph() if left == 'left': env = TfEnv( CustomGymEnv('PointMazeLeft-v0', record_video=False, record_log=False)) else: env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect', n=4, latent_dim=latent_dim) if if_filtered: experts_filtered = [] good_range = [0.1, 0.4] #[0.3, 0.5] for expert in experts: if expert['contexts'][0, 0] >= good_range[0] and expert['contexts'][ 0, 0] <= good_range[1]: experts_filtered.append(expert) assert len(experts_filtered) >= meta_batch_size experts_filtered = experts_filtered[:-(len(experts_filtered) % meta_batch_size)] experts = experts_filtered irl_itr_list = [2800] results = [] for irl_itr in irl_itr_list: params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr) prior_params = load_prior_params(params_file) init_context_encoder_params = load_prior_params( params_file, 'context_params') policy_prior_params = None # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) irl_model = InfoAIRL(env=env, expert_trajs=experts, context_encoder=context_encoder, state_only=True, max_path_length=max_path_length, meta_batch_size=meta_batch_size, latent_dim=latent_dim) algo = MetaIRLTRPO( init_irl_params=prior_params, init_pol_params=policy_prior_params, #policy_prior_params, init_context_encoder_params=init_context_encoder_params, env=env, policy=policy, irl_model=irl_model, n_itr=150, meta_batch_size=meta_batch_size, batch_size=batch_size, max_path_length=max_path_length, discount=0.99, store_paths=True, train_irl=True, # True train_context_only=True, train_policy=True, irl_model_wt=1.0, entropy_weight=entropy_weight, zero_environment_reward=True, baseline=LinearFeatureBaseline(env_spec=env.spec), log_params_folder=params_folder, log_experiment_name=exp_name, ) with rllab_logdir( algo=algo, dirname= 'data_finetune/maze_finetune_discrete-entropy-%s-irl_itr-%s-%s-%s-generalize/%s' % (entropy_weight, irl_itr, left, 'filter' if if_filtered else '', exp_name)): with tf.Session(): algo.train() results.append((irl_itr, np.max(algo.pol_ret))) tf.reset_default_graph() print(results)