def main(exp_name=None, latent_dim=3, params_folder=None): max_path_length = 100 batch_size = 16 meta_batch_size = 1 reward_arch = relu_net if reward_arch == relu_net: layers = 2 d_hidden = 32 reward_arch_args = { 'layers': layers, 'd_hidden': d_hidden, } else: layers, d_hidden = 0, 0 reward_arch_args = None # tf.reset_default_graph() env = TfEnv( CustomGymEnv('PointMazeRight-v0', record_video=False, record_log=False)) barrier_range = [0.2, 0.6] barrier_y = 0.3 # load ~2 iterations worth of data from each forward RL experiment as demos experts = load_latest_experts_multiple_runs( '/atlas/u/lantaoyu/projects/InfoAIRL/data/maze_left_data_collect', n=4, latent_dim=latent_dim) irl_itr_list = [2800] for irl_itr in irl_itr_list: # params_file = os.path.join(DATA_DIR, '%s/itr_%d.pkl' % (params_folder, irl_itr)) params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % irl_itr) prior_params = load_prior_params(params_file) init_context_encoder_params = load_prior_params( params_file, 'context_params') # params_file = os.path.join(DATA_DIR, 'itr_%d.pkl' % (irl_itr-800)) policy_prior_params = load_prior_params(params_file, 'policy_params') # policy_prior_params = None # contexual policy pi(a|s,m) policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) # approximate posterior q(m|tau) context_encoder_spec = EnvSpec( observation_space=Box( np.tile( np.concatenate((env.observation_space.low[:-latent_dim], env.action_space.low)), max_path_length), np.tile( np.concatenate((env.observation_space.high[:-latent_dim], env.action_space.high)), max_path_length)), action_space=Box(np.zeros(latent_dim), np.ones(latent_dim)), ) context_encoder = GaussianMLPPolicy(name='context_encoder', env_spec=context_encoder_spec, hidden_sizes=(128, 128)) irl_model = InfoAIRL(env=env, expert_trajs=experts, reward_arch=reward_arch, reward_arch_args=reward_arch_args, context_encoder=context_encoder, state_only=True, max_path_length=max_path_length, meta_batch_size=meta_batch_size, latent_dim=latent_dim) savedir = 'data_fusion_discrete/visualize_reward_right-%s' % irl_itr if not os.path.isdir(savedir): os.mkdir(savedir) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) irl_model.context_encoder.set_param_values( init_context_encoder_params) policy.set_param_values(policy_prior_params) irl_model.set_params(prior_params) boundary_low = -0.1 boundary_high = 0.6 expert_obs, expert_acts, expert_contexts = irl_model.extract_paths( irl_model.expert_trajs, keys=('observations', 'actions', 'contexts'), T=max_path_length) expert_trajs = np.concatenate( (expert_obs, expert_acts), axis=-1) # num_experts x T x (state_dim + act_dim) grid_size = 0.005 rescale = 1. / grid_size for itr in range(100): expert_traj_batch, m_batch = irl_model.sample_batch( expert_trajs, expert_contexts, batch_size=1, warm_up=False, warm_up_idx=False) obs_batch = [] num_y = 0 for pos_y in np.arange(boundary_low, boundary_high, grid_size): num_y += 1 num_x = 0 for pos_x in np.arange(boundary_low, boundary_high, grid_size): num_x += 1 obs_batch.append([pos_x, pos_y, 0.]) obs_batch = np.array(obs_batch).reshape( [1, -1, max_path_length, 3]) expert_traj_batch = np.tile( np.reshape(expert_traj_batch, [1, 1, max_path_length, -1]), [1, obs_batch.shape[1], 1, 1]) reward = tf.get_default_session().run( irl_model.reward, feed_dict={ irl_model.expert_traj_var: expert_traj_batch, irl_model.obs_t: obs_batch }) score = reward[:, 0] ax = sns.heatmap(score.reshape([num_x, num_y]), cmap="YlGnBu_r") ax.scatter((m_batch[0][0][0] - boundary_low) * rescale, (m_batch[0][0][1] - boundary_low) * rescale, marker='*', s=150, c='r', edgecolors='k', linewidths=0.5) ax.scatter((0.3 - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, (0. - boundary_low + np.random.uniform(low=-0.05, high=0.05)) * rescale, marker='o', s=120, c='white', linewidths=0.5, edgecolors='k') ax.plot([(barrier_range[0] - boundary_low) * rescale, (barrier_range[1] - boundary_low) * rescale], [(barrier_y - boundary_low) * rescale, (barrier_y - boundary_low) * rescale], color='k', linewidth=10) ax.invert_yaxis() plt.axis('off') plt.savefig(savedir + '/%s.png' % itr) print('Save Itr', itr) plt.close()
# env = TfEnv(CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False, force_reset=False)) env = TfEnv(CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False)) # logdir = '/home/usaywook/ext256/inverse_rl/data/ant_state_irl/itr_2999.pkl' logdir = '/home/usaywook/ext256/inverse_rl/data/ant_transfer/itr_1500.pkl' params = load_prior_params(logdir) loaded_params = params['policy_params'] policy = GaussianMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if loaded_params is not None: # x = list(params['policy']._cached_params.values())[0] # y = list(params['policy']._cached_param_dtypes.values())[0] policy.set_param_values(loaded_params) # pdb.set_trace() with tf.Session(config=get_session_config()) as sess: # algo = TRPO( # env=env, # sess=sess, # policy=policy, # n_itr=1, # batch_size=20000, # max_path_length=500, # discount=0.99, # store_paths=True, # entropy_weight=0.1, # baseline=LinearFeatureBaseline(env_spec=env.spec), # exp_name=None,