def test_get_action_dict_space(self): env = GymEnv(DummyDictEnv(obs_space_type='box', act_space_type='box')) policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_dim=4, state_include_action=False) policy.reset(do_resets=None) obs = env.reset()[0] action, _ = policy.get_action(obs) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs, obs]) for action in actions: assert env.action_space.contains(action)
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset() obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.build(obs_var) policy.reset() obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_build_state_include_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.reset(do_resets=None) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist concat_obs = np.concatenate([obs.flatten(), np.zeros(action_dim)]) output1 = self.sess.run( [policy.distribution.loc], feed_dict={policy.model.input: [[concat_obs], [concat_obs]]}) output2 = self.sess.run( [dist_sym.loc], feed_dict={state_input: [[concat_obs], [concat_obs]]}) assert np.array_equal(output1, output2)
def test_build_state_not_include_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = GaussianLSTMPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset(do_resets=None) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist dist_sym2 = policy.build(state_input, name='dist_sym2').dist output1 = self.sess.run( [dist_sym.loc], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) output2 = self.sess.run( [dist_sym2.loc], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_dist_info_sym_wrong_input(self): env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_lstm_policy.GaussianLSTMModel'), new=SimpleGaussianLSTMModel): policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() policy.dist_info_sym( obs_var=obs_ph, state_info_vars={'prev_action': np.zeros((3, 1, 1))}, name='p2_sym') # observation batch size = 2 but prev_action batch size = 3 with pytest.raises(tf.errors.InvalidArgumentError): self.sess.run( policy.model.networks['p2_sym'].input, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})
def test_get_action_state_include_action(self, mock_normal, obs_dim, action_dim, hidden_dim): mock_normal.return_value = 0.5 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_lstm_policy.GaussianLSTMModel'), new=SimpleGaussianLSTMModel): policy = GaussianLSTMPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() expected_action = np.full(action_dim, 0.5 * np.exp(0.5) + 0.5) action, agent_info = policy.get_action(obs) assert env.action_space.contains(action) assert np.allclose(action, expected_action, atol=1e-6) expected_mean = np.full(action_dim, 0.5) assert np.array_equal(agent_info['mean'], expected_mean) expected_log_std = np.full(action_dim, 0.5) assert np.array_equal(agent_info['log_std'], expected_log_std) expected_prev_action = np.full(action_dim, 0) assert np.array_equal(agent_info['prev_action'], expected_prev_action) policy.reset() actions, agent_infos = policy.get_actions([obs]) for action, mean, log_std, prev_action in zip( actions, agent_infos['mean'], agent_infos['log_std'], agent_infos['prev_action']): assert env.action_space.contains(action) assert np.allclose(action, np.full(action_dim, expected_action), atol=1e-6) assert np.array_equal(mean, expected_mean) assert np.array_equal(log_std, expected_log_std) assert np.array_equal(prev_action, expected_prev_action)
class TestGaussianLSTMPolicyWithModelTransit(TfGraphTestCase): def setup_method(self): super().setup_method() with mock.patch('tensorflow.random.normal') as mock_rand: mock_rand.return_value = 0.5 env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, ))) self.default_initializer = tf.constant_initializer(1) self.default_hidden_nonlinearity = tf.nn.tanh self.default_recurrent_nonlinearity = tf.nn.sigmoid self.default_output_nonlinearity = None self.time_step = 1 self.policy1 = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P1') self.policy2 = GaussianLSTMPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P2') self.sess.run(tf.global_variables_initializer()) self.policy3 = GaussianLSTMPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P3') self.policy4 = GaussianLSTMPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P4') self.policy1.reset() self.policy2.reset() self.policy3.reset() self.policy4.reset() self.obs = [env.reset()] self.obs = np.concatenate( [self.obs for _ in range(self.time_step)], axis=0) self.obs_ph = tf.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) self.action_ph = tf.placeholder(tf.float32, shape=(None, None, env.action_space.flat_dim)) self.dist1_sym = self.policy1.dist_info_sym( obs_var=self.obs_ph, state_info_vars={ 'prev_action': np.zeros((2, self.time_step, 1)) }, name='p1_sym') self.dist2_sym = self.policy2.dist_info_sym( obs_var=self.obs_ph, state_info_vars={ 'prev_action': np.zeros((2, self.time_step, 1)) }, name='p2_sym') self.dist3_sym = self.policy3.dist_info_sym( obs_var=self.obs_ph, state_info_vars={ 'prev_action': np.zeros((2, self.time_step, 1)) }, name='p3_sym') self.dist4_sym = self.policy4.dist_info_sym( obs_var=self.obs_ph, state_info_vars={ 'prev_action': np.zeros((2, self.time_step, 1)) }, name='p4_sym') def test_dist_info_sym_output(self): # batch size = 2 dist1 = self.sess.run(self.dist1_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist2 = self.sess.run(self.dist2_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist3 = self.sess.run(self.dist3_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist4 = self.sess.run(self.dist4_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) assert np.array_equal(dist1['mean'], dist3['mean']) assert np.array_equal(dist1['log_std'], dist3['log_std']) assert np.array_equal(dist2['mean'], dist4['mean']) assert np.array_equal(dist2['log_std'], dist4['log_std']) @mock.patch('numpy.random.normal') def test_get_action(self, mock_rand): mock_rand.return_value = 0.5 action1, agent_info1 = self.policy1.get_action(self.obs) action2, agent_info2 = self.policy2.get_action(self.obs) action3, agent_info3 = self.policy3.get_action(self.obs) action4, agent_info4 = self.policy4.get_action(self.obs) assert np.array_equal(action1, action3) assert np.array_equal(action2, action4) assert np.array_equal(agent_info1['mean'], agent_info3['mean']) assert np.array_equal(agent_info1['log_std'], agent_info3['log_std']) assert np.array_equal(agent_info2['mean'], agent_info4['mean']) assert np.array_equal(agent_info2['log_std'], agent_info4['log_std']) actions1, agent_infos1 = self.policy1.get_actions([self.obs]) actions2, agent_infos2 = self.policy2.get_actions([self.obs]) actions3, agent_infos3 = self.policy3.get_actions([self.obs]) actions4, agent_infos4 = self.policy4.get_actions([self.obs]) assert np.array_equal(actions1, actions3) assert np.array_equal(actions2, actions4) assert np.array_equal(agent_infos1['mean'], agent_infos3['mean']) assert np.array_equal(agent_infos1['log_std'], agent_infos3['log_std']) assert np.array_equal(agent_infos2['mean'], agent_infos4['mean']) assert np.array_equal(agent_infos2['log_std'], agent_infos4['log_std']) def test_kl_sym(self): kl_diff_sym1 = self.policy1.distribution.kl_sym( self.dist1_sym, self.dist2_sym) objective1 = tf.reduce_mean(kl_diff_sym1) kl_func = tensor_utils.compile_function([self.obs_ph], objective1) kl1 = kl_func([self.obs, self.obs]) kl_diff_sym2 = self.policy3.distribution.kl_sym( self.dist3_sym, self.dist4_sym) objective2 = tf.reduce_mean(kl_diff_sym2) kl_func = tensor_utils.compile_function([self.obs_ph], objective2) kl2 = kl_func([self.obs, self.obs]) assert np.array_equal(kl1, kl2) def test_log_likehihood_sym(self): log_prob_sym1 = self.policy1.distribution.log_likelihood_sym( self.action_ph, self.dist1_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy3.distribution.log_likelihood_sym( self.action_ph, self.dist3_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2) log_prob_sym1 = self.policy2.distribution.log_likelihood_sym( self.action_ph, self.dist2_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy4.distribution.log_likelihood_sym( self.action_ph, self.dist4_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2) def test_policy_entropy_sym(self): entropy_sym1 = self.policy1.distribution.entropy_sym( self.dist1_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym1) entropy1 = entropy_func([self.obs, self.obs]) entropy_sym2 = self.policy3.distribution.entropy_sym( self.dist3_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym2) entropy2 = entropy_func([self.obs, self.obs]) assert np.array_equal(entropy1, entropy2) def test_likelihood_ratio_sym(self): likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym( self.action_ph, self.dist1_sym, self.dist2_sym, name='li_ratio_sym1') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym1) likelihood_ratio1 = likelihood_ratio_func(np.ones((2, 1, 1)), [self.obs, self.obs]) likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym( self.action_ph, self.dist3_sym, self.dist4_sym, name='li_ratio_sym2') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym2) likelihood_ratio2 = likelihood_ratio_func(np.ones((2, 1, 1)), [self.obs, self.obs]) assert np.array_equal(likelihood_ratio1, likelihood_ratio2)