class TestQfDerivedPolicy(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(DummyDiscreteEnv()) self.qf = SimpleQFunction(self.env.spec) self.policy = DiscreteQfDerivedPolicy(env_spec=self.env.spec, qf=self.qf) self.sess.run(tf.compat.v1.global_variables_initializer()) self.env.reset() def test_discrete_qf_derived_policy(self): obs, _, _, _ = self.env.step(1) action, _ = self.policy.get_action(obs) assert self.env.action_space.contains(action) actions, _ = self.policy.get_actions([obs]) for action in actions: assert self.env.action_space.contains(action) def test_is_pickleable(self): with tf.compat.v1.variable_scope('SimpleQFunction/SimpleMLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) obs, _, _, _ = self.env.step(1) action1, _ = self.policy.get_action(obs) p = pickle.dumps(self.policy) with tf.compat.v1.Session(graph=tf.Graph()): policy_pickled = pickle.loads(p) action2, _ = policy_pickled.get_action(obs) assert action1 == action2
def test_get_qval_sym(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() obs_ph, act_ph = qf.inputs output1 = qf.get_qval([obs], [act]) input_var1 = tf.compat.v1.placeholder(tf.float32, shape=(None, obs.shape[0])) input_var2 = tf.compat.v1.placeholder(tf.float32, shape=(None, act.shape[0])) q_vals = qf.get_qval_sym(input_var1, input_var2, 'another') output2 = self.sess.run(q_vals, feed_dict={ input_var1: [obs], input_var2: [act] }) expected_output = np.full((1, ), 0.5) assert np.array_equal(output1, output2) assert np.array_equal(output2[0], expected_output)
def test_is_pickleable(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_mlp_policy.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim with tf.compat.v1.variable_scope('GaussianMLPPolicy/GaussianMLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.outputs[:-1], feed_dict={policy.model.input: [obs.flatten()]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run( policy_pickled.model.outputs[:-1], feed_dict={policy_pickled.model.input: [obs.flatten()]}) assert np.array_equal(output1, output2)
def test_dist_info_sym(self, obs_dim, action_dim, filter_dims, filter_sizes, strides, padding, hidden_sizes): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.MLPModel'), new=SimpleMLPModel): with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.CNNModel'), new=SimpleCNNModel): policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=filter_dims, conv_filter_sizes=filter_sizes, conv_strides=strides, conv_pad=padding, hidden_sizes=hidden_sizes) env.reset() obs, _, _, _ = env.step(1) expected_prob = np.full(action_dim, 0.5) obs_dim = env.spec.observation_space.shape state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + obs_dim) dist1 = policy.dist_info_sym(state_input, name='policy2') prob = self.sess.run(dist1['prob'], feed_dict={state_input: [obs]}) assert np.array_equal(prob[0], expected_prob)
def test_is_pickleable(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() obs_ph, act_ph = qf.inputs with tf.compat.v1.variable_scope( 'ContinuousMLPQFunction/SimpleMLPMergeModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = qf.get_qval([obs], [act]) h_data = pickle.dumps(qf) with tf.compat.v1.Session(graph=tf.Graph()): qf_pickled = pickle.loads(h_data) obs_ph_pickled, act_ph_pickled = qf_pickled.inputs output2 = qf_pickled.get_qval([obs], [act]) assert np.array_equal(output1, output2)
def test_is_pickleable(self, mock_rand, obs_dim, action_dim): mock_rand.return_value = 0 env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.MLPModel'), new=SimpleMLPModel): with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.CNNModel'), new=SimpleCNNModel): policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, ), conv_filter_sizes=(3, ), conv_strides=(1, ), conv_pad='SAME', hidden_sizes=(4, )) env.reset() obs, _, _, _ = env.step(1) with tf.compat.v1.variable_scope( 'CategoricalCNNPolicy/Sequential/MLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run(policy.model.outputs, feed_dict={policy.model.input: [obs]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run(policy_pickled.model.outputs, feed_dict={policy_pickled.model.input: [obs]}) assert np.array_equal(output1, output2)
def test_get_action(self, mock_rand, obs_dim, action_dim, filter_dims, filter_sizes, strides, padding, hidden_sizes): mock_rand.return_value = 0 env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.MLPModel'), new=SimpleMLPModel): with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.CNNModel'), new=SimpleCNNModel): policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=filter_dims, conv_filter_sizes=filter_sizes, conv_strides=strides, conv_pad=padding, hidden_sizes=hidden_sizes) env.reset() obs, _, _, _ = env.step(1) action, prob = policy.get_action(obs) expected_prob = np.full(action_dim, 0.5) assert env.action_space.contains(action) assert action == 0 assert np.array_equal(prob['prob'], expected_prob) actions, probs = policy.get_actions([obs, obs, obs]) for action, prob in zip(actions, probs['prob']): assert env.action_space.contains(action) assert action == 0 assert np.array_equal(prob, expected_prob)
def test_get_action(self, mock_normal, obs_dim, action_dim): mock_normal.return_value = 0.5 env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_mlp_policy.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, prob = policy.get_action(obs) expected_action = np.full(action_dim, 0.75) expected_mean = np.full(action_dim, 0.5) expected_log_std = np.full(action_dim, np.log(0.5)) assert env.action_space.contains(action) assert np.array_equal(action, expected_action) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std) actions, probs = policy.get_actions([obs, obs, obs]) for action, mean, log_std in zip(actions, probs['mean'], probs['log_std']): assert env.action_space.contains(action) assert np.array_equal(action, expected_action) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std)
def test_output_shape_dueling(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'discrete_mlp_q_function.MLPDuelingModel'), new=SimpleMLPModel): qf = DiscreteMLPQFunction(env_spec=env.spec, dueling=True) env.reset() obs, _, _, _ = env.step(1) outputs = self.sess.run(qf.q_vals, feed_dict={qf.input: [obs]}) assert outputs.shape == (1, action_dim)
class TestNormalizedGym: def setup_method(self): self.env = TfEnv( normalize(gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True)) def teardown_method(self): self.env.close() def test_does_not_modify_action(self): a = self.env.action_space.sample() a_copy = a self.env.reset() self.env.step(a) assert a == a_copy def test_flatten(self): for _ in range(10): self.env.reset() for _ in range(5): self.env.render() action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) assert next_obs.shape == self.env.observation_space.low.shape if done: break def test_unflatten(self): for _ in range(10): self.env.reset() for _ in range(5): action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) # yapf: disable assert (self.env.observation_space.flatten(next_obs).shape == self.env.observation_space.flat_dim) # yapf: enable if done: break
def test_dist_info(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'categorical_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = CategoricalMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) expected_prob = np.full(action_dim, 0.5) policy_probs = policy.dist_info([obs.flatten()]) assert np.array_equal(policy_probs['prob'][0], expected_prob)
def test_output_shape(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() obs_ph, act_ph = qf.inputs outputs = qf.get_qval([obs], [act]) assert outputs.shape == (1, 1)
def test_get_action(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'discrete_mlp_q_function.MLPModel'), new=SimpleMLPModel): qf = DiscreteMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) expected_output = np.full(action_dim, 0.5) outputs = self.sess.run(qf.q_vals, feed_dict={qf.input: [obs]}) assert np.array_equal(outputs[0], expected_output) outputs = self.sess.run(qf.q_vals, feed_dict={qf.input: [obs, obs, obs]}) for output in outputs: assert np.array_equal(output, expected_output)
def test_dist_info_sym(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'categorical_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = CategoricalMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) expected_prob = np.full(action_dim, 0.5) obs_dim = env.spec.observation_space.flat_dim state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim)) dist1 = policy.dist_info_sym(state_input, name='policy2') prob = self.sess.run(dist1['prob'], feed_dict={state_input: [obs.flatten()]}) assert np.array_equal(prob[0], expected_prob)
def test_get_qval_sym(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'discrete_mlp_q_function.MLPModel'), new=SimpleMLPModel): qf = DiscreteMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) output1 = self.sess.run(qf.q_vals, feed_dict={qf.input: [obs]}) input_var = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + obs_dim) q_vals = qf.get_qval_sym(input_var, 'another') output2 = self.sess.run(q_vals, feed_dict={input_var: [obs]}) expected_output = np.full(action_dim, 0.5) assert np.array_equal(output1, output2) assert np.array_equal(output2[0], expected_output)
def test_q_vals(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs = obs.flatten() act = np.full(action_dim, 0.5).flatten() expected_output = np.full((1, ), 0.5) obs_ph, act_ph = qf.inputs outputs = qf.get_qval([obs], [act]) assert np.array_equal(outputs[0], expected_output) outputs = qf.get_qval([obs, obs, obs], [act, act, act]) for output in outputs: assert np.array_equal(output, expected_output)
def test_q_vals_input_include_goal(self): env = TfEnv(DummyDictEnv()) with mock.patch(('metarl.tf.q_functions.' 'continuous_mlp_q_function.MLPMergeModel'), new=SimpleMLPMergeModel): qf = ContinuousMLPQFunction(env_spec=env.spec, input_include_goal=True) env.reset() obs, _, _, _ = env.step(1) obs = np.concatenate((obs['observation'], obs['desired_goal']), axis=-1) act = np.full((1, ), 0.5).flatten() expected_output = np.full((1, ), 0.5) obs_ph, act_ph = qf.inputs outputs = qf.get_qval([obs], [act]) assert np.array_equal(outputs[0], expected_output) outputs = qf.get_qval([obs, obs, obs], [act, act, act]) for output in outputs: assert np.array_equal(output, expected_output)
def test_dist_info_sym(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'gaussian_mlp_policy.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim)) dist1_sym = policy.dist_info_sym(obs_ph, name='p1_sym') # flatten output expected_mean = [np.full(np.prod(action_dim), 0.5)] expected_log_std = [np.full(np.prod(action_dim), np.log(0.5))] prob = self.sess.run(dist1_sym, feed_dict={obs_ph: [obs.flatten()]}) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std)
def test_dist_info(self, obs_dim, action_dim, filter_dims, filter_sizes, strides, padding, hidden_sizes): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.MLPModel'), new=SimpleMLPModel): with mock.patch(('metarl.tf.policies.' 'categorical_cnn_policy.CNNModel'), new=SimpleCNNModel): policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=filter_dims, conv_filter_sizes=filter_sizes, conv_strides=strides, conv_pad=padding, hidden_sizes=hidden_sizes) env.reset() obs, _, _, _ = env.step(1) expected_prob = np.full(action_dim, 0.5) policy_probs = policy.dist_info([obs]) assert np.array_equal(policy_probs['prob'][0], expected_prob)
def test_get_action(self, mock_rand, obs_dim, action_dim): mock_rand.return_value = 0 env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'categorical_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = CategoricalMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, prob = policy.get_action(obs) expected_prob = np.full(action_dim, 0.5) assert env.action_space.contains(action) assert action == 0 assert np.array_equal(prob['prob'], expected_prob) actions, probs = policy.get_actions([obs, obs, obs]) for action, prob in zip(actions, probs['prob']): assert env.action_space.contains(action) assert action == 0 assert np.array_equal(prob, expected_prob)
def test_get_action_sym(self, obs_dim, action_dim): """Test get_action_sym method""" env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'continuous_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim)) action_sym = policy.get_action_sym(state_input, name='action_sym') expected_action = np.full(action_dim, 0.5) action = self.sess.run(action_sym, feed_dict={state_input: [obs.flatten()]}) action = policy.action_space.unflatten(action) assert np.array_equal(action, expected_action) assert env.action_space.contains(action)
def test_get_action(self, obs_dim, action_dim): """Test get_action method""" env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.policies.' 'continuous_mlp_policy.MLPModel'), new=SimpleMLPModel): policy = ContinuousMLPPolicy(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, _ = policy.get_action(obs.flatten()) expected_action = np.full(action_dim, 0.5) assert env.action_space.contains(action) assert np.array_equal(action, expected_action) actions, _ = policy.get_actions( [obs.flatten(), obs.flatten(), obs.flatten()]) for action in actions: assert env.action_space.contains(action) assert np.array_equal(action, expected_action)
def test_is_pickleable(self, obs_dim, action_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('metarl.tf.q_functions.' 'discrete_mlp_q_function.MLPModel'), new=SimpleMLPModel): qf = DiscreteMLPQFunction(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) with tf.compat.v1.variable_scope('DiscreteMLPQFunction/SimpleMLPModel', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run(qf.q_vals, feed_dict={qf.input: [obs]}) h_data = pickle.dumps(qf) with tf.compat.v1.Session(graph=tf.Graph()) as sess: qf_pickled = pickle.loads(h_data) output2 = sess.run(qf_pickled.q_vals, feed_dict={qf_pickled.input: [obs]}) assert np.array_equal(output1, output2)