def test_serialize_deserialize(self): policy_1 = ContinuousUniformPolicy( action_range=( self.env.action_space.low, self.env.action_space.high, ), input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, observation_keys=self.env.observation_keys) self.assertFalse(policy_1.trainable_weights) config = policies.serialize(policy_1) policy_2 = policies.deserialize(config) self.assertEqual(policy_2._action_range, policy_1._action_range) self.assertEqual(policy_2._input_shapes, policy_1._input_shapes) self.assertEqual(policy_2._output_shape, policy_1._output_shape) self.assertEqual(policy_2._observation_keys, policy_1._observation_keys) path = sampler_utils.rollout(self.env, policy_2, path_length=10, break_on_terminal=False) observations = path['observations'] np.testing.assert_equal( policy_1.actions(observations).numpy().shape, policy_2.actions(observations).numpy().shape)
def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.policy = ContinuousUniformPolicy( action_range=( self.env.action_space.low, self.env.action_space.high, ), input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, observation_keys=self.env.observation_keys)
class ContinuousUniformPolicyTest(tf.test.TestCase): def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.policy = ContinuousUniformPolicy( action_range=( self.env.action_space.low, self.env.action_space.high, ), input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, observation_keys=self.env.observation_keys) def test_actions_and_log_probs(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = type(observation1_np)( ((key, np.stack((observation1_np[key], observation2_np[key]), axis=0).astype(np.float32)) for key in observation1_np.keys())) observations_tf = tree.map_structure( lambda x: tf.constant(x, dtype=x.dtype), observations_np) for observations in (observations_np, observations_tf): actions = self.policy.actions(observations) log_pis = self.policy.log_probs(observations, actions) self.assertAllEqual( log_pis, tfp.distributions.Independent( tfp.distributions.Uniform( low=self.env.action_space.low, high=self.env.action_space.high, ), reinterpreted_batch_ndims=1, ).log_prob(actions)[..., None]) self.assertEqual(actions.shape, (2, *self.env.action_shape)) def test_env_step_with_actions(self): observation_np = self.env.reset() action = self.policy.action(observation_np).numpy() self.env.step(action) def test_trainable_variables(self): self.assertEqual(len(self.policy.trainable_variables), 0) def test_get_diagnostics(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} observations_np = type(observation1_np)( ((key, np.stack((observation1_np[key], observation2_np[key]), axis=0).astype(np.float32)) for key in observation1_np.keys())) diagnostics = self.policy.get_diagnostics(observations_np) self.assertTrue(isinstance(diagnostics, OrderedDict)) self.assertFalse(diagnostics) def test_serialize_deserialize(self): policy_1 = ContinuousUniformPolicy( action_range=( self.env.action_space.low, self.env.action_space.high, ), input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, observation_keys=self.env.observation_keys) self.assertFalse(policy_1.trainable_weights) config = policies.serialize(policy_1) policy_2 = policies.deserialize(config) self.assertEqual(policy_2._action_range, policy_1._action_range) self.assertEqual(policy_2._input_shapes, policy_1._input_shapes) self.assertEqual(policy_2._output_shape, policy_1._output_shape) self.assertEqual(policy_2._observation_keys, policy_1._observation_keys) path = sampler_utils.rollout(self.env, policy_2, path_length=10, break_on_terminal=False) observations = path['observations'] np.testing.assert_equal( policy_1.actions(observations).numpy().shape, policy_2.actions(observations).numpy().shape)
class ContinuousUniformPolicyTest(tf.test.TestCase): def setUp(self): self.env = get_environment('gym', 'Swimmer', 'v3', {}) self.policy = ContinuousUniformPolicy( input_shapes=self.env.observation_shape, output_shape=self.env.action_shape, observation_keys=self.env.observation_keys) def test_actions_and_log_pis_symbolic(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) observations_tf = tree.map_structure( lambda x: tf.constant(x, dtype=tf.float32), observations_np) actions = self.policy.actions(observations_tf) with self.assertRaises(NotImplementedError): log_pis = self.policy.log_pis(observations_tf, actions) self.assertEqual(actions.shape, (2, *self.env.action_shape)) self.evaluate(tf.compat.v1.global_variables_initializer()) actions_np = self.evaluate(actions) self.assertEqual(actions_np.shape, (2, *self.env.action_shape)) def test_actions_and_log_pis_numeric(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) actions_np = self.policy.actions_np(observations_np) with self.assertRaises(NotImplementedError): log_pis_np = self.policy.log_pis_np(observations_np, actions_np) self.assertEqual(actions_np.shape, (2, *self.env.action_shape)) def test_env_step_with_actions(self): observation1_np = self.env.reset() observations_np = { key: value[None, :] for key, value in observation1_np.items() } action = self.policy.actions_np(observations_np)[0, ...] self.env.step(action) def test_trainable_variables(self): self.assertEqual(len(self.policy.trainable_variables), 0) def test_get_diagnostics(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) diagnostics = self.policy.get_diagnostics(observations_np) self.assertTrue(isinstance(diagnostics, OrderedDict)) self.assertFalse(diagnostics) def test_serialize_deserialize(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = {} for key in observation1_np.keys(): observations_np[key] = np.stack( (observation1_np[key], observation2_np[key])).astype(np.float32) deserialized = pickle.loads(pickle.dumps(self.policy)) np.testing.assert_equal( self.policy.actions_np(observations_np).shape, deserialized.actions_np(observations_np).shape)