def setUp(self): self.replay_buffer = MultiReplayBuffer(buffer_size=2, batch_size=1, obs_dim=1, ac_dim=2, all_obs_dim=3, all_ac_dim=4)
def _setup_maddpg_independent(self, scope): """Perform independent form of MADDPG setup.""" self.all_obs_ph = {} self.all_obs1_ph = {} self.all_action_ph = {} self.replay_buffer = {} self.terminals1 = {} self.rew_ph = {} self.action_ph = {} self.obs_ph = {} self.obs1_ph = {} self.actor_tf = {} self.critic_tf = {} self.actor_target = {} actors = [] actor_targets = [] # The size of the full action space. all_ac_dim = sum(self.ac_space[key].shape[0] for key in self.ac_space.keys()) # We move through the keys in a sorted fashion so that we may collect # the observations and actions for the full state in a sorted manner. for key in sorted(self.ob_space.keys()): # Compute the shape of the input observation space, which may # include the contextual term. ob_dim = self._get_ob_dim( self.ob_space[key], None if self.co_space is None else self.co_space[key]) # Create a replay buffer object. self.replay_buffer[key] = MultiReplayBuffer( buffer_size=self.buffer_size, batch_size=self.batch_size, obs_dim=ob_dim[0], ac_dim=self.ac_space[key].shape[0], all_obs_dim=self.all_ob_space.shape[0], all_ac_dim=all_ac_dim, ) with tf.compat.v1.variable_scope(key, reuse=False): # Create an input placeholder for the full state observations. self.all_obs_ph[key] = tf.compat.v1.placeholder( tf.float32, shape=(None, ) + self.all_ob_space.shape, name='all_obs') self.all_obs1_ph[key] = tf.compat.v1.placeholder( tf.float32, shape=(None, ) + self.all_ob_space.shape, name='all_obs1') # Create an input placeholder for the full actions. self.all_action_ph[key] = tf.compat.v1.placeholder( tf.float32, shape=(None, all_ac_dim), name='all_actions') # Create input variables. self.terminals1[key] = tf.compat.v1.placeholder( tf.float32, shape=(None, 1), name='terminals1') self.rew_ph[key] = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='rewards') self.action_ph[key] = tf.compat.v1.placeholder( tf.float32, shape=(None, ) + self.ac_space[key].shape, name='actions') self.obs_ph[key] = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs0') self.obs1_ph[key] = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + ob_dim, name='obs1') # Create actor and critic networks for the shared policy. actor_tf, critic_tf, noisy_actor_target = self._setup_agent( obs_ph=self.obs_ph[key], obs1_ph=self.obs1_ph[key], ac_space=self.ac_space[key], all_obs_ph=self.all_obs_ph[key], all_action_ph=self.all_action_ph[key], target_policy_noise=self.target_policy_noise[key], target_noise_clip=self.target_noise_clip[key], reuse=False, ) # Store the new objects in their respective attributes. self.actor_tf[key] = actor_tf self.critic_tf[key] = critic_tf self.actor_target[key] = noisy_actor_target actors.append(actor_tf) actor_targets.append(noisy_actor_target) # Combine all actors for when creating a centralized differentiable # critic. combined_actors = tf.concat(actors, axis=1) # Combine all actor targets to create a centralized target actor. noisy_actor_target = tf.concat(actor_targets, axis=1) # Now that we have all actor targets, we can start constructing # centralized critic targets and all update procedures. self.critic_loss = {} self.critic_optimizer = {} self.target_init_updates = {} self.target_soft_updates = {} self.actor_loss = {} self.actor_optimizer = {} # Loop through all agents. for key in self.ob_space.keys(): # Append the key to the outer scope term. scope_i = key if scope is None else "{}/{}".format(scope, key) # Create the policy update and logging operations of the agent. with tf.compat.v1.variable_scope(key, reuse=False): (self.critic_loss[key], self.critic_optimizer[key], self.target_init_updates[key], self.target_soft_updates[key], self.actor_loss[key], self.actor_optimizer[key]) = self._setup_agent_ops( scope=scope_i, actor_tf=self.actor_tf[key], critic_tf=self.critic_tf[key], noisy_actor_target=noisy_actor_target, all_obs_ph=self.all_obs_ph[key], all_obs1_ph=self.all_obs1_ph[key], rew_ph=self.rew_ph[key], terminals1=self.terminals1[key], combined_actors=combined_actors)
class TestMultiReplayBuffer(unittest.TestCase): """Tests for the MultiReplayBuffer object.""" def setUp(self): self.replay_buffer = MultiReplayBuffer(buffer_size=2, batch_size=1, obs_dim=1, ac_dim=2, all_obs_dim=3, all_ac_dim=4) def tearDown(self): del self.replay_buffer def test_init(self): """Validate that all the attributes were initialize properly.""" self.assertTupleEqual(self.replay_buffer.obs_t.shape, (2, 1)) self.assertTupleEqual(self.replay_buffer.action_t.shape, (2, 2)) self.assertTupleEqual(self.replay_buffer.reward.shape, (2, )) self.assertTupleEqual(self.replay_buffer.obs_tp1.shape, (2, 1)) self.assertTupleEqual(self.replay_buffer.done.shape, (2, )) self.assertTupleEqual(self.replay_buffer.all_obs_t.shape, (2, 3)) self.assertTupleEqual(self.replay_buffer.all_action_t.shape, (2, 4)) self.assertTupleEqual(self.replay_buffer.all_obs_tp1.shape, (2, 3)) def test_buffer_size(self): """Validate the buffer_size output from the replay buffer.""" self.assertEqual(self.replay_buffer.buffer_size, 2) def test_add_sample(self): """Test the `add` and `sample` methods the replay buffer.""" # Add an element. self.replay_buffer.add(obs_t=np.array([0]), action=np.array([1, 1]), reward=2, obs_tp1=np.array([3]), done=False, all_obs_t=np.array([4, 4, 4]), all_action_t=np.array([5, 5, 5, 5]), all_obs_tp1=np.array([6, 6, 6])) # Check is_full in the False case. self.assertEqual(self.replay_buffer.is_full(), False) # Add an element. self.replay_buffer.add(obs_t=np.array([0]), action=np.array([1, 1]), reward=2, obs_tp1=np.array([3]), done=False, all_obs_t=np.array([4, 4, 4]), all_action_t=np.array([5, 5, 5, 5]), all_obs_tp1=np.array([6, 6, 6])) # Check is_full in the True case. self.assertEqual(self.replay_buffer.is_full(), True) # Check can_sample in the True case. self.assertEqual(self.replay_buffer.can_sample(), True) # Test the `sample` method. obs_t, actions_t, rewards, obs_tp1, done, all_obs_t, all_actions_t, \ all_obs_tp1 = self.replay_buffer.sample() np.testing.assert_array_almost_equal(obs_t, [[0]]) np.testing.assert_array_almost_equal(actions_t, [[1, 1]]) np.testing.assert_array_almost_equal(rewards, [2]) np.testing.assert_array_almost_equal(obs_tp1, [[3]]) np.testing.assert_array_almost_equal(done, [False]) np.testing.assert_array_almost_equal(all_obs_t, [[4, 4, 4]]) np.testing.assert_array_almost_equal(all_actions_t, [[5, 5, 5, 5]]) np.testing.assert_array_almost_equal(all_obs_tp1, [[6, 6, 6]])