예제 #1
0
 def setUp(self):
     self.replay_buffer = MultiReplayBuffer(buffer_size=2,
                                            batch_size=1,
                                            obs_dim=1,
                                            ac_dim=2,
                                            all_obs_dim=3,
                                            all_ac_dim=4)
예제 #2
0
파일: td3.py 프로젝트: tamood/h-baselines
    def _setup_maddpg_independent(self, scope):
        """Perform independent form of MADDPG setup."""
        self.all_obs_ph = {}
        self.all_obs1_ph = {}
        self.all_action_ph = {}
        self.replay_buffer = {}
        self.terminals1 = {}
        self.rew_ph = {}
        self.action_ph = {}
        self.obs_ph = {}
        self.obs1_ph = {}
        self.actor_tf = {}
        self.critic_tf = {}
        self.actor_target = {}
        actors = []
        actor_targets = []

        # The size of the full action space.
        all_ac_dim = sum(self.ac_space[key].shape[0]
                         for key in self.ac_space.keys())

        # We move through the keys in a sorted fashion so that we may collect
        # the observations and actions for the full state in a sorted manner.
        for key in sorted(self.ob_space.keys()):
            # Compute the shape of the input observation space, which may
            # include the contextual term.
            ob_dim = self._get_ob_dim(
                self.ob_space[key],
                None if self.co_space is None else self.co_space[key])

            # Create a replay buffer object.
            self.replay_buffer[key] = MultiReplayBuffer(
                buffer_size=self.buffer_size,
                batch_size=self.batch_size,
                obs_dim=ob_dim[0],
                ac_dim=self.ac_space[key].shape[0],
                all_obs_dim=self.all_ob_space.shape[0],
                all_ac_dim=all_ac_dim,
            )

            with tf.compat.v1.variable_scope(key, reuse=False):
                # Create an input placeholder for the full state observations.
                self.all_obs_ph[key] = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, ) + self.all_ob_space.shape,
                    name='all_obs')
                self.all_obs1_ph[key] = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, ) + self.all_ob_space.shape,
                    name='all_obs1')

                # Create an input placeholder for the full actions.
                self.all_action_ph[key] = tf.compat.v1.placeholder(
                    tf.float32, shape=(None, all_ac_dim), name='all_actions')

                # Create input variables.
                self.terminals1[key] = tf.compat.v1.placeholder(
                    tf.float32, shape=(None, 1), name='terminals1')
                self.rew_ph[key] = tf.compat.v1.placeholder(tf.float32,
                                                            shape=(None, 1),
                                                            name='rewards')
                self.action_ph[key] = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, ) + self.ac_space[key].shape,
                    name='actions')
                self.obs_ph[key] = tf.compat.v1.placeholder(tf.float32,
                                                            shape=(None, ) +
                                                            ob_dim,
                                                            name='obs0')
                self.obs1_ph[key] = tf.compat.v1.placeholder(tf.float32,
                                                             shape=(None, ) +
                                                             ob_dim,
                                                             name='obs1')

                # Create actor and critic networks for the shared policy.
                actor_tf, critic_tf, noisy_actor_target = self._setup_agent(
                    obs_ph=self.obs_ph[key],
                    obs1_ph=self.obs1_ph[key],
                    ac_space=self.ac_space[key],
                    all_obs_ph=self.all_obs_ph[key],
                    all_action_ph=self.all_action_ph[key],
                    target_policy_noise=self.target_policy_noise[key],
                    target_noise_clip=self.target_noise_clip[key],
                    reuse=False,
                )

            # Store the new objects in their respective attributes.
            self.actor_tf[key] = actor_tf
            self.critic_tf[key] = critic_tf
            self.actor_target[key] = noisy_actor_target
            actors.append(actor_tf)
            actor_targets.append(noisy_actor_target)

        # Combine all actors for when creating a centralized differentiable
        # critic.
        combined_actors = tf.concat(actors, axis=1)

        # Combine all actor targets to create a centralized target actor.
        noisy_actor_target = tf.concat(actor_targets, axis=1)

        # Now that we have all actor targets, we can start constructing
        # centralized critic targets and all update procedures.
        self.critic_loss = {}
        self.critic_optimizer = {}
        self.target_init_updates = {}
        self.target_soft_updates = {}
        self.actor_loss = {}
        self.actor_optimizer = {}

        # Loop through all agents.
        for key in self.ob_space.keys():
            # Append the key to the outer scope term.
            scope_i = key if scope is None else "{}/{}".format(scope, key)

            # Create the policy update and logging operations of the agent.
            with tf.compat.v1.variable_scope(key, reuse=False):
                (self.critic_loss[key], self.critic_optimizer[key],
                 self.target_init_updates[key], self.target_soft_updates[key],
                 self.actor_loss[key],
                 self.actor_optimizer[key]) = self._setup_agent_ops(
                     scope=scope_i,
                     actor_tf=self.actor_tf[key],
                     critic_tf=self.critic_tf[key],
                     noisy_actor_target=noisy_actor_target,
                     all_obs_ph=self.all_obs_ph[key],
                     all_obs1_ph=self.all_obs1_ph[key],
                     rew_ph=self.rew_ph[key],
                     terminals1=self.terminals1[key],
                     combined_actors=combined_actors)
예제 #3
0
class TestMultiReplayBuffer(unittest.TestCase):
    """Tests for the MultiReplayBuffer object."""
    def setUp(self):
        self.replay_buffer = MultiReplayBuffer(buffer_size=2,
                                               batch_size=1,
                                               obs_dim=1,
                                               ac_dim=2,
                                               all_obs_dim=3,
                                               all_ac_dim=4)

    def tearDown(self):
        del self.replay_buffer

    def test_init(self):
        """Validate that all the attributes were initialize properly."""
        self.assertTupleEqual(self.replay_buffer.obs_t.shape, (2, 1))
        self.assertTupleEqual(self.replay_buffer.action_t.shape, (2, 2))
        self.assertTupleEqual(self.replay_buffer.reward.shape, (2, ))
        self.assertTupleEqual(self.replay_buffer.obs_tp1.shape, (2, 1))
        self.assertTupleEqual(self.replay_buffer.done.shape, (2, ))
        self.assertTupleEqual(self.replay_buffer.all_obs_t.shape, (2, 3))
        self.assertTupleEqual(self.replay_buffer.all_action_t.shape, (2, 4))
        self.assertTupleEqual(self.replay_buffer.all_obs_tp1.shape, (2, 3))

    def test_buffer_size(self):
        """Validate the buffer_size output from the replay buffer."""
        self.assertEqual(self.replay_buffer.buffer_size, 2)

    def test_add_sample(self):
        """Test the `add` and `sample` methods the replay buffer."""
        # Add an element.
        self.replay_buffer.add(obs_t=np.array([0]),
                               action=np.array([1, 1]),
                               reward=2,
                               obs_tp1=np.array([3]),
                               done=False,
                               all_obs_t=np.array([4, 4, 4]),
                               all_action_t=np.array([5, 5, 5, 5]),
                               all_obs_tp1=np.array([6, 6, 6]))

        # Check is_full in the False case.
        self.assertEqual(self.replay_buffer.is_full(), False)

        # Add an element.
        self.replay_buffer.add(obs_t=np.array([0]),
                               action=np.array([1, 1]),
                               reward=2,
                               obs_tp1=np.array([3]),
                               done=False,
                               all_obs_t=np.array([4, 4, 4]),
                               all_action_t=np.array([5, 5, 5, 5]),
                               all_obs_tp1=np.array([6, 6, 6]))

        # Check is_full in the True case.
        self.assertEqual(self.replay_buffer.is_full(), True)

        # Check can_sample in the True case.
        self.assertEqual(self.replay_buffer.can_sample(), True)

        # Test the `sample` method.
        obs_t, actions_t, rewards, obs_tp1, done, all_obs_t, all_actions_t, \
            all_obs_tp1 = self.replay_buffer.sample()
        np.testing.assert_array_almost_equal(obs_t, [[0]])
        np.testing.assert_array_almost_equal(actions_t, [[1, 1]])
        np.testing.assert_array_almost_equal(rewards, [2])
        np.testing.assert_array_almost_equal(obs_tp1, [[3]])
        np.testing.assert_array_almost_equal(done, [False])
        np.testing.assert_array_almost_equal(all_obs_t, [[4, 4, 4]])
        np.testing.assert_array_almost_equal(all_actions_t, [[5, 5, 5, 5]])
        np.testing.assert_array_almost_equal(all_obs_tp1, [[6, 6, 6]])