def set_state(self, state: Dict[str, Any]) -> None:
        """Restores all local state to the provided `state`.

        Args:
            state: The new state to set this buffer. Can be obtained by
                calling `self.get_state()`.
        """
        self.last_added_batches = state["last_added_batches"]
        MultiAgentPrioritizedReplayBuffer.set_state(state)
示例#2
0
    def test_policy_id_of_multi_agent_batches_independent(self):
        """Test if indepent sampling yields a MultiAgentBatch with the
        correct policy id."""
        self.batch_id = 0

        # Test lockstep mode with different policy ids using MultiAgentBatches
        buffer = MultiAgentPrioritizedReplayBuffer(
            capacity=10, replay_mode="independent", learning_starts=0, num_shards=1
        )

        self._add_multi_agent_batch_to_buffer(buffer, num_policies=1, num_batches=1)

        mabatch = buffer.sample(1)
        assert list(mabatch.policy_batches.keys())[0] == 0
示例#3
0
    def test_lockstep_mode(self):
        """Test the lockstep mode by adding batches from multiple policies."""
        self.batch_id = 0

        num_policies = 4
        num_batches = 13
        buffer_size = 15

        # Test lockstep mode with different policy ids using MultiAgentBatches
        buffer = MultiAgentPrioritizedReplayBuffer(
            capacity=buffer_size,
            replay_mode="lockstep",
            learning_starts=0,
            num_shards=1,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        _id, _buffer = next(buffer.replay_buffers.items().__iter__())
        assert _id == _ALL_POLICIES
        assert len(buffer) == num_batches

        # Add batches until the buffer is full
        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        assert _id == _ALL_POLICIES
        assert len(buffer) == buffer_size
示例#4
0
    def test_independent_mode(self):
        """Test the lockstep mode by adding batches from multiple policies."""
        self.batch_id = 0

        num_batches = 3
        buffer_size = 15
        num_policies = 2

        # Test lockstep mode with different policy ids using MultiAgentBatches
        buffer = MultiAgentPrioritizedReplayBuffer(
            capacity=buffer_size,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
        )

        self._add_multi_agent_batch_to_buffer(
            buffer, num_policies=num_policies, num_batches=num_batches
        )

        # Sample 4 SampleBatches from only one policy and put it into a
        # MultiAgentBatch
        for _id in range(num_policies):
            for __id in buffer.sample(4, policy_id=_id).policy_batches[_id][
                "policy_id"
            ]:
                assert __id == _id

        # Sample without specifying the policy should yield approx. the same
        # number of batches from each policy
        num_sampled_dict = {_id: 0 for _id in range(num_policies)}
        num_samples = 200
        for i in range(num_samples):
            num_items = np.random.randint(1, 5)
            for _id, batch in buffer.sample(num_items=num_items).policy_batches.items():
                num_sampled_dict[_id] += 1
                assert len(batch) == num_items
        assert np.allclose(
            np.array(list(num_sampled_dict.values())),
            len(num_sampled_dict) * [200],
            atol=0.1,
        )
    def get_state(self) -> Dict[str, Any]:
        """Returns all local state.

        Returns:
            The serializable local state.
        """
        data = {
            "last_added_batches": self.last_added_batches,
        }
        parent = MultiAgentPrioritizedReplayBuffer.get_state(self)
        parent.update(data)
        return parent
    def __init__(self,
                 capacity: int = 10000,
                 storage_unit: str = "timesteps",
                 num_shards: int = 1,
                 prioritized_replay_alpha: float = 0.6,
                 prioritized_replay_beta: float = 0.4,
                 prioritized_replay_eps: float = 1e-6,
                 learning_starts: int = 1000,
                 replay_batch_size: int = 1,
                 replay_sequence_length: int = 1,
                 replay_burn_in: int = 0,
                 replay_zero_init_states: bool = True,
                 replay_ratio: float = 0.66,
                 underlying_buffer_config: dict = None,
                 **kwargs):
        """Initializes MultiAgentMixInReplayBuffer instance.

        Args:
            capacity: Number of batches to store in total.
            storage_unit: Either 'timesteps', 'sequences' or
                'episodes'. Specifies how experiences are stored. If they
                are stored in episodes, replay_sequence_length is ignored.
            num_shards: The number of buffer shards that exist in total
                (including this one).
            learning_starts: Number of timesteps after which a call to
                `replay()` will yield samples (before that, `replay()` will
                return None).
            capacity: The capacity of the buffer. Note that when
                `replay_sequence_length` > 1, this is the number of sequences
                (not single timesteps) stored.
            replay_batch_size: The batch size to be sampled (in timesteps).
                Note that if `replay_sequence_length` > 1,
                `self.replay_batch_size` will be set to the number of
                sequences sampled (B).
            replay_sequence_length: The sequence length (T) of a single
                sample. If > 1, we will sample B x T from this buffer.
            replay_burn_in: The burn-in length in case
                `replay_sequence_length` > 0. This is the number of timesteps
                each sequence overlaps with the previous one to generate a
                better internal state (=state after the burn-in), instead of
                starting from 0.0 each RNN rollout.
            replay_zero_init_states: Whether the initial states in the
                buffer (if replay_sequence_length > 0) are alwayas 0.0 or
                should be updated with the previous train_batch state outputs.
            replay_ratio: Ratio of replayed samples in the returned
                batches. E.g. a ratio of 0.0 means only return new samples
                (no replay), a ratio of 0.5 means always return newest sample
                plus one old one (1:1), a ratio of 0.66 means always return
                the newest sample plus 2 old (replayed) ones (1:2), etc...
            underlying_buffer_config: A config that contains all necessary
                constructor arguments and arguments for methods to call on
                the underlying buffers. This replaces the standard behaviour
                of the underlying PrioritizedReplayBuffer. The config
                follows the conventions of the general
                replay_buffer_config. kwargs for subsequent calls of methods
                may also be included. Example:
                "replay_buffer_config": {"type": PrioritizedReplayBuffer,
                "capacity": 10, "storage_unit": "timesteps",
                prioritized_replay_alpha: 0.5, prioritized_replay_beta: 0.5,
                prioritized_replay_eps: 0.5}
            **kwargs: Forward compatibility kwargs.
        """
        if not 0 <= replay_ratio <= 1:
            raise ValueError("Replay ratio must be within [0, 1]")

        if "replay_mode" in kwargs and kwargs["replay_mode"] == "lockstep":
            if log_once("lockstep_mode_not_supported"):
                logger.error("Replay mode `lockstep` is not supported for "
                             "MultiAgentMixInReplayBuffer."
                             "This buffer will run in `independent` mode.")
            del kwargs["replay_mode"]

        MultiAgentPrioritizedReplayBuffer.__init__(
            self,
            capacity=capacity,
            storage_unit=storage_unit,
            prioritized_replay_alpha=prioritized_replay_alpha,
            prioritized_replay_beta=prioritized_replay_beta,
            prioritized_replay_eps=prioritized_replay_eps,
            num_shards=num_shards,
            replay_mode="independent",
            learning_starts=learning_starts,
            replay_batch_size=replay_batch_size,
            replay_sequence_length=replay_sequence_length,
            replay_burn_in=replay_burn_in,
            replay_zero_init_states=replay_zero_init_states,
            underlying_buffer_config=underlying_buffer_config,
            **kwargs)

        self.replay_ratio = replay_ratio

        self.last_added_batches = collections.defaultdict(list)
示例#7
0
    def __init__(self,
                 capacity: int = 10000,
                 storage_unit: str = "timesteps",
                 num_shards: int = 1,
                 learning_starts: int = 1000,
                 replay_mode: str = "independent",
                 replay_sequence_override: bool = True,
                 replay_sequence_length: int = 1,
                 replay_burn_in: int = 0,
                 replay_zero_init_states: bool = True,
                 replay_ratio: float = 0.66,
                 underlying_buffer_config: dict = None,
                 prioritized_replay_alpha: float = 0.6,
                 prioritized_replay_beta: float = 0.4,
                 prioritized_replay_eps: float = 1e-6,
                 **kwargs):
        """Initializes MultiAgentMixInReplayBuffer instance.

        Args:
            capacity: The capacity of the buffer, measured in `storage_unit`.
            storage_unit: Either 'timesteps', 'sequences' or
                'episodes'. Specifies how experiences are stored. If they
                are stored in episodes, replay_sequence_length is ignored.
            num_shards: The number of buffer shards that exist in total
                (including this one).
            learning_starts: Number of timesteps after which a call to
                `replay()` will yield samples (before that, `replay()` will
                return None).
            replay_mode: One of "independent" or "lockstep". Determines,
                whether batches are sampled independently or to an equal
                amount.
            replay_sequence_override: If True, ignore sequences found in incoming
                batches, slicing them into sequences as specified by
                `replay_sequence_length` and `replay_sequence_burn_in`. This only has
                an effect if storage_unit is `sequences`.
            replay_sequence_length: The sequence length (T) of a single
                sample. If > 1, we will sample B x T from this buffer. This
                only has an effect if storage_unit is 'timesteps'.
            replay_burn_in: The burn-in length in case
                `replay_sequence_length` > 0. This is the number of timesteps
                each sequence overlaps with the previous one to generate a
                better internal state (=state after the burn-in), instead of
                starting from 0.0 each RNN rollout.
            replay_zero_init_states: Whether the initial states in the
                buffer (if replay_sequence_length > 0) are alwayas 0.0 or
                should be updated with the previous train_batch state outputs.
            replay_ratio: Ratio of replayed samples in the returned
                batches. E.g. a ratio of 0.0 means only return new samples
                (no replay), a ratio of 0.5 means always return newest sample
                plus one old one (1:1), a ratio of 0.66 means always return
                the newest sample plus 2 old (replayed) ones (1:2), etc...
            underlying_buffer_config: A config that contains all necessary
                constructor arguments and arguments for methods to call on
                the underlying buffers. This replaces the standard behaviour
                of the underlying PrioritizedReplayBuffer. The config
                follows the conventions of the general
                replay_buffer_config. kwargs for subsequent calls of methods
                may also be included. Example:
                "replay_buffer_config": {"type": PrioritizedReplayBuffer,
                "capacity": 10, "storage_unit": "timesteps",
                prioritized_replay_alpha: 0.5, prioritized_replay_beta: 0.5,
                prioritized_replay_eps: 0.5}
            prioritized_replay_alpha: Alpha parameter for a prioritized
                replay buffer. Use 0.0 for no prioritization.
            prioritized_replay_beta: Beta parameter for a prioritized
                replay buffer.
            prioritized_replay_eps: Epsilon parameter for a prioritized
                replay buffer.
            **kwargs: Forward compatibility kwargs.
        """
        if not 0 <= replay_ratio <= 1:
            raise ValueError("Replay ratio must be within [0, 1]")

        MultiAgentPrioritizedReplayBuffer.__init__(
            self,
            capacity=capacity,
            storage_unit=storage_unit,
            num_shards=num_shards,
            learning_starts=learning_starts,
            replay_mode=replay_mode,
            replay_sequence_override=replay_sequence_override,
            replay_sequence_length=replay_sequence_length,
            replay_burn_in=replay_burn_in,
            replay_zero_init_states=replay_zero_init_states,
            underlying_buffer_config=underlying_buffer_config,
            prioritized_replay_alpha=prioritized_replay_alpha,
            prioritized_replay_beta=prioritized_replay_beta,
            prioritized_replay_eps=prioritized_replay_eps,
            **kwargs)

        self.replay_ratio = replay_ratio

        self.last_added_batches = collections.defaultdict(list)
示例#8
0
    def test_update_priorities(self):
        num_batches = 5
        buffer_size = 15

        # Buffer needs to be in independent mode, lockstep is not supported
        buffer = MultiAgentPrioritizedReplayBuffer(
            capacity=buffer_size,
            prioritized_replay_alpha=self.alpha,
            prioritized_replay_beta=self.beta,
            replay_mode="independent",
            replay_sequence_length=2,
            learning_starts=0,
            num_shards=1,
        )

        # Insert n samples
        for i in range(num_batches):
            data = self._generate_data()
            buffer.add(data, weight=1.0)
            assert len(buffer) == i + 1

        # Fetch records, their indices and weights.
        mabatch = buffer.sample(3)
        assert type(mabatch) == MultiAgentBatch
        samplebatch = mabatch.policy_batches[DEFAULT_POLICY_ID]

        weights = samplebatch["weights"]
        indices = samplebatch["batch_indexes"]
        check(weights, np.ones(shape=(6,)))
        assert 6 == len(indices)
        assert len(buffer) == num_batches
        policy_buffer = buffer.replay_buffers[DEFAULT_POLICY_ID]
        assert policy_buffer._next_idx == num_batches
        # Update weight of indices 0, 2, 3, 4, like in our
        # PrioritizedReplayBuffer tests
        priority_dict = {
            DEFAULT_POLICY_ID: (
                np.array([0, 2, 3, 4]),
                np.array([0.01, 0.01, 0.01, 0.01]),
            )
        }

        buffer.update_priorities(priority_dict)

        # Expect to sample almost only index 1
        # (which still has a weight of 1.0).
        for _ in range(10):
            mabatch = buffer.sample(1000)
            assert type(mabatch) == MultiAgentBatch
            samplebatch = mabatch.policy_batches[DEFAULT_POLICY_ID]
            assert type(mabatch) == MultiAgentBatch
            indices = samplebatch["batch_indexes"]
            self.assertTrue(1900 < np.sum(indices) < 2200)
        # Test get_state/set_state.
        state = buffer.get_state()
        new_buffer = MultiAgentPrioritizedReplayBuffer(
            capacity=buffer_size,
            prioritized_replay_alpha=self.alpha,
            prioritized_replay_beta=self.beta,
            replay_mode="independent",
            learning_starts=0,
            num_shards=1,
        )
        new_buffer.set_state(state)
        batch = new_buffer.sample(1000).policy_batches[DEFAULT_POLICY_ID]
        indices = batch["batch_indexes"]
        self.assertTrue(1900 < np.sum(indices) < 2200)