def test_lockstep_mode(self): """Test the lockstep mode by only adding SampleBatches. Such SampleBatches are converted to MultiAgent Batches as if there was only one policy.""" self.batch_id = 0 batch_size = 5 buffer_size = 30 buffer = MultiAgentReplayBuffer( capacity=buffer_size, replay_mode="lockstep", learning_starts=0, num_shards=1, ) # Test add/sample self._add_sample_batch_to_buffer(buffer, batch_size=batch_size, num_batches=1) # Sampling from it now should yield the first batch assert get_batch_id(buffer.sample(1)) == 0 self._add_sample_batch_to_buffer(buffer, batch_size=batch_size, num_batches=2) # Sampling from it now should yield our first batch 1/3 of the time num_sampled_dict = {_id: 0 for _id in range(self.batch_id)} num_samples = 200 for i in range(num_samples): _id = get_batch_id(buffer.sample(1)) num_sampled_dict[_id] += 1 assert np.allclose( np.array(list(num_sampled_dict.values())) / num_samples, len(num_sampled_dict) * [1 / 3], atol=0.1, )
def test_independent_with_underlying_prioritized_replay_buffer(self): """Test this the buffer with different underlying buffers. Test if we can initialize a more complex underlying buffer with additional arguments and independent sampling. This does not test updating priorities and using weights as implemented in MultiAgentPrioritizedReplayBuffer. """ # Test with PrioritizedReplayBuffer, args for c'tor, add and sample prioritized_replay_buffer_config = { "type": PrioritizedReplayBuffer, "alpha": 0.6, "beta": 0.4, } num_policies = 2 buffer_size = 15 num_batches = 1 buffer = MultiAgentReplayBuffer( capacity=buffer_size, replay_mode="independent", learning_starts=0, num_shards=1, underlying_buffer_config=prioritized_replay_buffer_config, ) self._add_multi_agent_batch_to_buffer( buffer, num_policies=num_policies, num_batches=num_batches ) # Only test if we can sample from multiple policies sample = buffer.sample(2) assert len(sample) == 4 assert len(sample.policy_batches) == 2
def test_independent_mode_sequences_storage_unit(self): """Test the independent mode with sequences as a storage unit. Such SampleBatches are converted to MultiAgentBatches as if there was only one policy.""" buffer_size = 15 self.batch_id = 0 buffer = MultiAgentReplayBuffer( capacity=buffer_size, replay_mode="independent", storage_unit="sequences", replay_sequence_length=2, learning_starts=0, num_shards=1, ) # Test add/sample self._add_multi_agent_batch_to_buffer(buffer, num_policies=2, num_batches=1, seq_lens=True) # Sampling from it now should yield the first batch assert get_batch_id(buffer.sample(1), 0) == 0 self._add_multi_agent_batch_to_buffer(buffer, num_policies=2, num_batches=2, seq_lens=True) # Sampling from it now should yield each batch that went into a # multiagent batch 1/6th of the time num_sampled_dict = {_id: 0 for _id in range(self.batch_id)} num_samples = 200 for i in range(num_samples): sample = buffer.sample(1) # Count one of both policy batches _id = get_batch_id(sample, np.random.choice([0, 1])) num_sampled_dict[_id] += 1 # See if a random batch has the desired sequence length of two assert len(sample.policy_batches[np.random.choice([0, 1])]) == 2 assert np.allclose( np.array(list(num_sampled_dict.values())) / num_samples, len(num_sampled_dict) * [1 / 6], atol=0.1, )
def test_independent_mode_multiple_policies(self): """Test the lockstep mode by adding batches from multiple policies.""" num_batches = 3 buffer_size = 15 num_policies = 2 # Test lockstep mode with different policy ids using MultiAgentBatches self.batch_id = 0 buffer = MultiAgentReplayBuffer( capacity=buffer_size, replay_mode="independent", learning_starts=0, num_shards=1, ) self._add_multi_agent_batch_to_buffer( buffer, num_policies=num_policies, num_batches=num_batches ) # Sample 4 SampleBatches from only one policy for _id in range(num_policies): for __id in buffer.sample(4, policy_id=_id).policy_batches[_id][ "policy_id" ]: assert __id == _id # Sample without specifying the policy should yield the same number # of batches from each policy num_sampled_dict = {_id: 0 for _id in range(num_policies)} num_samples = 200 for i in range(num_samples): num_items = np.random.randint(0, 5) for _id, batch in buffer.sample(num_items=num_items).policy_batches.items(): num_sampled_dict[_id] += 1 assert len(batch) == num_items assert np.allclose( np.array(list(num_sampled_dict.values())), len(num_sampled_dict) * [200], atol=0.1, )
def test_lockstep_with_underlying_replay_buffer(self): """Test this the buffer with different underlying buffers. Test if we can initialize a simple underlying buffer without additional arguments and lockstep sampling. """ # Test with ReplayBuffer, no args for c'tor, add and sample replay_buffer_config = {"type": ReplayBuffer} num_policies = 2 buffer_size = 200 num_batches = 20 buffer = MultiAgentReplayBuffer( capacity=buffer_size, replay_mode="lockstep", learning_starts=0, num_shards=1, underlying_buffer_config=replay_buffer_config, ) self._add_multi_agent_batch_to_buffer( buffer, num_policies=num_policies - 1, num_batches=num_batches ) # Only test if we can sample and if samples belong to a single policy sample = buffer.sample(2) assert len(sample) == 2 assert len(sample.policy_batches) == 1 self._add_multi_agent_batch_to_buffer( buffer, num_policies=num_policies, num_batches=num_batches ) # Only test if we can sample from multiple policies, out of 100 # samples, some should be of each policy sample = buffer.sample(100) assert len(sample) == 100 assert len(sample.policy_batches) == 2
def test_store_to_replay_local(self): buf = MultiAgentReplayBuffer( num_shards=1, learning_starts=200, capacity=1000, prioritized_replay_alpha=0.6, prioritized_replay_beta=0.4, prioritized_replay_eps=0.0001, ) assert len(buf.sample(100)) == 0 workers = make_workers(0) a = ParallelRollouts(workers, mode="bulk_sync") b = a.for_each(StoreToReplayBuffer(local_buffer=buf)) next(b) assert len(buf.sample(100)) == 0 # learning hasn't started yet next(b) assert buf.sample(100).count == 100 replay_op = Replay(local_buffer=buf, num_items_to_replay=100) assert next(replay_op).count == 100
def test_policy_id_of_multi_agent_batches_independent(self): """Test if indepent sampling yields a MultiAgentBatch with the correct policy id.""" self.batch_id = 0 # Test lockstep mode with different policy ids using MultiAgentBatches buffer = MultiAgentReplayBuffer( capacity=10, replay_mode="independent", learning_starts=0, num_shards=1 ) self._add_multi_agent_batch_to_buffer(buffer, num_policies=1, num_batches=1) mabatch = buffer.sample(1) assert list(mabatch.policy_batches.keys())[0] == 0
class MyTrainer(Trainer): @classmethod @override(Trainer) def get_default_config(cls) -> TrainerConfigDict: # Run this Trainer with new `training_iteration` API and set some PPO-specific # parameters. return with_common_config({ "num_sgd_iter": 10, "sgd_minibatch_size": 128, }) @override(Trainer) def setup(self, config): # Call super's `setup` to create rollout workers. super().setup(config) # Create local replay buffer. self.local_replay_buffer = MultiAgentReplayBuffer(num_shards=1, learning_starts=1000, capacity=50000) @override(Trainer) def training_iteration(self) -> ResultDict: # Generate common experiences, collect batch for PPO, store every (DQN) batch # into replay buffer. ppo_batches = [] num_env_steps = 0 # PPO batch size fixed at 200. while num_env_steps < 200: ma_batches = synchronous_parallel_sample(worker_set=self.workers, concat=False) # Loop through (parallely collected) ma-batches. for ma_batch in ma_batches: # Update sampled counters. self._counters[NUM_ENV_STEPS_SAMPLED] += ma_batch.count self._counters[ NUM_AGENT_STEPS_SAMPLED] += ma_batch.agent_steps() ppo_batch = ma_batch.policy_batches.pop("ppo_policy") # Add collected batches (only for DQN policy) to replay buffer. self.local_replay_buffer.add(ma_batch) ppo_batches.append(ppo_batch) num_env_steps += ppo_batch.count # DQN sub-flow. dqn_train_results = {} dqn_train_batch = self.local_replay_buffer.sample(num_items=64) if dqn_train_batch is not None: dqn_train_results = train_one_step(self, dqn_train_batch, ["dqn_policy"]) self._counters[ "agent_steps_trained_DQN"] += dqn_train_batch.agent_steps() print( "DQN policy learning on samples from", "agent steps trained", dqn_train_batch.agent_steps(), ) # Update DQN's target net every 500 train steps. if (self._counters["agent_steps_trained_DQN"] - self._counters[LAST_TARGET_UPDATE_TS] >= 500): self.workers.local_worker().get_policy( "dqn_policy").update_target() self._counters[NUM_TARGET_UPDATES] += 1 self._counters[LAST_TARGET_UPDATE_TS] = self._counters[ "agent_steps_trained_DQN"] # PPO sub-flow. ppo_train_batch = SampleBatch.concat_samples(ppo_batches) self._counters[ "agent_steps_trained_PPO"] += ppo_train_batch.agent_steps() # Standardize advantages. ppo_train_batch[Postprocessing.ADVANTAGES] = standardized( ppo_train_batch[Postprocessing.ADVANTAGES]) print( "PPO policy learning on samples from", "agent steps trained", ppo_train_batch.agent_steps(), ) ppo_train_batch = MultiAgentBatch({"ppo_policy": ppo_train_batch}, ppo_train_batch.count) ppo_train_results = train_one_step(self, ppo_train_batch, ["ppo_policy"]) # Combine results for PPO and DQN into one results dict. results = dict(ppo_train_results, **dqn_train_results) return results