def testBaselinePerformance(self): for _ in range(20): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, batch_steps=100) start = time.time() count = 0 while time.time() - start < 1: count += ev.sample().count print() print("Samples per second {}".format( count / (time.time() - start))) print()
def test_external_multi_agent_env_sample(self): agents = 2 act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = RolloutWorker( env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)), policy_spec={ "p0": (MockPolicy, obs_space, act_space, {}), "p1": (MockPolicy, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), rollout_fragment_length=50) batch = ev.sample() self.assertEqual(batch.count, 50)
def _make_envs(self): def make_sess(): return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2})) local = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"), policy=PPOTFPolicy, tf_session_creator=make_sess) remotes = [ RolloutWorker.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy=PPOTFPolicy, tf_session_creator=make_sess) ] return local, remotes
def test_get_filters(self): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, sample_async=True, observation_filter="ConcurrentMeanStdFilter") self.sample_and_flush(ev) filters = ev.get_filters(flush_after=False) time.sleep(2) filters2 = ev.get_filters(flush_after=False) obs_f = filters[DEFAULT_POLICY_ID] obs_f2 = filters2[DEFAULT_POLICY_ID] self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n) self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
def test_batches_larger_when_vectorized(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=8), policy=MockPolicy, batch_mode="truncate_episodes", rollout_fragment_length=4, num_envs=4) batch = ev.sample() self.assertEqual(batch.count, 16) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) batch = ev.sample() result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 4)
def test_extra_python_envs(self): extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"} self.assertFalse("env_key_1" in os.environ) self.assertFalse("env_key_2" in os.environ) ev = RolloutWorker(env_creator=lambda _: MockEnv(10), policy=MockPolicy, extra_python_environs=extra_envs) self.assertTrue("env_key_1" in os.environ) self.assertTrue("env_key_2" in os.environ) ev.stop() # reset to original del os.environ["env_key_1"] del os.environ["env_key_2"]
def test_soft_horizon(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=10), policy_spec=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=4, soft_horizon=True) samples = ev.sample() # three logical episodes self.assertEqual(len(set(samples["eps_id"])), 3) # only 1 hard done value self.assertEqual(sum(samples["dones"]), 1) ev.stop()
def test_multi_agent_sample_with_horizon(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), "p1": (MockPolicy, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), episode_horizon=10, # test with episode horizon set rollout_fragment_length=50) batch = ev.sample() self.assertEqual(batch.count, 50)
def test_complete_episodes_packing(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(10), policy_spec=MockPolicy, rollout_fragment_length=15, batch_mode="complete_episodes", ) batch = ev.sample() self.assertEqual(batch.count, 20) self.assertEqual( batch["t"].tolist(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], ) ev.stop()
def test_metrics(self): ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") remote_ev = RolloutWorker.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10)
def test_returning_model_based_rollouts_data(self): class ModelBasedPolicy(DQNTFPolicy): def compute_actions(self, obs_batch, state_batches, prev_action_batch=None, prev_reward_batch=None, episodes=None, **kwargs): # In policy loss initialization phase, no episodes are passed # in. if episodes is not None: # Pretend we did a model-based rollout and want to return # the extra trajectory. builder = episodes[0].new_batch_builder() rollout_id = random.randint(0, 10000) for t in range(5): builder.add_values( agent_id="extra_0", policy_id="p1", # use p1 so we can easily check it t=t, eps_id=rollout_id, # new id for each rollout obs=obs_batch[0], actions=0, rewards=0, dones=t == 4, infos={}, new_obs=obs_batch[0]) batch = builder.build_and_reset(episode=None) episodes[0].add_extra_batch(batch) # Just return zeros for actions return [0] * len(obs_batch), [], {} single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space ev = RolloutWorker( env_creator=lambda _: MultiAgentCartPole({"num_agents": 2}), policy_spec={ "p0": (ModelBasedPolicy, obs_space, act_space, {}), "p1": (ModelBasedPolicy, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p0", rollout_fragment_length=5) batch = ev.sample() self.assertEqual(batch.count, 5) self.assertEqual(batch.policy_batches["p0"].count, 10) self.assertEqual(batch.policy_batches["p1"].count, 25)
def testMultiAgentSampleAsyncRemote(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), "p1": (MockPolicy, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_steps=50, num_envs=4, remote_worker_envs=True) batch = ev.sample() self.assertEqual(batch.count, 200)
def test_baseline_performance(self): for _ in range(20): ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, rollout_fragment_length=100, ) start = time.time() count = 0 while time.time() - start < 1: count += ev.sample().count print() print("Samples per second {}".format(count / (time.time() - start))) print()
def make_workers(n): local = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=PPOTFPolicy, rollout_fragment_length=100, ) remotes = [ RolloutWorker.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=PPOTFPolicy, rollout_fragment_length=100, ) for _ in range(n) ] workers = WorkerSet._from_existing(local, remotes) return workers
def testSampleFromEarlyDoneEnv(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = RolloutWorker( env_creator=lambda _: EarlyDoneMultiAgent(), policy={ "p0": (MockPolicy, obs_space, act_space, {}), "p1": (MockPolicy, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_mode="complete_episodes", batch_steps=1) self.assertRaisesRegexp(ValueError, ".*don't have a last observation.*", lambda: ev.sample())
def test_metrics(self): # Allow for Unittest run. ray.init(num_cpus=5, ignore_reinit_error=True) ev = RolloutWorker(env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") remote_ev = RolloutWorker.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") ev.sample() ray.get(remote_ev.sample.remote()) result = collect_metrics(ev, [remote_ev]) self.assertEqual(result["episodes_this_iter"], 20) self.assertEqual(result["episode_reward_mean"], 10)
def test_batch_ids(self): fragment_len = 100 ev = RolloutWorker(env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, rollout_fragment_length=fragment_len) batch1 = ev.sample() batch2 = ev.sample() unroll_ids_1 = set(batch1["unroll_id"]) unroll_ids_2 = set(batch2["unroll_id"]) # Assert no overlap of unroll IDs between sample() calls. self.assertTrue(not any(uid in unroll_ids_2 for uid in unroll_ids_1)) # CartPole episodes should be short initially: Expect more than one # unroll ID in each batch. self.assertTrue(len(unroll_ids_1) > 1) self.assertTrue(len(unroll_ids_2) > 1) ev.stop()
def add_workers(self, num_workers: int) -> None: """Creates and adds a number of remote workers to this worker set. Can be called several times on the same WorkerSet to add more RolloutWorkers to the set. Args: num_workers: The number of remote Workers to add to this WorkerSet. """ remote_args = { "num_cpus": self._remote_config["num_cpus_per_worker"], "num_gpus": self._remote_config["num_gpus_per_worker"], "resources": self._remote_config["custom_resources_per_worker"], } cls = RolloutWorker.as_remote(**remote_args).remote self._remote_workers.extend( [ self._make_worker( cls=cls, env_creator=self._env_creator, validate_env=None, policy_cls=self._policy_class, worker_index=i + 1, num_workers=num_workers, config=self._remote_config, ) for i in range(num_workers) ] )
def _sample_and_train_torch_distributed(worker: RolloutWorker): # This function is applied remotely on each rollout worker. config = worker.policy_config # Generate a sample. start = time.perf_counter() batch = worker.sample() sample_time = time.perf_counter() - start expected_batch_size = (config["rollout_fragment_length"] * config["num_envs_per_worker"]) assert batch.count == expected_batch_size, ( "Batch size possibly out of sync between workers, expected:", expected_batch_size, "got:", batch.count, ) # Perform n minibatch SGD update(s) on the worker itself. start = time.perf_counter() info = do_minibatch_sgd( batch, worker.policy_map, worker, config["num_sgd_iter"], config["sgd_minibatch_size"], [Postprocessing.ADVANTAGES], ) learn_on_batch_time = time.perf_counter() - start return { "info": info, "env_steps": batch.env_steps(), "agent_steps": batch.agent_steps(), "sample_time": sample_time, "learn_on_batch_time": learn_on_batch_time, }
def test_train_external_multi_agent_cartpole_many_policies(self): n = 20 single_env = gym.make("CartPole-v0") act_space = single_env.action_space obs_space = single_env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) ev = RolloutWorker( env_creator=lambda _: MultiAgentCartPole({"num_agents": n}), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), rollout_fragment_length=100) optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev)) for i in range(100): optimizer.step() result = collect_metrics(ev) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def add_workers(self, num_workers: int) -> None: """Creates and add a number of remote workers to this worker set. Args: num_workers (int): The number of remote Workers to add to this WorkerSet. """ remote_args = { "num_cpus": self._remote_config["num_cpus_per_worker"], "num_gpus": self._remote_config["num_gpus_per_worker"], # memory=0 is an error, but memory=None means no limits. "memory": self._remote_config["memory_per_worker"] or None, "object_store_memory": self._remote_config["object_store_memory_per_worker"] or None, "resources": self._remote_config["custom_resources_per_worker"], } cls = RolloutWorker.as_remote(**remote_args).remote self._remote_workers.extend([ self._make_worker(cls=cls, env_creator=self._env_creator, validate_env=None, policy_cls=self._policy_class, worker_index=i + 1, num_workers=num_workers, config=self._remote_config) for i in range(num_workers) ])
def test_train_multi_cartpole_many_policies(self): n = 20 env = gym.make("CartPole-v0") act_space = env.action_space obs_space = env.observation_space policies = {} for i in range(20): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) worker = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) workers = WorkerSet._from_existing(worker, []) optimizer = SyncSamplesOptimizer(workers) for i in range(100): optimizer.step() result = collect_metrics(worker) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) if result["episode_reward_mean"] >= 25 * n: return raise Exception("failed to improve reward")
def add_workers(self, num_workers: int) -> None: """Creates and add a number of remote workers to this worker set. Args: num_workers (int): The number of remote Workers to add to this WorkerSet. """ remote_args = { "num_cpus": self._remote_config["num_cpus_per_worker"], "num_gpus": self._remote_config["num_gpus_per_worker"], "memory": self._remote_config["memory_per_worker"], "object_store_memory": self._remote_config["object_store_memory_per_worker"], "resources": self._remote_config["custom_resources_per_worker"], } cls = RolloutWorker.as_remote(**remote_args).remote self._remote_workers.extend([ self._make_worker(cls, self._env_creator, self._policy_class, i + 1, self._remote_config) for i in range(num_workers) ])
def testVectorEnvSupport(self): ev = RolloutWorker( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy=MockPolicy, batch_mode="truncate_episodes", batch_steps=10) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 0) for _ in range(8): batch = ev.sample() self.assertEqual(batch.count, 10) result = collect_metrics(ev, []) self.assertEqual(result["episodes_this_iter"], 8)
def recreate_failed_workers( self, local_worker_for_synching: RolloutWorker ) -> Tuple[List[ActorHandle], List[ActorHandle]]: """Recreates any failed workers (after health check). Args: local_worker_for_synching: RolloutWorker to use to synchronize the weights after recreation. Returns: A tuple consisting of two items: The list of removed workers and the list of newly added ones. """ faulty_indices = self._worker_health_check() removed_workers = [] new_workers = [] for worker_index in faulty_indices: worker = self.remote_workers()[worker_index - 1] removed_workers.append(worker) logger.info(f"Trying to recreate faulty worker {worker_index}") try: worker.__ray_terminate__.remote() except Exception: logger.exception("Error terminating faulty worker.") # Try to recreate the failed worker (start a new one). new_worker = self._make_worker( cls=self._cls, env_creator=self._env_creator, validate_env=None, policy_cls=self._policy_class, worker_index=worker_index, num_workers=len(self._remote_workers), recreated_worker=True, config=self._remote_config, ) # Sync new worker from provided one (or local one). new_worker.set_weights.remote( weights=local_worker_for_synching.get_weights(), global_vars=local_worker_for_synching.get_global_vars(), ) # Add new worker to list of remote workers. self._remote_workers[worker_index - 1] = new_worker new_workers.append(new_worker) return removed_workers, new_workers
def test_multi_agent_sample_sync_remote(self): ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), "p1": PolicySpec(policy_class=MockPolicy), }, # This signature will raise a soft-deprecation warning due # to the new signature we are using (agent_id, episode, **kwargs), # but should not break this test. policy_mapping_fn=(lambda agent_id: "p{}".format(agent_id % 2)), rollout_fragment_length=50, num_envs=4, remote_worker_envs=True, remote_env_batch_wait_ms=99999999) batch = ev.sample() self.assertEqual(batch.count, 200)
def test_multi_agent_sample(self): def policy_mapping_fn(agent_id, episode, worker, **kwargs): return "p{}".format(agent_id % 2) ev = RolloutWorker(env_creator=lambda _: BasicMultiAgent(5), policy_spec={ "p0": PolicySpec(policy_class=MockPolicy), "p1": PolicySpec(policy_class=MockPolicy), }, policy_mapping_fn=policy_mapping_fn, rollout_fragment_length=50) batch = ev.sample() self.assertEqual(batch.count, 50) self.assertEqual(batch.policy_batches["p0"].count, 150) self.assertEqual(batch.policy_batches["p1"].count, 100) self.assertEqual(batch.policy_batches["p0"]["t"].tolist(), list(range(25)) * 6)
def test_multiagent_env(self): temp_env = EpisodeEnv(NUM_STEPS, NUM_AGENTS) ev = RolloutWorker( env_creator=lambda _: EpisodeEnv(NUM_STEPS, NUM_AGENTS), policy_spec={ str(agent_id): ( EchoPolicy, temp_env.observation_space, temp_env.action_space, {}, ) for agent_id in range(NUM_AGENTS) }, policy_mapping_fn=lambda aid, eps, **kwargs: str(aid), callbacks=LastInfoCallback, ) ev.sample()
def testMultiAgentSample(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), "p1": (MockPolicy, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), batch_steps=50) batch = ev.sample() self.assertEqual(batch.count, 50) self.assertEqual(batch.policy_batches["p0"].count, 150) self.assertEqual(batch.policy_batches["p1"].count, 100) self.assertEqual(batch.policy_batches["p0"]["t"].tolist(), list(range(25)) * 6)
def test_multi_agent_sample_async_remote(self): # Allow to be run via Unittest. ray.init(num_cpus=4, ignore_reinit_error=True) act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), "p1": (MockPolicy, obs_space, act_space, {}), }, policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), rollout_fragment_length=50, num_envs=4, remote_worker_envs=True) batch = ev.sample() self.assertEqual(batch.count, 200)