def test_len_and_size_bytes(self): s1 = SampleBatch({ "a": np.array([1, 2, 3]), "b": { "c": np.array([4, 5, 6]) }, "seq_lens": [1, 2], }) check(len(s1), 3) check(s1.size_bytes(), s1["a"].nbytes + s1["b"]["c"].nbytes + s1["seq_lens"].nbytes)
def on_train_result(self, *, trainer, result: dict, **kwargs): stats = result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][ LEARNER_STATS_KEY] # Learning rate should go to 0 after 1 iter. check(stats["cur_lr"], 5e-5 if trainer.iteration == 1 else 0.0) # Entropy coeff goes to 0.05, then 0.0 (per iter). check(stats["entropy_coeff"], 0.1 if trainer.iteration == 1 else 0.05) trainer.workers.foreach_policy( self._check_lr_torch if trainer.config["framework"] == "torch" else self._check_lr_tf)
def test_hard_horizon(self): ev = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy_spec=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=4, soft_horizon=False, ) samples = ev.sample() # Three logical episodes and correct episode resets (always after 4 # steps). self.assertEqual(len(set(samples["eps_id"])), 3) for i in range(4): self.assertEqual(np.argmax(samples["obs"][i]), i) self.assertEqual(np.argmax(samples["obs"][4]), 0) # 3 done values. self.assertEqual(sum(samples["dones"]), 3) ev.stop() # A gym env's max_episode_steps is smaller than Trainer's horizon. ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_spec=MockPolicy, batch_mode="complete_episodes", rollout_fragment_length=10, episode_horizon=6, soft_horizon=False, ) samples = ev.sample() # 12 steps due to `complete_episodes` batch_mode. self.assertEqual(len(samples["eps_id"]), 12) # Two logical episodes and correct episode resets (always after 6(!) # steps). self.assertEqual(len(set(samples["eps_id"])), 2) # 2 done values after 6 and 12 steps. check( samples["dones"], [ False, False, False, False, False, True, False, False, False, False, False, True, ], ) ev.stop()
def test_ppo_compilation_and_schedule_mixins(self): """Test whether a PPOTrainer can be built with all frameworks.""" config = copy.deepcopy(ppo.DEFAULT_CONFIG) # For checking lr-schedule correctness. config["callbacks"] = MyCallbacks config["num_workers"] = 1 config["num_sgd_iter"] = 2 # Settings in case we use an LSTM. config["model"]["lstm_cell_size"] = 10 config["model"]["max_seq_len"] = 20 # Use default-native keras models whenever possible. config["model"]["_use_default_native_models"] = True # Setup lr- and entropy schedules for testing. config["lr_schedule"] = [[0, config["lr"]], [128, 0.0]] # Set entropy_coeff to a faulty value to proof that it'll get # overridden by the schedule below (which is expected). config["entropy_coeff"] = 100.0 config["entropy_coeff_schedule"] = [[0, 0.1], [256, 0.0]] config["train_batch_size"] = 128 # Test with compression. config["compress_observations"] = True num_iterations = 2 for fw in framework_iterator(config): for env in ["CartPole-v0", "MsPacmanNoFrameskip-v4"]: print("Env={}".format(env)) for lstm in [True, False]: print("LSTM={}".format(lstm)) config["model"]["use_lstm"] = lstm config["model"]["lstm_use_prev_action"] = lstm config["model"]["lstm_use_prev_reward"] = lstm trainer = ppo.PPOTrainer(config=config, env=env) policy = trainer.get_policy() entropy_coeff = trainer.get_policy().entropy_coeff lr = policy.cur_lr if fw == "tf": entropy_coeff, lr = policy.get_session().run( [entropy_coeff, lr]) check(entropy_coeff, 0.1) check(lr, config["lr"]) for i in range(num_iterations): print(trainer.train()) check_compute_single_action( trainer, include_prev_action_reward=True, include_state=lstm) trainer.stop()
def test_multi_agent_complex_spaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGTrainer( env="nested_ma", config={ "num_workers": 0, "rollout_fragment_length": 5, "train_batch_size": 5, "multiagent": { "policies": { "tuple_policy": ( None, TUPLE_SPACE, act_space, {"model": {"custom_model": "tuple_spy"}}), "dict_policy": ( None, DICT_SPACE, act_space, {"model": {"custom_model": "dict_spy"}}), }, "policy_mapping_fn": lambda aid, **kwargs: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy"}[aid], }, "framework": "tf", }) # Skip first passes as they came from the TorchPolicy loss # initialization. TupleSpyModel.capture_index = DictSpyModel.capture_index = 0 pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) check(seen[2][0], task_i)
def test_rows(self): s1 = SampleBatch( { "a": np.array([[1, 1], [2, 2], [3, 3]]), "b": {"c": np.array([[4, 4], [5, 5], [6, 6]])}, SampleBatch.SEQ_LENS: np.array([1, 2]), } ) check( next(s1.rows()), {"a": [1, 1], "b": {"c": [4, 4]}, SampleBatch.SEQ_LENS: 1}, )
def test_nested_multidiscrete_one_hot_preprocessor(self): space = Tuple((MultiDiscrete([2, 3, 4]), )) pp = get_preprocessor(space)(space) self.assertTrue(pp.shape == (9, )) check( pp.transform((np.array([1, 2, 0]), )), [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0], ) check( pp.transform((np.array([0, 1, 3]), )), [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0], )
def test_ddpg_exploration_and_with_random_prerun(self): """Tests DDPG's Exploration (w/ random actions for n timesteps).""" core_config = ddpg.DEFAULT_CONFIG.copy() core_config["num_workers"] = 0 # Run locally. obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. for _ in framework_iterator(core_config): config = core_config.copy() # Default OUNoise setup. trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") # Setting explore=False should always return the same action. a_ = trainer.compute_action(obs, explore=False) for _ in range(50): a = trainer.compute_action(obs, explore=False) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for _ in range(50): actions.append(trainer.compute_action(obs)) check(np.std(actions), 0.0, false=True) trainer.stop() # Check randomness at beginning. config["exploration_config"] = { # Act randomly at beginning ... "random_timesteps": 50, # Then act very closely to deterministic actions thereafter. "ou_base_scale": 0.001, "initial_scale": 0.001, "final_scale": 0.001, } trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") # ts=1 (get a deterministic action as per explore=False). deterministic_action = trainer.compute_action(obs, explore=False) # ts=2-5 (in random window). random_a = [] for _ in range(49): random_a.append(trainer.compute_action(obs, explore=True)) check(random_a[-1], deterministic_action, false=True) self.assertTrue(np.std(random_a) > 0.5) # ts > 50 (a=deterministic_action + scale * N[0,1]) for _ in range(50): a = trainer.compute_action(obs, explore=True) check(a, deterministic_action, rtol=0.1) # ts >> 50 (BUT: explore=False -> expect deterministic action). for _ in range(50): a = trainer.compute_action(obs, explore=False) check(a, deterministic_action) trainer.stop()
def test_dict_flattening_preprocessor(self): space = Dict({ "a": Discrete(2), "b": Tuple([Discrete(3), Box(-1.0, 1.0, (4, ))]), }) pp = get_preprocessor(space)(space) self.assertTrue(isinstance(pp, DictFlatteningPreprocessor)) self.assertEqual(pp.shape, (9, )) check( pp.transform({ "a": 1, "b": (1, np.array([0.0, -0.5, 0.1, 0.6])) }), [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.5, 0.1, 0.6])
def test_traj_view_lstm_functionality(self): action_space = Box(-float("inf"), float("inf"), shape=(3, )) obs_space = Box(float("-inf"), float("inf"), (4, )) max_seq_len = 50 rollout_fragment_length = 200 assert rollout_fragment_length % max_seq_len == 0 policies = { "pol0": (EpisodeEnvAwareLSTMPolicy, obs_space, action_space, {}), } def policy_fn(agent_id): return "pol0" config = { "multiagent": { "policies": policies, "policy_mapping_fn": policy_fn, }, "model": { "use_lstm": True, "max_seq_len": max_seq_len, }, }, rollout_worker_w_api = RolloutWorker( env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}), policy_config=dict(config, **{"_use_trajectory_view_api": True}), rollout_fragment_length=rollout_fragment_length, policy_spec=policies, policy_mapping_fn=policy_fn, num_envs=1, ) rollout_worker_wo_api = RolloutWorker( env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}), policy_config=dict(config, **{"_use_trajectory_view_api": False}), rollout_fragment_length=rollout_fragment_length, policy_spec=policies, policy_mapping_fn=policy_fn, num_envs=1, ) for iteration in range(20): result = rollout_worker_w_api.sample() check(result.count, rollout_fragment_length) pol_batch_w = result.policy_batches["pol0"] assert pol_batch_w.count >= rollout_fragment_length analyze_rnn_batch(pol_batch_w, max_seq_len) result = rollout_worker_wo_api.sample() pol_batch_wo = result.policy_batches["pol0"] check(pol_batch_w.data, pol_batch_wo.data)
def test_multi_categorical(self): batch_size = 100 num_categories = 3 num_sub_distributions = 5 # Create 5 categorical distributions of 3 categories each. inputs_space = Box(-1.0, 2.0, shape=(batch_size, num_sub_distributions * num_categories)) values_space = Box(0, num_categories - 1, shape=(num_sub_distributions, batch_size), dtype=np.int32) inputs = inputs_space.sample() input_lengths = [num_categories] * num_sub_distributions inputs_split = np.split(inputs, num_sub_distributions, axis=1) for fw in framework_iterator(): # Create the correct distribution object. cls = MultiCategorical if fw != "torch" else TorchMultiCategorical multi_categorical = cls(inputs, None, input_lengths) # Batch of size=3 and deterministic (True). expected = np.transpose(np.argmax(inputs_split, axis=-1)) # Sample, expect always max value # (max likelihood for deterministic draw). out = multi_categorical.deterministic_sample() check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. out = multi_categorical.sample() check(tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), 1.0, decimals=0) # Test log-likelihood outputs. probs = softmax(inputs_split) values = values_space.sample() out = multi_categorical.logp(values if fw != "torch" else [ torch.Tensor(values[i]) for i in range(num_sub_distributions) ]) # v in np.stack(values, 1)]) expected = [] for i in range(batch_size): expected.append( np.sum( np.log( np.array([ probs[j][i][values[j][i]] for j in range(num_sub_distributions) ])))) check(out, expected, decimals=4) # Test entropy outputs. out = multi_categorical.entropy() expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1) check(out, expected_entropy)
def test_categorical(self): batch_size = 10000 num_categories = 4 # Create categorical distribution with n categories. inputs_space = Box(-1.0, 2.0, shape=(batch_size, num_categories), dtype=np.float32) values_space = Box(0, num_categories - 1, shape=(batch_size, ), dtype=np.int32) inputs = inputs_space.sample() for fw, sess in framework_iterator(session=True, frameworks=("tf", "tf2", "torch")): # Create the correct distribution object. cls = JAXCategorical if fw == "jax" else Categorical if \ fw != "torch" else TorchCategorical categorical = cls(inputs, {}) # Do a stability test using extreme NN outputs to see whether # sampling and logp'ing result in NaN or +/-inf values. self._stability_test(cls, inputs_space.shape, fw=fw, sess=sess, bounds=(0, num_categories - 1)) # Batch of size=3 and deterministic (True). expected = np.transpose(np.argmax(inputs, axis=-1)) # Sample, expect always max value # (max likelihood for deterministic draw). out = categorical.deterministic_sample() check(out, expected) # Batch of size=3 and non-deterministic -> expect roughly the mean. out = categorical.sample() check(np.mean(out) if fw == "jax" else tf.reduce_mean(out) if fw != "torch" else torch.mean(out.float()), 1.0, decimals=0) # Test log-likelihood outputs. probs = softmax(inputs) values = values_space.sample() out = categorical.logp( values if fw != "torch" else torch.Tensor(values)) expected = [] for i in range(batch_size): expected.append(np.sum(np.log(np.array(probs[i][values[i]])))) check(out, expected, decimals=4) # Test entropy outputs. out = categorical.entropy() expected_entropy = -np.sum(probs * np.log(probs), -1) check(out, expected_entropy)
def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOTrainer( env="counter", config={ "shuffle_sequences": False, # for deterministic testing "num_workers": 0, "rollout_fragment_length": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, "vf_share_layers": True, }, "framework": "tf", }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) check(batch0["sequences"], [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) check(batch1["sequences"], [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) check(batch2["sequences"], [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) check(batch3["sequences"], [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def test_traj_view_next_action(self): action_space = Discrete(2) rollout_worker_w_api = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy_config=ppo.DEFAULT_CONFIG, rollout_fragment_length=200, policy_spec=ppo.PPOTorchPolicy, policy_mapping_fn=None, num_envs=1, ) # Add the next action (a') and 2nd next action (a'') to the view # requirements of the policy. # This should be visible then in postprocessing and train batches. # Switch off for action computations (can't be there as we don't know # the next actions already at action computation time). rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[ "next_actions"] = ViewRequirement( SampleBatch.ACTIONS, shift=1, space=action_space, used_for_compute_actions=False, ) rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[ "2nd_next_actions"] = ViewRequirement( SampleBatch.ACTIONS, shift=2, space=action_space, used_for_compute_actions=False, ) # Make sure, we have DONEs as well. rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[ "dones"] = ViewRequirement() batch = rollout_worker_w_api.sample() self.assertTrue("next_actions" in batch) self.assertTrue("2nd_next_actions" in batch) expected_a_ = None # expected next action expected_a__ = None # expected 2nd next action for i in range(len(batch["actions"])): a, d, a_, a__ = ( batch["actions"][i], batch["dones"][i], batch["next_actions"][i], batch["2nd_next_actions"][i], ) # Episode done: next action and 2nd next action should be 0. if d: check(a_, 0) check(a__, 0) expected_a_ = None expected_a__ = None continue # Episode is not done and we have an expected next-a. if expected_a_ is not None: check(a, expected_a_) if expected_a__ is not None: check(a_, expected_a__) expected_a__ = a__ expected_a_ = a_
def test_shuffle_with_interceptor(self): """Tests, whether `shuffle()` clears the `intercepted_values` cache.""" s = SampleBatch({ "a": np.array([1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7]), }) # Set a summy get-interceptor (returning all values, but plus 1). s.set_get_interceptor(lambda v: v + 1) # Make sure, interceptor works. check(s["a"], [2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8]) s.shuffle() # Make sure, intercepted values are NOT the original ones (before the shuffle), # but have also been shuffled. check(s["a"], [2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8], false=True)
def test_microbenchmark_vs_old_version(self): """ Results from March 2020 (capacity=1048576): New tree: 0.049599366000000256s results = timeit.timeit("tree.sum(5, 60000)", setup="from ray.rllib.execution.segment_tree import SumSegmentTree; tree = SumSegmentTree({})".format(capacity), number=10000) Old tree: 0.13390400999999974s results = timeit.timeit("tree.sum(5, 60000)", setup="from ray.rllib.execution.tests.old_segment_tree import OldSumSegmentTree; tree = OldSumSegmentTree({})".format(capacity), number=10000) """ capacity = 2**20 # Expect reductions to be much faster now. new = timeit.timeit( "tree.sum(5, 60000)", setup="from ray.rllib.execution.segment_tree import " "SumSegmentTree; tree = SumSegmentTree({})".format(capacity), number=10000) old = timeit.timeit( "tree.sum(5, 60000)", setup="from ray.rllib.execution.tests.old_segment_tree import " "OldSumSegmentTree; tree = OldSumSegmentTree({})".format(capacity), number=10000) print("Sum performance (time spent) old={} new={}".format(old, new)) self.assertGreater(old, new) # Expect insertions to be roughly the same. new = timeit.timeit( "tree[50000] = 10; tree[50001] = 11", setup="from ray.rllib.execution.segment_tree import " "SumSegmentTree; tree = SumSegmentTree({})".format(capacity), number=100000) old = timeit.timeit( "tree[50000] = 10; tree[50001] = 11", setup="from ray.rllib.execution.tests.old_segment_tree import " "OldSumSegmentTree; tree = OldSumSegmentTree({})".format(capacity), number=100000) print("Insertion performance (time spent) " "old={} new={}".format(old, new)) check(old, new, rtol=0.15)
def test_wrap_gym_env(self): record_env_dir = os.popen("mktemp -d").read()[:-1] print(f"tmp dir for videos={record_env_dir}") if not os.path.exists(record_env_dir): sys.exit(1) num_steps_per_episode = 10 wrapped = record_env_wrapper( env=MockEnv2(num_steps_per_episode), record_env=record_env_dir, log_dir="", policy_config={ "in_evaluation": False, }, ) # Non MultiAgentEnv: Wrapper's type is wrappers.Monitor. self.assertTrue(isinstance(wrapped, gym.wrappers.Monitor)) self.assertFalse(isinstance(wrapped, VideoMonitor)) wrapped.reset() # Expect one video file to have been produced in the tmp dir. os.chdir(record_env_dir) ls = glob.glob("*.mp4") self.assertTrue(len(ls) == 1) # 10 steps for a complete episode. for i in range(num_steps_per_episode): wrapped.step(0) # Another episode. wrapped.reset() for i in range(num_steps_per_episode): wrapped.step(0) # Expect another video file to have been produced (2nd episode). ls = glob.glob("*.mp4") self.assertTrue(len(ls) == 2) # MockEnv2 returns a reward of 100.0 every step. # So total reward is 1000.0 per episode (10 steps). check( np.array([100.0, 100.0]) * num_steps_per_episode, wrapped.get_episode_rewards(), ) # Erase all generated files and the temp path just in case, # as to not disturb further CI-tests. shutil.rmtree(record_env_dir)
def test_apex_dqn_compilation_and_per_worker_epsilon_values(self): """Test whether an APEX-DQNTrainer can be built on all frameworks.""" config = ( apex.ApexConfig() .rollouts(num_rollout_workers=3) .resources(num_gpus=0) .training( replay_buffer_config={ "learning_starts": 1000, }, optimizer={ "num_replay_buffer_shards": 1, }, ) .reporting( min_sample_timesteps_per_reporting=100, min_time_s_per_reporting=1, ) ) for _ in framework_iterator(config, with_eager_tracing=True): trainer = config.build(env="CartPole-v0") # Test per-worker epsilon distribution. infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state() ) expected = [0.4, 0.016190862, 0.00065536] check([i["cur_epsilon"] for i in infos], [0.0] + expected) check_compute_single_action(trainer) for i in range(2): results = trainer.train() check_train_results(results) print(results) # Test again per-worker epsilon distribution # (should not have changed). infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state() ) check([i["cur_epsilon"] for i in infos], [0.0] + expected) trainer.stop()
def test_right_zero_padding(self): """Tests, whether right-zero-padding work properly.""" s1 = SampleBatch({ "a": np.array([1, 2, 3]), "b": { "c": np.array([4, 5, 6]) }, SampleBatch.SEQ_LENS: [1, 2], }) s1.right_zero_pad(max_seq_len=5) check( s1, { "a": [1, 0, 0, 0, 0, 2, 3, 0, 0, 0], "b": { "c": [4, 0, 0, 0, 0, 5, 6, 0, 0, 0] }, SampleBatch.SEQ_LENS: [1, 2] })
def test_traj_view_lstm_functionality(self): action_space = Box(float("-inf"), float("inf"), shape=(3, )) obs_space = Box(float("-inf"), float("inf"), (4, )) max_seq_len = 50 rollout_fragment_length = 200 assert rollout_fragment_length % max_seq_len == 0 policies = { "pol0": (EpisodeEnvAwareLSTMPolicy, obs_space, action_space, {}), } def policy_fn(agent_id, episode, **kwargs): return "pol0" config = { "multiagent": { "policies": policies, "policy_mapping_fn": policy_fn, }, "model": { "use_lstm": True, "max_seq_len": max_seq_len, }, } rw = RolloutWorker( env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}), policy_config=config, rollout_fragment_length=rollout_fragment_length, policy_spec=policies, policy_mapping_fn=policy_fn, normalize_actions=False, num_envs=1, ) for iteration in range(20): result = rw.sample() check(result.count, rollout_fragment_length) pol_batch_w = result.policy_batches["pol0"] assert pol_batch_w.count >= rollout_fragment_length analyze_rnn_batch( pol_batch_w, max_seq_len, view_requirements=rw.policy_map["pol0"].view_requirements, )
def test_ddppo_compilation(self): """Test whether a DDPPOTrainer can be built with both frameworks.""" config = ppo.ddppo.DEFAULT_CONFIG.copy() config["num_gpus_per_worker"] = 0 num_iterations = 2 for _ in framework_iterator(config, frameworks="torch"): trainer = ppo.ddppo.DDPPOTrainer(config=config, env="CartPole-v0") for i in range(num_iterations): trainer.train() # Make sure, weights on all workers are the same (including # local one). weights = trainer.workers.foreach_worker( lambda w: w.get_weights()) for w in weights[1:]: check(w, weights[0]) check_compute_single_action(trainer) trainer.stop()
def test_impala_lr_schedule(self): # Test whether we correctly ignore the "lr" setting. # The first lr should be 0.05. config = (impala.ImpalaConfig().resources(num_gpus=0).training( lr=0.1, lr_schedule=[ [0, 0.05], [100000, 0.000001], ], train_batch_size=100, ).rollouts(num_envs_per_worker=2).environment(env="CartPole-v0")) def get_lr(result): return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][ LEARNER_STATS_KEY]["cur_lr"] for fw in framework_iterator(config): trainer = config.build() policy = trainer.get_policy() try: if fw == "tf": check(policy.get_session().run(policy.cur_lr), 0.05) else: check(policy.cur_lr, 0.05) for _ in range(1): r1 = trainer.train() for _ in range(2): r2 = trainer.train() for _ in range(2): r3 = trainer.train() # Due to the asynch'ness of IMPALA, learner-stats metrics # could be delayed by one iteration. Do 3 train() calls here # and measure guaranteed decrease in lr between 1st and 3rd. lr1 = get_lr(r1) lr2 = get_lr(r2) lr3 = get_lr(r3) assert lr2 <= lr1, (lr1, lr2) assert lr3 <= lr2, (lr2, lr3) assert lr3 < lr1, (lr1, lr3) finally: trainer.stop()
def test_log_probs_from_logits_and_actions(self): """Tests log_probs_from_logits_and_actions.""" seq_len = 7 num_actions = 3 batch_size = 4 for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True): vtrace = vtrace_tf if fw == "tf" else vtrace_torch policy_logits = Box(-1.0, 1.0, (seq_len, batch_size, num_actions), np.float32).sample() actions = np.random.randint(0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32) if fw == "torch": action_log_probs_tensor = \ vtrace.log_probs_from_logits_and_actions( torch.from_numpy(policy_logits), torch.from_numpy(actions)) else: action_log_probs_tensor = \ vtrace.log_probs_from_logits_and_actions( policy_logits, actions) # Ground Truth # Using broadcasting to create a mask that indexes action logits action_index_mask = actions[..., None] == np.arange(num_actions) def index_with_mask(array, mask): return array[mask].reshape(*array.shape[:-1]) # Note: Normally log(softmax) is not a good idea because it's not # numerically stable. However, in this test we have well-behaved # values. ground_truth_v = index_with_mask(np.log(softmax(policy_logits)), action_index_mask) if sess: action_log_probs_tensor = sess.run(action_log_probs_tensor) check(action_log_probs_tensor, ground_truth_v)
def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self): """Test whether an APEX-DDPGTrainer can be built on all frameworks.""" config = (apex_ddpg.ApexDDPGConfig().rollouts( num_rollout_workers=2).reporting( min_sample_timesteps_per_reporting=100).training( replay_buffer_config={ "learning_starts": 0 }, optimizer={ "num_replay_buffer_shards": 1 }, ).environment(env="Pendulum-v1")) num_iterations = 1 for _ in framework_iterator(config, with_eager_tracing=True): trainer = config.build() # Test per-worker scale distribution. infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state()) scale = [i["cur_scale"] for i in infos] expected = [ 0.4**(1 + (i + 1) / float(config.num_workers - 1) * 7) for i in range(config.num_workers) ] check(scale, [0.0] + expected) for _ in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) # Test again per-worker scale distribution # (should not have changed). infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state()) scale = [i["cur_scale"] for i in infos] check(scale, [0.0] + expected) trainer.stop()
def test_ddppo_compilation(self): """Test whether a DDPPOTrainer can be built with both frameworks.""" config = ppo.DDPPOConfig().resources(num_gpus_per_worker=0) num_iterations = 2 for _ in framework_iterator(config, frameworks="torch"): trainer = config.build(env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) # Make sure, weights on all workers are the same. weights = trainer.workers.foreach_worker( lambda w: w.get_weights()) for w in weights[1:]: check(w, weights[1]) check_compute_single_action(trainer) trainer.stop()
def test_vtrace(self): """Tests V-trace against ground truth data calculated in python.""" seq_len = 5 batch_size = 10 # Create log_rhos such that rho will span from near-zero to above the # clipping thresholds. In particular, calculate log_rhos in # [-2.5, 2.5), # so that rho is in approx [0.08, 12.2). space_w_time = Box(-1.0, 1.0, (seq_len, batch_size), np.float32) space_only_batch = Box(-1.0, 1.0, (batch_size, ), np.float32) log_rhos = space_w_time.sample() / (batch_size * seq_len) log_rhos = 5 * (log_rhos - 0.5) # [0.0, 1.0) -> [-2.5, 2.5). values = { "log_rhos": log_rhos, # T, B where B_i: [0.9 / (i+1)] * T "discounts": np.array([[0.9 / (b + 1) for b in range(batch_size)] for _ in range(seq_len)]), "rewards": space_w_time.sample(), "values": space_w_time.sample() / batch_size, "bootstrap_value": space_only_batch.sample() + 1.0, "clip_rho_threshold": 3.7, "clip_pg_rho_threshold": 2.2, } for fw, sess in framework_iterator(frameworks=("torch", "tf"), session=True): vtrace = vtrace_tf if fw != "torch" else vtrace_torch output = vtrace.from_importance_weights(**values) if sess: output = sess.run(output) ground_truth_v = _ground_truth_calculation(vtrace, **values) check(output, ground_truth_v)
def test_impala_lr_schedule(self): config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 # Test whether we correctly ignore the "lr" setting. # The first lr should be 0.05. config["lr"] = 0.1 config["lr_schedule"] = [ [0, 0.05], [10000, 0.000001], ] config["num_gpus"] = 0 # Do not use any (fake) GPUs. config["env"] = "CartPole-v0" def get_lr(result): return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ "cur_lr" ] for fw in framework_iterator(config): trainer = impala.ImpalaTrainer(config=config) policy = trainer.get_policy() try: if fw == "tf": check(policy.get_session().run(policy.cur_lr), 0.05) else: check(policy.cur_lr, 0.05) r1 = trainer.train() r2 = trainer.train() r3 = trainer.train() # Due to the asynch'ness of IMPALA, learner-stats metrics # could be delayed by one iteration. Do 3 train() calls here # and measure guaranteed decrease in lr between 1st and 3rd. lr1 = get_lr(r1) lr2 = get_lr(r2) lr3 = get_lr(r3) assert lr2 <= lr1, (lr1, lr2) assert lr3 <= lr2, (lr2, lr3) assert lr3 < lr1, (lr1, lr3) finally: trainer.stop()
def test_n_step_from_same_obs_source_array(self): """Tests, whether n-step also works on a shared obs/new-obs array.""" gamma = 0.99 # The underlying observation data. Both obs and next_obs will # be references into that same np.array. underlying_obs = np.arange(0, 8) obs = underlying_obs[:7] next_obs = underlying_obs[1:] actions = np.random.randint(-1, 3, size=(7,)) check_actions = actions.copy() rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0] dones = [False, False, False, False, False, False, True] batch = SampleBatch( { SampleBatch.OBS: obs, SampleBatch.ACTIONS: actions, SampleBatch.REWARDS: rewards, SampleBatch.DONES: dones, SampleBatch.NEXT_OBS: next_obs, } ) adjust_nstep(4, gamma, batch) check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6]) check(batch[SampleBatch.ACTIONS], check_actions) check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7]) check(batch[SampleBatch.DONES], [False, False, False, True, True, True, True]) check( batch[SampleBatch.REWARDS], [ discount_cumsum(np.array(rewards[0:4]), gamma)[0], discount_cumsum(np.array(rewards[1:5]), gamma)[0], discount_cumsum(np.array(rewards[2:6]), gamma)[0], discount_cumsum(np.array(rewards[3:7]), gamma)[0], discount_cumsum(np.array(rewards[4:]), gamma)[0], discount_cumsum(np.array(rewards[5:]), gamma)[0], discount_cumsum(np.array(rewards[6:]), gamma)[0], ], )
def test_fqe_model(self): # Test FQETorchModel for: # (1) Check that it does not modify the underlying batch during training # (2) Check that the stoppign criteria from FQE are working correctly # (3) Check that using fqe._compute_action_probs equals brute force # iterating over all actions with policy.compute_log_likelihoods fqe = FQETorchModel( policy=self.algo.get_policy(), gamma=self.gamma, **self.q_model_config, ) tmp_batch = copy.deepcopy(self.batch) losses = fqe.train(self.batch) # Make sure FQETorchModel.train() does not modify self.batch check(tmp_batch, self.batch) # Make sure FQE stopping criteria are respected assert ( len(losses) == fqe.n_iters or losses[-1] < fqe.delta ), f"FQE.train() terminated early in {len(losses)} steps with final loss" f"{losses[-1]} for n_iters: {fqe.n_iters} and delta: {fqe.delta}" # Test fqe._compute_action_probs against "brute force" method # of computing log_prob for each possible action individually # using policy.compute_log_likelihoods obs = torch.tensor(self.batch["obs"], device=fqe.device) action_probs = fqe._compute_action_probs(obs) action_probs = convert_to_numpy(action_probs) tmp_probs = [] for act in range(fqe.policy.action_space.n): tmp_actions = np.zeros_like(self.batch["actions"]) + act log_probs = fqe.policy.compute_log_likelihoods( actions=tmp_actions, obs_batch=self.batch["obs"], ) tmp_probs.append(torch.exp(log_probs)) tmp_probs = torch.stack(tmp_probs).transpose(0, 1) tmp_probs = convert_to_numpy(tmp_probs) check(action_probs, tmp_probs, decimals=3)
def test_n_step_very_short_trajectory(self): """Tests, whether n-step also works for very small trajectories.""" gamma = 1.0 obs = np.arange(0, 2) actions = np.random.randint(-100, 300, size=(2, )) check_actions = actions.copy() rewards = [10.0, 100.0] next_obs = np.arange(1, 3) batch = SampleBatch({ SampleBatch.OBS: obs, SampleBatch.ACTIONS: actions, SampleBatch.REWARDS: rewards, SampleBatch.DONES: [False, False], SampleBatch.NEXT_OBS: next_obs, }) adjust_nstep(3, gamma, batch) check(batch[SampleBatch.OBS], [0, 1]) check(batch[SampleBatch.ACTIONS], check_actions) check(batch[SampleBatch.DONES], [False, False]) check(batch[SampleBatch.REWARDS], [10.0 + gamma * 100.0, 100.0]) check(batch[SampleBatch.NEXT_OBS], [2, 2])