Exemplo n.º 1
0
 def test_len_and_size_bytes(self):
     s1 = SampleBatch({
         "a": np.array([1, 2, 3]),
         "b": {
             "c": np.array([4, 5, 6])
         },
         "seq_lens": [1, 2],
     })
     check(len(s1), 3)
     check(s1.size_bytes(),
           s1["a"].nbytes + s1["b"]["c"].nbytes + s1["seq_lens"].nbytes)
Exemplo n.º 2
0
    def on_train_result(self, *, trainer, result: dict, **kwargs):
        stats = result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][
            LEARNER_STATS_KEY]
        # Learning rate should go to 0 after 1 iter.
        check(stats["cur_lr"], 5e-5 if trainer.iteration == 1 else 0.0)
        # Entropy coeff goes to 0.05, then 0.0 (per iter).
        check(stats["entropy_coeff"], 0.1 if trainer.iteration == 1 else 0.05)

        trainer.workers.foreach_policy(
            self._check_lr_torch if trainer.config["framework"] ==
            "torch" else self._check_lr_tf)
Exemplo n.º 3
0
    def test_hard_horizon(self):
        ev = RolloutWorker(
            env_creator=lambda _: MockEnv2(episode_length=10),
            policy_spec=MockPolicy,
            batch_mode="complete_episodes",
            rollout_fragment_length=10,
            episode_horizon=4,
            soft_horizon=False,
        )
        samples = ev.sample()
        # Three logical episodes and correct episode resets (always after 4
        # steps).
        self.assertEqual(len(set(samples["eps_id"])), 3)
        for i in range(4):
            self.assertEqual(np.argmax(samples["obs"][i]), i)
        self.assertEqual(np.argmax(samples["obs"][4]), 0)
        # 3 done values.
        self.assertEqual(sum(samples["dones"]), 3)
        ev.stop()

        # A gym env's max_episode_steps is smaller than Trainer's horizon.
        ev = RolloutWorker(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_spec=MockPolicy,
            batch_mode="complete_episodes",
            rollout_fragment_length=10,
            episode_horizon=6,
            soft_horizon=False,
        )
        samples = ev.sample()
        # 12 steps due to `complete_episodes` batch_mode.
        self.assertEqual(len(samples["eps_id"]), 12)
        # Two logical episodes and correct episode resets (always after 6(!)
        # steps).
        self.assertEqual(len(set(samples["eps_id"])), 2)
        # 2 done values after 6 and 12 steps.
        check(
            samples["dones"],
            [
                False,
                False,
                False,
                False,
                False,
                True,
                False,
                False,
                False,
                False,
                False,
                True,
            ],
        )
        ev.stop()
Exemplo n.º 4
0
    def test_ppo_compilation_and_schedule_mixins(self):
        """Test whether a PPOTrainer can be built with all frameworks."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        # For checking lr-schedule correctness.
        config["callbacks"] = MyCallbacks

        config["num_workers"] = 1
        config["num_sgd_iter"] = 2
        # Settings in case we use an LSTM.
        config["model"]["lstm_cell_size"] = 10
        config["model"]["max_seq_len"] = 20
        # Use default-native keras models whenever possible.
        config["model"]["_use_default_native_models"] = True

        # Setup lr- and entropy schedules for testing.
        config["lr_schedule"] = [[0, config["lr"]], [128, 0.0]]
        # Set entropy_coeff to a faulty value to proof that it'll get
        # overridden by the schedule below (which is expected).
        config["entropy_coeff"] = 100.0
        config["entropy_coeff_schedule"] = [[0, 0.1], [256, 0.0]]

        config["train_batch_size"] = 128
        # Test with compression.
        config["compress_observations"] = True
        num_iterations = 2

        for fw in framework_iterator(config):
            for env in ["CartPole-v0", "MsPacmanNoFrameskip-v4"]:
                print("Env={}".format(env))
                for lstm in [True, False]:
                    print("LSTM={}".format(lstm))
                    config["model"]["use_lstm"] = lstm
                    config["model"]["lstm_use_prev_action"] = lstm
                    config["model"]["lstm_use_prev_reward"] = lstm

                    trainer = ppo.PPOTrainer(config=config, env=env)
                    policy = trainer.get_policy()
                    entropy_coeff = trainer.get_policy().entropy_coeff
                    lr = policy.cur_lr
                    if fw == "tf":
                        entropy_coeff, lr = policy.get_session().run(
                            [entropy_coeff, lr])
                    check(entropy_coeff, 0.1)
                    check(lr, config["lr"])

                    for i in range(num_iterations):
                        print(trainer.train())

                    check_compute_single_action(
                        trainer,
                        include_prev_action_reward=True,
                        include_state=lstm)
                    trainer.stop()
Exemplo n.º 5
0
    def test_multi_agent_complex_spaces(self):
        ModelCatalog.register_custom_model("dict_spy", DictSpyModel)
        ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel)
        register_env("nested_ma", lambda _: NestedMultiAgentEnv())
        act_space = spaces.Discrete(2)
        pg = PGTrainer(
            env="nested_ma",
            config={
                "num_workers": 0,
                "rollout_fragment_length": 5,
                "train_batch_size": 5,
                "multiagent": {
                    "policies": {
                        "tuple_policy": (
                            None, TUPLE_SPACE, act_space,
                            {"model": {"custom_model": "tuple_spy"}}),
                        "dict_policy": (
                            None, DICT_SPACE, act_space,
                            {"model": {"custom_model": "dict_spy"}}),
                    },
                    "policy_mapping_fn": lambda aid, **kwargs: {
                        "tuple_agent": "tuple_policy",
                        "dict_agent": "dict_policy"}[aid],
                },
                "framework": "tf",
            })
        # Skip first passes as they came from the TorchPolicy loss
        # initialization.
        TupleSpyModel.capture_index = DictSpyModel.capture_index = 0
        pg.train()

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "d_spy_in_{}".format(i)))
            pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
            cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
            task_i = one_hot(
                DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            check(seen[2][0], task_i)

        for i in range(4):
            seen = pickle.loads(
                ray.experimental.internal_kv._internal_kv_get(
                    "t_spy_in_{}".format(i)))
            pos_i = TUPLE_SAMPLES[i][0].tolist()
            cam_i = TUPLE_SAMPLES[i][1][0].tolist()
            task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
            self.assertEqual(seen[0][0].tolist(), pos_i)
            self.assertEqual(seen[1][0].tolist(), cam_i)
            check(seen[2][0], task_i)
Exemplo n.º 6
0
 def test_rows(self):
     s1 = SampleBatch(
         {
             "a": np.array([[1, 1], [2, 2], [3, 3]]),
             "b": {"c": np.array([[4, 4], [5, 5], [6, 6]])},
             SampleBatch.SEQ_LENS: np.array([1, 2]),
         }
     )
     check(
         next(s1.rows()),
         {"a": [1, 1], "b": {"c": [4, 4]}, SampleBatch.SEQ_LENS: 1},
     )
Exemplo n.º 7
0
 def test_nested_multidiscrete_one_hot_preprocessor(self):
     space = Tuple((MultiDiscrete([2, 3, 4]), ))
     pp = get_preprocessor(space)(space)
     self.assertTrue(pp.shape == (9, ))
     check(
         pp.transform((np.array([1, 2, 0]), )),
         [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
     )
     check(
         pp.transform((np.array([0, 1, 3]), )),
         [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0],
     )
Exemplo n.º 8
0
    def test_ddpg_exploration_and_with_random_prerun(self):
        """Tests DDPG's Exploration (w/ random actions for n timesteps)."""
        core_config = ddpg.DEFAULT_CONFIG.copy()
        core_config["num_workers"] = 0  # Run locally.
        obs = np.array([0.0, 0.1, -0.1])

        # Test against all frameworks.
        for _ in framework_iterator(core_config):
            config = core_config.copy()
            # Default OUNoise setup.
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            # Setting explore=False should always return the same action.
            a_ = trainer.compute_action(obs, explore=False)
            for _ in range(50):
                a = trainer.compute_action(obs, explore=False)
                check(a, a_)
            # explore=None (default: explore) should return different actions.
            actions = []
            for _ in range(50):
                actions.append(trainer.compute_action(obs))
            check(np.std(actions), 0.0, false=True)
            trainer.stop()

            # Check randomness at beginning.
            config["exploration_config"] = {
                # Act randomly at beginning ...
                "random_timesteps": 50,
                # Then act very closely to deterministic actions thereafter.
                "ou_base_scale": 0.001,
                "initial_scale": 0.001,
                "final_scale": 0.001,
            }
            trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0")
            # ts=1 (get a deterministic action as per explore=False).
            deterministic_action = trainer.compute_action(obs, explore=False)
            # ts=2-5 (in random window).
            random_a = []
            for _ in range(49):
                random_a.append(trainer.compute_action(obs, explore=True))
                check(random_a[-1], deterministic_action, false=True)
            self.assertTrue(np.std(random_a) > 0.5)

            # ts > 50 (a=deterministic_action + scale * N[0,1])
            for _ in range(50):
                a = trainer.compute_action(obs, explore=True)
                check(a, deterministic_action, rtol=0.1)

            # ts >> 50 (BUT: explore=False -> expect deterministic action).
            for _ in range(50):
                a = trainer.compute_action(obs, explore=False)
                check(a, deterministic_action)
            trainer.stop()
Exemplo n.º 9
0
 def test_dict_flattening_preprocessor(self):
     space = Dict({
         "a": Discrete(2),
         "b": Tuple([Discrete(3), Box(-1.0, 1.0, (4, ))]),
     })
     pp = get_preprocessor(space)(space)
     self.assertTrue(isinstance(pp, DictFlatteningPreprocessor))
     self.assertEqual(pp.shape, (9, ))
     check(
         pp.transform({
             "a": 1,
             "b": (1, np.array([0.0, -0.5, 0.1, 0.6]))
         }), [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.5, 0.1, 0.6])
Exemplo n.º 10
0
    def test_traj_view_lstm_functionality(self):
        action_space = Box(-float("inf"), float("inf"), shape=(3, ))
        obs_space = Box(float("-inf"), float("inf"), (4, ))
        max_seq_len = 50
        rollout_fragment_length = 200
        assert rollout_fragment_length % max_seq_len == 0
        policies = {
            "pol0": (EpisodeEnvAwareLSTMPolicy, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        config = {
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_fn,
            },
            "model": {
                "use_lstm": True,
                "max_seq_len": max_seq_len,
            },
        },

        rollout_worker_w_api = RolloutWorker(
            env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}),
            policy_config=dict(config, **{"_use_trajectory_view_api": True}),
            rollout_fragment_length=rollout_fragment_length,
            policy_spec=policies,
            policy_mapping_fn=policy_fn,
            num_envs=1,
        )
        rollout_worker_wo_api = RolloutWorker(
            env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}),
            policy_config=dict(config, **{"_use_trajectory_view_api": False}),
            rollout_fragment_length=rollout_fragment_length,
            policy_spec=policies,
            policy_mapping_fn=policy_fn,
            num_envs=1,
        )
        for iteration in range(20):
            result = rollout_worker_w_api.sample()
            check(result.count, rollout_fragment_length)
            pol_batch_w = result.policy_batches["pol0"]
            assert pol_batch_w.count >= rollout_fragment_length
            analyze_rnn_batch(pol_batch_w, max_seq_len)

            result = rollout_worker_wo_api.sample()
            pol_batch_wo = result.policy_batches["pol0"]
            check(pol_batch_w.data, pol_batch_wo.data)
Exemplo n.º 11
0
    def test_multi_categorical(self):
        batch_size = 100
        num_categories = 3
        num_sub_distributions = 5
        # Create 5 categorical distributions of 3 categories each.
        inputs_space = Box(-1.0,
                           2.0,
                           shape=(batch_size,
                                  num_sub_distributions * num_categories))
        values_space = Box(0,
                           num_categories - 1,
                           shape=(num_sub_distributions, batch_size),
                           dtype=np.int32)

        inputs = inputs_space.sample()
        input_lengths = [num_categories] * num_sub_distributions
        inputs_split = np.split(inputs, num_sub_distributions, axis=1)

        for fw in framework_iterator():
            # Create the correct distribution object.
            cls = MultiCategorical if fw != "torch" else TorchMultiCategorical
            multi_categorical = cls(inputs, None, input_lengths)

            # Batch of size=3 and deterministic (True).
            expected = np.transpose(np.argmax(inputs_split, axis=-1))
            # Sample, expect always max value
            # (max likelihood for deterministic draw).
            out = multi_categorical.deterministic_sample()
            check(out, expected)

            # Batch of size=3 and non-deterministic -> expect roughly the mean.
            out = multi_categorical.sample()
            check(tf.reduce_mean(out)
                  if fw != "torch" else torch.mean(out.float()),
                  1.0,
                  decimals=0)

            # Test log-likelihood outputs.
            probs = softmax(inputs_split)
            values = values_space.sample()

            out = multi_categorical.logp(values if fw != "torch" else [
                torch.Tensor(values[i]) for i in range(num_sub_distributions)
            ])  # v in np.stack(values, 1)])
            expected = []
            for i in range(batch_size):
                expected.append(
                    np.sum(
                        np.log(
                            np.array([
                                probs[j][i][values[j][i]]
                                for j in range(num_sub_distributions)
                            ]))))
            check(out, expected, decimals=4)

            # Test entropy outputs.
            out = multi_categorical.entropy()
            expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1)
            check(out, expected_entropy)
Exemplo n.º 12
0
    def test_categorical(self):
        batch_size = 10000
        num_categories = 4
        # Create categorical distribution with n categories.
        inputs_space = Box(-1.0,
                           2.0,
                           shape=(batch_size, num_categories),
                           dtype=np.float32)
        values_space = Box(0,
                           num_categories - 1,
                           shape=(batch_size, ),
                           dtype=np.int32)

        inputs = inputs_space.sample()

        for fw, sess in framework_iterator(session=True,
                                           frameworks=("tf", "tf2", "torch")):
            # Create the correct distribution object.
            cls = JAXCategorical if fw == "jax" else Categorical if \
                fw != "torch" else TorchCategorical
            categorical = cls(inputs, {})

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(cls,
                                 inputs_space.shape,
                                 fw=fw,
                                 sess=sess,
                                 bounds=(0, num_categories - 1))

            # Batch of size=3 and deterministic (True).
            expected = np.transpose(np.argmax(inputs, axis=-1))
            # Sample, expect always max value
            # (max likelihood for deterministic draw).
            out = categorical.deterministic_sample()
            check(out, expected)

            # Batch of size=3 and non-deterministic -> expect roughly the mean.
            out = categorical.sample()
            check(np.mean(out) if fw == "jax" else tf.reduce_mean(out)
                  if fw != "torch" else torch.mean(out.float()),
                  1.0,
                  decimals=0)

            # Test log-likelihood outputs.
            probs = softmax(inputs)
            values = values_space.sample()

            out = categorical.logp(
                values if fw != "torch" else torch.Tensor(values))
            expected = []
            for i in range(batch_size):
                expected.append(np.sum(np.log(np.array(probs[i][values[i]]))))
            check(out, expected, decimals=4)

            # Test entropy outputs.
            out = categorical.entropy()
            expected_entropy = -np.sum(probs * np.log(probs), -1)
            check(out, expected_entropy)
Exemplo n.º 13
0
    def test_minibatch_sequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOTrainer(
            env="counter",
            config={
                "shuffle_sequences": False,  # for deterministic testing
                "num_workers": 0,
                "rollout_fragment_length": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                    "vf_share_layers": True,
                },
                "framework": "tf",
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        check(batch0["sequences"], [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        check(batch1["sequences"], [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        check(batch2["sequences"], [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        check(batch3["sequences"], [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
Exemplo n.º 14
0
    def test_traj_view_next_action(self):
        action_space = Discrete(2)
        rollout_worker_w_api = RolloutWorker(
            env_creator=lambda _: gym.make("CartPole-v0"),
            policy_config=ppo.DEFAULT_CONFIG,
            rollout_fragment_length=200,
            policy_spec=ppo.PPOTorchPolicy,
            policy_mapping_fn=None,
            num_envs=1,
        )
        # Add the next action (a') and 2nd next action (a'') to the view
        # requirements of the policy.
        # This should be visible then in postprocessing and train batches.
        # Switch off for action computations (can't be there as we don't know
        # the next actions already at action computation time).
        rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[
            "next_actions"] = ViewRequirement(
                SampleBatch.ACTIONS,
                shift=1,
                space=action_space,
                used_for_compute_actions=False,
            )
        rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[
            "2nd_next_actions"] = ViewRequirement(
                SampleBatch.ACTIONS,
                shift=2,
                space=action_space,
                used_for_compute_actions=False,
            )

        # Make sure, we have DONEs as well.
        rollout_worker_w_api.policy_map[DEFAULT_POLICY_ID].view_requirements[
            "dones"] = ViewRequirement()
        batch = rollout_worker_w_api.sample()
        self.assertTrue("next_actions" in batch)
        self.assertTrue("2nd_next_actions" in batch)
        expected_a_ = None  # expected next action
        expected_a__ = None  # expected 2nd next action
        for i in range(len(batch["actions"])):
            a, d, a_, a__ = (
                batch["actions"][i],
                batch["dones"][i],
                batch["next_actions"][i],
                batch["2nd_next_actions"][i],
            )
            # Episode done: next action and 2nd next action should be 0.
            if d:
                check(a_, 0)
                check(a__, 0)
                expected_a_ = None
                expected_a__ = None
                continue
            # Episode is not done and we have an expected next-a.
            if expected_a_ is not None:
                check(a, expected_a_)
            if expected_a__ is not None:
                check(a_, expected_a__)
            expected_a__ = a__
            expected_a_ = a_
Exemplo n.º 15
0
 def test_shuffle_with_interceptor(self):
     """Tests, whether `shuffle()` clears the `intercepted_values` cache."""
     s = SampleBatch({
         "a":
         np.array([1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7]),
     })
     # Set a summy get-interceptor (returning all values, but plus 1).
     s.set_get_interceptor(lambda v: v + 1)
     # Make sure, interceptor works.
     check(s["a"], [2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8])
     s.shuffle()
     # Make sure, intercepted values are NOT the original ones (before the shuffle),
     # but have also been shuffled.
     check(s["a"], [2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8],
           false=True)
Exemplo n.º 16
0
    def test_microbenchmark_vs_old_version(self):
        """
        Results from March 2020 (capacity=1048576):

        New tree:
        0.049599366000000256s
        results = timeit.timeit("tree.sum(5, 60000)",
            setup="from ray.rllib.execution.segment_tree import
            SumSegmentTree; tree = SumSegmentTree({})".format(capacity),
            number=10000)

        Old tree:
        0.13390400999999974s
        results = timeit.timeit("tree.sum(5, 60000)",
            setup="from ray.rllib.execution.tests.old_segment_tree import
            OldSumSegmentTree; tree = OldSumSegmentTree({})".format(capacity),
            number=10000)
        """
        capacity = 2**20
        # Expect reductions to be much faster now.
        new = timeit.timeit(
            "tree.sum(5, 60000)",
            setup="from ray.rllib.execution.segment_tree import "
            "SumSegmentTree; tree = SumSegmentTree({})".format(capacity),
            number=10000)
        old = timeit.timeit(
            "tree.sum(5, 60000)",
            setup="from ray.rllib.execution.tests.old_segment_tree import "
            "OldSumSegmentTree; tree = OldSumSegmentTree({})".format(capacity),
            number=10000)
        print("Sum performance (time spent) old={} new={}".format(old, new))
        self.assertGreater(old, new)

        # Expect insertions to be roughly the same.
        new = timeit.timeit(
            "tree[50000] = 10; tree[50001] = 11",
            setup="from ray.rllib.execution.segment_tree import "
            "SumSegmentTree; tree = SumSegmentTree({})".format(capacity),
            number=100000)
        old = timeit.timeit(
            "tree[50000] = 10; tree[50001] = 11",
            setup="from ray.rllib.execution.tests.old_segment_tree import "
            "OldSumSegmentTree; tree = OldSumSegmentTree({})".format(capacity),
            number=100000)
        print("Insertion performance (time spent) "
              "old={} new={}".format(old, new))
        check(old, new, rtol=0.15)
Exemplo n.º 17
0
    def test_wrap_gym_env(self):
        record_env_dir = os.popen("mktemp -d").read()[:-1]
        print(f"tmp dir for videos={record_env_dir}")

        if not os.path.exists(record_env_dir):
            sys.exit(1)

        num_steps_per_episode = 10
        wrapped = record_env_wrapper(
            env=MockEnv2(num_steps_per_episode),
            record_env=record_env_dir,
            log_dir="",
            policy_config={
                "in_evaluation": False,
            },
        )
        # Non MultiAgentEnv: Wrapper's type is wrappers.Monitor.
        self.assertTrue(isinstance(wrapped, gym.wrappers.Monitor))
        self.assertFalse(isinstance(wrapped, VideoMonitor))

        wrapped.reset()
        # Expect one video file to have been produced in the tmp dir.
        os.chdir(record_env_dir)
        ls = glob.glob("*.mp4")
        self.assertTrue(len(ls) == 1)
        # 10 steps for a complete episode.
        for i in range(num_steps_per_episode):
            wrapped.step(0)
        # Another episode.
        wrapped.reset()
        for i in range(num_steps_per_episode):
            wrapped.step(0)
        # Expect another video file to have been produced (2nd episode).
        ls = glob.glob("*.mp4")
        self.assertTrue(len(ls) == 2)

        # MockEnv2 returns a reward of 100.0 every step.
        # So total reward is 1000.0 per episode (10 steps).
        check(
            np.array([100.0, 100.0]) * num_steps_per_episode,
            wrapped.get_episode_rewards(),
        )
        # Erase all generated files and the temp path just in case,
        # as to not disturb further CI-tests.
        shutil.rmtree(record_env_dir)
Exemplo n.º 18
0
    def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
        """Test whether an APEX-DQNTrainer can be built on all frameworks."""
        config = (
            apex.ApexConfig()
            .rollouts(num_rollout_workers=3)
            .resources(num_gpus=0)
            .training(
                replay_buffer_config={
                    "learning_starts": 1000,
                },
                optimizer={
                    "num_replay_buffer_shards": 1,
                },
            )
            .reporting(
                min_sample_timesteps_per_reporting=100,
                min_time_s_per_reporting=1,
            )
        )

        for _ in framework_iterator(config, with_eager_tracing=True):
            trainer = config.build(env="CartPole-v0")

            # Test per-worker epsilon distribution.
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_state()
            )
            expected = [0.4, 0.016190862, 0.00065536]
            check([i["cur_epsilon"] for i in infos], [0.0] + expected)

            check_compute_single_action(trainer)

            for i in range(2):
                results = trainer.train()
                check_train_results(results)
                print(results)

            # Test again per-worker epsilon distribution
            # (should not have changed).
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_state()
            )
            check([i["cur_epsilon"] for i in infos], [0.0] + expected)

            trainer.stop()
Exemplo n.º 19
0
 def test_right_zero_padding(self):
     """Tests, whether right-zero-padding work properly."""
     s1 = SampleBatch({
         "a": np.array([1, 2, 3]),
         "b": {
             "c": np.array([4, 5, 6])
         },
         SampleBatch.SEQ_LENS: [1, 2],
     })
     s1.right_zero_pad(max_seq_len=5)
     check(
         s1, {
             "a": [1, 0, 0, 0, 0, 2, 3, 0, 0, 0],
             "b": {
                 "c": [4, 0, 0, 0, 0, 5, 6, 0, 0, 0]
             },
             SampleBatch.SEQ_LENS: [1, 2]
         })
Exemplo n.º 20
0
    def test_traj_view_lstm_functionality(self):
        action_space = Box(float("-inf"), float("inf"), shape=(3, ))
        obs_space = Box(float("-inf"), float("inf"), (4, ))
        max_seq_len = 50
        rollout_fragment_length = 200
        assert rollout_fragment_length % max_seq_len == 0
        policies = {
            "pol0": (EpisodeEnvAwareLSTMPolicy, obs_space, action_space, {}),
        }

        def policy_fn(agent_id, episode, **kwargs):
            return "pol0"

        config = {
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_fn,
            },
            "model": {
                "use_lstm": True,
                "max_seq_len": max_seq_len,
            },
        }

        rw = RolloutWorker(
            env_creator=lambda _: MultiAgentDebugCounterEnv({"num_agents": 4}),
            policy_config=config,
            rollout_fragment_length=rollout_fragment_length,
            policy_spec=policies,
            policy_mapping_fn=policy_fn,
            normalize_actions=False,
            num_envs=1,
        )

        for iteration in range(20):
            result = rw.sample()
            check(result.count, rollout_fragment_length)
            pol_batch_w = result.policy_batches["pol0"]
            assert pol_batch_w.count >= rollout_fragment_length
            analyze_rnn_batch(
                pol_batch_w,
                max_seq_len,
                view_requirements=rw.policy_map["pol0"].view_requirements,
            )
Exemplo n.º 21
0
    def test_ddppo_compilation(self):
        """Test whether a DDPPOTrainer can be built with both frameworks."""
        config = ppo.ddppo.DEFAULT_CONFIG.copy()
        config["num_gpus_per_worker"] = 0
        num_iterations = 2

        for _ in framework_iterator(config, frameworks="torch"):
            trainer = ppo.ddppo.DDPPOTrainer(config=config, env="CartPole-v0")
            for i in range(num_iterations):
                trainer.train()
                # Make sure, weights on all workers are the same (including
                # local one).
                weights = trainer.workers.foreach_worker(
                    lambda w: w.get_weights())
                for w in weights[1:]:
                    check(w, weights[0])

            check_compute_single_action(trainer)
            trainer.stop()
Exemplo n.º 22
0
    def test_impala_lr_schedule(self):
        # Test whether we correctly ignore the "lr" setting.
        # The first lr should be 0.05.
        config = (impala.ImpalaConfig().resources(num_gpus=0).training(
            lr=0.1,
            lr_schedule=[
                [0, 0.05],
                [100000, 0.000001],
            ],
            train_batch_size=100,
        ).rollouts(num_envs_per_worker=2).environment(env="CartPole-v0"))

        def get_lr(result):
            return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][
                LEARNER_STATS_KEY]["cur_lr"]

        for fw in framework_iterator(config):
            trainer = config.build()
            policy = trainer.get_policy()

            try:
                if fw == "tf":
                    check(policy.get_session().run(policy.cur_lr), 0.05)
                else:
                    check(policy.cur_lr, 0.05)
                for _ in range(1):
                    r1 = trainer.train()
                for _ in range(2):
                    r2 = trainer.train()
                for _ in range(2):
                    r3 = trainer.train()
                # Due to the asynch'ness of IMPALA, learner-stats metrics
                # could be delayed by one iteration. Do 3 train() calls here
                # and measure guaranteed decrease in lr between 1st and 3rd.
                lr1 = get_lr(r1)
                lr2 = get_lr(r2)
                lr3 = get_lr(r3)
                assert lr2 <= lr1, (lr1, lr2)
                assert lr3 <= lr2, (lr2, lr3)
                assert lr3 < lr1, (lr1, lr3)
            finally:
                trainer.stop()
Exemplo n.º 23
0
    def test_log_probs_from_logits_and_actions(self):
        """Tests log_probs_from_logits_and_actions."""
        seq_len = 7
        num_actions = 3
        batch_size = 4

        for fw, sess in framework_iterator(frameworks=("torch", "tf"),
                                           session=True):
            vtrace = vtrace_tf if fw == "tf" else vtrace_torch
            policy_logits = Box(-1.0, 1.0, (seq_len, batch_size, num_actions),
                                np.float32).sample()
            actions = np.random.randint(0,
                                        num_actions - 1,
                                        size=(seq_len, batch_size),
                                        dtype=np.int32)

            if fw == "torch":
                action_log_probs_tensor = \
                    vtrace.log_probs_from_logits_and_actions(
                        torch.from_numpy(policy_logits),
                        torch.from_numpy(actions))
            else:
                action_log_probs_tensor = \
                    vtrace.log_probs_from_logits_and_actions(
                        policy_logits, actions)

            # Ground Truth
            # Using broadcasting to create a mask that indexes action logits
            action_index_mask = actions[..., None] == np.arange(num_actions)

            def index_with_mask(array, mask):
                return array[mask].reshape(*array.shape[:-1])

            # Note: Normally log(softmax) is not a good idea because it's not
            # numerically stable. However, in this test we have well-behaved
            # values.
            ground_truth_v = index_with_mask(np.log(softmax(policy_logits)),
                                             action_index_mask)

            if sess:
                action_log_probs_tensor = sess.run(action_log_probs_tensor)
            check(action_log_probs_tensor, ground_truth_v)
Exemplo n.º 24
0
    def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
        """Test whether an APEX-DDPGTrainer can be built on all frameworks."""
        config = (apex_ddpg.ApexDDPGConfig().rollouts(
            num_rollout_workers=2).reporting(
                min_sample_timesteps_per_reporting=100).training(
                    replay_buffer_config={
                        "learning_starts": 0
                    },
                    optimizer={
                        "num_replay_buffer_shards": 1
                    },
                ).environment(env="Pendulum-v1"))

        num_iterations = 1

        for _ in framework_iterator(config, with_eager_tracing=True):
            trainer = config.build()

            # Test per-worker scale distribution.
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_state())
            scale = [i["cur_scale"] for i in infos]
            expected = [
                0.4**(1 + (i + 1) / float(config.num_workers - 1) * 7)
                for i in range(config.num_workers)
            ]
            check(scale, [0.0] + expected)

            for _ in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)
            check_compute_single_action(trainer)

            # Test again per-worker scale distribution
            # (should not have changed).
            infos = trainer.workers.foreach_policy(
                lambda p, _: p.get_exploration_state())
            scale = [i["cur_scale"] for i in infos]
            check(scale, [0.0] + expected)

            trainer.stop()
Exemplo n.º 25
0
    def test_ddppo_compilation(self):
        """Test whether a DDPPOTrainer can be built with both frameworks."""
        config = ppo.DDPPOConfig().resources(num_gpus_per_worker=0)

        num_iterations = 2

        for _ in framework_iterator(config, frameworks="torch"):
            trainer = config.build(env="CartPole-v0")
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)
                # Make sure, weights on all workers are the same.
                weights = trainer.workers.foreach_worker(
                    lambda w: w.get_weights())
                for w in weights[1:]:
                    check(w, weights[1])

            check_compute_single_action(trainer)
            trainer.stop()
Exemplo n.º 26
0
    def test_vtrace(self):
        """Tests V-trace against ground truth data calculated in python."""
        seq_len = 5
        batch_size = 10

        # Create log_rhos such that rho will span from near-zero to above the
        # clipping thresholds. In particular, calculate log_rhos in
        # [-2.5, 2.5),
        # so that rho is in approx [0.08, 12.2).
        space_w_time = Box(-1.0, 1.0, (seq_len, batch_size), np.float32)
        space_only_batch = Box(-1.0, 1.0, (batch_size, ), np.float32)
        log_rhos = space_w_time.sample() / (batch_size * seq_len)
        log_rhos = 5 * (log_rhos - 0.5)  # [0.0, 1.0) -> [-2.5, 2.5).
        values = {
            "log_rhos":
            log_rhos,
            # T, B where B_i: [0.9 / (i+1)] * T
            "discounts":
            np.array([[0.9 / (b + 1) for b in range(batch_size)]
                      for _ in range(seq_len)]),
            "rewards":
            space_w_time.sample(),
            "values":
            space_w_time.sample() / batch_size,
            "bootstrap_value":
            space_only_batch.sample() + 1.0,
            "clip_rho_threshold":
            3.7,
            "clip_pg_rho_threshold":
            2.2,
        }

        for fw, sess in framework_iterator(frameworks=("torch", "tf"),
                                           session=True):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            output = vtrace.from_importance_weights(**values)
            if sess:
                output = sess.run(output)

            ground_truth_v = _ground_truth_calculation(vtrace, **values)
            check(output, ground_truth_v)
Exemplo n.º 27
0
    def test_impala_lr_schedule(self):
        config = impala.DEFAULT_CONFIG.copy()
        config["num_gpus"] = 0
        # Test whether we correctly ignore the "lr" setting.
        # The first lr should be 0.05.
        config["lr"] = 0.1
        config["lr_schedule"] = [
            [0, 0.05],
            [10000, 0.000001],
        ]
        config["num_gpus"] = 0  # Do not use any (fake) GPUs.
        config["env"] = "CartPole-v0"

        def get_lr(result):
            return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][
                "cur_lr"
            ]

        for fw in framework_iterator(config):
            trainer = impala.ImpalaTrainer(config=config)
            policy = trainer.get_policy()

            try:
                if fw == "tf":
                    check(policy.get_session().run(policy.cur_lr), 0.05)
                else:
                    check(policy.cur_lr, 0.05)
                r1 = trainer.train()
                r2 = trainer.train()
                r3 = trainer.train()
                # Due to the asynch'ness of IMPALA, learner-stats metrics
                # could be delayed by one iteration. Do 3 train() calls here
                # and measure guaranteed decrease in lr between 1st and 3rd.
                lr1 = get_lr(r1)
                lr2 = get_lr(r2)
                lr3 = get_lr(r3)
                assert lr2 <= lr1, (lr1, lr2)
                assert lr3 <= lr2, (lr2, lr3)
                assert lr3 < lr1, (lr1, lr3)
            finally:
                trainer.stop()
Exemplo n.º 28
0
    def test_n_step_from_same_obs_source_array(self):
        """Tests, whether n-step also works on a shared obs/new-obs array."""
        gamma = 0.99
        # The underlying observation data. Both obs and next_obs will
        # be references into that same np.array.
        underlying_obs = np.arange(0, 8)
        obs = underlying_obs[:7]
        next_obs = underlying_obs[1:]

        actions = np.random.randint(-1, 3, size=(7,))
        check_actions = actions.copy()
        rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
        dones = [False, False, False, False, False, False, True]

        batch = SampleBatch(
            {
                SampleBatch.OBS: obs,
                SampleBatch.ACTIONS: actions,
                SampleBatch.REWARDS: rewards,
                SampleBatch.DONES: dones,
                SampleBatch.NEXT_OBS: next_obs,
            }
        )
        adjust_nstep(4, gamma, batch)

        check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6])
        check(batch[SampleBatch.ACTIONS], check_actions)
        check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7])
        check(batch[SampleBatch.DONES], [False, False, False, True, True, True, True])
        check(
            batch[SampleBatch.REWARDS],
            [
                discount_cumsum(np.array(rewards[0:4]), gamma)[0],
                discount_cumsum(np.array(rewards[1:5]), gamma)[0],
                discount_cumsum(np.array(rewards[2:6]), gamma)[0],
                discount_cumsum(np.array(rewards[3:7]), gamma)[0],
                discount_cumsum(np.array(rewards[4:]), gamma)[0],
                discount_cumsum(np.array(rewards[5:]), gamma)[0],
                discount_cumsum(np.array(rewards[6:]), gamma)[0],
            ],
        )
Exemplo n.º 29
0
    def test_fqe_model(self):
        # Test FQETorchModel for:
        # (1) Check that it does not modify the underlying batch during training
        # (2) Check that the stoppign criteria from FQE are working correctly
        # (3) Check that using fqe._compute_action_probs equals brute force
        # iterating over all actions with policy.compute_log_likelihoods
        fqe = FQETorchModel(
            policy=self.algo.get_policy(),
            gamma=self.gamma,
            **self.q_model_config,
        )
        tmp_batch = copy.deepcopy(self.batch)
        losses = fqe.train(self.batch)

        # Make sure FQETorchModel.train() does not modify self.batch
        check(tmp_batch, self.batch)

        # Make sure FQE stopping criteria are respected
        assert (
            len(losses) == fqe.n_iters or losses[-1] < fqe.delta
        ), f"FQE.train() terminated early in {len(losses)} steps with final loss"
        f"{losses[-1]} for n_iters: {fqe.n_iters} and delta: {fqe.delta}"

        # Test fqe._compute_action_probs against "brute force" method
        # of computing log_prob for each possible action individually
        # using policy.compute_log_likelihoods
        obs = torch.tensor(self.batch["obs"], device=fqe.device)
        action_probs = fqe._compute_action_probs(obs)
        action_probs = convert_to_numpy(action_probs)

        tmp_probs = []
        for act in range(fqe.policy.action_space.n):
            tmp_actions = np.zeros_like(self.batch["actions"]) + act
            log_probs = fqe.policy.compute_log_likelihoods(
                actions=tmp_actions,
                obs_batch=self.batch["obs"],
            )
            tmp_probs.append(torch.exp(log_probs))
        tmp_probs = torch.stack(tmp_probs).transpose(0, 1)
        tmp_probs = convert_to_numpy(tmp_probs)
        check(action_probs, tmp_probs, decimals=3)
Exemplo n.º 30
0
 def test_n_step_very_short_trajectory(self):
     """Tests, whether n-step also works for very small trajectories."""
     gamma = 1.0
     obs = np.arange(0, 2)
     actions = np.random.randint(-100, 300, size=(2, ))
     check_actions = actions.copy()
     rewards = [10.0, 100.0]
     next_obs = np.arange(1, 3)
     batch = SampleBatch({
         SampleBatch.OBS: obs,
         SampleBatch.ACTIONS: actions,
         SampleBatch.REWARDS: rewards,
         SampleBatch.DONES: [False, False],
         SampleBatch.NEXT_OBS: next_obs,
     })
     adjust_nstep(3, gamma, batch)
     check(batch[SampleBatch.OBS], [0, 1])
     check(batch[SampleBatch.ACTIONS], check_actions)
     check(batch[SampleBatch.DONES], [False, False])
     check(batch[SampleBatch.REWARDS], [10.0 + gamma * 100.0, 100.0])
     check(batch[SampleBatch.NEXT_OBS], [2, 2])