예제 #1
0
def check_support(alg, config, train=True, check_bounds=False, tfe=False):
    config["log_level"] = "ERROR"
    config["train_batch_size"] = 10
    config["rollout_fragment_length"] = 10

    def _do_check(alg, config, a_name, o_name):
        fw = config["framework"]
        action_space = ACTION_SPACES_TO_TEST[a_name]
        obs_space = OBSERVATION_SPACES_TO_TEST[o_name]
        print(
            "=== Testing {} (fw={}) A={} S={} ===".format(
                alg, fw, action_space, obs_space
            )
        )
        config.update(
            dict(
                env_config=dict(
                    action_space=action_space,
                    observation_space=obs_space,
                    reward_space=Box(1.0, 1.0, shape=(), dtype=np.float32),
                    p_done=1.0,
                    check_action_bounds=check_bounds,
                )
            )
        )
        stat = "ok"

        try:
            a = get_trainer_class(alg)(config=config, env=RandomEnv)
        except ray.exceptions.RayActorError as e:
            if len(e.args) >= 2 and isinstance(e.args[2], UnsupportedSpaceException):
                stat = "unsupported"
            elif isinstance(e.args[0].args[2], UnsupportedSpaceException):
                stat = "unsupported"
            else:
                raise
        except UnsupportedSpaceException:
            stat = "unsupported"
        else:
            if alg not in ["DDPG", "ES", "ARS", "SAC"]:
                # 2D (image) input: Expect VisionNet.
                if o_name in ["atari", "image"]:
                    if fw == "torch":
                        assert isinstance(a.get_policy().model, TorchVisionNet)
                    else:
                        assert isinstance(a.get_policy().model, VisionNet)
                # 1D input: Expect FCNet.
                elif o_name == "vector1d":
                    if fw == "torch":
                        assert isinstance(a.get_policy().model, TorchFCNet)
                    else:
                        assert isinstance(a.get_policy().model, FCNet)
                # Could be either one: ComplexNet (if disabled Preprocessor)
                # or FCNet (w/ Preprocessor).
                elif o_name == "vector2d":
                    if fw == "torch":
                        assert isinstance(
                            a.get_policy().model, (TorchComplexNet, TorchFCNet)
                        )
                    else:
                        assert isinstance(a.get_policy().model, (ComplexNet, FCNet))
            if train:
                a.train()
            a.stop()
        print(stat)

    frameworks = ("tf", "torch")
    if tfe:
        frameworks += ("tf2", "tfe")
    for _ in framework_iterator(config, frameworks=frameworks):
        # Zip through action- and obs-spaces.
        for a_name, o_name in zip(
            ACTION_SPACES_TO_TEST.keys(), OBSERVATION_SPACES_TO_TEST.keys()
        ):
            _do_check(alg, config, a_name, o_name)
        # Do the remaining obs spaces.
        assert len(OBSERVATION_SPACES_TO_TEST) >= len(ACTION_SPACES_TO_TEST)
        fixed_action_key = next(iter(ACTION_SPACES_TO_TEST.keys()))
        for i, o_name in enumerate(OBSERVATION_SPACES_TO_TEST.keys()):
            if i < len(ACTION_SPACES_TO_TEST):
                continue
            _do_check(alg, config, fixed_action_key, o_name)
예제 #2
0
파일: test_cql.py 프로젝트: rlan/ray
    def test_cql_compilation(self):
        """Test whether a CQLTrainer can be built with all frameworks."""

        # Learns from a historic-data file.
        # To generate this data, first run:
        # $ ./train.py --run=SAC --env=Pendulum-v0 \
        #   --stop='{"timesteps_total": 50000}' \
        #   --config='{"output": "/tmp/out"}'
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))

        config = cql.CQL_DEFAULT_CONFIG.copy()
        config["env"] = "Pendulum-v0"
        config["input"] = [data_file]

        # In the files, we use here for testing, actions have already
        # been normalized.
        # This is usually the case when the file was generated by another
        # RLlib algorithm (e.g. PPO or SAC).
        config["actions_in_input_normalized"] = False
        config["clip_actions"] = True
        config["train_batch_size"] = 2000

        config["num_workers"] = 0  # Run locally.
        config["twin_q"] = True
        config["learning_starts"] = 0
        config["bc_iters"] = 2  # 2 BC iters, 2 CQL iters.
        config["rollout_fragment_length"] = 1

        # Switch on off-policy evaluation.
        config["input_evaluation"] = ["is"]

        config["evaluation_interval"] = 2
        config["evaluation_num_episodes"] = 10
        config["evaluation_config"]["input"] = "sampler"
        config["evaluation_parallel_to_training"] = False
        config["evaluation_num_workers"] = 2

        num_iterations = 4

        # Test for tf/torch frameworks.
        for fw in framework_iterator(config):
            trainer = cql.CQLTrainer(config=config)
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)
                eval_results = results.get("evaluation")
                if eval_results:
                    print(f"iter={trainer.iteration} "
                          f"R={eval_results['episode_reward_mean']}")

            check_compute_single_action(trainer)

            # Get policy and model.
            pol = trainer.get_policy()
            cql_model = pol.model
            if fw == "tf":
                pol.get_session().__enter__()

            # Example on how to do evaluation on the trained Trainer
            # using the data from CQL's global replay buffer.
            # Get a sample (MultiAgentBatch -> SampleBatch).
            from ray.rllib.agents.cql.cql import replay_buffer
            batch = replay_buffer.replay().policy_batches["default_policy"]

            if fw == "torch":
                obs = torch.from_numpy(batch["obs"])
            else:
                obs = batch["obs"]
                batch["actions"] = batch["actions"].astype(np.float32)

            # Pass the observations through our model to get the
            # features, which then to pass through the Q-head.
            model_out, _ = cql_model({"obs": obs})
            # The estimated Q-values from the (historic) actions in the batch.
            if fw == "torch":
                q_values_old = cql_model.get_q_values(
                    model_out, torch.from_numpy(batch["actions"]))
            else:
                q_values_old = cql_model.get_q_values(
                    tf.convert_to_tensor(model_out), batch["actions"])

            # The estimated Q-values for the new actions computed
            # by our trainer policy.
            actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0]
            if fw == "torch":
                q_values_new = cql_model.get_q_values(
                    model_out, torch.from_numpy(actions_new))
            else:
                q_values_new = cql_model.get_q_values(model_out, actions_new)

            if fw == "tf":
                q_values_old, q_values_new = pol.get_session().run(
                    [q_values_old, q_values_new])

            print(f"Q-val batch={q_values_old}")
            print(f"Q-val policy={q_values_new}")

            if fw == "tf":
                pol.get_session().__exit__(None, None, None)

            trainer.stop()
예제 #3
0
 def test_agent_output_logdir(self):
     """Test special value 'logdir' as Agent's output."""
     for fw in framework_iterator():
         agent = self.write_outputs("logdir", fw)
         self.assertEqual(len(glob.glob(agent.logdir + "/output-*.json")),
                          1)
예제 #4
0
파일: test_local.py 프로젝트: willfrey/ray
 def test_local(self):
     cf = DEFAULT_CONFIG.copy()
     for fw in framework_iterator(cf):
         agent = PPOTrainer(cf, "CartPole-v0")
         print(agent.train())
def ckpt_restore_test(alg_name, tfe=False):
    config = CONFIGS[alg_name]
    frameworks = (["tfe"] if tfe else []) + ["torch", "tf"]
    for fw in framework_iterator(config, frameworks=frameworks):
        for use_object_store in [False, True]:
            print("use_object_store={}".format(use_object_store))
            cls = get_agent_class(alg_name)
            if "DDPG" in alg_name or "SAC" in alg_name:
                alg1 = cls(config=config, env="Pendulum-v0")
                alg2 = cls(config=config, env="Pendulum-v0")
            else:
                alg1 = cls(config=config, env="CartPole-v0")
                alg2 = cls(config=config, env="CartPole-v0")

            policy1 = alg1.get_policy()

            for _ in range(1):
                res = alg1.train()
                print("current status: " + str(res))

            # Check optimizer state as well.
            optim_state = policy1.get_state().get("_optimizer_variables")

            # Sync the models
            if use_object_store:
                alg2.restore_from_object(alg1.save_to_object())
            else:
                alg2.restore(alg1.save())

            # Compare optimizer state with re-loaded one.
            if optim_state:
                s2 = alg2.get_policy().get_state().get("_optimizer_variables")
                # Tf -> Compare states 1:1.
                if fw in ["tf", "tfe"]:
                    check(s2, optim_state)
                # For torch, optimizers have state_dicts with keys=params,
                # which are different for the two models (ignore these
                # different keys, but compare all values nevertheless).
                else:
                    for i, s2_ in enumerate(s2):
                        check(
                            list(s2_["state"].values()),
                            list(optim_state[i]["state"].values()))

            for _ in range(1):
                if "DDPG" in alg_name or "SAC" in alg_name:
                    obs = np.clip(
                        np.random.uniform(size=3),
                        policy1.observation_space.low,
                        policy1.observation_space.high)
                else:
                    obs = np.clip(
                        np.random.uniform(size=4),
                        policy1.observation_space.low,
                        policy1.observation_space.high)
                a1 = get_mean_action(alg1, obs)
                a2 = get_mean_action(alg2, obs)
                print("Checking computed actions", alg1, obs, a1, a2)
                if abs(a1 - a2) > .1:
                    raise AssertionError("algo={} [a1={} a2={}]".format(
                        alg_name, a1, a2))
예제 #6
0
파일: test_pg.py 프로젝트: RuofanKong/ray
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = SampleBatch({
            SampleBatch.OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.EPS_ID:
            np.array([1234, 1234, 1234]),
            SampleBatch.AGENT_INDEX:
            np.array([0, 0, 0]),
        })

        for fw, sess in framework_iterator(config, session=True):
            dist_cls = (Categorical if fw != "torch" else TorchCategorical)
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            vars = policy.model.trainable_variables()
            if sess:
                vars = policy.get_session().run(vars)

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
            # [2.9701, 1.99, 1.0]
            train_batch_ = pg.post_process_advantages(policy,
                                                      train_batch.copy())
            if fw == "torch":
                train_batch_ = policy._lazy_tensor_dict(train_batch_)

            # Check Advantage values.
            check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

            # Actual loss results.
            if sess:
                results = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(train_batch_,
                                                           shuffle=False))
            else:
                results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else
                           pg.pg_torch_loss)(policy,
                                             policy.model,
                                             dist_class=dist_cls,
                                             train_batch=train_batch_)

            # Calculate expected results.
            if fw != "torch":
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[0],
                                        vars[1],
                                        framework=fw),
                                     vars[2],
                                     vars[3],
                                     framework=fw)
            else:
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[2],
                                        vars[3],
                                        framework=fw),
                                     vars[0],
                                     vars[1],
                                     framework=fw)
            expected_logp = dist_cls(expected_logits, policy.model).logp(
                train_batch_[SampleBatch.ACTIONS])
            adv = train_batch_[Postprocessing.ADVANTAGES]
            if sess:
                expected_logp = sess.run(expected_logp)
            elif fw == "torch":
                expected_logp = expected_logp.detach().cpu().numpy()
                adv = adv.detach().cpu().numpy()
            else:
                expected_logp = expected_logp.numpy()
            expected_loss = -np.mean(expected_logp * adv)
            check(results, expected_loss, decimals=4)
예제 #7
0
    def test_sac_loss_function(self):
        """Tests SAC loss function results across all frameworks."""
        config = sac.DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        config["learning_starts"] = 0
        config["twin_q"] = False
        config["gamma"] = 0.99
        # Switch on deterministic loss so we can compare the loss values.
        config["_deterministic_loss"] = True
        # Use very simple nets.
        config["Q_model"]["fcnet_hiddens"] = [10]
        config["policy_model"]["fcnet_hiddens"] = [10]
        # Make sure, timing differences do not affect trainer.train().
        config["min_iter_time_s"] = 0
        # Test SAC with Simplex action space.
        config["env_config"] = {"simplex_actions": True}

        map_ = {
            # Normal net.
            "default_policy/sequential/action_1/kernel": "action_model."
            "action_0._model.0.weight",
            "default_policy/sequential/action_1/bias": "action_model."
            "action_0._model.0.bias",
            "default_policy/sequential/action_out/kernel": "action_model."
            "action_out._model.0.weight",
            "default_policy/sequential/action_out/bias": "action_model."
            "action_out._model.0.bias",
            "default_policy/sequential_1/q_hidden_0/kernel": "q_net."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_1/q_hidden_0/bias": "q_net."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_1/q_out/kernel": "q_net."
            "q_out._model.0.weight",
            "default_policy/sequential_1/q_out/bias": "q_net."
            "q_out._model.0.bias",
            "default_policy/value_out/kernel": "_value_branch."
            "_model.0.weight",
            "default_policy/value_out/bias": "_value_branch."
            "_model.0.bias",
            "default_policy/log_alpha": "log_alpha",
            # Target net.
            "default_policy/sequential_2/action_1/kernel": "action_model."
            "action_0._model.0.weight",
            "default_policy/sequential_2/action_1/bias": "action_model."
            "action_0._model.0.bias",
            "default_policy/sequential_2/action_out/kernel": "action_model."
            "action_out._model.0.weight",
            "default_policy/sequential_2/action_out/bias": "action_model."
            "action_out._model.0.bias",
            "default_policy/sequential_3/q_hidden_0/kernel": "q_net."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_3/q_hidden_0/bias": "q_net."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_3/q_out/kernel": "q_net."
            "q_out._model.0.weight",
            "default_policy/sequential_3/q_out/bias": "q_net."
            "q_out._model.0.bias",
            "default_policy/value_out_1/kernel": "_value_branch."
            "_model.0.weight",
            "default_policy/value_out_1/bias": "_value_branch."
            "_model.0.bias",
            "default_policy/log_alpha_1": "log_alpha",
        }

        env = SimpleEnv
        batch_size = 100
        if env is SimpleEnv:
            obs_size = (batch_size, 1)
            actions = np.random.random(size=(batch_size, 2))
        elif env == "CartPole-v0":
            obs_size = (batch_size, 4)
            actions = np.random.randint(0, 2, size=(batch_size, ))
        else:
            obs_size = (batch_size, 3)
            actions = np.random.random(size=(batch_size, 1))

        # Batch of size=n.
        input_ = self._get_batch_helper(obs_size, actions, batch_size)

        # Simply compare loss values AND grads of all frameworks with each
        # other.
        prev_fw_loss = weights_dict = None
        expect_c, expect_a, expect_e, expect_t = None, None, None, None
        # History of tf-updated NN-weights over n training steps.
        tf_updated_weights = []
        # History of input batches used.
        tf_inputs = []
        for fw, sess in framework_iterator(config,
                                           frameworks=("tf", "torch"),
                                           session=True):
            # Generate Trainer and get its default Policy object.
            trainer = sac.SACTrainer(config=config, env=env)
            policy = trainer.get_policy()
            p_sess = None
            if sess:
                p_sess = policy.get_session()

            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
                # Start with the tf vars-dict.
                assert fw in ["tf2", "tf", "tfe"]
                weights_dict = policy.get_weights()
                if fw == "tfe":
                    log_alpha = weights_dict[10]
                    weights_dict = self._translate_tfe_weights(
                        weights_dict, map_)
            else:
                assert fw == "torch"  # Then transfer that to torch Model.
                model_dict = self._translate_weights_to_torch(
                    weights_dict, map_)
                policy.model.load_state_dict(model_dict)
                policy.target_model.load_state_dict(model_dict)

            if fw == "tf":
                log_alpha = weights_dict["default_policy/log_alpha"]
            elif fw == "torch":
                # Actually convert to torch tensors (by accessing everything).
                input_ = policy._lazy_tensor_dict(input_)
                input_ = {k: input_[k] for k in input_.keys()}
                log_alpha = policy.model.log_alpha.detach().cpu().numpy()[0]

            # Only run the expectation once, should be the same anyways
            # for all frameworks.
            if expect_c is None:
                expect_c, expect_a, expect_e, expect_t = \
                    self._sac_loss_helper(input_, weights_dict,
                                          sorted(weights_dict.keys()),
                                          log_alpha, fw,
                                          gamma=config["gamma"], sess=sess)

            # Get actual outs and compare to expectation AND previous
            # framework. c=critic, a=actor, e=entropy, t=td-error.
            if fw == "tf":
                c, a, e, t, tf_c_grads, tf_a_grads, tf_e_grads = \
                    p_sess.run([
                        policy.critic_loss,
                        policy.actor_loss,
                        policy.alpha_loss,
                        policy.td_error,
                        policy.optimizer().compute_gradients(
                            policy.critic_loss[0],
                            policy.model.q_variables()),
                        policy.optimizer().compute_gradients(
                            policy.actor_loss,
                            policy.model.policy_variables()),
                        policy.optimizer().compute_gradients(
                            policy.alpha_loss, policy.model.log_alpha)],
                        feed_dict=policy._get_loss_inputs_dict(
                            input_, shuffle=False))
                tf_c_grads = [g for g, v in tf_c_grads]
                tf_a_grads = [g for g, v in tf_a_grads]
                tf_e_grads = [g for g, v in tf_e_grads]

            elif fw == "tfe":
                with tf.GradientTape() as tape:
                    tf_loss(policy, policy.model, None, input_)
                c, a, e, t = policy.critic_loss, policy.actor_loss, \
                    policy.alpha_loss, policy.td_error
                vars = tape.watched_variables()
                tf_c_grads = tape.gradient(c[0], vars[6:10])
                tf_a_grads = tape.gradient(a, vars[2:6])
                tf_e_grads = tape.gradient(e, vars[10])

            elif fw == "torch":
                loss_torch(policy, policy.model, None, input_)
                c, a, e, t = policy.critic_loss, policy.actor_loss, \
                    policy.alpha_loss, policy.td_error

                # Test actor gradients.
                policy.actor_optim.zero_grad()
                assert all(v.grad is None for v in policy.model.q_variables())
                assert all(v.grad is None
                           for v in policy.model.policy_variables())
                assert policy.model.log_alpha.grad is None
                a.backward()
                # `actor_loss` depends on Q-net vars (but these grads must
                # be ignored and overridden in critic_loss.backward!).
                assert not any(v.grad is None
                               for v in policy.model.q_variables())
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert not all(
                    torch.min(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert policy.model.log_alpha.grad is None
                # Compare with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.detach().cpu()))
                    else:
                        check(tf_g, torch_g)

                # Test critic gradients.
                policy.critic_optims[0].zero_grad()
                assert all(
                    torch.mean(v.grad) == 0.0
                    for v in policy.model.q_variables())
                assert all(
                    torch.min(v.grad) == 0.0
                    for v in policy.model.q_variables())
                assert policy.model.log_alpha.grad is None
                c[0].backward()
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.q_variables())
                assert not all(
                    torch.min(v.grad) == 0 for v in policy.model.q_variables())
                assert policy.model.log_alpha.grad is None
                # Compare with tf ones.
                torch_c_grads = [v.grad for v in policy.model.q_variables()]
                for tf_g, torch_g in zip(tf_c_grads, torch_c_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.detach().cpu()))
                    else:
                        check(tf_g, torch_g)
                # Compare (unchanged(!) actor grads) with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.detach().cpu()))
                    else:
                        check(tf_g, torch_g)

                # Test alpha gradient.
                policy.alpha_optim.zero_grad()
                assert policy.model.log_alpha.grad is None
                e.backward()
                assert policy.model.log_alpha.grad is not None
                check(policy.model.log_alpha.grad, tf_e_grads)

            check(c, expect_c)
            check(a, expect_a)
            check(e, expect_e)
            check(t, expect_t)

            # Store this framework's losses in prev_fw_loss to compare with
            # next framework's outputs.
            if prev_fw_loss is not None:
                check(c, prev_fw_loss[0])
                check(a, prev_fw_loss[1])
                check(e, prev_fw_loss[2])
                check(t, prev_fw_loss[3])

            prev_fw_loss = (c, a, e, t)

            # Update weights from our batch (n times).
            for update_iteration in range(10):
                print("train iteration {}".format(update_iteration))
                if fw == "tf":
                    in_ = self._get_batch_helper(obs_size, actions, batch_size)
                    tf_inputs.append(in_)
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    updated_weights = policy.get_weights()
                    # Net must have changed.
                    if tf_updated_weights:
                        check(updated_weights[
                            "default_policy/sequential/action_1/kernel"],
                              tf_updated_weights[-1]
                              ["default_policy/sequential/action_1/kernel"],
                              false=True)
                    tf_updated_weights.append(updated_weights)

                # Compare with updated tf-weights. Must all be the same.
                else:
                    tf_weights = tf_updated_weights[update_iteration]
                    in_ = tf_inputs[update_iteration]
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = LocalReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    # Compare updated model.
                    for tf_key in sorted(tf_weights.keys())[2:10]:
                        tf_var = tf_weights[tf_key]
                        torch_var = policy.model.state_dict()[map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var,
                                  np.transpose(torch_var.detach().cpu()),
                                  rtol=0.05)
                        else:
                            check(tf_var, torch_var, rtol=0.05)
                    # And alpha.
                    check(policy.model.log_alpha,
                          tf_weights["default_policy/log_alpha"])
                    # Compare target nets.
                    for tf_key in sorted(tf_weights.keys())[10:18]:
                        tf_var = tf_weights[tf_key]
                        torch_var = policy.target_model.state_dict()[
                            map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var,
                                  np.transpose(torch_var.detach().cpu()),
                                  rtol=0.05)
                        else:
                            check(tf_var, torch_var, rtol=0.05)
예제 #8
0
    def test_diag_gaussian(self):
        """Tests the DiagGaussian ActionDistribution for all frameworks."""
        input_space = Box(-2.0, 1.0, shape=(2000, 10))

        for fw, sess in framework_iterator(frameworks=("torch", "tf", "tfe"),
                                           session=True):
            cls = DiagGaussian if fw != "torch" else TorchDiagGaussian

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(cls, input_space.shape, fw=fw, sess=sess)

            # Batch of size=n and deterministic.
            inputs = input_space.sample()
            means, _ = np.split(inputs, 2, axis=-1)
            diag_distribution = cls(inputs, {})
            expected = means
            # Sample n times, expect always mean value (deterministic draw).
            out = diag_distribution.deterministic_sample()
            check(out, expected)

            # Batch of size=n and non-deterministic -> expect roughly the mean.
            inputs = input_space.sample()
            means, log_stds = np.split(inputs, 2, axis=-1)
            diag_distribution = cls(inputs, {})
            expected = means
            values = diag_distribution.sample()
            if sess:
                values = sess.run(values)
            else:
                values = values.numpy()
            check(np.mean(values), expected.mean(), decimals=1)

            # Test log-likelihood outputs.
            sampled_action_logp = diag_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                sampled_action_logp = sess.run(sampled_action_logp)
            else:
                sampled_action_logp = sampled_action_logp.numpy()

            # NN output.
            means = np.array(
                [[0.1, 0.2, 0.3, 0.4, 50.0], [-0.1, -0.2, -0.3, -0.4, -1.0]],
                dtype=np.float32)
            log_stds = np.array(
                [[0.8, -0.2, 0.3, -1.0, 2.0], [0.7, -0.3, 0.4, -0.9, 2.0]],
                dtype=np.float32)

            diag_distribution = cls(inputs=np.concatenate([means, log_stds],
                                                          axis=-1),
                                    model={})
            # Convert to parameters for distr.
            stds = np.exp(log_stds)
            # Values to get log-likelihoods for.
            values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05],
                               [-0.9, -0.2, 0.4, -0.1, -1.05]])

            # get log-llh from regular gaussian.
            log_prob = np.sum(np.log(norm.pdf(values, means, stds)), -1)

            outs = diag_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                outs = sess.run(outs)
            check(outs, log_prob, decimals=4)
예제 #9
0
    def test_multi_action_distribution(self):
        """Tests the MultiActionDistribution (across all frameworks)."""
        batch_size = 1000
        input_space = Tuple([
            Box(-10.0, 10.0, shape=(batch_size, 4)),
            Box(-2.0, 2.0, shape=(
                batch_size,
                6,
            )),
            Dict({"a": Box(-1.0, 1.0, shape=(batch_size, 4))}),
        ])
        std_space = Box(-0.05, 0.05, shape=(
            batch_size,
            3,
        ))

        low, high = -1.0, 1.0
        value_space = Tuple([
            Box(0, 3, shape=(batch_size, ), dtype=np.int32),
            Box(-2.0, 2.0, shape=(batch_size, 3), dtype=np.float32),
            Dict({"a": Box(0.0, 1.0, shape=(batch_size, 2), dtype=np.float32)})
        ])

        for fw, sess in framework_iterator(session=True):
            if fw == "torch":
                cls = TorchMultiActionDistribution
                child_distr_cls = [
                    TorchCategorical, TorchDiagGaussian,
                    partial(TorchBeta, low=low, high=high)
                ]
            else:
                cls = MultiActionDistribution
                child_distr_cls = [
                    Categorical,
                    DiagGaussian,
                    partial(Beta, low=low, high=high),
                ]

            inputs = list(input_space.sample())
            distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]],
                                       axis=1),
                        model={},
                        action_space=value_space,
                        child_distributions=child_distr_cls,
                        input_lens=[4, 6, 4])

            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            # Sample deterministically.
            expected_det = [
                np.argmax(inputs[0], axis=-1),
                inputs[1][:, :3],  # [:3]=Mean values.
                # Mean for a Beta distribution:
                # 1 / [1 + (beta/alpha)] * range + low
                (1.0 /
                 (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, 0:2])) *
                (high - low) + low,
            ]
            out = distr.deterministic_sample()
            if sess:
                out = sess.run(out)
            check(out[0], expected_det[0])
            check(out[1], expected_det[1])
            check(out[2]["a"], expected_det[2])

            # Stochastic sampling -> expect roughly the mean.
            inputs = list(input_space.sample())
            # Fix categorical inputs (not needed for distribution itself, but
            # for our expectation calculations).
            inputs[0] = softmax(inputs[0], -1)
            # Fix std inputs (shouldn't be too large for this test).
            inputs[1][:, 3:] = std_space.sample()
            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]],
                                       axis=1),
                        model={},
                        action_space=value_space,
                        child_distributions=child_distr_cls,
                        input_lens=[4, 6, 4])
            expected_mean = [
                np.mean(np.sum(inputs[0] * np.array([0, 1, 2, 3]), -1)),
                inputs[1][:, :3],  # [:3]=Mean values.
                # Mean for a Beta distribution:
                # 1 / [1 + (beta/alpha)] * range + low
                (1.0 / (1.0 + inputs[2]["a"][:, 2:] / inputs[2]["a"][:, :2])) *
                (high - low) + low,
            ]
            out = distr.sample()
            if sess:
                out = sess.run(out)
            out = list(out)
            if fw == "torch":
                out[0] = out[0].numpy()
                out[1] = out[1].numpy()
                out[2]["a"] = out[2]["a"].numpy()
            check(np.mean(out[0]), expected_mean[0], decimals=1)
            check(np.mean(out[1], 0), np.mean(expected_mean[1], 0), decimals=1)
            check(np.mean(out[2]["a"], 0),
                  np.mean(expected_mean[2], 0),
                  decimals=1)

            # Test log-likelihood outputs.
            # Make sure beta-values are within 0.0 and 1.0 for the numpy
            # calculation (which doesn't have scaling).
            inputs = list(input_space.sample())
            # Adjust inputs for the Beta distr just as Beta itself does.
            inputs[2]["a"] = np.clip(inputs[2]["a"], np.log(SMALL_NUMBER),
                                     -np.log(SMALL_NUMBER))
            inputs[2]["a"] = np.log(np.exp(inputs[2]["a"]) + 1.0) + 1.0
            distr = cls(np.concatenate([inputs[0], inputs[1], inputs[2]["a"]],
                                       axis=1),
                        model={},
                        action_space=value_space,
                        child_distributions=child_distr_cls,
                        input_lens=[4, 6, 4])
            inputs[0] = softmax(inputs[0], -1)
            values = list(value_space.sample())
            log_prob_beta = np.log(
                beta.pdf(values[2]["a"], inputs[2]["a"][:, :2],
                         inputs[2]["a"][:, 2:]))
            # Now do the up-scaling for [2] (beta values) to be between
            # low/high.
            values[2]["a"] = values[2]["a"] * (high - low) + low
            inputs[1][:, 3:] = np.exp(inputs[1][:, 3:])
            expected_log_llh = np.sum(
                np.concatenate([
                    np.expand_dims(
                        np.log(
                            [i[values[0][j]]
                             for j, i in enumerate(inputs[0])]), -1),
                    np.log(
                        norm.pdf(values[1], inputs[1][:, :3],
                                 inputs[1][:, 3:])), log_prob_beta
                ], -1), -1)

            values[0] = np.expand_dims(values[0], -1)
            if fw == "torch":
                values = tree.map_structure(lambda s: torch.Tensor(s), values)
            # Test all flattened input.
            concat = np.concatenate(tree.flatten(values),
                                    -1).astype(np.float32)
            out = distr.logp(concat)
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
            # Test structured input.
            out = distr.logp(values)
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
            # Test flattened input.
            out = distr.logp(tree.flatten(values))
            if sess:
                out = sess.run(out)
            check(out, expected_log_llh, atol=15)
예제 #10
0
    def test_multi_categorical(self):
        batch_size = 100
        num_categories = 3
        num_sub_distributions = 5
        # Create 5 categorical distributions of 3 categories each.
        inputs_space = Box(-1.0,
                           2.0,
                           shape=(batch_size,
                                  num_sub_distributions * num_categories))
        values_space = Box(0,
                           num_categories - 1,
                           shape=(num_sub_distributions, batch_size),
                           dtype=np.int32)

        inputs = inputs_space.sample()
        input_lengths = [num_categories] * num_sub_distributions
        inputs_split = np.split(inputs, num_sub_distributions, axis=1)

        for fw, sess in framework_iterator(session=True):
            # Create the correct distribution object.
            cls = MultiCategorical if fw != "torch" else TorchMultiCategorical
            multi_categorical = cls(inputs, None, input_lengths)

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(cls,
                                 inputs_space.shape,
                                 fw=fw,
                                 sess=sess,
                                 bounds=(0, num_categories - 1),
                                 extra_kwargs={"input_lens": input_lengths})

            # Batch of size=3 and deterministic (True).
            expected = np.transpose(np.argmax(inputs_split, axis=-1))
            # Sample, expect always max value
            # (max likelihood for deterministic draw).
            out = multi_categorical.deterministic_sample()
            check(out, expected)

            # Batch of size=3 and non-deterministic -> expect roughly the mean.
            out = multi_categorical.sample()
            check(tf.reduce_mean(out)
                  if fw != "torch" else torch.mean(out.float()),
                  1.0,
                  decimals=0)

            # Test log-likelihood outputs.
            probs = softmax(inputs_split)
            values = values_space.sample()

            out = multi_categorical.logp(values if fw != "torch" else [
                torch.Tensor(values[i]) for i in range(num_sub_distributions)
            ])  # v in np.stack(values, 1)])
            expected = []
            for i in range(batch_size):
                expected.append(
                    np.sum(
                        np.log(
                            np.array([
                                probs[j][i][values[j][i]]
                                for j in range(num_sub_distributions)
                            ]))))
            check(out, expected, decimals=4)

            # Test entropy outputs.
            out = multi_categorical.entropy()
            expected_entropy = -np.sum(np.sum(probs * np.log(probs), 0), -1)
            check(out, expected_entropy)
예제 #11
0
    def test_squashed_gaussian(self):
        """Tests the SquashedGaussian ActionDistribution for all frameworks."""
        input_space = Box(-2.0, 2.0, shape=(2000, 10))
        low, high = -2.0, 1.0

        for fw, sess in framework_iterator(frameworks=("torch", "tf", "tfe"),
                                           session=True):
            cls = SquashedGaussian if fw != "torch" else TorchSquashedGaussian

            # Do a stability test using extreme NN outputs to see whether
            # sampling and logp'ing result in NaN or +/-inf values.
            self._stability_test(cls,
                                 input_space.shape,
                                 fw=fw,
                                 sess=sess,
                                 bounds=(low, high))

            # Batch of size=n and deterministic.
            inputs = input_space.sample()
            means, _ = np.split(inputs, 2, axis=-1)
            squashed_distribution = cls(inputs, {}, low=low, high=high)
            expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low
            # Sample n times, expect always mean value (deterministic draw).
            out = squashed_distribution.deterministic_sample()
            check(out, expected)

            # Batch of size=n and non-deterministic -> expect roughly the mean.
            inputs = input_space.sample()
            means, log_stds = np.split(inputs, 2, axis=-1)
            squashed_distribution = cls(inputs, {}, low=low, high=high)
            expected = ((np.tanh(means) + 1.0) / 2.0) * (high - low) + low
            values = squashed_distribution.sample()
            if sess:
                values = sess.run(values)
            else:
                values = values.numpy()
            self.assertTrue(np.max(values) <= high)
            self.assertTrue(np.min(values) >= low)

            check(np.mean(values), expected.mean(), decimals=1)

            # Test log-likelihood outputs.
            sampled_action_logp = squashed_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                sampled_action_logp = sess.run(sampled_action_logp)
            else:
                sampled_action_logp = sampled_action_logp.numpy()
            # Convert to parameters for distr.
            stds = np.exp(
                np.clip(log_stds, MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT))
            # Unsquash values, then get log-llh from regular gaussian.
            # atanh_in = np.clip((values - low) / (high - low) * 2.0 - 1.0,
            #   -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER)
            normed_values = (values - low) / (high - low) * 2.0 - 1.0
            save_normed_values = np.clip(normed_values, -1.0 + SMALL_NUMBER,
                                         1.0 - SMALL_NUMBER)
            unsquashed_values = np.arctanh(save_normed_values)
            log_prob_unsquashed = np.sum(
                np.log(norm.pdf(unsquashed_values, means, stds)), -1)
            log_prob = log_prob_unsquashed - \
                np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2),
                       axis=-1)
            check(np.sum(sampled_action_logp), np.sum(log_prob), rtol=0.05)

            # NN output.
            means = np.array([[0.1, 0.2, 0.3, 0.4, 50.0],
                              [-0.1, -0.2, -0.3, -0.4, -1.0]])
            log_stds = np.array([[0.8, -0.2, 0.3, -1.0, 2.0],
                                 [0.7, -0.3, 0.4, -0.9, 2.0]])
            squashed_distribution = cls(inputs=np.concatenate(
                [means, log_stds], axis=-1),
                                        model={},
                                        low=low,
                                        high=high)
            # Convert to parameters for distr.
            stds = np.exp(log_stds)
            # Values to get log-likelihoods for.
            values = np.array([[0.9, 0.2, 0.4, -0.1, -1.05],
                               [-0.9, -0.2, 0.4, -0.1, -1.05]])

            # Unsquash values, then get log-llh from regular gaussian.
            unsquashed_values = np.arctanh((values - low) /
                                           (high - low) * 2.0 - 1.0)
            log_prob_unsquashed = \
                np.sum(np.log(norm.pdf(unsquashed_values, means, stds)), -1)
            log_prob = log_prob_unsquashed - \
                np.sum(np.log(1 - np.tanh(unsquashed_values) ** 2),
                       axis=-1)

            outs = squashed_distribution.logp(
                values if fw != "torch" else torch.Tensor(values))
            if sess:
                outs = sess.run(outs)
            check(outs, log_prob, decimals=4)
예제 #12
0
 def test_a2c_exec_impl(ray_start_regular):
     config = {"min_iter_time_s": 0}
     for _ in framework_iterator(config, ("tf", "torch")):
         trainer = a3c.A2CTrainer(env="CartPole-v0", config=config)
         assert isinstance(trainer.train(), dict)
         check_compute_single_action(trainer)
예제 #13
0
    def test_sac_compilation(self):
        """Tests whether an SACTrainer can be built with all frameworks."""
        config = sac.DEFAULT_CONFIG.copy()
        config["Q_model"] = sac.DEFAULT_CONFIG["Q_model"].copy()
        config["num_workers"] = 0  # Run locally.
        config["n_step"] = 3
        config["twin_q"] = True
        config["learning_starts"] = 0
        config["prioritized_replay"] = True
        config["rollout_fragment_length"] = 10
        config["train_batch_size"] = 10
        # If we use default buffer size (1e6), the buffer will take up
        # 169.445 GB memory, which is beyond travis-ci's current (Mar 19, 2021)
        # available system memory (8.34816 GB).
        config["replay_buffer_config"]["capacity"] = 40000
        # Test with saved replay buffer.
        config["store_buffer_in_checkpoints"] = True
        num_iterations = 1

        ModelCatalog.register_custom_model("batch_norm", KerasBatchNormModel)
        ModelCatalog.register_custom_model("batch_norm_torch", TorchBatchNormModel)

        image_space = Box(-1.0, 1.0, shape=(84, 84, 3))
        simple_space = Box(-1.0, 1.0, shape=(3,))

        tune.register_env(
            "random_dict_env",
            lambda _: RandomEnv(
                {
                    "observation_space": Dict(
                        {
                            "a": simple_space,
                            "b": Discrete(2),
                            "c": image_space,
                        }
                    ),
                    "action_space": Box(-1.0, 1.0, shape=(1,)),
                }
            ),
        )
        tune.register_env(
            "random_tuple_env",
            lambda _: RandomEnv(
                {
                    "observation_space": Tuple(
                        [simple_space, Discrete(2), image_space]
                    ),
                    "action_space": Box(-1.0, 1.0, shape=(1,)),
                }
            ),
        )

        for fw in framework_iterator(config, with_eager_tracing=True):
            # Test for different env types (discrete w/ and w/o image, + cont).
            for env in [
                "random_dict_env",
                "random_tuple_env",
                # "MsPacmanNoFrameskip-v4",
                "CartPole-v0",
            ]:
                print("Env={}".format(env))
                # Test making the Q-model a custom one for CartPole, otherwise,
                # use the default model.
                config["Q_model"]["custom_model"] = (
                    "batch_norm{}".format("_torch" if fw == "torch" else "")
                    if env == "CartPole-v0"
                    else None
                )
                trainer = sac.SACTrainer(config=config, env=env)
                for i in range(num_iterations):
                    results = trainer.train()
                    check_train_results(results)
                    print(results)
                check_compute_single_action(trainer)

                # Test, whether the replay buffer is saved along with
                # a checkpoint (no point in doing it for all frameworks since
                # this is framework agnostic).
                if fw == "tf" and env == "CartPole-v0":
                    checkpoint = trainer.save()
                    new_trainer = sac.SACTrainer(config, env=env)
                    new_trainer.restore(checkpoint)
                    # Get some data from the buffer and compare.
                    data = trainer.local_replay_buffer.replay_buffers[
                        "default_policy"
                    ]._storage[: 42 + 42]
                    new_data = new_trainer.local_replay_buffer.replay_buffers[
                        "default_policy"
                    ]._storage[: 42 + 42]
                    check(data, new_data)
                    new_trainer.stop()

                trainer.stop()
예제 #14
0
    def test_marwil_loss_function(self):
        """
        To generate the historic data used in this test case, first run:
        $ ./train.py --run=PPO --env=CartPole-v0 \
          --stop='{"timesteps_total": 50000}' \
          --config='{"output": "/tmp/out", "batch_mode": "complete_episodes"}'
        """
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/cartpole/small.json")
        print("data_file={} exists={}".format(data_file, os.path.isfile(data_file)))
        config = marwil.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        # Learn from offline data.
        config["input"] = [data_file]

        for fw, sess in framework_iterator(config, session=True):
            reader = JsonReader(inputs=[data_file])
            batch = reader.next()

            trainer = marwil.MARWILTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            model = policy.model

            # Calculate our own expected values (to then compare against the
            # agent's loss output).
            cummulative_rewards = compute_advantages(
                batch, 0.0, config["gamma"], 1.0, False, False
            )["advantages"]
            if fw == "torch":
                cummulative_rewards = torch.tensor(cummulative_rewards)
            if fw != "tf":
                batch = policy._lazy_tensor_dict(batch)
            model_out, _ = model(batch)
            vf_estimates = model.value_function()
            if fw == "tf":
                model_out, vf_estimates = policy.get_session().run(
                    [model_out, vf_estimates]
                )
            adv = cummulative_rewards - vf_estimates
            if fw == "torch":
                adv = adv.detach().cpu().numpy()
            adv_squared = np.mean(np.square(adv))
            c_2 = 100.0 + 1e-8 * (adv_squared - 100.0)
            c = np.sqrt(c_2)
            exp_advs = np.exp(config["beta"] * (adv / c))
            dist = policy.dist_class(model_out, model)
            logp = dist.logp(batch["actions"])
            if fw == "torch":
                logp = logp.detach().cpu().numpy()
            elif fw == "tf":
                logp = sess.run(logp)
            # Calculate all expected loss components.
            expected_vf_loss = 0.5 * adv_squared
            expected_pol_loss = -1.0 * np.mean(exp_advs * logp)
            expected_loss = expected_pol_loss + config["vf_coeff"] * expected_vf_loss

            # Calculate the algorithm's loss (to check against our own
            # calculation above).
            batch.set_get_interceptor(None)
            postprocessed_batch = policy.postprocess_trajectory(batch)
            loss_func = (
                marwil.marwil_tf_policy.marwil_loss
                if fw != "torch"
                else marwil.marwil_torch_policy.marwil_loss
            )
            if fw != "tf":
                policy._lazy_tensor_dict(postprocessed_batch)
                loss_out = loss_func(
                    policy, model, policy.dist_class, postprocessed_batch
                )
            else:
                loss_out, v_loss, p_loss = policy.get_session().run(
                    [policy._loss, policy.loss.v_loss, policy.loss.p_loss],
                    feed_dict=policy._get_loss_inputs_dict(
                        postprocessed_batch, shuffle=False
                    ),
                )

            # Check all components.
            if fw == "torch":
                check(policy.v_loss, expected_vf_loss, decimals=4)
                check(policy.p_loss, expected_pol_loss, decimals=4)
            elif fw == "tf":
                check(v_loss, expected_vf_loss, decimals=4)
                check(p_loss, expected_pol_loss, decimals=4)
            else:
                check(policy.loss.v_loss, expected_vf_loss, decimals=4)
                check(policy.loss.p_loss, expected_pol_loss, decimals=4)
            check(loss_out, expected_loss, decimals=3)
예제 #15
0
def evaluate_test(algo, env="CartPole-v0", test_episode_rollout=False):
    extra_config = ""
    if algo == "ARS":
        extra_config = ',"train_batch_size": 10, "noise_size": 250000'
    elif algo == "ES":
        extra_config = (
            ',"episodes_per_batch": 1,"train_batch_size": 10, ' '"noise_size": 250000'
        )

    for fw in framework_iterator(frameworks=("tf", "torch")):
        fw_ = ', "framework": "{}"'.format(fw)

        tmp_dir = os.popen("mktemp -d").read()[:-1]
        if not os.path.exists(tmp_dir):
            sys.exit(1)

        print("Saving results to {}".format(tmp_dir))

        rllib_dir = str(Path(__file__).parent.parent.absolute())
        print("RLlib dir = {}\nexists={}".format(rllib_dir, os.path.exists(rllib_dir)))
        os.system(
            "python {}/train.py --local-dir={} --run={} "
            "--checkpoint-freq=1 ".format(rllib_dir, tmp_dir, algo)
            + "--config='{"
            + '"num_workers": 1, "num_gpus": 0{}{}'.format(fw_, extra_config)
            + ', "min_sample_timesteps_per_reporting": 5,'
            '"min_time_s_per_reporting": 0.1, '
            '"model": {"fcnet_hiddens": [10]}'
            "}' --stop='{\"training_iteration\": 1}'" + " --env={}".format(env)
        )

        checkpoint_path = os.popen(
            "ls {}/default/*/checkpoint_000001/checkpoint-1".format(tmp_dir)
        ).read()[:-1]
        if not os.path.exists(checkpoint_path):
            sys.exit(1)
        print("Checkpoint path {} (exists)".format(checkpoint_path))

        # Test rolling out n steps.
        os.popen(
            'python {}/evaluate.py --run={} "{}" --steps=10 '
            '--out="{}/rollouts_10steps.pkl" --no-render'.format(
                rllib_dir, algo, checkpoint_path, tmp_dir
            )
        ).read()
        if not os.path.exists(tmp_dir + "/rollouts_10steps.pkl"):
            sys.exit(1)
        print("evaluate output (10 steps) exists!")

        # Test rolling out 1 episode.
        if test_episode_rollout:
            os.popen(
                'python {}/evaluate.py --run={} "{}" --episodes=1 '
                '--out="{}/rollouts_1episode.pkl" --no-render'.format(
                    rllib_dir, algo, checkpoint_path, tmp_dir
                )
            ).read()
            if not os.path.exists(tmp_dir + "/rollouts_1episode.pkl"):
                sys.exit(1)
            print("evaluate output (1 ep) exists!")

        # Cleanup.
        os.popen('rm -rf "{}"'.format(tmp_dir)).read()
    def test_dummy_components(self):

        # Bazel makes it hard to find files specified in `args`
        # (and `data`).
        # Use the true absolute path.
        script_dir = Path(__file__).parent
        abs_path = script_dir.absolute()

        for fw, sess in framework_iterator(session=True):
            fw_ = fw if fw != "tfe" else "tf"
            # Try to create from an abstract class w/o default constructor.
            # Expect None.
            test = from_config({
                "type": AbstractDummyComponent,
                "framework": fw_
            })
            check(test, None)

            # Create a Component via python API (config dict).
            component = from_config(
                dict(type=DummyComponent,
                     prop_a=1.0,
                     prop_d="non_default",
                     framework=fw_))
            check(component.prop_d, "non_default")

            # Create a tf Component from json file.
            config_file = str(abs_path.joinpath("dummy_config.json"))
            component = from_config(config_file, framework=fw_)
            check(component.prop_c, "default")
            check(component.prop_d, 4)  # default
            value = component.add(3.3)
            if sess:
                value = sess.run(value)
            check(value, 5.3)  # prop_b == 2.0

            # Create a torch Component from yaml file.
            config_file = str(abs_path.joinpath("dummy_config.yml"))
            component = from_config(config_file, framework=fw_)
            check(component.prop_a, "something else")
            check(component.prop_d, 3)
            value = component.add(1.2)
            if sess:
                value = sess.run(value)
            check(value, np.array([2.2]))  # prop_b == 1.0

            # Create tf Component from json-string (e.g. on command line).
            component = from_config(
                '{"type": "ray.rllib.utils.tests.'
                'test_framework_agnostic_components.DummyComponent", '
                '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default", '
                '"framework": "' + fw_ + '"}')
            check(component.prop_a, "A")
            check(component.prop_d, 4)  # default
            value = component.add(-1.1)
            if sess:
                value = sess.run(value)
            check(value, -2.1)  # prop_b == -1.0

            # Test recognizing default module path.
            component = from_config(
                DummyComponent, '{"type": "NonAbstractChildOfDummyComponent", '
                '"prop_a": "A", "prop_b": -1.0, "prop_c": "non-default",'
                '"framework": "' + fw_ + '"}')
            check(component.prop_a, "A")
            check(component.prop_d, 4)  # default
            value = component.add(-1.1)
            if sess:
                value = sess.run(value)
            check(value, -2.1)  # prop_b == -1.0

            # Test recognizing default package path.
            scope = None
            if sess:
                scope = tf.variable_scope("exploration_object")
                scope.__enter__()
            component = from_config(
                Exploration, {
                    "type": "EpsilonGreedy",
                    "action_space": Discrete(2),
                    "framework": fw_,
                    "num_workers": 0,
                    "worker_index": 0,
                    "policy_config": {},
                    "model": None
                })
            if scope:
                scope.__exit__(None, None, None)
            check(component.epsilon_schedule.outside_value, 0.05)  # default

            # Create torch Component from yaml-string.
            component = from_config(
                "type: ray.rllib.utils.tests."
                "test_framework_agnostic_components.DummyComponent\n"
                "prop_a: B\nprop_b: -1.5\nprop_c: non-default\nframework: "
                "{}".format(fw_))
            check(component.prop_a, "B")
            check(component.prop_d, 4)  # default
            value = component.add(-5.1)
            if sess:
                value = sess.run(value)
            check(value, np.array([-6.6]))  # prop_b == -1.5
예제 #17
0
파일: test_ddpg.py 프로젝트: krfricke/ray
    def test_ddpg_loss_function(self):
        """Tests DDPG loss function results across all frameworks."""
        config = ddpg.DEFAULT_CONFIG.copy()
        # Run locally.
        config["seed"] = 42
        config["num_workers"] = 0
        config["learning_starts"] = 0
        config["twin_q"] = True
        config["use_huber"] = True
        config["huber_threshold"] = 1.0
        config["gamma"] = 0.99
        # Make this small (seems to introduce errors).
        config["l2_reg"] = 1e-10
        config["prioritized_replay"] = False
        # Use very simple nets.
        config["actor_hiddens"] = [10]
        config["critic_hiddens"] = [10]
        # Make sure, timing differences do not affect trainer.train().
        config["min_time_s_per_reporting"] = 0
        config["timesteps_per_iteration"] = 100

        map_ = {
            # Normal net.
            "default_policy/actor_hidden_0/kernel":
            "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0/bias":
            "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out/kernel":
            "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out/bias":
            "policy_model.action_out._model.0.bias",
            "default_policy/sequential/q_hidden_0/kernel":
            "q_model.q_hidden_0"
            "._model.0.weight",
            "default_policy/sequential/q_hidden_0/bias":
            "q_model.q_hidden_0."
            "_model.0.bias",
            "default_policy/sequential/q_out/kernel":
            "q_model.q_out._model."
            "0.weight",
            "default_policy/sequential/q_out/bias":
            "q_model.q_out._model.0.bias",
            # -- twin.
            "default_policy/sequential_1/twin_q_hidden_0/kernel":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_1/twin_q_hidden_0/bias":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_1/twin_q_out/kernel":
            "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_1/twin_q_out/bias":
            "twin_"
            "q_model.twin_q_out._model.0.bias",
            # Target net.
            "default_policy/actor_hidden_0_1/kernel":
            "policy_model.action_0."
            "_model.0.weight",
            "default_policy/actor_hidden_0_1/bias":
            "policy_model.action_0."
            "_model.0.bias",
            "default_policy/actor_out_1/kernel":
            "policy_model.action_out."
            "_model.0.weight",
            "default_policy/actor_out_1/bias":
            "policy_model.action_out._model"
            ".0.bias",
            "default_policy/sequential_2/q_hidden_0/kernel":
            "q_model."
            "q_hidden_0._model.0.weight",
            "default_policy/sequential_2/q_hidden_0/bias":
            "q_model."
            "q_hidden_0._model.0.bias",
            "default_policy/sequential_2/q_out/kernel":
            "q_model."
            "q_out._model.0.weight",
            "default_policy/sequential_2/q_out/bias":
            "q_model.q_out._model.0.bias",
            # -- twin.
            "default_policy/sequential_3/twin_q_hidden_0/kernel":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.weight",
            "default_policy/sequential_3/twin_q_hidden_0/bias":
            "twin_"
            "q_model.twin_q_hidden_0._model.0.bias",
            "default_policy/sequential_3/twin_q_out/kernel":
            "twin_"
            "q_model.twin_q_out._model.0.weight",
            "default_policy/sequential_3/twin_q_out/bias":
            "twin_"
            "q_model.twin_q_out._model.0.bias",
        }

        env = SimpleEnv
        batch_size = 100
        obs_size = (batch_size, 1)
        actions = np.random.random(size=(batch_size, 1))

        # Batch of size=n.
        input_ = self._get_batch_helper(obs_size, actions, batch_size)

        # Simply compare loss values AND grads of all frameworks with each
        # other.
        prev_fw_loss = weights_dict = None
        expect_c, expect_a, expect_t = None, None, None
        # History of tf-updated NN-weights over n training steps.
        tf_updated_weights = []
        # History of input batches used.
        tf_inputs = []
        for fw, sess in framework_iterator(config,
                                           frameworks=("tf", "torch"),
                                           session=True):
            # Generate Trainer and get its default Policy object.
            trainer = ddpg.DDPGTrainer(config=config, env=env)
            policy = trainer.get_policy()
            p_sess = None
            if sess:
                p_sess = policy.get_session()

            # Set all weights (of all nets) to fixed values.
            if weights_dict is None:
                assert fw == "tf"  # Start with the tf vars-dict.
                weights_dict = policy.get_weights()
            else:
                assert fw == "torch"  # Then transfer that to torch Model.
                model_dict = self._translate_weights_to_torch(
                    weights_dict, map_)
                policy.model.load_state_dict(model_dict)
                policy.target_model.load_state_dict(model_dict)

            if fw == "torch":
                # Actually convert to torch tensors.
                input_ = policy._lazy_tensor_dict(input_)
                input_ = {k: input_[k] for k in input_.keys()}

            # Only run the expectation once, should be the same anyways
            # for all frameworks.
            if expect_c is None:
                expect_c, expect_a, expect_t = self._ddpg_loss_helper(
                    input_,
                    weights_dict,
                    sorted(weights_dict.keys()),
                    fw,
                    gamma=config["gamma"],
                    huber_threshold=config["huber_threshold"],
                    l2_reg=config["l2_reg"],
                    sess=sess,
                )

            # Get actual outs and compare to expectation AND previous
            # framework. c=critic, a=actor, e=entropy, t=td-error.
            if fw == "tf":
                c, a, t, tf_c_grads, tf_a_grads = p_sess.run(
                    [
                        policy.critic_loss,
                        policy.actor_loss,
                        policy.td_error,
                        policy._critic_optimizer.compute_gradients(
                            policy.critic_loss, policy.model.q_variables()),
                        policy._actor_optimizer.compute_gradients(
                            policy.actor_loss,
                            policy.model.policy_variables()),
                    ],
                    feed_dict=policy._get_loss_inputs_dict(input_,
                                                           shuffle=False),
                )
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                tf_c_grads = [g for g, v in tf_c_grads]
                tf_a_grads = [g for g, v in tf_a_grads]

            elif fw == "torch":
                loss_torch(policy, policy.model, None, input_)
                c, a, t = (
                    policy.get_tower_stats("critic_loss")[0],
                    policy.get_tower_stats("actor_loss")[0],
                    policy.get_tower_stats("td_error")[0],
                )
                # Check pure loss values.
                check(c, expect_c)
                check(a, expect_a)
                check(t, expect_t)

                # Test actor gradients.
                policy._actor_optimizer.zero_grad()
                assert all(v.grad is None for v in policy.model.q_variables())
                assert all(v.grad is None
                           for v in policy.model.policy_variables())
                a.backward()
                # `actor_loss` depends on Q-net vars
                # (but not twin-Q-net vars!).
                assert not any(v.grad is None
                               for v in policy.model.q_variables()[:4])
                assert all(v.grad is None
                           for v in policy.model.q_variables()[4:])
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.policy_variables())
                assert not all(
                    torch.min(v.grad) == 0
                    for v in policy.model.policy_variables())
                # Compare with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)

                # Test critic gradients.
                policy._critic_optimizer.zero_grad()
                assert all(v.grad is None or torch.mean(v.grad) == 0.0
                           for v in policy.model.q_variables())
                assert all(v.grad is None or torch.min(v.grad) == 0.0
                           for v in policy.model.q_variables())
                c.backward()
                assert not all(
                    torch.mean(v.grad) == 0
                    for v in policy.model.q_variables())
                assert not all(
                    torch.min(v.grad) == 0 for v in policy.model.q_variables())
                # Compare with tf ones.
                torch_c_grads = [v.grad for v in policy.model.q_variables()]
                for tf_g, torch_g in zip(tf_c_grads, torch_c_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)
                # Compare (unchanged(!) actor grads) with tf ones.
                torch_a_grads = [
                    v.grad for v in policy.model.policy_variables()
                ]
                for tf_g, torch_g in zip(tf_a_grads, torch_a_grads):
                    if tf_g.shape != torch_g.shape:
                        check(tf_g, np.transpose(torch_g.cpu()))
                    else:
                        check(tf_g, torch_g)

            # Store this framework's losses in prev_fw_loss to compare with
            # next framework's outputs.
            if prev_fw_loss is not None:
                check(c, prev_fw_loss[0])
                check(a, prev_fw_loss[1])
                check(t, prev_fw_loss[2])

            prev_fw_loss = (c, a, t)

            # Update weights from our batch (n times).
            for update_iteration in range(6):
                print("train iteration {}".format(update_iteration))
                if fw == "tf":
                    in_ = self._get_batch_helper(obs_size, actions, batch_size)
                    tf_inputs.append(in_)
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = MultiAgentReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    updated_weights = policy.get_weights()
                    # Net must have changed.
                    if tf_updated_weights:
                        check(
                            updated_weights[
                                "default_policy/actor_hidden_0/kernel"],
                            tf_updated_weights[-1]
                            ["default_policy/actor_hidden_0/kernel"],
                            false=True,
                        )
                    tf_updated_weights.append(updated_weights)

                # Compare with updated tf-weights. Must all be the same.
                else:
                    tf_weights = tf_updated_weights[update_iteration]
                    in_ = tf_inputs[update_iteration]
                    # Set a fake-batch to use
                    # (instead of sampling from replay buffer).
                    buf = MultiAgentReplayBuffer.get_instance_for_testing()
                    buf._fake_batch = in_
                    trainer.train()
                    # Compare updated model and target weights.
                    for tf_key in tf_weights.keys():
                        tf_var = tf_weights[tf_key]
                        # Model.
                        if re.search(
                                "actor_out_1|actor_hidden_0_1|sequential_[23]",
                                tf_key):
                            torch_var = policy.target_model.state_dict()[
                                map_[tf_key]]
                        # Target model.
                        else:
                            torch_var = policy.model.state_dict()[map_[tf_key]]
                        if tf_var.shape != torch_var.shape:
                            check(tf_var,
                                  np.transpose(torch_var.cpu()),
                                  atol=0.1)
                        else:
                            check(tf_var, torch_var, atol=0.1)

            trainer.stop()
예제 #18
0
    def test_cql_compilation(self):
        """Test whether CQL can be built with all frameworks."""

        # Learns from a historic-data file.
        # To generate this data, first run:
        # $ ./train.py --run=SAC --env=Pendulum-v1 \
        #   --stop='{"timesteps_total": 50000}' \
        #   --config='{"output": "/tmp/out"}'
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/pendulum/small.json")
        print("data_file={} exists={}".format(data_file,
                                              os.path.isfile(data_file)))

        config = (
            cql.CQLConfig().environment(env="Pendulum-v1", ).offline_data(
                input_=[data_file],
                # In the files, we use here for testing, actions have already
                # been normalized.
                # This is usually the case when the file was generated by another
                # RLlib algorithm (e.g. PPO or SAC).
                actions_in_input_normalized=False,
                # Switch on off-policy evaluation.
                off_policy_estimation_methods={
                    "is": {
                        "type": ImportanceSampling
                    }
                },
            ).training(
                clip_actions=False,
                train_batch_size=2000,
                twin_q=True,
                replay_buffer_config={
                    "learning_starts": 0
                },
                bc_iters=2,
            ).evaluation(
                always_attach_evaluation_results=True,
                evaluation_interval=2,
                evaluation_duration=10,
                evaluation_config={
                    "input": "sampler"
                },
                evaluation_parallel_to_training=False,
                evaluation_num_workers=2,
            ).rollouts(rollout_fragment_length=1))
        num_iterations = 4

        # Test for tf/torch frameworks.
        for fw in framework_iterator(config, with_eager_tracing=True):
            trainer = config.build()
            for i in range(num_iterations):
                results = trainer.train()
                check_train_results(results)
                print(results)
                eval_results = results["evaluation"]
                print(f"iter={trainer.iteration} "
                      f"R={eval_results['episode_reward_mean']}")

            check_compute_single_action(trainer)

            # Get policy and model.
            pol = trainer.get_policy()
            cql_model = pol.model
            if fw == "tf":
                pol.get_session().__enter__()

            # Example on how to do evaluation on the trained Trainer
            # using the data from CQL's global replay buffer.
            # Get a sample (MultiAgentBatch).
            multi_agent_batch = trainer.local_replay_buffer.sample(
                num_items=config.train_batch_size)
            # All experiences have been buffered for `default_policy`
            batch = multi_agent_batch.policy_batches["default_policy"]

            if fw == "torch":
                obs = torch.from_numpy(batch["obs"])
            else:
                obs = batch["obs"]
                batch["actions"] = batch["actions"].astype(np.float32)

            # Pass the observations through our model to get the
            # features, which then to pass through the Q-head.
            model_out, _ = cql_model({"obs": obs})
            # The estimated Q-values from the (historic) actions in the batch.
            if fw == "torch":
                q_values_old = cql_model.get_q_values(
                    model_out, torch.from_numpy(batch["actions"]))
            else:
                q_values_old = cql_model.get_q_values(
                    tf.convert_to_tensor(model_out), batch["actions"])

            # The estimated Q-values for the new actions computed
            # by our trainer policy.
            actions_new = pol.compute_actions_from_input_dict({"obs": obs})[0]
            if fw == "torch":
                q_values_new = cql_model.get_q_values(
                    model_out, torch.from_numpy(actions_new))
            else:
                q_values_new = cql_model.get_q_values(model_out, actions_new)

            if fw == "tf":
                q_values_old, q_values_new = pol.get_session().run(
                    [q_values_old, q_values_new])

            print(f"Q-val batch={q_values_old}")
            print(f"Q-val policy={q_values_new}")

            if fw == "tf":
                pol.get_session().__exit__(None, None, None)

            trainer.stop()
예제 #19
0
파일: test_td3.py 프로젝트: wuisawesome/ray
    def test_td3_exploration_and_with_random_prerun(self):
        """Tests TD3's Exploration (w/ random actions for n timesteps)."""
        config = td3.TD3_DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        obs = np.array([0.0, 0.1, -0.1])

        # Test against all frameworks.
        for _ in framework_iterator(config, with_eager_tracing=True):
            lcl_config = config.copy()
            # Default GaussianNoise setup.
            trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v1")
            # Setting explore=False should always return the same action.
            a_ = trainer.compute_single_action(obs, explore=False)
            self.assertEqual(trainer.get_policy().global_timestep, 1)
            for i in range(50):
                a = trainer.compute_single_action(obs, explore=False)
                self.assertEqual(trainer.get_policy().global_timestep, i + 2)
                check(a, a_)
            # explore=None (default: explore) should return different actions.
            actions = []
            for i in range(50):
                actions.append(trainer.compute_single_action(obs))
                self.assertEqual(trainer.get_policy().global_timestep, i + 52)
            check(np.std(actions), 0.0, false=True)
            trainer.stop()

            # Check randomness at beginning.
            lcl_config["exploration_config"] = {
                # Act randomly at beginning ...
                "random_timesteps": 30,
                # Then act very closely to deterministic actions thereafter.
                "stddev": 0.001,
                "initial_scale": 0.001,
                "final_scale": 0.001,
            }
            trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v1")
            # ts=0 (get a deterministic action as per explore=False).
            deterministic_action = trainer.compute_single_action(obs,
                                                                 explore=False)
            self.assertEqual(trainer.get_policy().global_timestep, 1)
            # ts=1-29 (in random window).
            random_a = []
            for i in range(1, 30):
                random_a.append(
                    trainer.compute_single_action(obs, explore=True))
                self.assertEqual(trainer.get_policy().global_timestep, i + 1)
                check(random_a[-1], deterministic_action, false=True)
            self.assertTrue(np.std(random_a) > 0.3)

            # ts > 30 (a=deterministic_action + scale * N[0,1])
            for i in range(50):
                a = trainer.compute_single_action(obs, explore=True)
                self.assertEqual(trainer.get_policy().global_timestep, i + 31)
                check(a, deterministic_action, rtol=0.1)

            # ts >> 30 (BUT: explore=False -> expect deterministic action).
            for i in range(50):
                a = trainer.compute_single_action(obs, explore=False)
                self.assertEqual(trainer.get_policy().global_timestep, i + 81)
                check(a, deterministic_action)
            trainer.stop()
예제 #20
0
파일: test_trainer.py 프로젝트: ijrsvt/ray
    def test_add_delete_policy(self):
        config = pg.DEFAULT_CONFIG.copy()
        config.update(
            {
                "env": MultiAgentCartPole,
                "env_config": {
                    "config": {
                        "num_agents": 4,
                    },
                },
                "num_workers": 2,  # Test on remote workers as well.
                "model": {
                    "fcnet_hiddens": [5],
                    "fcnet_activation": "linear",
                },
                "train_batch_size": 100,
                "rollout_fragment_length": 50,
                "multiagent": {
                    # Start with a single policy.
                    "policies": {"p0"},
                    "policy_mapping_fn": lambda aid, eps, worker, **kwargs: "p0",
                    # And only two policies that can be stored in memory at a
                    # time.
                    "policy_map_capacity": 2,
                },
            }
        )

        for _ in framework_iterator(config):
            trainer = pg.PGTrainer(config=config)
            pol0 = trainer.get_policy("p0")
            r = trainer.train()
            self.assertTrue("p0" in r["info"][LEARNER_INFO])
            for i in range(1, 3):

                def new_mapping_fn(agent_id, episode, worker, **kwargs):
                    return f"p{choice([i, i - 1])}"

                # Add a new policy.
                pid = f"p{i}"
                new_pol = trainer.add_policy(
                    pid,
                    trainer.get_default_policy_class(config),
                    # Test changing the mapping fn.
                    policy_mapping_fn=new_mapping_fn,
                    # Change the list of policies to train.
                    policies_to_train=[f"p{i}", f"p{i-1}"],
                )
                pol_map = trainer.workers.local_worker().policy_map
                self.assertTrue(new_pol is not pol0)
                for j in range(i + 1):
                    self.assertTrue(f"p{j}" in pol_map)
                self.assertTrue(len(pol_map) == i + 1)
                trainer.train()
                checkpoint = trainer.save()

                # Test restoring from the checkpoint (which has more policies
                # than what's defined in the config dict).
                test = pg.PGTrainer(config=config)
                test.restore(checkpoint)
                pol0 = test.get_policy("p0")
                test.train()
                # Test creating an action with the added (and restored) policy.
                a = test.compute_single_action(
                    np.zeros_like(pol0.observation_space.sample()), policy_id=pid
                )
                self.assertTrue(pol0.action_space.contains(a))
                test.stop()

            # Delete all added policies again from trainer.
            for i in range(2, 0, -1):
                trainer.remove_policy(
                    f"p{i}",
                    # Note that the complete signature of a policy_mapping_fn
                    # is: `agent_id, episode, worker, **kwargs`.
                    policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}",
                    policies_to_train=[f"p{i - 1}"],
                )

            trainer.stop()
예제 #21
0
def learn_test_plus_evaluate(algo, env="CartPole-v0"):
    for fw in framework_iterator(frameworks=("tf", "torch")):
        fw_ = ', \\"framework\\": \\"{}\\"'.format(fw)

        tmp_dir = os.popen("mktemp -d").read()[:-1]
        if not os.path.exists(tmp_dir):
            # Last resort: Resolve via underlying tempdir (and cut tmp_.
            tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:]
            if not os.path.exists(tmp_dir):
                sys.exit(1)

        print("Saving results to {}".format(tmp_dir))

        rllib_dir = str(Path(__file__).parent.parent.absolute())
        print("RLlib dir = {}\nexists={}".format(rllib_dir,
                                                 os.path.exists(rllib_dir)))
        os.system("python {}/train.py --local-dir={} --run={} "
                  "--checkpoint-freq=1 --checkpoint-at-end ".format(
                      rllib_dir, tmp_dir, algo) +
                  '--config="{\\"num_gpus\\": 0, \\"num_workers\\": 1, '
                  '\\"evaluation_config\\": {\\"explore\\": false}' + fw_ +
                  '}" ' + '--stop="{\\"episode_reward_mean\\": 100.0}"' +
                  " --env={}".format(env))

        # Find last checkpoint and use that for the rollout.
        checkpoint_path = os.popen(
            "ls {}/default/*/checkpoint_*/checkpoint-*".format(
                tmp_dir)).read()[:-1]
        checkpoints = [
            cp for cp in checkpoint_path.split("\n")
            if re.match(r"^.+checkpoint-\d+$", cp)
        ]
        # Sort by number and pick last (which should be the best checkpoint).
        last_checkpoint = sorted(
            checkpoints,
            key=lambda x: int(re.match(r".+checkpoint-(\d+)", x).group(1)))[-1]
        assert re.match(r"^.+checkpoint_\d+/checkpoint-\d+$", last_checkpoint)
        if not os.path.exists(last_checkpoint):
            sys.exit(1)
        print("Best checkpoint={} (exists)".format(last_checkpoint))

        # Test rolling out n steps.
        result = os.popen("python {}/evaluate.py --run={} "
                          "--steps=400 "
                          '--out="{}/rollouts_n_steps.pkl" "{}"'.format(
                              rllib_dir, algo, tmp_dir,
                              last_checkpoint)).read()[:-1]
        if not os.path.exists(tmp_dir + "/rollouts_n_steps.pkl"):
            sys.exit(1)
        print("Rollout output exists -> Checking reward ...")
        episodes = result.split("\n")
        mean_reward = 0.0
        num_episodes = 0
        for ep in episodes:
            mo = re.match(r"Episode .+reward: ([\d\.\-]+)", ep)
            if mo:
                mean_reward += float(mo.group(1))
                num_episodes += 1
        mean_reward /= num_episodes
        print("Rollout's mean episode reward={}".format(mean_reward))
        assert mean_reward >= 100.0

        # Cleanup.
        os.popen('rm -rf "{}"'.format(tmp_dir)).read()
예제 #22
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["model"]["vf_share_layers"] = True

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check no free log std var by default.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters()
                    if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables()
                    if "log_std" in str(v)
                ]
            assert len(matching) == 0, matching

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            train_batch = compute_gae_for_sample_batch(policy,
                                                       FAKE_BATCH.copy())
            if fw == "torch":
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw in ["tf2", "tfe"]:
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            elif fw == "torch":
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw != "torch" else \
                list(policy.model.parameters())
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS],
                                     vars[0 if fw != "torch" else 2],
                                     vars[1 if fw != "torch" else 3],
                                     framework=fw)
            expected_logits = fc(expected_shared_out,
                                 vars[2 if fw != "torch" else 0],
                                 vars[3 if fw != "torch" else 1],
                                 framework=fw)
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw != "torch" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs,
                    sess=sess
                )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy._mean_kl_loss,
                        policy._mean_entropy,
                        policy._mean_policy_loss,
                        policy._mean_vf_loss,
                        policy._total_loss,
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False))
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            elif fw == "torch":
                check(policy.model.tower_stats["mean_kl_loss"], kl)
                check(policy.model.tower_stats["mean_entropy"], entropy)
                check(policy.model.tower_stats["mean_policy_loss"],
                      np.mean(-pg_loss))
                check(policy.model.tower_stats["mean_vf_loss"],
                      np.mean(vf_loss),
                      decimals=4)
                check(policy.model.tower_stats["total_loss"],
                      overall_loss,
                      decimals=4)
            else:
                check(policy._mean_kl_loss, kl)
                check(policy._mean_entropy, entropy)
                check(policy._mean_policy_loss, np.mean(-pg_loss))
                check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4)
                check(policy._total_loss, overall_loss, decimals=4)
            trainer.stop()
예제 #23
0
    def do_test_parameter_noise_exploration(self, trainer_cls, config, env,
                                            env_config, obs, fws):
        """Tests, whether an Agent works with ParameterNoise."""
        core_config = config.copy()
        core_config["num_workers"] = 0  # Run locally.
        core_config["env_config"] = env_config

        for fw in framework_iterator(core_config, fws):

            config = core_config.copy()

            # Algo with ParameterNoise exploration (config["explore"]=True).
            # ----
            config["exploration_config"] = {"type": "ParameterNoise"}
            config["explore"] = True

            trainer = trainer_cls(config=config, env=env)
            policy = trainer.get_policy()
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise_before = self._get_current_noise(policy, fw)
            check(noise_before, 0.0)
            initial_weights = self._get_current_weight(policy, fw)

            # Pseudo-start an episode and compare the weights before and after.
            policy.exploration.on_episode_start(policy, tf_sess=policy._sess)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise_after_ep_start = self._get_current_noise(policy, fw)
            weights_after_ep_start = self._get_current_weight(policy, fw)
            # Should be the same, as we don't do anything at the beginning of
            # the episode, only one step later.
            check(noise_after_ep_start, noise_before)
            check(initial_weights, weights_after_ep_start)

            # Setting explore=False should always return the same action.
            a_ = trainer.compute_action(obs, explore=False)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            noise = self._get_current_noise(policy, fw)
            # We sampled the first noise (not zero anymore).
            check(noise, 0.0, false=True)
            # But still not applied b/c explore=False.
            check(self._get_current_weight(policy, fw), initial_weights)
            for _ in range(10):
                a = trainer.compute_action(obs, explore=False)
                check(a, a_)
                # Noise never gets applied.
                check(self._get_current_weight(policy, fw), initial_weights)
                self.assertFalse(
                    policy.exploration.weights_are_currently_noisy)

            # Explore=None (default: True) should return different actions.
            # However, this is only due to the underlying epsilon-greedy
            # exploration.
            actions = []
            current_weight = None
            for _ in range(10):
                actions.append(trainer.compute_action(obs))
                self.assertTrue(policy.exploration.weights_are_currently_noisy)
                # Now, noise actually got applied (explore=True).
                current_weight = self._get_current_weight(policy, fw)
                check(current_weight, initial_weights, false=True)
                check(current_weight, initial_weights + noise)
            check(np.std(actions), 0.0, false=True)

            # Pseudo-end the episode and compare weights again.
            # Make sure they are the original ones.
            policy.exploration.on_episode_end(policy, tf_sess=policy._sess)
            weights_after_ep_end = self._get_current_weight(policy, fw)
            check(current_weight - noise, weights_after_ep_end, decimals=5)

            # DQN with ParameterNoise exploration (config["explore"]=False).
            # ----
            config = core_config.copy()
            config["exploration_config"] = {"type": "ParameterNoise"}
            config["explore"] = False
            trainer = trainer_cls(config=config, env=env)
            policy = trainer.get_policy()
            self.assertFalse(policy.exploration.weights_are_currently_noisy)
            initial_weights = self._get_current_weight(policy, fw)

            # Noise before anything (should be 0.0, no episode started yet).
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0)

            # Pseudo-start an episode and compare the weights before and after
            # (they should be the same).
            policy.exploration.on_episode_start(policy, tf_sess=policy._sess)
            self.assertFalse(policy.exploration.weights_are_currently_noisy)

            # Should be the same, as we don't do anything at the beginning of
            # the episode, only one step later.
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0)
            noisy_weights = self._get_current_weight(policy, fw)
            check(initial_weights, noisy_weights)

            # Setting explore=False or None should always return the same
            # action.
            a_ = trainer.compute_action(obs, explore=False)
            # Now we have re-sampled.
            noise = self._get_current_noise(policy, fw)
            check(noise, 0.0, false=True)
            for _ in range(5):
                a = trainer.compute_action(obs, explore=None)
                check(a, a_)
                a = trainer.compute_action(obs, explore=False)
                check(a, a_)

            # Pseudo-end the episode and compare weights again.
            # Make sure they are the original ones (no noise permanently
            # applied throughout the episode).
            policy.exploration.on_episode_end(policy, tf_sess=policy._sess)
            weights_after_episode_end = self._get_current_weight(policy, fw)
            check(initial_weights, weights_after_episode_end)
            # Noise should still be the same (re-sampling only happens at
            # beginning of episode).
            noise_after = self._get_current_noise(policy, fw)
            check(noise, noise_after)

            # Switch off underlying exploration entirely.
            # ----
            config = core_config.copy()
            if trainer_cls is dqn.DQNTrainer:
                sub_config = {
                    "type": "EpsilonGreedy",
                    "initial_epsilon": 0.0,  # <- no randomness whatsoever
                    "final_epsilon": 0.0,
                }
            else:
                sub_config = {
                    "type": "OrnsteinUhlenbeckNoise",
                    "initial_scale": 0.0,  # <- no randomness whatsoever
                    "final_scale": 0.0,
                    "random_timesteps": 0,
                }
            config["exploration_config"] = {
                "type": "ParameterNoise",
                "sub_exploration": sub_config,
            }
            config["explore"] = True
            trainer = trainer_cls(config=config, env=env)
            # Now, when we act - even with explore=True - we would expect
            # the same action for the same input (parameter noise is
            # deterministic).
            policy = trainer.get_policy()
            policy.exploration.on_episode_start(policy, tf_sess=policy._sess)
            a_ = trainer.compute_action(obs)
            for _ in range(10):
                a = trainer.compute_action(obs, explore=True)
                check(a, a_)
예제 #24
0
    def test_simple_q_loss_function(self):
        """Tests the Simple-Q loss function results on all frameworks."""
        config = simple_q.SimpleQConfig().rollouts(num_rollout_workers=0)
        # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)).
        config.training(model={
            "fcnet_hiddens": [10],
            "fcnet_activation": "linear",
        })

        for fw in framework_iterator(config):
            # Generate Trainer and get its default Policy object.
            trainer = simple_q.SimpleQ(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            # Batch of size=2.
            input_ = SampleBatch({
                SampleBatch.CUR_OBS:
                np.random.random(size=(2, 4)),
                SampleBatch.ACTIONS:
                np.array([0, 1]),
                SampleBatch.REWARDS:
                np.array([0.4, -1.23]),
                SampleBatch.DONES:
                np.array([False, False]),
                SampleBatch.NEXT_OBS:
                np.random.random(size=(2, 4)),
                SampleBatch.EPS_ID:
                np.array([1234, 1234]),
                SampleBatch.AGENT_INDEX:
                np.array([0, 0]),
                SampleBatch.ACTION_LOGP:
                np.array([-0.1, -0.1]),
                SampleBatch.ACTION_DIST_INPUTS:
                np.array([[0.1, 0.2], [-0.1, -0.2]]),
                SampleBatch.ACTION_PROB:
                np.array([0.1, 0.2]),
                "q_values":
                np.array([[0.1, 0.2], [0.2, 0.1]]),
            })
            # Get model vars for computing expected model outs (q-vals).
            # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias
            vars = policy.get_weights()
            if isinstance(vars, dict):
                vars = list(vars.values())

            vars_t = policy.target_model.variables()
            if fw == "tf":
                vars_t = policy.get_session().run(vars_t)

            # Q(s,a) outputs.
            q_t = np.sum(
                one_hot(input_[SampleBatch.ACTIONS], 2) * fc(
                    fc(
                        input_[SampleBatch.CUR_OBS],
                        vars[0 if fw != "torch" else 2],
                        vars[1 if fw != "torch" else 3],
                        framework=fw,
                    ),
                    vars[2 if fw != "torch" else 0],
                    vars[3 if fw != "torch" else 1],
                    framework=fw,
                ),
                1,
            )
            # max[a'](Qtarget(s',a')) outputs.
            q_target_tp1 = np.max(
                fc(
                    fc(
                        input_[SampleBatch.NEXT_OBS],
                        vars_t[0 if fw != "torch" else 2],
                        vars_t[1 if fw != "torch" else 3],
                        framework=fw,
                    ),
                    vars_t[2 if fw != "torch" else 0],
                    vars_t[3 if fw != "torch" else 1],
                    framework=fw,
                ),
                1,
            )
            # TD-errors (Bellman equation).
            td_error = q_t - config.gamma * input_[
                SampleBatch.REWARDS] + q_target_tp1
            # Huber/Square loss on TD-error.
            expected_loss = huber_loss(td_error).mean()

            if fw == "torch":
                input_ = policy._lazy_tensor_dict(input_)
            # Get actual out and compare.
            if fw == "tf":
                out = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(input_,
                                                           shuffle=False),
                )
            else:
                out = (loss_torch if fw == "torch" else loss_tf)(policy,
                                                                 policy.model,
                                                                 None, input_)
            check(out, expected_loss, decimals=1)
예제 #25
0
    def test_traj_view_simple_performance(self):
        """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`.
        """
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        action_space = Discrete(2)
        obs_space = Box(-1.0, 1.0, shape=(700, ))

        from ray.rllib.examples.env.random_env import RandomMultiAgentEnv

        from ray.tune import register_env
        register_env(
            "ma_env",
            lambda c: RandomMultiAgentEnv({
                "num_agents": 2,
                "p_done": 0.0,
                "max_episode_len": 104,
                "action_space": action_space,
                "observation_space": obs_space
            }))

        config["num_workers"] = 3
        config["num_envs_per_worker"] = 8
        config["num_sgd_iter"] = 1  # Put less weight on training.

        policies = {
            "pol0": (None, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": policy_fn,
        }
        num_iterations = 2
        # Only works in torch so far.
        for _ in framework_iterator(config, frameworks="torch"):
            print("w/ traj. view API")
            config["_use_trajectory_view_api"] = True
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_w = 0.0
            sampler_perf_w = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_w = {
                    k:
                    sampler_perf_w.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_w += delta
                print("{}={}s".format(i, delta))
            sampler_perf_w = {
                k: sampler_perf_w[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_w.items()
            }
            duration_w = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_w, sampler_perf_w,
                      learn_time_w / num_iterations))
            trainer.stop()

            print("w/o traj. view API")
            config["_use_trajectory_view_api"] = False
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_wo = 0.0
            sampler_perf_wo = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_wo = {
                    k: sampler_perf_wo.get(k, 0.0) +
                    (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_wo += delta
                print("{}={}s".format(i, delta))
            sampler_perf_wo = {
                k: sampler_perf_wo[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_wo.items()
            }
            duration_wo = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_wo, sampler_perf_wo,
                      learn_time_wo / num_iterations))
            trainer.stop()

            # Assert `_use_trajectory_view_api` is faster.
            self.assertLess(sampler_perf_w["mean_raw_obs_processing_ms"],
                            sampler_perf_wo["mean_raw_obs_processing_ms"])
            self.assertLess(sampler_perf_w["mean_action_processing_ms"],
                            sampler_perf_wo["mean_action_processing_ms"])
            self.assertLess(duration_w, duration_wo)
예제 #26
0
    def test_simple_q_loss_function(self):
        """Tests the Simple-Q loss function results on all frameworks."""
        config = dqn.SIMPLE_Q_DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)).
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        for fw in framework_iterator(config):
            # Generate Trainer and get its default Policy object.
            trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            # Batch of size=2.
            input_ = {
                SampleBatch.CUR_OBS: np.random.random(size=(2, 4)),
                SampleBatch.ACTIONS: np.array([0, 1]),
                SampleBatch.REWARDS: np.array([0.4, -1.23]),
                SampleBatch.DONES: np.array([False, False]),
                SampleBatch.NEXT_OBS: np.random.random(size=(2, 4))
            }
            # Get model vars for computing expected model outs (q-vals).
            # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias
            vars = policy.get_weights()
            if isinstance(vars, dict):
                vars = list(vars.values())
            vars_t = policy.target_q_func_vars
            if fw == "tf":
                vars_t = policy.get_session().run(vars_t)

            # Q(s,a) outputs.
            q_t = np.sum(
                one_hot(input_[SampleBatch.ACTIONS], 2) *
                fc(fc(input_[SampleBatch.CUR_OBS],
                      vars[0 if fw != "torch" else 2],
                      vars[1 if fw != "torch" else 3],
                      framework=fw),
                   vars[2 if fw != "torch" else 0],
                   vars[3 if fw != "torch" else 1],
                   framework=fw), 1)
            # max[a'](Qtarget(s',a')) outputs.
            q_target_tp1 = np.max(
                fc(fc(input_[SampleBatch.NEXT_OBS],
                      vars_t[0 if fw != "torch" else 2],
                      vars_t[1 if fw != "torch" else 3],
                      framework=fw),
                   vars_t[2 if fw != "torch" else 0],
                   vars_t[3 if fw != "torch" else 1],
                   framework=fw), 1)
            # TD-errors (Bellman equation).
            td_error = q_t - config["gamma"] * input_[SampleBatch.REWARDS] + \
                q_target_tp1
            # Huber/Square loss on TD-error.
            expected_loss = huber_loss(td_error).mean()

            if fw == "torch":
                input_ = policy._lazy_tensor_dict(input_)
            # Get actual out and compare.
            if fw == "tf":
                out = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(input_,
                                                           shuffle=False))
            else:
                out = (loss_torch if fw == "torch" else loss_tf)(policy,
                                                                 policy.model,
                                                                 None, input_)
            check(out, expected_loss, decimals=1)
예제 #27
0
 def test_agent_output_ok(self):
     for fw in framework_iterator(frameworks=("torch", "tf")):
         self.write_outputs(self.test_dir, fw)
         self.assertEqual(len(os.listdir(self.test_dir + fw)), 1)
         reader = JsonReader(self.test_dir + fw + "/*.json")
         reader.next()
def do_test_log_likelihood(run,
                           config,
                           prev_a=None,
                           continuous=False,
                           layer_key=("fc", (0, 4), ("_hidden_layers.0.",
                                                     "_logits.")),
                           logp_func=None):
    config = config.copy()
    # Run locally.
    config["num_workers"] = 0
    # Env setup.
    if continuous:
        env = "Pendulum-v0"
        obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]])
    else:
        env = "FrozenLake-v0"
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs_batch = np.array([0])
        preprocessed_obs_batch = one_hot(obs_batch, depth=16)

    prev_r = None if prev_a is None else np.array(0.0)

    # Test against all frameworks.
    for fw in framework_iterator(config):
        trainer = run(config=config, env=env)

        policy = trainer.get_policy()
        vars = policy.get_weights()
        # Sample n actions, then roughly check their logp against their
        # counts.
        num_actions = 1000 if not continuous else 50
        actions = []
        for _ in range(num_actions):
            # Single action from single obs.
            actions.append(
                trainer.compute_action(obs_batch[0],
                                       prev_action=prev_a,
                                       prev_reward=prev_r,
                                       explore=True))

        # Test all taken actions for their log-likelihoods vs expected values.
        if continuous:
            for idx in range(num_actions):
                a = actions[idx]
                if fw != "torch":
                    if isinstance(vars, list):
                        expected_mean_logstd = fc(
                            fc(obs_batch, vars[layer_key[1][0]]),
                            vars[layer_key[1][1]])
                    else:
                        expected_mean_logstd = fc(
                            fc(
                                obs_batch,
                                vars["default_policy/{}_1/kernel".format(
                                    layer_key[0])]),
                            vars["default_policy/{}_out/kernel".format(
                                layer_key[0])])
                else:
                    expected_mean_logstd = fc(
                        fc(obs_batch,
                           vars["{}_model.0.weight".format(layer_key[2][0])],
                           framework=fw),
                        vars["{}_model.0.weight".format(layer_key[2][1])],
                        framework=fw)
                mean, log_std = np.split(expected_mean_logstd, 2, axis=-1)
                if logp_func is None:
                    expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std)))
                else:
                    expected_logp = logp_func(mean, log_std, a)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]) if prev_a else None,
                    prev_reward_batch=np.array([prev_r]) if prev_r else None)
                check(logp, expected_logp[0], rtol=0.2)
        # Test all available actions for their logp values.
        else:
            for a in [0, 1, 2, 3]:
                count = actions.count(a)
                expected_prob = count / num_actions
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]) if prev_a else None,
                    prev_reward_batch=np.array([prev_r]) if prev_r else None)
                check(np.exp(logp), expected_prob, atol=0.2)
예제 #29
0
def learn_test_multi_agent_plus_evaluate(algo):
    for fw in framework_iterator(frameworks=("tf", "torch")):
        tmp_dir = os.popen("mktemp -d").read()[:-1]
        if not os.path.exists(tmp_dir):
            # Last resort: Resolve via underlying tempdir (and cut tmp_.
            tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:]
            if not os.path.exists(tmp_dir):
                sys.exit(1)

        print("Saving results to {}".format(tmp_dir))

        rllib_dir = str(Path(__file__).parent.parent.absolute())
        print("RLlib dir = {}\nexists={}".format(rllib_dir,
                                                 os.path.exists(rllib_dir)))

        def policy_fn(agent_id, episode, **kwargs):
            return "pol{}".format(agent_id)

        config = {
            "num_gpus": 0,
            "num_workers": 1,
            "evaluation_config": {
                "explore": False
            },
            "framework": fw,
            "env": MultiAgentCartPole,
            "multiagent": {
                "policies": {"pol0", "pol1"},
                "policy_mapping_fn": policy_fn,
            },
        }
        stop = {"episode_reward_mean": 100.0}
        tune.run(algo,
                 config=config,
                 stop=stop,
                 checkpoint_freq=1,
                 checkpoint_at_end=True,
                 local_dir=tmp_dir,
                 verbose=1)

        # Find last checkpoint and use that for the rollout.
        checkpoint_path = os.popen("ls {}/PPO/*/checkpoint_*/"
                                   "checkpoint-*".format(tmp_dir)).read()[:-1]
        checkpoint_paths = checkpoint_path.split("\n")
        assert len(checkpoint_paths) > 0
        checkpoints = [
            cp for cp in checkpoint_paths
            if re.match(r"^.+checkpoint-\d+$", cp)
        ]
        # Sort by number and pick last (which should be the best checkpoint).
        last_checkpoint = sorted(
            checkpoints,
            key=lambda x: int(re.match(r".+checkpoint-(\d+)", x).group(1)))[-1]
        assert re.match(r"^.+checkpoint_\d+/checkpoint-\d+$", last_checkpoint)
        if not os.path.exists(last_checkpoint):
            sys.exit(1)
        print("Best checkpoint={} (exists)".format(last_checkpoint))

        ray.shutdown()

        # Test rolling out n steps.
        result = os.popen(
            "python {}/evaluate.py --run={} "
            "--steps=400 "
            "--out=\"{}/rollouts_n_steps.pkl\" --no-render \"{}\"".format(
                rllib_dir, algo, tmp_dir, last_checkpoint)).read()[:-1]
        if not os.path.exists(tmp_dir + "/rollouts_n_steps.pkl"):
            sys.exit(1)
        print("Rollout output exists -> Checking reward ...")
        episodes = result.split("\n")
        mean_reward = 0.0
        num_episodes = 0
        for ep in episodes:
            mo = re.match(r"Episode .+reward: ([\d\.\-]+)", ep)
            if mo:
                mean_reward += float(mo.group(1))
                num_episodes += 1
        mean_reward /= num_episodes
        print("Rollout's mean episode reward={}".format(mean_reward))
        assert mean_reward >= 100.0

        # Cleanup.
        os.popen("rm -rf \"{}\"".format(tmp_dir)).read()
예제 #30
0
    def test_dqn_exploration_and_soft_q_config(self):
        """Tests, whether a DQN Agent outputs exploration/softmaxed actions."""
        config = dqn.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs = np.array(0)

        # Test against all frameworks.
        for _ in framework_iterator(config):
            # Default EpsilonGreedy setup.
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            # Setting explore=False should always return the same action.
            a_ = trainer.compute_single_action(obs, explore=False)
            for _ in range(50):
                a = trainer.compute_single_action(obs, explore=False)
                check(a, a_)
            # explore=None (default: explore) should return different actions.
            actions = []
            for _ in range(50):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, false=True)
            trainer.stop()

            # Low softmax temperature. Behaves like argmax
            # (but no epsilon exploration).
            config["exploration_config"] = {
                "type": "SoftQ",
                "temperature": 0.000001
            }
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            # Due to the low temp, always expect the same action.
            actions = [trainer.compute_single_action(obs)]
            for _ in range(50):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, decimals=3)
            trainer.stop()

            # Higher softmax temperature.
            config["exploration_config"]["temperature"] = 1.0
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")

            # Even with the higher temperature, if we set explore=False, we
            # should expect the same actions always.
            a_ = trainer.compute_single_action(obs, explore=False)
            for _ in range(50):
                a = trainer.compute_single_action(obs, explore=False)
                check(a, a_)

            # Due to the higher temp, expect different actions avg'ing
            # around 1.5.
            actions = []
            for _ in range(300):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, false=True)
            trainer.stop()

            # With Random exploration.
            config["exploration_config"] = {"type": "Random"}
            config["explore"] = True
            trainer = dqn.DQNTrainer(config=config, env="FrozenLake-v0")
            actions = []
            for _ in range(300):
                actions.append(trainer.compute_single_action(obs))
            check(np.std(actions), 0.0, false=True)
            trainer.stop()