Exemplo n.º 1
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["model"]["vf_share_layers"] = True

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check no free log std var by default.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters()
                    if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables()
                    if "log_std" in str(v)
                ]
            assert len(matching) == 0, matching

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            train_batch = compute_gae_for_sample_batch(policy,
                                                       FAKE_BATCH.copy())
            if fw == "torch":
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw in ["tf2", "tfe"]:
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            elif fw == "torch":
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw != "torch" else \
                list(policy.model.parameters())
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS],
                                     vars[0 if fw != "torch" else 2],
                                     vars[1 if fw != "torch" else 3],
                                     framework=fw)
            expected_logits = fc(expected_shared_out,
                                 vars[2 if fw != "torch" else 0],
                                 vars[3 if fw != "torch" else 1],
                                 framework=fw)
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw != "torch" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs,
                    sess=sess
                )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy._mean_kl,
                        policy._mean_entropy,
                        policy._mean_policy_loss,
                        policy._mean_vf_loss,
                        policy._total_loss,
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False))
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            else:
                check(policy._mean_kl, kl)
                check(policy._mean_entropy, entropy)
                check(policy._mean_policy_loss, np.mean(-pg_loss))
                check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4)
                check(policy._total_loss, overall_loss, decimals=4)
            trainer.stop()
Exemplo n.º 2
0
    def _sac_loss_helper(self, train_batch, weights, ks, log_alpha, fw, gamma,
                         sess):
        """Emulates SAC loss functions for tf and torch."""
        # ks:
        # 0=log_alpha
        # 1=target log-alpha (not used)

        # 2=action hidden bias
        # 3=action hidden kernel
        # 4=action out bias
        # 5=action out kernel

        # 6=Q hidden bias
        # 7=Q hidden kernel
        # 8=Q out bias
        # 9=Q out kernel

        # 14=target Q hidden bias
        # 15=target Q hidden kernel
        # 16=target Q out bias
        # 17=target Q out kernel
        alpha = np.exp(log_alpha)
        cls = TorchSquashedGaussian if fw == "torch" else SquashedGaussian
        model_out_t = train_batch[SampleBatch.CUR_OBS]
        model_out_tp1 = train_batch[SampleBatch.NEXT_OBS]
        target_model_out_tp1 = train_batch[SampleBatch.NEXT_OBS]

        # get_policy_output
        action_dist_t = cls(
            fc(
                relu(
                    fc(model_out_t,
                       weights[ks[3]],
                       weights[ks[2]],
                       framework=fw)), weights[ks[5]], weights[ks[4]]), None)
        policy_t = action_dist_t.deterministic_sample()
        log_pis_t = action_dist_t.logp(policy_t)
        if sess:
            log_pis_t = sess.run(log_pis_t)
            policy_t = sess.run(policy_t)
        log_pis_t = np.expand_dims(log_pis_t, -1)

        # Get policy output for t+1.
        action_dist_tp1 = cls(
            fc(
                relu(
                    fc(model_out_tp1,
                       weights[ks[3]],
                       weights[ks[2]],
                       framework=fw)), weights[ks[5]], weights[ks[4]]), None)
        policy_tp1 = action_dist_tp1.deterministic_sample()
        log_pis_tp1 = action_dist_tp1.logp(policy_tp1)
        if sess:
            log_pis_tp1 = sess.run(log_pis_tp1)
            policy_tp1 = sess.run(policy_tp1)
        log_pis_tp1 = np.expand_dims(log_pis_tp1, -1)

        # Q-values for the actually selected actions.
        # get_q_values
        q_t = fc(relu(
            fc(np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]],
                              -1),
               weights[ks[7]],
               weights[ks[6]],
               framework=fw)),
                 weights[ks[9]],
                 weights[ks[8]],
                 framework=fw)

        # Q-values for current policy in given current state.
        # get_q_values
        q_t_det_policy = fc(relu(
            fc(np.concatenate([model_out_t, policy_t], -1),
               weights[ks[7]],
               weights[ks[6]],
               framework=fw)),
                            weights[ks[9]],
                            weights[ks[8]],
                            framework=fw)

        # Target q network evaluation.
        # target_model.get_q_values
        q_tp1 = fc(relu(
            fc(np.concatenate([target_model_out_tp1, policy_tp1], -1),
               weights[ks[15]],
               weights[ks[14]],
               framework=fw)),
                   weights[ks[17]],
                   weights[ks[16]],
                   framework=fw)

        q_t_selected = np.squeeze(q_t, axis=-1)
        q_tp1 -= alpha * log_pis_tp1
        q_tp1_best = np.squeeze(q_tp1, axis=-1)
        dones = train_batch[SampleBatch.DONES]
        rewards = train_batch[SampleBatch.REWARDS]
        if fw == "torch":
            dones = dones.float().numpy()
            rewards = rewards.numpy()
        q_tp1_best_masked = (1.0 - dones) * q_tp1_best
        q_t_selected_target = rewards + gamma * q_tp1_best_masked
        base_td_error = np.abs(q_t_selected - q_t_selected_target)
        td_error = base_td_error
        critic_loss = [
            0.5 * np.mean(np.power(q_t_selected_target - q_t_selected, 2.0))
        ]
        target_entropy = -np.prod((1, ))
        alpha_loss = -np.mean(log_alpha * (log_pis_t + target_entropy))
        actor_loss = np.mean(alpha * log_pis_t - q_t_det_policy)

        return critic_loss, actor_loss, alpha_loss, td_error
Exemplo n.º 3
0
    def test_simple_q_loss_function(self):
        """Tests the Simple-Q loss function results on all frameworks."""
        config = dqn.simple_q.SimpleQConfig().rollouts(num_rollout_workers=0)
        # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)).
        config.training(model={
            "fcnet_hiddens": [10],
            "fcnet_activation": "linear",
        })

        for fw in framework_iterator(config):
            # Generate Trainer and get its default Policy object.
            trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            # Batch of size=2.
            input_ = SampleBatch({
                SampleBatch.CUR_OBS:
                np.random.random(size=(2, 4)),
                SampleBatch.ACTIONS:
                np.array([0, 1]),
                SampleBatch.REWARDS:
                np.array([0.4, -1.23]),
                SampleBatch.DONES:
                np.array([False, False]),
                SampleBatch.NEXT_OBS:
                np.random.random(size=(2, 4)),
                SampleBatch.EPS_ID:
                np.array([1234, 1234]),
                SampleBatch.AGENT_INDEX:
                np.array([0, 0]),
                SampleBatch.ACTION_LOGP:
                np.array([-0.1, -0.1]),
                SampleBatch.ACTION_DIST_INPUTS:
                np.array([[0.1, 0.2], [-0.1, -0.2]]),
                SampleBatch.ACTION_PROB:
                np.array([0.1, 0.2]),
                "q_values":
                np.array([[0.1, 0.2], [0.2, 0.1]]),
            })
            # Get model vars for computing expected model outs (q-vals).
            # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias
            vars = policy.get_weights()
            if isinstance(vars, dict):
                vars = list(vars.values())

            vars_t = policy.target_model.variables()
            if fw == "tf":
                vars_t = policy.get_session().run(vars_t)

            # Q(s,a) outputs.
            q_t = np.sum(
                one_hot(input_[SampleBatch.ACTIONS], 2) * fc(
                    fc(
                        input_[SampleBatch.CUR_OBS],
                        vars[0 if fw != "torch" else 2],
                        vars[1 if fw != "torch" else 3],
                        framework=fw,
                    ),
                    vars[2 if fw != "torch" else 0],
                    vars[3 if fw != "torch" else 1],
                    framework=fw,
                ),
                1,
            )
            # max[a'](Qtarget(s',a')) outputs.
            q_target_tp1 = np.max(
                fc(
                    fc(
                        input_[SampleBatch.NEXT_OBS],
                        vars_t[0 if fw != "torch" else 2],
                        vars_t[1 if fw != "torch" else 3],
                        framework=fw,
                    ),
                    vars_t[2 if fw != "torch" else 0],
                    vars_t[3 if fw != "torch" else 1],
                    framework=fw,
                ),
                1,
            )
            # TD-errors (Bellman equation).
            td_error = q_t - config.gamma * input_[
                SampleBatch.REWARDS] + q_target_tp1
            # Huber/Square loss on TD-error.
            expected_loss = huber_loss(td_error).mean()

            if fw == "torch":
                input_ = policy._lazy_tensor_dict(input_)
            # Get actual out and compare.
            if fw == "tf":
                out = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(input_,
                                                           shuffle=False),
                )
            else:
                out = (loss_torch if fw == "torch" else loss_tf)(policy,
                                                                 policy.model,
                                                                 None, input_)
            check(out, expected_loss, decimals=1)
Exemplo n.º 4
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["vf_share_layers"] = True

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS: np.array(
                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                 [0.9, 1.0, 1.1, 1.2]],
                dtype=np.float32),
            SampleBatch.ACTIONS: np.array([0, 1, 1]),
            SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.DONES: np.array([False, False, True]),
            SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
            SampleBatch.ACTION_DIST_INPUTS: np.array(
                [[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
            SampleBatch.ACTION_LOGP: np.array(
                [-0.5, -0.1, -0.2], dtype=np.float32),
        }

        for fw in ["tf", "torch"]:
            print("framework={}".format(fw))
            config["use_pytorch"] = fw == "torch"
            config["eager"] = fw == "tf"

            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            if fw == "tf":
                train_batch = postprocess_ppo_gae_tf(policy, train_batch)
            else:
                train_batch = postprocess_ppo_gae_torch(policy, train_batch)
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss (results are stored in policy.loss_obj)
            # for tf.
            if fw == "tf":
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            else:
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw == "tf" else \
                list(policy.model.parameters())
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS], vars[0],
                                     vars[1])
            expected_logits = fc(expected_shared_out, vars[2], vars[3])
            expected_value_outs = fc(expected_shared_out, vars[4], vars[5])

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw == "tf" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs
                )
            check(policy.loss_obj.mean_kl, kl)
            check(policy.loss_obj.mean_entropy, entropy)
            check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
            check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4)
            check(policy.loss_obj.loss, overall_loss, decimals=4)
def do_test_log_likelihood(run,
                           config,
                           prev_a=None,
                           continuous=False,
                           layer_key=("fc", (0, 4), ("_hidden_layers.0.",
                                                     "_logits.")),
                           logp_func=None):
    config = config.copy()
    # Run locally.
    config["num_workers"] = 0
    # Env setup.
    if continuous:
        env = "Pendulum-v0"
        obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]])
    else:
        env = "FrozenLake-v0"
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs_batch = np.array([0])
        preprocessed_obs_batch = one_hot(obs_batch, depth=16)

    prev_r = None if prev_a is None else np.array(0.0)

    # Test against all frameworks.
    for fw in framework_iterator(config):
        if run in [sac.SACTrainer] and fw == "tfe":
            continue

        trainer = run(config=config, env=env)

        policy = trainer.get_policy()
        vars = policy.get_weights()
        # Sample n actions, then roughly check their logp against their
        # counts.
        num_actions = 1000 if not continuous else 50
        actions = []
        for _ in range(num_actions):
            # Single action from single obs.
            actions.append(
                trainer.compute_action(obs_batch[0],
                                       prev_action=prev_a,
                                       prev_reward=prev_r,
                                       explore=True))

        # Test all taken actions for their log-likelihoods vs expected values.
        if continuous:
            for idx in range(num_actions):
                a = actions[idx]
                if fw != "torch":
                    if isinstance(vars, list):
                        expected_mean_logstd = fc(
                            fc(obs_batch, vars[layer_key[1][0]]),
                            vars[layer_key[1][1]])
                    else:
                        expected_mean_logstd = fc(
                            fc(
                                obs_batch,
                                vars["default_policy/{}_1/kernel".format(
                                    layer_key[0])]),
                            vars["default_policy/{}_out/kernel".format(
                                layer_key[0])])
                else:
                    expected_mean_logstd = fc(
                        fc(obs_batch,
                           vars["{}_model.0.weight".format(layer_key[2][0])],
                           framework=fw),
                        vars["{}_model.0.weight".format(layer_key[2][1])],
                        framework=fw)
                mean, log_std = np.split(expected_mean_logstd, 2, axis=-1)
                if logp_func is None:
                    expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std)))
                else:
                    expected_logp = logp_func(mean, log_std, a)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp[0], rtol=0.2)
        # Test all available actions for their logp values.
        else:
            for a in [0, 1, 2, 3]:
                count = actions.count(a)
                expected_prob = count / num_actions
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(np.exp(logp), expected_prob, atol=0.2)
Exemplo n.º 6
0
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = SampleBatch({
            SampleBatch.OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.EPS_ID:
            np.array([1234, 1234, 1234]),
            SampleBatch.AGENT_INDEX:
            np.array([0, 0, 0]),
        })

        for fw, sess in framework_iterator(config, session=True):
            dist_cls = (Categorical if fw != "torch" else TorchCategorical)
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            vars = policy.model.trainable_variables()
            if sess:
                vars = policy.get_session().run(vars)

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
            # [2.9701, 1.99, 1.0]
            train_batch_ = pg.post_process_advantages(policy,
                                                      train_batch.copy())
            if fw == "torch":
                train_batch_ = policy._lazy_tensor_dict(train_batch_)

            # Check Advantage values.
            check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

            # Actual loss results.
            if sess:
                results = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(train_batch_,
                                                           shuffle=False))
            else:
                results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else
                           pg.pg_torch_loss)(policy,
                                             policy.model,
                                             dist_class=dist_cls,
                                             train_batch=train_batch_)

            # Calculate expected results.
            if fw != "torch":
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[0],
                                        vars[1],
                                        framework=fw),
                                     vars[2],
                                     vars[3],
                                     framework=fw)
            else:
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[2],
                                        vars[3],
                                        framework=fw),
                                     vars[0],
                                     vars[1],
                                     framework=fw)
            expected_logp = dist_cls(expected_logits, policy.model).logp(
                train_batch_[SampleBatch.ACTIONS])
            adv = train_batch_[Postprocessing.ADVANTAGES]
            if sess:
                expected_logp = sess.run(expected_logp)
            elif fw == "torch":
                expected_logp = expected_logp.detach().cpu().numpy()
                adv = adv.detach().cpu().numpy()
            else:
                expected_logp = expected_logp.numpy()
            expected_loss = -np.mean(expected_logp * adv)
            check(results, expected_loss, decimals=4)
Exemplo n.º 7
0
    def _ddpg_loss_helper(self, train_batch, weights, ks, fw, gamma,
                          huber_threshold, l2_reg, sess):
        """Emulates DDPG loss functions for tf and torch."""
        model_out_t = train_batch[SampleBatch.CUR_OBS]
        target_model_out_tp1 = train_batch[SampleBatch.NEXT_OBS]
        # get_policy_output
        policy_t = sigmoid(2.0 * fc(
            relu(fc(model_out_t, weights[ks[1]], weights[ks[0]],
                    framework=fw)), weights[ks[5]], weights[ks[4]]))
        # Get policy output for t+1 (target model).
        policy_tp1 = sigmoid(2.0 * fc(
            relu(
                fc(target_model_out_tp1,
                   weights[ks[3]],
                   weights[ks[2]],
                   framework=fw)), weights[ks[7]], weights[ks[6]]))
        # Assume no smooth target policy.
        policy_tp1_smoothed = policy_tp1

        # Q-values for the actually selected actions.
        # get_q_values
        q_t = fc(relu(
            fc(np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]],
                              -1),
               weights[ks[9]],
               weights[ks[8]],
               framework=fw)),
                 weights[ks[11]],
                 weights[ks[10]],
                 framework=fw)
        twin_q_t = fc(relu(
            fc(np.concatenate([model_out_t, train_batch[SampleBatch.ACTIONS]],
                              -1),
               weights[ks[13]],
               weights[ks[12]],
               framework=fw)),
                      weights[ks[15]],
                      weights[ks[14]],
                      framework=fw)

        # Q-values for current policy in given current state.
        # get_q_values
        q_t_det_policy = fc(relu(
            fc(np.concatenate([model_out_t, policy_t], -1),
               weights[ks[9]],
               weights[ks[8]],
               framework=fw)),
                            weights[ks[11]],
                            weights[ks[10]],
                            framework=fw)

        # Target q network evaluation.
        # target_model.get_q_values
        q_tp1 = fc(relu(
            fc(np.concatenate([target_model_out_tp1, policy_tp1_smoothed], -1),
               weights[ks[17]],
               weights[ks[16]],
               framework=fw)),
                   weights[ks[19]],
                   weights[ks[18]],
                   framework=fw)
        twin_q_tp1 = fc(relu(
            fc(np.concatenate([target_model_out_tp1, policy_tp1_smoothed], -1),
               weights[ks[21]],
               weights[ks[20]],
               framework=fw)),
                        weights[ks[23]],
                        weights[ks[22]],
                        framework=fw)

        q_t_selected = np.squeeze(q_t, axis=-1)
        twin_q_t_selected = np.squeeze(twin_q_t, axis=-1)
        q_tp1 = np.minimum(q_tp1, twin_q_tp1)
        q_tp1_best = np.squeeze(q_tp1, axis=-1)

        dones = train_batch[SampleBatch.DONES]
        rewards = train_batch[SampleBatch.REWARDS]
        if fw == "torch":
            dones = dones.float().numpy()
            rewards = rewards.numpy()

        q_tp1_best_masked = (1.0 - dones) * q_tp1_best
        q_t_selected_target = rewards + gamma * q_tp1_best_masked

        td_error = q_t_selected - q_t_selected_target
        twin_td_error = twin_q_t_selected - q_t_selected_target
        td_error = td_error + twin_td_error
        errors = huber_loss(td_error, huber_threshold) + \
            huber_loss(twin_td_error, huber_threshold)

        critic_loss = np.mean(errors)
        actor_loss = -np.mean(q_t_det_policy)
        # Add l2-regularization if required.
        for name, var in weights.items():
            if re.match("default_policy/actor_(hidden_0|out)/kernel", name):
                actor_loss += (l2_reg * l2_loss(var))
            elif re.match("default_policy/sequential(_1)?/\\w+/kernel", name):
                critic_loss += (l2_reg * l2_loss(var))

        return critic_loss, actor_loss, td_error
Exemplo n.º 8
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["eager"] = True
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]],
                     dtype=np.float32),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.VF_PREDS:
            np.array([0.5, 0.6, 0.7], dtype=np.float32),
            BEHAVIOUR_LOGITS:
            np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
            ACTION_LOGP:
            np.array([-0.5, -0.1, -0.2], dtype=np.float32)
        }

        # tf.
        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()

        # Post-process (calculate simple (non-GAE) advantages) and attach to
        # train_batch dict.
        # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
        # [0.50005, -0.505, 0.5]
        train_batch = postprocess_ppo_gae_tf(policy, train_batch)
        # Check Advantage values.
        check(train_batch[Postprocessing.VALUE_TARGETS],
              [0.50005, -0.505, 0.5])

        # Calculate actual PPO loss (results are stored in policy.loss_obj) for
        # tf.
        ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch)

        vars = policy.model.trainable_variables()
        expected_logits = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(),
               vars[1].numpy()), vars[4].numpy(), vars[5].numpy())
        expected_value_outs = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[2].numpy(),
               vars[3].numpy()), vars[6].numpy(), vars[7].numpy())

        kl, entropy, pg_loss, vf_loss, overall_loss = \
            self._ppo_loss_helper(
                policy, policy.model, Categorical, train_batch,
                expected_logits, expected_value_outs
            )
        check(policy.loss_obj.mean_kl, kl)
        check(policy.loss_obj.mean_entropy, entropy)
        check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
        check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4)
        check(policy.loss_obj.loss, overall_loss, decimals=4)

        # Torch.
        config["use_pytorch"] = True
        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        train_batch = postprocess_ppo_gae_torch(policy, train_batch)
        train_batch = policy._lazy_tensor_dict(train_batch)

        # Check Advantage values.
        check(train_batch[Postprocessing.VALUE_TARGETS],
              [0.50005, -0.505, 0.5])

        # Calculate actual PPO loss (results are stored in policy.loss_obj)
        # for tf.
        ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical,
                                 train_batch)

        kl, entropy, pg_loss, vf_loss, overall_loss = \
            self._ppo_loss_helper(
                policy, policy.model, TorchCategorical, train_batch,
                policy.model.last_output(),
                policy.model.value_function().detach().numpy()
            )
        check(policy.loss_obj.mean_kl, kl)
        check(policy.loss_obj.mean_entropy, entropy)
        check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
        check(policy.loss_obj.mean_vf_loss, np.mean(vf_loss), decimals=4)
        check(policy.loss_obj.loss, overall_loss, decimals=4)
Exemplo n.º 9
0
    def test_simple_q_loss_function(self):
        """Tests the Simple-Q loss function results on all frameworks."""
        config = dqn.SIMPLE_Q_DEFAULT_CONFIG.copy()
        # Run locally.
        config["num_workers"] = 0
        # Use very simple net (layer0=10 nodes, q-layer=2 nodes (2 actions)).
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        for fw in framework_iterator(config):
            # Generate Trainer and get its default Policy object.
            trainer = dqn.SimpleQTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            # Batch of size=2.
            input_ = {
                SampleBatch.CUR_OBS: np.random.random(size=(2, 4)),
                SampleBatch.ACTIONS: np.array([0, 1]),
                SampleBatch.REWARDS: np.array([0.4, -1.23]),
                SampleBatch.DONES: np.array([False, False]),
                SampleBatch.NEXT_OBS: np.random.random(size=(2, 4))
            }
            # Get model vars for computing expected model outs (q-vals).
            # 0=layer-kernel; 1=layer-bias; 2=q-val-kernel; 3=q-val-bias
            vars = policy.get_weights()
            if isinstance(vars, dict):
                vars = list(vars.values())
            vars_t = policy.target_q_func_vars
            if fw == "tf":
                vars_t = policy.get_session().run(vars_t)

            # Q(s,a) outputs.
            q_t = np.sum(
                one_hot(input_[SampleBatch.ACTIONS], 2) *
                fc(fc(input_[SampleBatch.CUR_OBS],
                      vars[0 if fw != "torch" else 2],
                      vars[1 if fw != "torch" else 3],
                      framework=fw),
                   vars[2 if fw != "torch" else 0],
                   vars[3 if fw != "torch" else 1],
                   framework=fw), 1)
            # max[a'](Qtarget(s',a')) outputs.
            q_target_tp1 = np.max(
                fc(fc(input_[SampleBatch.NEXT_OBS],
                      vars_t[0 if fw != "torch" else 2],
                      vars_t[1 if fw != "torch" else 3],
                      framework=fw),
                   vars_t[2 if fw != "torch" else 0],
                   vars_t[3 if fw != "torch" else 1],
                   framework=fw), 1)
            # TD-errors (Bellman equation).
            td_error = q_t - config["gamma"] * input_[SampleBatch.REWARDS] + \
                q_target_tp1
            # Huber/Square loss on TD-error.
            expected_loss = huber_loss(td_error).mean()

            if fw == "torch":
                input_ = policy._lazy_tensor_dict(input_)
            # Get actual out and compare.
            if fw == "tf":
                out = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(input_,
                                                           shuffle=False))
            else:
                out = (loss_torch if fw == "torch" else loss_tf)(policy,
                                                                 policy.model,
                                                                 None, input_)
            check(out, expected_loss, decimals=1)
Exemplo n.º 10
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = ppo.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["vf_share_layers"] = True

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]],
                     dtype=np.float32),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.PREV_ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.PREV_REWARDS:
            np.array([1.0, -1.0, .5], dtype=np.float32),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.VF_PREDS:
            np.array([0.5, 0.6, 0.7], dtype=np.float32),
            SampleBatch.ACTION_DIST_INPUTS:
            np.array([[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
            SampleBatch.ACTION_LOGP:
            np.array([-0.5, -0.1, -0.2], dtype=np.float32),
        }

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            if fw == "tf" or fw == "eager":
                train_batch = postprocess_ppo_gae_tf(policy, train_batch)
            else:
                train_batch = postprocess_ppo_gae_torch(policy, train_batch)
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw == "eager":
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            elif fw == "torch":
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw != "torch" else \
                list(policy.model.parameters())
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS],
                                     vars[0 if fw != "torch" else 2],
                                     vars[1 if fw != "torch" else 3],
                                     framework=fw)
            expected_logits = fc(expected_shared_out,
                                 vars[2 if fw != "torch" else 0],
                                 vars[3 if fw != "torch" else 1],
                                 framework=fw)
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw != "torch" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs,
                    sess=sess
                )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy.loss_obj.mean_kl, policy.loss_obj.mean_entropy,
                        policy.loss_obj.mean_policy_loss,
                        policy.loss_obj.mean_vf_loss, policy.loss_obj.loss
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False))
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            else:
                check(policy.loss_obj.mean_kl, kl)
                check(policy.loss_obj.mean_entropy, entropy)
                check(policy.loss_obj.mean_policy_loss, np.mean(-pg_loss))
                check(policy.loss_obj.mean_vf_loss,
                      np.mean(vf_loss),
                      decimals=4)
                check(policy.loss_obj.loss, overall_loss, decimals=4)
Exemplo n.º 11
0
def test_log_likelihood(run,
                        config,
                        prev_a=None,
                        continuous=False,
                        layer_key=("fc", (0, 4)),
                        logp_func=None):
    config = config.copy()
    # Run locally.
    config["num_workers"] = 0
    # Env setup.
    if continuous:
        env = "Pendulum-v0"
        obs_batch = preprocessed_obs_batch = np.array([[0.0, 0.1, -0.1]])
    else:
        env = "FrozenLake-v0"
        config["env_config"] = {"is_slippery": False, "map_name": "4x4"}
        obs_batch = np.array([0])
        preprocessed_obs_batch = one_hot(obs_batch, depth=16)

    # Use Soft-Q for DQNs.
    if run is dqn.DQNTrainer:
        config["exploration_config"] = {"type": "SoftQ", "temperature": 0.5}

    prev_r = None if prev_a is None else np.array(0.0)

    # Test against all frameworks.
    for fw in ["tf", "eager", "torch"]:
        if run in [dqn.DQNTrainer, sac.SACTrainer] and fw == "torch":
            continue
        print("Testing {} with framework={}".format(run, fw))
        config["eager"] = True if fw == "eager" else False
        config["use_pytorch"] = True if fw == "torch" else False

        trainer = run(config=config, env=env)
        policy = trainer.get_policy()
        vars = policy.get_weights()
        # Sample n actions, then roughly check their logp against their
        # counts.
        num_actions = 500
        actions = []
        for _ in range(num_actions):
            # Single action from single obs.
            actions.append(
                trainer.compute_action(obs_batch[0],
                                       prev_action=prev_a,
                                       prev_reward=prev_r,
                                       explore=True))

        # Test 50 actions for their log-likelihoods vs expected values.
        if continuous:
            for idx in range(50):
                a = actions[idx]
                if fw == "tf" or fw == "eager":
                    if isinstance(vars, list):
                        expected_mean_logstd = fc(
                            fc(obs_batch, vars[layer_key[1][0]]),
                            vars[layer_key[1][1]])
                    else:
                        expected_mean_logstd = fc(
                            fc(
                                obs_batch,
                                vars["default_policy/{}_1/kernel".format(
                                    layer_key[0])]),
                            vars["default_policy/{}_out/kernel".format(
                                layer_key[0])])
                else:
                    expected_mean_logstd = fc(
                        fc(obs_batch,
                           vars["_hidden_layers.0._model.0.weight"]),
                        vars["_logits._model.0.weight"])
                mean, log_std = np.split(expected_mean_logstd, 2, axis=-1)
                if logp_func is None:
                    expected_logp = np.log(norm.pdf(a, mean, np.exp(log_std)))
                else:
                    expected_logp = logp_func(mean, log_std, a)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp[0], rtol=0.2)
        # Test all available actions for their logp values.
        else:
            for a in [0, 1, 2, 3]:
                count = actions.count(a)
                expected_logp = np.log(count / num_actions)
                logp = policy.compute_log_likelihoods(
                    np.array([a]),
                    preprocessed_obs_batch,
                    prev_action_batch=np.array([prev_a]),
                    prev_reward_batch=np.array([prev_r]))
                check(logp, expected_logp, rtol=0.3)