예제 #1
0
    def test_polynomial_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000]
        expected = [
            0.5 + (2.0 - 0.5) * (1.0 - min(t, 100) / 100)**2 for t in ts
        ]
        config = dict(
            type=
            "ray.rllib.utils.schedules.polynomial_schedule.PolynomialSchedule",
            schedule_timesteps=100,
            initial_p=2.0,
            final_p=0.5,
            power=2.0,
        )

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            polynomial = from_config(config, framework=fw)
            for t, e in zip(ts, expected):
                out = polynomial(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = polynomial(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
예제 #2
0
    def test_constant_schedule(self):
        value = 2.3
        ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56]

        for fw in ["tf", "torch", None]:
            constant = from_config(ConstantSchedule,
                                   dict(value=value, framework=fw))
            for t in ts:
                out = constant(t)
                check(out, value)
예제 #3
0
    def test_linear_schedule(self):
        ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000]
        config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6}

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            linear = from_config(LinearSchedule, config, framework=fw_)
            for t in ts:
                out = linear(t)
                check(out, 2.1 - (min(t, 100) / 100) * (2.1 - 0.6), decimals=4)
예제 #4
0
 def test_piecewise_schedule(self):
     piecewise = from_config(
         PiecewiseSchedule,
         dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)],
              outside_value=14.5))
     ts = [0, 5, 10, 100, 90, 2, 1, 99, 27]
     expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0]
     for t, e in zip(ts, expected):
         out = piecewise(t)
         check(out, e, decimals=4)
예제 #5
0
    def test_exponential_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
        config = dict(initial_p=2.0, decay_rate=0.99, schedule_timesteps=100)

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            exponential = from_config(
                ExponentialSchedule, config, framework=fw_)
            for t in ts:
                out = exponential(t)
                check(out, 2.0 * 0.99**(t / 100), decimals=4)
예제 #6
0
    def test_piecewise_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 27]
        expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0]
        config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)],
                      outside_value=14.5)

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            piecewise = from_config(PiecewiseSchedule, config, framework=fw_)
            for t, e in zip(ts, expected):
                out = piecewise(t)
                check(out, e, decimals=4)
예제 #7
0
    def test_constant_schedule(self):
        value = 2.3
        ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56]

        config = {"value": value}

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            constant = from_config(ConstantSchedule, config, framework=fw_)
            for t in ts:
                out = constant(t)
                check(out, value)
예제 #8
0
 def test_exponential_schedule(self):
     ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
     for fw in ["tf", "torch", None]:
         exponential = from_config(
             ExponentialSchedule,
             dict(initial_p=2.0,
                  decay_rate=0.99,
                  schedule_timesteps=100,
                  framework=fw))
         for t in ts:
             out = exponential(t)
             check(out, 2.0 * 0.99**(t / 100), decimals=4)
예제 #9
0
 def test_linear_schedule(self):
     ts = [0, 50, 10, 100, 90, 2, 1, 99, 23]
     for fw in ["tf", "torch", None]:
         linear = from_config(
             LinearSchedule, {
                 "schedule_timesteps": 100,
                 "initial_p": 2.1,
                 "final_p": 0.6,
                 "framework": fw
             })
         if fw == "tf":
             tf.enable_eager_execution()
         for t in ts:
             out = linear(t)
             check(out, 2.1 - (t / 100) * (2.1 - 0.6), decimals=4)
예제 #10
0
파일: test_schedules.py 프로젝트: w0617/ray
    def test_linear_schedule(self):
        ts = [0, 50, 10, 100, 90, 2, 1, 99, 23]
        config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6}
        for fw in ["tf", "torch", None]:
            linear = from_config(LinearSchedule, config, framework=fw)
            for t in ts:
                out = linear(t)
                check(out, 2.1 - (t / 100) * (2.1 - 0.6), decimals=4)

        # Test eager as well.
        with eager_mode():
            linear = from_config(LinearSchedule, config, framework="tf")
            for t in ts:
                out = linear(t)
                check(out, 2.1 - (t / 100) * (2.1 - 0.6), decimals=4)
예제 #11
0
    def test_polynomial_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000]
        config = dict(type="ray.rllib.utils.schedules.polynomial_schedule."
                      "PolynomialSchedule",
                      schedule_timesteps=100,
                      initial_p=2.0,
                      final_p=0.5,
                      power=2.0)

        for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]):
            fw_ = fw if fw != "tfe" else "tf"
            polynomial = from_config(config, framework=fw_)
            for t in ts:
                out = polynomial(t)
                t = min(t, 100)
                check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
예제 #12
0
 def test_polynomial_schedule(self):
     ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
     for fw in ["tf", "torch", None]:
         polynomial = from_config(
             dict(type="ray.rllib.utils.schedules.polynomial_schedule."
                  "PolynomialSchedule",
                  schedule_timesteps=100,
                  initial_p=2.0,
                  final_p=0.5,
                  power=2.0,
                  framework=fw))
         if fw == "tf":
             tf.enable_eager_execution()
         for t in ts:
             out = polynomial(t)
             check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
예제 #13
0
    def test_linear_schedule(self):
        ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000]
        expected = [2.1 - (min(t, 100) / 100) * (2.1 - 0.6) for t in ts]
        config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6}

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            linear = from_config(LinearSchedule, config, framework=fw)
            for t, e in zip(ts, expected):
                out = linear(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = linear(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
예제 #14
0
파일: test_schedules.py 프로젝트: w0617/ray
    def test_exponential_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
        config = dict(initial_p=2.0, decay_rate=0.99, schedule_timesteps=100)
        for fw in ["tf", "torch", None]:
            config["framework"] = fw
            exponential = from_config(ExponentialSchedule, config)
            for t in ts:
                out = exponential(t)
                check(out, 2.0 * 0.99**(t / 100), decimals=4)

        # Test eager as well.
        with eager_mode():
            config["framework"] = "tf"
            exponential = from_config(ExponentialSchedule, config)
            for t in ts:
                out = exponential(t)
                check(out, 2.0 * 0.99**(t / 100), decimals=4)
예제 #15
0
파일: test_schedules.py 프로젝트: w0617/ray
    def test_constant_schedule(self):
        value = 2.3
        ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56]

        config = {"value": value}

        for fw in ["tf", "torch", None]:
            constant = from_config(ConstantSchedule, config, framework=fw)
            for t in ts:
                out = constant(t)
                check(out, value)

        # Test eager as well.
        with eager_mode():
            constant = from_config(ConstantSchedule, config, framework="tf")
            for t in ts:
                out = constant(t)
                check(out, value)
예제 #16
0
    def test_constant_schedule(self):
        value = 2.3
        ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56]

        config = {"value": value}

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            constant = from_config(ConstantSchedule, config, framework=fw)
            for t in ts:
                out = constant(t)
                check(out, value)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t in ts_as_tensors:
                out = constant(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, value, decimals=4)
예제 #17
0
    def test_piecewise_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 27]
        expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0]
        config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)],
                      outside_value=14.5)

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            piecewise = from_config(PiecewiseSchedule, config, framework=fw)
            for t, e in zip(ts, expected):
                out = piecewise(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = piecewise(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
예제 #18
0
    def test_piecewise_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 27]
        expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0]
        config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)],
                      outside_value=14.5)
        for fw in ["tf", "torch", None]:
            config["framework"] = fw
            piecewise = from_config(PiecewiseSchedule, config)
            for t, e in zip(ts, expected):
                out = piecewise(t)
                check(out, e, decimals=4)

        # Test eager as well.
        with eager_mode():
            config["framework"] = "tf"
            piecewise = from_config(PiecewiseSchedule, config)
            for t, e in zip(ts, expected):
                out = piecewise(t)
                check(out, e, decimals=4)
예제 #19
0
def do_test_explorations(
    run, env, config, dummy_obs, prev_a=None, expected_mean_action=None
):
    """Calls an Agent's `compute_actions` with different `explore` options."""

    core_config = config.copy()
    if run not in [a3c.A3CTrainer]:
        core_config["num_workers"] = 0

    # Test all frameworks.
    for _ in framework_iterator(core_config):
        print("Agent={}".format(run))

        # Test for both the default Agent's exploration AND the `Random`
        # exploration class.
        for exploration in [None, "Random"]:
            local_config = core_config.copy()
            if exploration == "Random":
                # TODO(sven): Random doesn't work for IMPALA yet.
                if run is impala.ImpalaTrainer:
                    continue
                local_config["exploration_config"] = {"type": "Random"}
            print("exploration={}".format(exploration or "default"))

            trainer = run(config=local_config, env=env)

            # Make sure all actions drawn are the same, given same
            # observations.
            actions = []
            for _ in range(25):
                actions.append(
                    trainer.compute_single_action(
                        observation=dummy_obs,
                        explore=False,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None,
                    )
                )
                check(actions[-1], actions[0])

            # Make sure actions drawn are different
            # (around some mean value), given constant observations.
            actions = []
            for _ in range(500):
                actions.append(
                    trainer.compute_single_action(
                        observation=dummy_obs,
                        explore=True,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None,
                    )
                )
            check(
                np.mean(actions),
                expected_mean_action if expected_mean_action is not None else 0.5,
                atol=0.4,
            )
            # Check that the stddev is not 0.0 (values differ).
            check(np.std(actions), 0.0, false=True)
예제 #20
0
    def test_polynomial_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
        config = dict(type="ray.rllib.utils.schedules.polynomial_schedule."
                      "PolynomialSchedule",
                      schedule_timesteps=100,
                      initial_p=2.0,
                      final_p=0.5,
                      power=2.0)
        for fw in ["tf", "torch", None]:
            config["framework"] = fw
            polynomial = from_config(config)
            for t in ts:
                out = polynomial(t)
                check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)

        # Test eager as well.
        with eager_mode():
            config["framework"] = "tf"
            polynomial = from_config(config)
            for t in ts:
                out = polynomial(t)
                check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
예제 #21
0
def test_explorations(run,
                      env,
                      config,
                      dummy_obs,
                      prev_a=None,
                      expected_mean_action=None):
    """Calls an Agent's `compute_actions` with different `explore` options."""

    config = config.copy()
    if run not in [a3c.A3CTrainer]:
        config["num_workers"] = 0

    # Test all frameworks.
    for fw in ["torch", "eager", "tf"]:
        if fw == "torch" and \
                run in [dqn.DQNTrainer, dqn.SimpleQTrainer,
                        impala.ImpalaTrainer, sac.SACTrainer]:
            continue
        print("Testing {} in framework={}".format(run, fw))
        config["eager"] = True if fw == "eager" else False
        config["use_pytorch"] = True if fw == "torch" else False

        # Test for both the default Agent's exploration AND the `Random`
        # exploration class.
        for exploration in [None]:  # , "Random"]:
            if exploration == "Random":
                config["exploration_config"] = {"type": "Random"}

            trainer = run(config=config, env=env)

            # Make sure all actions drawn are the same, given same
            # observations.
            actions = []
            for _ in range(100):
                actions.append(
                    trainer.compute_action(
                        observation=dummy_obs,
                        explore=False,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None))
                check(actions[-1], actions[0])

            # Make sure actions drawn are different (around some mean value),
            # given constant observations.
            actions = []
            for _ in range(100):
                actions.append(
                    trainer.compute_action(
                        observation=dummy_obs,
                        explore=True,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None))
            check(
                np.mean(actions),
                expected_mean_action
                if expected_mean_action is not None else 0.5,
                atol=0.3)
            # Check that the stddev is not 0.0 (values differ).
            check(np.std(actions), 0.0, false=True)
예제 #22
0
    def test_exponential_schedule(self):
        decay_rate = 0.2
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23]
        expected = [2.0 * decay_rate**(t / 100) for t in ts]
        config = dict(initial_p=2.0,
                      decay_rate=decay_rate,
                      schedule_timesteps=100)

        for fw in framework_iterator(
                frameworks=["tf2", "tf", "tfe", "torch", None]):
            exponential = from_config(ExponentialSchedule,
                                      config,
                                      framework=fw)
            for t, e in zip(ts, expected):
                out = exponential(t)
                check(out, e, decimals=4)

            ts_as_tensors = self._get_framework_tensors(ts, fw)
            for t, e in zip(ts_as_tensors, expected):
                out = exponential(t)
                assert fw != "tf" or isinstance(out, tf.Tensor)
                check(out, e, decimals=4)
예제 #23
0
    def test_no_curiosity(self):
        config = ppo.DEFAULT_CONFIG
        env = "CartPole-v0"
        dummy_obs = np.array([0.0, 0.1, 0.0, 0.0])
        prev_a = np.array(0)
        config["framework"] = "torch"
        config["exploration_config"] = {"type": "ParameterNoise"}

        trainer = ppo.PPOTrainer(config=config, env=env)
        trainer.train()

        # Make sure all actions drawn are the same, given same
        # observations. Tests the explorations API.

        actions = []
        for _ in range(5):
            actions.append(
                trainer.compute_action(
                    observation=dummy_obs,
                    explore=False,
                    prev_action=prev_a,
                    prev_reward=1.0 if prev_a is not None else None))
            check(actions[-1], actions[0])
        print(actions)
예제 #24
0
파일: test_pg.py 프로젝트: yosagi/ray
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["eager"] = True
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True])
        }

        # tf.
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        vars = policy.model.trainable_variables()

        # Post-process (calculate simple (non-GAE) advantages) and attach to
        # train_batch dict.
        # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
        # [2.9701, 1.99, 1.0]
        train_batch = pg.post_process_advantages(policy, train_batch)
        # Check Advantage values.
        check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

        # Actual loss results.
        results = pg.pg_tf_loss(policy,
                                policy.model,
                                dist_class=Categorical,
                                train_batch=train_batch)

        # Calculate expected results.
        expected_logits = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(),
               vars[1].numpy()), vars[2].numpy(), vars[3].numpy())
        expected_logp = Categorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp * train_batch[Postprocessing.ADVANTAGES])
        check(results.numpy(), expected_loss, decimals=4)

        # Torch.
        config["use_pytorch"] = True
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        train_batch = policy._lazy_tensor_dict(train_batch)
        results = pg.pg_torch_loss(policy,
                                   policy.model,
                                   dist_class=TorchCategorical,
                                   train_batch=train_batch)
        expected_logits = policy.model.last_output()
        expected_logp = TorchCategorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp.detach().numpy() *
            train_batch[Postprocessing.ADVANTAGES].numpy())
        check(results.detach().numpy(), expected_loss, decimals=4)
예제 #25
0
def do_test_explorations(run,
                         env,
                         config,
                         dummy_obs,
                         prev_a=None,
                         expected_mean_action=None):
    """Calls an Agent's `compute_actions` with different `explore` options."""

    config = config.copy()
    if run not in [a3c.A3CTrainer]:
        config["num_workers"] = 0

    # Test all frameworks.
    for fw in ["tf", "eager", "torch"]:
        if fw == "torch" and \
                run in [ddpg.DDPGTrainer, dqn.DQNTrainer, dqn.SimpleQTrainer,
                        impala.ImpalaTrainer, sac.SACTrainer, td3.TD3Trainer]:
            continue
        elif fw == "eager" and run in [ddpg.DDPGTrainer, td3.TD3Trainer]:
            continue

        print("Testing {} in framework={}".format(run, fw))
        config["eager"] = fw == "eager"
        config["use_pytorch"] = fw == "torch"

        # Test for both the default Agent's exploration AND the `Random`
        # exploration class.
        for exploration in [None, "Random"]:
            if exploration == "Random":
                # TODO(sven): Random doesn't work for IMPALA yet.
                if run is impala.ImpalaTrainer:
                    continue
                config["exploration_config"] = {"type": "Random"}
            print("exploration={}".format(exploration or "default"))

            eager_ctx = None
            if fw == "eager":
                eager_ctx = eager_mode()
                eager_ctx.__enter__()
                assert tf.executing_eagerly()
            elif fw == "tf":
                assert not tf.executing_eagerly()

            trainer = run(config=config, env=env)

            # Make sure all actions drawn are the same, given same
            # observations.
            actions = []
            for _ in range(50):
                actions.append(
                    trainer.compute_action(
                        observation=dummy_obs,
                        explore=False,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None))
                check(actions[-1], actions[0])

            # Make sure actions drawn are different
            # (around some mean value), given constant observations.
            actions = []
            for _ in range(100):
                actions.append(
                    trainer.compute_action(
                        observation=dummy_obs,
                        explore=True,
                        prev_action=prev_a,
                        prev_reward=1.0 if prev_a is not None else None))
            check(
                np.mean(actions),
                expected_mean_action
                if expected_mean_action is not None else 0.5,
                atol=0.3)
            # Check that the stddev is not 0.0 (values differ).
            check(np.std(actions), 0.0, false=True)

            if eager_ctx:
                eager_ctx.__exit__(None, None, None)
예제 #26
0
파일: test_pg.py 프로젝트: zivzone/ray
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = SampleBatch({
            SampleBatch.OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True]),
            SampleBatch.EPS_ID:
            np.array([1234, 1234, 1234]),
            SampleBatch.AGENT_INDEX:
            np.array([0, 0, 0]),
        })

        for fw, sess in framework_iterator(config, session=True):
            dist_cls = (Categorical if fw != "torch" else TorchCategorical)
            trainer = pg.PGTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()
            vars = policy.model.trainable_variables()
            if sess:
                vars = policy.get_session().run(vars)

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
            # [2.9701, 1.99, 1.0]
            train_batch_ = pg.post_process_advantages(policy,
                                                      train_batch.copy())
            if fw == "torch":
                train_batch_ = policy._lazy_tensor_dict(train_batch_)

            # Check Advantage values.
            check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

            # Actual loss results.
            if sess:
                results = policy.get_session().run(
                    policy._loss,
                    feed_dict=policy._get_loss_inputs_dict(train_batch_,
                                                           shuffle=False))
            else:
                results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else
                           pg.pg_torch_loss)(policy,
                                             policy.model,
                                             dist_class=dist_cls,
                                             train_batch=train_batch_)

            # Calculate expected results.
            if fw != "torch":
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[0],
                                        vars[1],
                                        framework=fw),
                                     vars[2],
                                     vars[3],
                                     framework=fw)
            else:
                expected_logits = fc(fc(train_batch_[SampleBatch.OBS],
                                        vars[2],
                                        vars[3],
                                        framework=fw),
                                     vars[0],
                                     vars[1],
                                     framework=fw)
            expected_logp = dist_cls(expected_logits, policy.model).logp(
                train_batch_[SampleBatch.ACTIONS])
            adv = train_batch_[Postprocessing.ADVANTAGES]
            if sess:
                expected_logp = sess.run(expected_logp)
            elif fw == "torch":
                expected_logp = expected_logp.detach().cpu().numpy()
                adv = adv.detach().cpu().numpy()
            else:
                expected_logp = expected_logp.numpy()
            expected_loss = -np.mean(expected_logp * adv)
            check(results, expected_loss, decimals=4)