Пример #1
0
def calculate_gae_advantages(paths, discount, gae_lambda):
    baseline = LinearFeatureBaseline()

    for idx, path in enumerate(paths):
        path["returns"] = discount_cumsum(path["rewards"], discount)

    baseline.fit(paths)
    all_path_baselines = [baseline.predict(path) for path in paths]

    for idx, path in enumerate(paths):
        path_baselines = np.append(all_path_baselines[idx], 0)
        deltas = path["rewards"] + discount * path_baselines[1:] - path_baselines[:-1]
        path["advantages"] = discount_cumsum(deltas, discount * gae_lambda)
    return paths
Пример #2
0
 def test_n_step_4(self):
     """Tests, whether n-step adjustments of trajectories work."""
     # n-step = 4
     gamma = 0.99
     obs = np.arange(0, 7)
     actions = np.random.randint(-1, 3, size=(7, ))
     check_actions = actions.copy()
     rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
     dones = [False, False, False, False, False, False, True]
     next_obs = np.arange(1, 8)
     batch = SampleBatch({
         SampleBatch.OBS: obs,
         SampleBatch.ACTIONS: actions,
         SampleBatch.REWARDS: rewards,
         SampleBatch.DONES: dones,
         SampleBatch.NEXT_OBS: next_obs,
     })
     adjust_nstep(4, gamma, batch)
     check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6])
     check(batch[SampleBatch.ACTIONS], check_actions)
     check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7])
     check(batch[SampleBatch.DONES],
           [False, False, False, True, True, True, True])
     check(batch[SampleBatch.REWARDS], [
         discount_cumsum(np.array(rewards[0:4]), gamma)[0],
         discount_cumsum(np.array(rewards[1:5]), gamma)[0],
         discount_cumsum(np.array(rewards[2:6]), gamma)[0],
         discount_cumsum(np.array(rewards[3:7]), gamma)[0],
         discount_cumsum(np.array(rewards[4:]), gamma)[0],
         discount_cumsum(np.array(rewards[5:]), gamma)[0],
         discount_cumsum(np.array(rewards[6:]), gamma)[0],
     ])
Пример #3
0
    def test_n_step_from_same_obs_source_array(self):
        """Tests, whether n-step also works on a shared obs/new-obs array."""
        gamma = 0.99
        # The underlying observation data. Both obs and next_obs will
        # be references into that same np.array.
        underlying_obs = np.arange(0, 8)
        obs = underlying_obs[:7]
        next_obs = underlying_obs[1:]

        actions = np.random.randint(-1, 3, size=(7,))
        check_actions = actions.copy()
        rewards = [10.0, 0.0, 100.0, 50.0, 60.0, 10.0, 100.0]
        dones = [False, False, False, False, False, False, True]

        batch = SampleBatch(
            {
                SampleBatch.OBS: obs,
                SampleBatch.ACTIONS: actions,
                SampleBatch.REWARDS: rewards,
                SampleBatch.DONES: dones,
                SampleBatch.NEXT_OBS: next_obs,
            }
        )
        adjust_nstep(4, gamma, batch)

        check(batch[SampleBatch.OBS], [0, 1, 2, 3, 4, 5, 6])
        check(batch[SampleBatch.ACTIONS], check_actions)
        check(batch[SampleBatch.NEXT_OBS], [4, 5, 6, 7, 7, 7, 7])
        check(batch[SampleBatch.DONES], [False, False, False, True, True, True, True])
        check(
            batch[SampleBatch.REWARDS],
            [
                discount_cumsum(np.array(rewards[0:4]), gamma)[0],
                discount_cumsum(np.array(rewards[1:5]), gamma)[0],
                discount_cumsum(np.array(rewards[2:6]), gamma)[0],
                discount_cumsum(np.array(rewards[3:7]), gamma)[0],
                discount_cumsum(np.array(rewards[4:]), gamma)[0],
                discount_cumsum(np.array(rewards[5:]), gamma)[0],
                discount_cumsum(np.array(rewards[6:]), gamma)[0],
            ],
        )
def calculate_advantages(policy,
                         sample_batch,
                         other_agent_batches=None,
                         episode=None):
    sample_batch["returns"] = discount_cumsum(sample_batch["rewards"], 0.99)
    return sample_batch