def test_rollout_with_nones(self): buffer = NStepAdvantageBuffer(self.v, self.features, 3, 3, discount_factor=0.5) done = torch.ones(12) done[5] = 0 done[7] = 0 done[9] = 0 states = State(torch.arange(0, 12).unsqueeze(1), done) actions = torch.ones((3)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) buffer.store(states[6:9], actions, 2 * torch.ones(3)) states, actions, advantages = buffer.advantages(states[9:12]) expected_states = State(torch.arange(0, 9).unsqueeze(1), done[0:9]) expected_next_done = torch.zeros(9) expected_next_done[5] = 1 expected_next_done[7] = 1 expected_next_done[8] = 1 expected_next_states = State( torch.tensor([9, 7, 5, 9, 7, 11, 9, 10, 11]).unsqueeze(1), expected_next_done) expected_returns = torch.tensor([1, 0.5, 0, 2, 1, 2, 2, 2, 2]).float() expected_lengths = torch.tensor([3, 2, 1, 2, 1, 2, 1, 1, 1]).float() self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths))
def _make_buffer(self): return NStepAdvantageBuffer( self.v, self.features, self.n_steps, self.n_envs, discount_factor=self.discount_factor )
def test_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 3, discount_factor=0.5) actions = torch.ones((3)) states = State(torch.arange(0, 12).unsqueeze(1)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) states, _, advantages = buffer.advantages(states[6:9]) expected_states = State(torch.arange(0, 6).unsqueeze(1)) expected_next_states = State( torch.cat((torch.arange(6, 9), torch.arange(6, 9))).unsqueeze(1)) expected_returns = torch.tensor([0.5, 0.5, 0.5, 1, 1, 1]).float() expected_lengths = torch.tensor([2., 2, 2, 1, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths))
def test_multi_rollout(self): buffer = NStepAdvantageBuffer(self.v, self.features, 2, 2, discount_factor=0.5) raw_states = StateArray( torch.arange(0, 12).unsqueeze(1).float(), (12, )) actions = torch.ones((2)) buffer.store(raw_states[0:2], actions, torch.ones(2)) buffer.store(raw_states[2:4], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[4:6]) expected_states = StateArray( torch.arange(0, 4).unsqueeze(1).float(), (4, )) expected_returns = torch.tensor([1.5, 1.5, 1, 1]) expected_next_states = StateArray( torch.tensor([4., 5, 4, 5]).unsqueeze(1), (4, )) expected_lengths = torch.tensor([2., 2, 1, 1]) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages(expected_states, expected_returns, expected_next_states, expected_lengths)) buffer.store(raw_states[4:6], actions, torch.ones(2)) buffer.store(raw_states[6:8], actions, torch.ones(2)) states, actions, advantages = buffer.advantages(raw_states[8:10]) expected_states = StateArray( torch.arange(4, 8).unsqueeze(1).float(), (4, )) self.assert_states_equal(states, expected_states) tt.assert_allclose( advantages, self._compute_expected_advantages( expected_states, torch.tensor([1.5, 1.5, 1, 1]), StateArray( torch.tensor([8, 9, 8, 9]).unsqueeze(1).float(), (4, )), torch.tensor([2., 2, 1, 1])))