def test_final_state_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) with pytest.raises(AssertionError): final_state_from_episode([1, 2, 3]) D = BatchEpisode(env_spec) D.obs[0] = [0.1, 0.2, 1.3] D.done[0] = [False, False, True] D.info[0] = [{}, {}, {'terminal_observation': 0.3}] D.obs[1] = [1, 2, 3, 4, 5, 6, 7, 8, 9] D.done[1] = [False] * 9 D.obs[2] = [10, 15] D.done[2] = [False, True] D.info[2] = [{}, {'terminal_observation': 20}] final_states = final_state_from_episode(D) assert final_states.shape == (3, ) + env_spec.observation_space.shape assert np.allclose(final_states[0], 0.3) assert np.allclose(final_states[1], 9) assert np.allclose(final_states[2], 20) with pytest.raises(AssertionError): final_state_from_segment(D)
def test_returns_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] out = returns_from_episode(D, 1.0) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [6, 5, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [15, 14, 12, 9, 5, 0, 0, 0]) assert np.allclose(out[2], [36, 35, 33, 30, 26, 21, 15, 8]) del out out = returns_from_episode(D, 0.1) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.23, 2.3, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [1.2345, 2.345, 3.45, 4.5, 5, 0, 0, 0]) assert np.allclose( out[2], [1.2345678, 2.345678, 3.45678, 4.5678, 5.678, 6.78, 7.8, 8]) with pytest.raises(AssertionError): returns_from_segment(D, 0.1)
def test_gae_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.completed[0] = True D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.completed[1] = False D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] D.completed[2] = True all_Vs = [ torch.tensor([[0.1], [0.5], [1.0]]), torch.tensor([[1.1], [1.5], [2.0]]), torch.tensor([[2.1], [2.5], [3.0]]), torch.tensor([[3.1], [3.5], [4.0]]), torch.tensor([[4.1], [4.5], [5.0]]), torch.tensor([[5.1], [5.5], [6.0]]), torch.tensor([[6.1], [6.5], [7.0]]), torch.tensor([[7.1], [7.5], [8.0]]) ] last_Vs = torch.tensor([10, 20, 30]).unsqueeze(1) all_Vs = torch.stack(all_Vs, 1) out = gae_from_episode(D, all_Vs, last_Vs, 1.0, 0.5) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [3.725, 3.45, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [6.40625, 8.8125, 11.625, 15.25, 20.5, 0, 0, 0]) assert np.allclose(out[2], [5.84375, 7.6875, 9.375, 10.75, 11.5, 11., 8, 0.]) del out out = gae_from_episode(D, all_Vs, last_Vs, 0.1, 0.2) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.03256, 1.128, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [0.665348, 0.7674, 0.87, 1, 2.5, 0, 0, 0]) assert np.allclose(out[2], [ 0.206164098, 0.308204915, 0.410245728, 0.5122864, 0.61432, 0.716, 0.8, 0 ]) with pytest.raises(AssertionError): gae_from_segment(D, all_Vs, last_Vs, 0.1, 0.2)
def test_td0_error_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.completed[0] = True D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.completed[1] = False D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] D.completed[2] = True all_Vs = [ torch.tensor([[0.1], [0.5], [1.0]]), torch.tensor([[1.1], [1.5], [2.0]]), torch.tensor([[2.1], [2.5], [3.0]]), torch.tensor([[3.1], [3.5], [4.0]]), torch.tensor([[4.1], [4.5], [5.0]]), torch.tensor([[5.1], [5.5], [6.0]]), torch.tensor([[6.1], [6.5], [7.0]]), torch.tensor([[7.1], [7.5], [8.0]]) ] last_Vs = torch.tensor([10, 20, 30]).unsqueeze(1) all_Vs = torch.stack(all_Vs, 1) out = td0_error_from_episode(D, all_Vs, last_Vs, 1.0) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [2.0, 3, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [2, 3, 4, 5, 20.5, 0, 0, 0]) assert np.allclose(out[2], [2, 3, 4, 5, 6, 7, 8, 0]) del out out = td0_error_from_episode(D, all_Vs, last_Vs, 0.1) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.01, 1.11, 0.9, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [0.65, 0.75, 0.85, 0.95, 2.5, 0, 0, 0]) assert np.allclose(out[2], [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0]) with pytest.raises(AssertionError): td0_error_from_segment(D, all_Vs, last_Vs, 0.1)
def test_bootstrapped_returns_from_episode(): env = make_vec_env(SerialVecEnv, make_gym_env, 'CartPole-v1', 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) D.r[0] = [1, 2, 3] D.done[0] = [False, False, True] D.completed[0] = True D.r[1] = [1, 2, 3, 4, 5] D.done[1] = [False, False, False, False, False] D.completed[1] = False D.r[2] = [1, 2, 3, 4, 5, 6, 7, 8] D.done[2] = [False, False, False, False, False, False, False, True] D.completed[2] = True last_Vs = torch.tensor([10, 20, 30]).unsqueeze(1) out = bootstrapped_returns_from_episode(D, last_Vs, 1.0) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [6, 5, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [35, 34, 32, 29, 25, 0, 0, 0]) assert np.allclose(out[2], [36, 35, 33, 30, 26, 21, 15, 8]) del out out = bootstrapped_returns_from_episode(D, last_Vs, 0.1) assert out.shape == (3, D.maxT) assert np.allclose(out[0], [1.23, 2.3, 3, 0, 0, 0, 0, 0]) assert np.allclose(out[1], [1.2347, 2.347, 3.47, 4.7, 7, 0, 0, 0]) assert np.allclose( out[2], [1.2345678, 2.345678, 3.45678, 4.5678, 5.678, 6.78, 7.8, 8]) with pytest.raises(AssertionError): bootstrapped_returns_from_segment(D, last_Vs, 0.1)
def test_batch_episode(vec_env, env_id): env = make_vec_env(vec_env, make_gym_env, env_id, 3, 0) env_spec = EnvSpec(env) D = BatchEpisode(env_spec) if env_id == 'CartPole-v1': sticky_action = 1 action_shape = () action_dtype = np.int32 elif env_id == 'Pendulum-v0': sticky_action = [0.1] action_shape = env_spec.action_space.shape action_dtype = np.float32 obs = env.reset() D.add_observation(obs) for t in range(30): action = [sticky_action] * env.num_env obs, reward, done, info = env.step(action) D.add_observation(obs) D.add_action(action) D.add_reward(reward) D.add_done(done) D.add_info(info) D.add_batch_info({'V': [0.1 * (t + 1), (t + 1), 10 * (t + 1)]}) [D.set_completed(n) for n, d in enumerate(done) if d] assert D.N == 3 assert len(D.Ts) == 3 assert D.maxT == max(D.Ts) assert all([ isinstance(x, np.ndarray) for x in [ D.numpy_observations, D.numpy_actions, D.numpy_rewards, D.numpy_dones, D.numpy_masks ] ]) assert all([ x.dtype == np.float32 for x in [D.numpy_observations, D.numpy_rewards, D.numpy_masks] ]) assert all([ x.shape == (3, D.maxT) for x in [D.numpy_rewards, D.numpy_dones, D.numpy_masks] ]) assert D.numpy_actions.dtype == action_dtype assert D.numpy_dones.dtype == np.bool assert D.numpy_observations.shape == (3, D.maxT + 1) + env_spec.observation_space.shape assert D.numpy_actions.shape == (3, D.maxT) + action_shape assert isinstance(D.batch_infos, list) and len(D.batch_infos) == 30 assert np.allclose([0.1 * (x + 1) for x in range(30)], [info['V'][0] for info in D.batch_infos]) assert np.allclose([1 * (x + 1) for x in range(30)], [info['V'][1] for info in D.batch_infos]) assert np.allclose([10 * (x + 1) for x in range(30)], [info['V'][2] for info in D.batch_infos]) seeder = Seeder(0) seed1, seed2, seed3 = seeder(3) env1 = make_gym_env(env_id, seed1) env2 = make_gym_env(env_id, seed2) env3 = make_gym_env(env_id, seed3) for n, ev in enumerate([env1, env2, env3]): obs = ev.reset() assert np.allclose(obs, D.observations[n][0]) assert np.allclose(obs, D.numpy_observations[n, 0, ...]) for t in range(30): obs, reward, done, info = ev.step(sticky_action) assert np.allclose(reward, D.rewards[n][t]) assert np.allclose(reward, D.numpy_rewards[n, t]) assert np.allclose(done, D.dones[n][t]) assert done == D.numpy_dones[n, t] assert int(not done) == D.masks[n][t] assert int(not done) == D.numpy_masks[n, t] if done: assert np.allclose(obs, D.infos[n][t]['terminal_observation']) assert D.completes[n] assert np.allclose(0.0, D.numpy_observations[n, t + 1 + 1:, ...]) assert np.allclose(0.0, D.numpy_actions[n, t + 1:, ...]) assert np.allclose(0.0, D.numpy_rewards[n, t + 1:]) assert np.allclose(True, D.numpy_dones[n, t + 1:]) assert np.allclose(0.0, D.numpy_masks[n, t + 1:]) break else: assert np.allclose(obs, D.observations[n][t + 1])
def __call__(self, T): D = BatchEpisode(self.env_spec) obs = self.env.reset() D.add_observation(obs) self.agent.reset(self.config) # e.g. RNN initial states done = None # for RNN mask for t in range(T): if self.agent.recurrent and done is not None and any(done): kwargs = { 'mask': torch.from_numpy(np.logical_not(done).astype(np.float32)) } else: kwargs = {} out_agent = self.agent.choose_action(obs, **kwargs) action = out_agent.pop('action') if torch.is_tensor(action): raw_action = list(action.detach().cpu().numpy()) else: raw_action = action D.add_action(raw_action) obs, reward, done, info = self.env.step(raw_action) D.add_observation(obs) D.add_reward(reward) D.add_done(done) D.add_info(info) [D.set_completed(n) for n, d in enumerate(done) if d] # Record other information: e.g. log-probability of action, policy entropy D.add_batch_info(out_agent) if all(D.completes): break return D