class TestEpsilonGreedyPolicy: def setup_method(self): self.env = DummyDiscreteEnv() self.policy = SimplePolicy(env_spec=self.env) self.epsilon_greedy_policy = EpsilonGreedyPolicy(env_spec=self.env, policy=self.policy, total_timesteps=100, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) self.env.reset() def test_epsilon_greedy_policy(self): obs, _, _, _ = self.env.step(1) action, _ = self.epsilon_greedy_policy.get_action(obs) assert self.env.action_space.contains(action) # epsilon decay by 1 step, new epsilon = 1 - 0.098 = 0.902 random_rate = np.random.random( 100000) < self.epsilon_greedy_policy._epsilon() assert np.isclose([0.902], [sum(random_rate) / 100000], atol=0.01) actions, _ = self.epsilon_greedy_policy.get_actions([obs] * 5) # epsilon decay by 6 steps in total # new epsilon = 1 - 6 * 0.098 = 0.412 random_rate = np.random.random( 100000) < self.epsilon_greedy_policy._epsilon() assert np.isclose([0.412], [sum(random_rate) / 100000], atol=0.01) for action in actions: assert self.env.action_space.contains(action) def test_set_param(self): params = self.epsilon_greedy_policy.get_param_values() params['total_env_steps'] = 6 self.epsilon_greedy_policy.set_param_values(params) assert np.isclose(self.epsilon_greedy_policy._epsilon(), 0.412) def test_update(self): DummyBatch = collections.namedtuple('EpisodeBatch', ['lengths']) batch = DummyBatch(np.array([1, 2, 3])) self.epsilon_greedy_policy.update(batch) assert np.isclose(self.epsilon_greedy_policy._epsilon(), 0.412) def test_epsilon_greedy_policy_is_pickleable(self): obs, _, _, _ = self.env.step(1) for _ in range(5): self.epsilon_greedy_policy.get_action(obs) h_data = pickle.dumps(self.epsilon_greedy_policy) policy = pickle.loads(h_data) assert policy._epsilon() == self.epsilon_greedy_policy._epsilon()
class TestEpsilonGreedyStrategy(unittest.TestCase): def setUp(self): super().setUp() self.env = DummyDiscreteEnv() self.policy = SimplePolicy(env_spec=self.env) self.epsilon_greedy_strategy = EpsilonGreedyStrategy( env_spec=self.env, total_timesteps=100, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) self.env.reset() def test_epsilon_greedy_strategy(self): obs, _, _, _ = self.env.step(1) action, _ = self.epsilon_greedy_strategy.get_action( 0, obs, self.policy) assert self.env.action_space.contains(action) # epsilon decay by 1 step, new epsilon = 1 - 0.98 = 0.902 random_rate = np.random.random( 100000) < self.epsilon_greedy_strategy._epsilon assert np.isclose([0.902], [sum(random_rate) / 100000], atol=0.01) actions, _ = self.epsilon_greedy_strategy.get_actions( 0, [obs] * 5, self.policy) # epsilon decay by 6 steps in total, new epsilon = 1 - 6 * 0.98 = 0.412 random_rate = np.random.random( 100000) < self.epsilon_greedy_strategy._epsilon assert np.isclose([0.412], [sum(random_rate) / 100000], atol=0.01) for action in actions: assert self.env.action_space.contains(action) def test_epsilon_greedy_strategy_is_pickleable(self): obs, _, _, _ = self.env.step(1) for _ in range(5): self.epsilon_greedy_strategy.get_action(0, obs, self.policy) h_data = pickle.dumps(self.epsilon_greedy_strategy) strategy = pickle.loads(h_data) assert strategy._epsilon == self.epsilon_greedy_strategy._epsilon
def test_add_transition_dtype(self): env = DummyDiscreteEnv() obs = env.reset() replay_buffer = SimpleReplayBuffer( env_spec=env, size_in_transitions=3, time_horizon=1) replay_buffer.add_transition( observation=obs, action=env.action_space.sample()) sample = replay_buffer.sample(1) sample_obs = sample['observation'] sample_action = sample['action'] assert sample_obs.dtype == env.observation_space.dtype assert sample_action.dtype == env.action_space.dtype
def test_eviction_policy(self): env = DummyDiscreteEnv() obs = env.reset() replay_buffer = SimpleReplayBuffer( env_spec=env, size_in_transitions=3, time_horizon=1) replay_buffer.add_transitions(observation=[obs, obs], action=[1, 2]) assert not replay_buffer.full replay_buffer.add_transitions(observation=[obs, obs], action=[3, 4]) assert replay_buffer.full replay_buffer.add_transitions(observation=[obs, obs], action=[5, 6]) replay_buffer.add_transitions(observation=[obs, obs], action=[7, 8]) assert np.array_equal(replay_buffer._buffer['action'], [[7], [8], [6]]) assert replay_buffer.n_transitions_stored == 3
def test_pickleable(self): env = DummyDiscreteEnv() obs = env.reset() replay_buffer = SimpleReplayBuffer(env_spec=env, size_in_transitions=100, time_horizon=1) for _ in range(0, 100): replay_buffer.add_transitions(observation=[obs], action=[1]) replay_buffer_pickled = pickle.loads(pickle.dumps(replay_buffer)) assert replay_buffer_pickled._buffer.keys( ) == replay_buffer._buffer.keys() for k in replay_buffer_pickled._buffer: assert replay_buffer_pickled._buffer[ k].shape == replay_buffer._buffer[k].shape
def test_add_path_dtype(self): env = DummyDiscreteEnv() obs = env.reset() replay_buffer = PathBuffer(capacity_in_transitions=3) replay_buffer.add_path({ 'observations': np.array([obs]), 'actions': np.array([[env.action_space.sample()]]) }) sample = replay_buffer.sample_transitions(1) sample_obs = sample['observations'] sample_action = sample['actions'] assert sample_obs.dtype == env.observation_space.dtype assert sample_action.dtype == env.action_space.dtype