class TestEpsilonGreedyPolicy:
    def setup_method(self):
        self.env = DummyDiscreteEnv()
        self.policy = SimplePolicy(env_spec=self.env)
        self.epsilon_greedy_policy = EpsilonGreedyPolicy(env_spec=self.env,
                                                         policy=self.policy,
                                                         total_timesteps=100,
                                                         max_epsilon=1.0,
                                                         min_epsilon=0.02,
                                                         decay_ratio=0.1)

        self.env.reset()

    def test_epsilon_greedy_policy(self):
        obs, _, _, _ = self.env.step(1)

        action, _ = self.epsilon_greedy_policy.get_action(obs)
        assert self.env.action_space.contains(action)

        # epsilon decay by 1 step, new epsilon = 1 - 0.098 = 0.902
        random_rate = np.random.random(
            100000) < self.epsilon_greedy_policy._epsilon()
        assert np.isclose([0.902], [sum(random_rate) / 100000], atol=0.01)

        actions, _ = self.epsilon_greedy_policy.get_actions([obs] * 5)

        # epsilon decay by 6 steps in total
        # new epsilon = 1 - 6 * 0.098 = 0.412
        random_rate = np.random.random(
            100000) < self.epsilon_greedy_policy._epsilon()
        assert np.isclose([0.412], [sum(random_rate) / 100000], atol=0.01)

        for action in actions:
            assert self.env.action_space.contains(action)

    def test_set_param(self):
        params = self.epsilon_greedy_policy.get_param_values()
        params['total_env_steps'] = 6
        self.epsilon_greedy_policy.set_param_values(params)
        assert np.isclose(self.epsilon_greedy_policy._epsilon(), 0.412)

    def test_update(self):
        DummyBatch = collections.namedtuple('EpisodeBatch', ['lengths'])
        batch = DummyBatch(np.array([1, 2, 3]))
        self.epsilon_greedy_policy.update(batch)
        assert np.isclose(self.epsilon_greedy_policy._epsilon(), 0.412)

    def test_epsilon_greedy_policy_is_pickleable(self):
        obs, _, _, _ = self.env.step(1)
        for _ in range(5):
            self.epsilon_greedy_policy.get_action(obs)

        h_data = pickle.dumps(self.epsilon_greedy_policy)
        policy = pickle.loads(h_data)
        assert policy._epsilon() == self.epsilon_greedy_policy._epsilon()
예제 #2
0
class TestEpsilonGreedyStrategy(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.env = DummyDiscreteEnv()
        self.policy = SimplePolicy(env_spec=self.env)
        self.epsilon_greedy_strategy = EpsilonGreedyStrategy(
            env_spec=self.env,
            total_timesteps=100,
            max_epsilon=1.0,
            min_epsilon=0.02,
            decay_ratio=0.1)

        self.env.reset()

    def test_epsilon_greedy_strategy(self):
        obs, _, _, _ = self.env.step(1)

        action, _ = self.epsilon_greedy_strategy.get_action(
            0, obs, self.policy)
        assert self.env.action_space.contains(action)

        # epsilon decay by 1 step, new epsilon = 1 - 0.98 = 0.902
        random_rate = np.random.random(
            100000) < self.epsilon_greedy_strategy._epsilon
        assert np.isclose([0.902], [sum(random_rate) / 100000], atol=0.01)

        actions, _ = self.epsilon_greedy_strategy.get_actions(
            0, [obs] * 5, self.policy)

        # epsilon decay by 6 steps in total, new epsilon = 1 - 6 * 0.98 = 0.412
        random_rate = np.random.random(
            100000) < self.epsilon_greedy_strategy._epsilon
        assert np.isclose([0.412], [sum(random_rate) / 100000], atol=0.01)

        for action in actions:
            assert self.env.action_space.contains(action)

    def test_epsilon_greedy_strategy_is_pickleable(self):
        obs, _, _, _ = self.env.step(1)
        for _ in range(5):
            self.epsilon_greedy_strategy.get_action(0, obs, self.policy)

        h_data = pickle.dumps(self.epsilon_greedy_strategy)
        strategy = pickle.loads(h_data)
        assert strategy._epsilon == self.epsilon_greedy_strategy._epsilon
예제 #3
0
    def test_add_transition_dtype(self):
        env = DummyDiscreteEnv()
        obs = env.reset()
        replay_buffer = SimpleReplayBuffer(
            env_spec=env, size_in_transitions=3, time_horizon=1)
        replay_buffer.add_transition(
            observation=obs, action=env.action_space.sample())
        sample = replay_buffer.sample(1)
        sample_obs = sample['observation']
        sample_action = sample['action']

        assert sample_obs.dtype == env.observation_space.dtype
        assert sample_action.dtype == env.action_space.dtype
예제 #4
0
    def test_eviction_policy(self):
        env = DummyDiscreteEnv()
        obs = env.reset()

        replay_buffer = SimpleReplayBuffer(
            env_spec=env, size_in_transitions=3, time_horizon=1)
        replay_buffer.add_transitions(observation=[obs, obs], action=[1, 2])
        assert not replay_buffer.full
        replay_buffer.add_transitions(observation=[obs, obs], action=[3, 4])
        assert replay_buffer.full
        replay_buffer.add_transitions(observation=[obs, obs], action=[5, 6])
        replay_buffer.add_transitions(observation=[obs, obs], action=[7, 8])

        assert np.array_equal(replay_buffer._buffer['action'], [[7], [8], [6]])
        assert replay_buffer.n_transitions_stored == 3
예제 #5
0
    def test_pickleable(self):
        env = DummyDiscreteEnv()
        obs = env.reset()

        replay_buffer = SimpleReplayBuffer(env_spec=env,
                                           size_in_transitions=100,
                                           time_horizon=1)
        for _ in range(0, 100):
            replay_buffer.add_transitions(observation=[obs], action=[1])
        replay_buffer_pickled = pickle.loads(pickle.dumps(replay_buffer))
        assert replay_buffer_pickled._buffer.keys(
        ) == replay_buffer._buffer.keys()
        for k in replay_buffer_pickled._buffer:
            assert replay_buffer_pickled._buffer[
                k].shape == replay_buffer._buffer[k].shape
예제 #6
0
    def test_add_path_dtype(self):
        env = DummyDiscreteEnv()
        obs = env.reset()
        replay_buffer = PathBuffer(capacity_in_transitions=3)
        replay_buffer.add_path({
            'observations':
            np.array([obs]),
            'actions':
            np.array([[env.action_space.sample()]])
        })
        sample = replay_buffer.sample_transitions(1)
        sample_obs = sample['observations']
        sample_action = sample['actions']

        assert sample_obs.dtype == env.observation_space.dtype
        assert sample_action.dtype == env.action_space.dtype