예제 #1
0
def test_simple_get_frame():
    """ Check if get_frame returns frames from a buffer partially full """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8)
    action_space = gym.spaces.Discrete(4)
    buffer = DequeBufferBackend(20, observation_space, action_space)

    v1 = np.ones(4).reshape((2, 2, 1))
    v2 = v1 * 2
    v3 = v1 * 3

    buffer.store_transition(v1, 0, 0, False)
    buffer.store_transition(v2, 0, 0, False)
    buffer.store_transition(v3, 0, 0, False)

    assert np.all(buffer.get_frame(0, 4).max(0).max(0) == np.array([0, 0, 0, 1]))
    assert np.all(buffer.get_frame(1, 4).max(0).max(0) == np.array([0, 0, 1, 2]))
    assert np.all(buffer.get_frame(2, 4).max(0).max(0) == np.array([0, 1, 2, 3]))

    with t.assert_raises(VelException):
        buffer.get_frame(3, 4)

    with t.assert_raises(VelException):
        buffer.get_frame(4, 4)
예제 #2
0
class DequeReplayRollerEpsGreedy(ReplayEnvRollerBase):
    """
    Environment roller for action-value models using experience replay.
    Simplest buffer implementation just holding up to given number of samples.

    Because framestack is implemented directly in the buffer, we can use *much* less space to hold samples in
    memory for very little additional cost.
    """
    def __init__(self, environment, device, epsilon_schedule: Schedule,
                 batch_size: int, buffer_capacity: int,
                 buffer_initial_size: int, frame_stack: int):
        self.epsilon_schedule = epsilon_schedule
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.frame_stack = frame_stack

        self.device = device
        self._environment = environment
        self.backend = DequeBufferBackend(
            buffer_capacity=self.buffer_capacity,
            observation_space=environment.observation_space,
            action_space=environment.action_space)

        self.last_observation = self.environment.reset()

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.backend.current_size >= self.buffer_initial_size

    def epsgreedy_action(self, policy_samples, epsilon):
        """ Sample e-greedy action using curreny policy and epsilon value """
        random_samples = torch.randint_like(policy_samples,
                                            self.environment.action_space.n)
        selector = torch.rand_like(random_samples, dtype=torch.float32)
        return torch.where(selector > epsilon, policy_samples, random_samples)

    @torch.no_grad()
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        epsilon_value = self.epsilon_schedule.value(batch_info['progress'])
        batch_info['epsilon'] = epsilon_value

        last_observation = np.concatenate([
            self.backend.get_frame(self.backend.current_idx,
                                   self.frame_stack - 1), self.last_observation
        ],
                                          axis=-1)

        observation_tensor = torch.from_numpy(last_observation[None]).to(
            self.device)
        step = model.step(observation_tensor)

        epsgreedy_step = self.epsgreedy_action(step['actions'], epsilon_value)
        action = epsgreedy_step.item()

        observation, reward, done, info = self.environment.step(action)
        self.backend.store_transition(self.last_observation, action, reward,
                                      done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()

        self.last_observation = observation

        return Transitions(size=1,
                           environment_information=[info],
                           transition_tensors={
                               'actions': epsgreedy_step.unsqueeze(0),
                               'values': step['values']
                           },
                           extra_data={'epsilon': epsilon_value})

    def metrics(self):
        """ List of metrics to track for this learning process """
        return [
            AveragingNamedMetric("epsilon"),
        ]

    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size,
                                                    self.frame_stack)
        batch = self.backend.get_batch(indexes, self.frame_stack)

        observations = torch.from_numpy(batch['states']).to(self.device)
        observations_plus1 = torch.from_numpy(batch['states+1']).to(
            self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(
            self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(
            self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(size=self.batch_size,
                           environment_information=None,
                           transition_tensors={
                               'observations': observations,
                               'observations_next': observations_plus1,
                               'dones': dones,
                               'rewards': rewards,
                               'actions': actions,
                               'weights': torch.ones_like(rewards)
                           })