class DequeReplayRollerOuNoise(ReplayEnvRollerBase):
    """
    Enrionment roller with experience replay buffer rolling out a **single** environment
    with Ornstein–Uhlenbeck noise process
    """

    def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev,
                 normalize_observations=False):
        self.device = device
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.normalize_observations = normalize_observations

        self.device = device
        self._environment = environment

        self.backend = DequeBufferBackend(
            buffer_capacity=self.buffer_capacity,
            observation_space=environment.observation_space,
            action_space=environment.action_space
        )

        self.last_observation = self.environment.reset()

        len_action_space = self.environment.action_space.shape[-1]

        self.noise_process = OrnsteinUhlenbeckNoiseProcess(
            np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space)
        )

        self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None
        self.clip_obs = 10.0

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.backend.current_size >= self.buffer_initial_size

    @torch.no_grad()
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        observation_tensor = torch.from_numpy(self.last_observation).to(self.device)

        step = model.step(observation_tensor[None])
        action = step['actions'].detach().cpu().numpy()[0]
        noise = self.noise_process()

        action_perturbed = np.clip(
            action + noise, self.environment.action_space.low, self.environment.action_space.high
        )

        observation, reward, done, info = self.environment.step(action_perturbed)

        if self.ob_rms is not None:
            self.ob_rms.update(observation)

        self.backend.store_transition(self.last_observation, action_perturbed, reward, done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()
            self.noise_process.reset()

        self.last_observation = observation

        return Transitions(
            size=1,
            environment_information=[info],
            transition_tensors={
                'actions': step['actions'],
                'values': step['values']
            },
        )

    def _filter_observation(self, obs):
        """ Potentially normalize observation """
        if self.ob_rms is not None:
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + 1e-8), -self.clip_obs, self.clip_obs)

            return obs.astype(np.float32)
        else:
            return obs

    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1)
        batch = self.backend.get_batch(indexes, history_length=1)

        observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device)
        observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(
            size=self.batch_size,
            environment_information=[],
            transition_tensors={
                'observations': observations,
                'observations_next': observations_plus1,
                'dones': dones,
                'rewards': rewards,
                'actions': actions
            }
        )
예제 #2
0
class DequeReplayRollerEpsGreedy(ReplayEnvRollerBase):
    """
    Environment roller for action-value models using experience replay.
    Simplest buffer implementation just holding up to given number of samples.

    Because framestack is implemented directly in the buffer, we can use *much* less space to hold samples in
    memory for very little additional cost.
    """
    def __init__(self, environment, device, epsilon_schedule: Schedule,
                 batch_size: int, buffer_capacity: int,
                 buffer_initial_size: int, frame_stack: int):
        self.epsilon_schedule = epsilon_schedule
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.frame_stack = frame_stack

        self.device = device
        self._environment = environment
        self.backend = DequeBufferBackend(
            buffer_capacity=self.buffer_capacity,
            observation_space=environment.observation_space,
            action_space=environment.action_space)

        self.last_observation = self.environment.reset()

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.backend.current_size >= self.buffer_initial_size

    def epsgreedy_action(self, policy_samples, epsilon):
        """ Sample e-greedy action using curreny policy and epsilon value """
        random_samples = torch.randint_like(policy_samples,
                                            self.environment.action_space.n)
        selector = torch.rand_like(random_samples, dtype=torch.float32)
        return torch.where(selector > epsilon, policy_samples, random_samples)

    @torch.no_grad()
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        epsilon_value = self.epsilon_schedule.value(batch_info['progress'])
        batch_info['epsilon'] = epsilon_value

        last_observation = np.concatenate([
            self.backend.get_frame(self.backend.current_idx,
                                   self.frame_stack - 1), self.last_observation
        ],
                                          axis=-1)

        observation_tensor = torch.from_numpy(last_observation[None]).to(
            self.device)
        step = model.step(observation_tensor)

        epsgreedy_step = self.epsgreedy_action(step['actions'], epsilon_value)
        action = epsgreedy_step.item()

        observation, reward, done, info = self.environment.step(action)
        self.backend.store_transition(self.last_observation, action, reward,
                                      done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()

        self.last_observation = observation

        return Transitions(size=1,
                           environment_information=[info],
                           transition_tensors={
                               'actions': epsgreedy_step.unsqueeze(0),
                               'values': step['values']
                           },
                           extra_data={'epsilon': epsilon_value})

    def metrics(self):
        """ List of metrics to track for this learning process """
        return [
            AveragingNamedMetric("epsilon"),
        ]

    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size,
                                                    self.frame_stack)
        batch = self.backend.get_batch(indexes, self.frame_stack)

        observations = torch.from_numpy(batch['states']).to(self.device)
        observations_plus1 = torch.from_numpy(batch['states+1']).to(
            self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(
            self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(
            self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(size=self.batch_size,
                           environment_information=None,
                           transition_tensors={
                               'observations': observations,
                               'observations_next': observations_plus1,
                               'dones': dones,
                               'rewards': rewards,
                               'actions': actions,
                               'weights': torch.ones_like(rewards)
                           })