class DequeReplayRollerOuNoise(ReplayEnvRollerBase): """ Enrionment roller with experience replay buffer rolling out a **single** environment with Ornstein–Uhlenbeck noise process """ def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev, normalize_observations=False): self.device = device self.batch_size = batch_size self.buffer_capacity = buffer_capacity self.buffer_initial_size = buffer_initial_size self.normalize_observations = normalize_observations self.device = device self._environment = environment self.backend = DequeBufferBackend( buffer_capacity=self.buffer_capacity, observation_space=environment.observation_space, action_space=environment.action_space ) self.last_observation = self.environment.reset() len_action_space = self.environment.action_space.shape[-1] self.noise_process = OrnsteinUhlenbeckNoiseProcess( np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space) ) self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None self.clip_obs = 10.0 @property def environment(self): """ Return environment of this env roller """ return self._environment def is_ready_for_sampling(self) -> bool: """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """ return self.backend.current_size >= self.buffer_initial_size @torch.no_grad() def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ observation_tensor = torch.from_numpy(self.last_observation).to(self.device) step = model.step(observation_tensor[None]) action = step['actions'].detach().cpu().numpy()[0] noise = self.noise_process() action_perturbed = np.clip( action + noise, self.environment.action_space.low, self.environment.action_space.high ) observation, reward, done, info = self.environment.step(action_perturbed) if self.ob_rms is not None: self.ob_rms.update(observation) self.backend.store_transition(self.last_observation, action_perturbed, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.noise_process.reset() self.last_observation = observation return Transitions( size=1, environment_information=[info], transition_tensors={ 'actions': step['actions'], 'values': step['values'] }, ) def _filter_observation(self, obs): """ Potentially normalize observation """ if self.ob_rms is not None: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + 1e-8), -self.clip_obs, self.clip_obs) return obs.astype(np.float32) else: return obs def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1) batch = self.backend.get_batch(indexes, history_length=1) observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device) observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions( size=self.batch_size, environment_information=[], transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions } )
class DequeReplayRollerEpsGreedy(ReplayEnvRollerBase): """ Environment roller for action-value models using experience replay. Simplest buffer implementation just holding up to given number of samples. Because framestack is implemented directly in the buffer, we can use *much* less space to hold samples in memory for very little additional cost. """ def __init__(self, environment, device, epsilon_schedule: Schedule, batch_size: int, buffer_capacity: int, buffer_initial_size: int, frame_stack: int): self.epsilon_schedule = epsilon_schedule self.batch_size = batch_size self.buffer_capacity = buffer_capacity self.buffer_initial_size = buffer_initial_size self.frame_stack = frame_stack self.device = device self._environment = environment self.backend = DequeBufferBackend( buffer_capacity=self.buffer_capacity, observation_space=environment.observation_space, action_space=environment.action_space) self.last_observation = self.environment.reset() @property def environment(self): """ Return environment of this env roller """ return self._environment def is_ready_for_sampling(self) -> bool: """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """ return self.backend.current_size >= self.buffer_initial_size def epsgreedy_action(self, policy_samples, epsilon): """ Sample e-greedy action using curreny policy and epsilon value """ random_samples = torch.randint_like(policy_samples, self.environment.action_space.n) selector = torch.rand_like(random_samples, dtype=torch.float32) return torch.where(selector > epsilon, policy_samples, random_samples) @torch.no_grad() def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ epsilon_value = self.epsilon_schedule.value(batch_info['progress']) batch_info['epsilon'] = epsilon_value last_observation = np.concatenate([ self.backend.get_frame(self.backend.current_idx, self.frame_stack - 1), self.last_observation ], axis=-1) observation_tensor = torch.from_numpy(last_observation[None]).to( self.device) step = model.step(observation_tensor) epsgreedy_step = self.epsgreedy_action(step['actions'], epsilon_value) action = epsgreedy_step.item() observation, reward, done, info = self.environment.step(action) self.backend.store_transition(self.last_observation, action, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.last_observation = observation return Transitions(size=1, environment_information=[info], transition_tensors={ 'actions': epsgreedy_step.unsqueeze(0), 'values': step['values'] }, extra_data={'epsilon': epsilon_value}) def metrics(self): """ List of metrics to track for this learning process """ return [ AveragingNamedMetric("epsilon"), ] def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, self.frame_stack) batch = self.backend.get_batch(indexes, self.frame_stack) observations = torch.from_numpy(batch['states']).to(self.device) observations_plus1 = torch.from_numpy(batch['states+1']).to( self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to( self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to( self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions(size=self.batch_size, environment_information=None, transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions, 'weights': torch.ones_like(rewards) })