def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ observation_tensor = torch.from_numpy(self.last_observation).to(self.device) step = model.step(observation_tensor[None]) action = step['actions'].detach().cpu().numpy()[0] noise = self.noise_process() action_perturbed = np.clip( action + noise, self.environment.action_space.low, self.environment.action_space.high ) observation, reward, done, info = self.environment.step(action_perturbed) if self.ob_rms is not None: self.ob_rms.update(observation) self.backend.store_transition(self.last_observation, action_perturbed, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.noise_process.reset() self.last_observation = observation return Transitions( size=1, environment_information=[info], transition_tensors={ 'actions': step['actions'], 'values': step['values'] }, )
def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, self.frame_stack) batch = self.backend.get_batch(indexes, self.frame_stack) observations = torch.from_numpy(batch['states']).to(self.device) observations_plus1 = torch.from_numpy(batch['states+1']).to( self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to( self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to( self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions(size=self.batch_size, environment_information=None, transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions, 'weights': torch.ones_like(rewards) })
def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ epsilon_value = self.epsilon_schedule.value(batch_info['progress']) batch_info['epsilon'] = epsilon_value last_observation = np.concatenate([ self.backend.get_frame(self.backend.current_idx, self.frame_stack - 1), self.last_observation ], axis=-1) observation_tensor = torch.from_numpy(last_observation[None]).to( self.device) step = model.step(observation_tensor) epsgreedy_step = self.epsgreedy_action(step['actions'], epsilon_value) action = epsgreedy_step.item() observation, reward, done, info = self.environment.step(action) self.backend.store_transition(self.last_observation, action, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.last_observation = observation return Transitions(size=1, environment_information=[info], transition_tensors={ 'actions': epsgreedy_step.unsqueeze(0), 'values': step['values'] }, extra_data={'epsilon': epsilon_value})
def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ probs, indexes, tree_idxs = self.backend.sample_batch_prioritized(self.batch_size, self.frame_stack) batch = self.backend.get_batch(indexes, self.frame_stack) # Normalize weights properly priority_weight = self.priority_weight_schedule.value(batch_info['progress']) probs = np.stack(probs) / self.backend.segment_tree.total() capacity = self.backend.deque.current_size weights = (capacity * probs) ** (-priority_weight) weights = weights / weights.max() observations = torch.from_numpy(batch['states']).to(self.device) observations_plus1 = torch.from_numpy(batch['states+1']).to(self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device) actions = torch.from_numpy(batch['actions']).to(self.device) weights = torch.from_numpy(weights.astype(np.float32)).to(self.device) return Transitions( size=self.batch_size, environment_information=None, transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions, 'weights': weights, }, extra_data={ 'tree_idxs': tree_idxs } )
def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1) batch = self.backend.get_batch(indexes, history_length=1) observations = self._observation_list_to_tensor(batch['states']) observations_plus1 = self._observation_list_to_tensor( batch['states+1']) rewards = batch['rewards'].astype(np.float32) if self.ret_rms is not None: rewards = np.clip(rewards / np.sqrt(self.ret_rms.var + 1e-8), -self.clip_obs, self.clip_obs) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to( self.device) rewards = torch.from_numpy(rewards).to(self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions(size=self.batch_size, environment_information=[], transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions })
def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ # observation_tensor = self._observation_to_tensor(self.last_observation[None]) observation_tensor = self._observation_to_tensor(self.last_observation) step = model.step(observation_tensor) action = step['actions'].detach().cpu().numpy()[0] noise = self.noise_process() action_perturbed = np.clip(action + noise, self.environment.action_space.low, self.environment.action_space.high) observation, reward, done, info = self.environment.step( action_perturbed) if self.ob_rms is not None: self.ob_rms.update(observation) if self.ret_rms is not None: self.accumulated_return = reward + self.discount_factor * self.accumulated_return self.ret_rms.update(np.array([self.accumulated_return])) self.backend.store_transition(self.last_observation, action_perturbed, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.noise_process.reset() self.accumulated_return = 0.0 self.last_observation = observation return Transitions( size=1, environment_information=info, transition_tensors={ 'actions': step['actions'], 'values': step['values'] }, )
def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1) batch = self.backend.get_batch(indexes, history_length=1) observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device) observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions( size=self.batch_size, environment_information=[], transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions } )