def _get_samples_with_support_for_octree( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: if not self.contains_octree_obs: return __old_get_samples__(self, batch_inds=batch_inds, env=env) # Current observations obs = self.observations[batch_inds, 0, :] obs = preprocess_stacked_octree_batch(obs, self.device) # Next observations if self.optimize_memory_usage: next_obs = self.observations[(batch_inds + 1) % self.buffer_size, 0, :] else: next_obs = self.next_observations[batch_inds, 0, :] next_obs = preprocess_stacked_octree_batch(next_obs, self.device) return ReplayBufferSamples( observations=obs, actions=self.to_torch(self.actions[batch_inds, 0, :]), next_observations=next_obs, dones=self.to_torch(self.dones[batch_inds]), rewards=self.to_torch( self._normalize_reward(self.rewards[batch_inds], env)), )
def _get_seq_samples(self, seq, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: if self.optimize_memory_usage: index = (batch_inds + 1) % self.buffer_size for i in range(len(batch_inds)): if i == 0: ndata = np.expand_dims(self.observations[index[i]:index[i]+seq, 0, :], 0) else: ndata = np.append(ndata, np.expand_dims(self.observations[index[i]:index[i]+seq, 0, :], 0), axis=0) next_obs = self._normalize_obs(ndata, env) else: for i in range(len(batch_inds)): if i == 0: ndata = np.expand_dims(self.next_observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0) else: ndata = np.append(ndata, np.expand_dims(self.next_observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0), axis=0) next_obs = self._normalize_obs(ndata, env) for i in range(len(batch_inds)): if i == 0: odata = np.expand_dims(self.observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0) adata = np.expand_dims(self.actions[batch_inds[i]:batch_inds[i]+seq, 0, :], 0) ddata = np.expand_dims(self.dones[batch_inds[i]:batch_inds[i]+seq], 0) rdata = np.expand_dims(self.rewards[batch_inds[i]:batch_inds[i]+seq], 0) else: odata = np.append(odata, np.expand_dims(self.observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0), axis=0) adata = np.append(adata, np.expand_dims(self.actions[batch_inds[i]:batch_inds[i]+seq, 0, :], 0), axis=0) ddata = np.append(ddata, np.expand_dims(self.dones[batch_inds[i]:batch_inds[i]+seq], 0), axis=0) rdata = np.append(rdata, np.expand_dims(self.rewards[batch_inds[i]:batch_inds[i]+seq], 0), axis=0) data = ( self._normalize_obs(odata, env), adata, next_obs, ddata, self._normalize_reward(rdata, env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: # Sample randomly the env idx env_indices = np.random.randint(0, high=self.n_envs, size=(len(batch_inds), )) if self.optimize_memory_usage: next_obs = self._normalize_obs( self.observations[(batch_inds + 1) % self.buffer_size, env_indices, :], env) else: next_obs = self._normalize_obs( self.next_observations[batch_inds, env_indices, :], env) data = ( self._normalize_obs(self.observations[batch_inds, env_indices, :], env), self.actions[batch_inds, env_indices, :], next_obs, # Only use dones that are not due to timeouts # deactivated by default (timeouts is initialized as an array of False) (self.dones[batch_inds, env_indices] * (1 - self.timeouts[batch_inds, env_indices])).reshape(-1, 1), self._normalize_reward( self.rewards[batch_inds, env_indices].reshape(-1, 1), env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: data = (self._normalize_obs(self.observations[batch_inds, 0, :], env), self.actions[batch_inds, 0, :], self._normalize_obs(self.next_observations[batch_inds, 0, :], env), self.dones[batch_inds], self._normalize_reward(self.rewards[batch_inds], env)) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: if self.optimize_memory_usage: next_obs = self._normalize_obs(self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env) else: next_obs = self._normalize_obs(self.next_observations[batch_inds, 0, :], env) data = ( self._normalize_obs(self.observations[batch_inds, 0, :], env), self.actions[batch_inds, 0, :], next_obs, self.dones[batch_inds], self._normalize_reward(self.rewards[batch_inds], env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples(self, batch_inds: np.ndarray, env_idx: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: if self.optimize_memory_usage: next_obs = self._normalize_obs(self.observations[(batch_inds + 1) % self.buffer_size, env_idx, :], env) else: next_obs = self._normalize_obs(self.next_observations[batch_inds, env_idx, :], env) data = ( self._normalize_obs(self.observations[batch_inds, env_idx, :], env), self.actions[batch_inds, env_idx, :], next_obs, self.dones[batch_inds, env_idx, None], # keep the (batch_size, 1) dimension requirement self._normalize_reward(self.rewards[batch_inds, env_idx, None], env), # keep the (batch_size, 1) dimension requirement ) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: if self.optimize_memory_usage: next_obs = self._normalize_obs(self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env) else: next_obs = self._normalize_obs(self.next_observations[batch_inds, 0, :], env) data = ( self._normalize_obs(self.observations[batch_inds, 0, :], env), self.actions[batch_inds, 0, :], next_obs, # Only use dones that are not due to timeouts # deactivated by default (timeouts is initialized as an array of False) self.dones[batch_inds] * (1 - self.timeouts[batch_inds]), self._normalize_reward(self.rewards[batch_inds], env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: if self.optimize_memory_usage: next_obs = self._normalize_obs( self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env) else: next_obs = self._normalize_obs( self.next_observations[batch_inds, 0, :], env) # dones remains to be a np.array # because Tensor does not support substraction of boolean types # but numpy does data = (self._normalize_obs(self.observations[batch_inds, 0, :], env), self.actions[batch_inds, 0, :], next_obs, self.dones[batch_inds].astype(int), self._normalize_reward(self.rewards[batch_inds], env)) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: num_samples = len(batch_inds) num_expert_samples = int(num_samples / 2) batch_inds = batch_inds[:num_expert_samples] expert_inds = np.random.randint(0, len(self.expert_states), size=num_expert_samples) # Balanced sampling if self.optimize_memory_usage: next_obs = self._normalize_obs( self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env) else: next_obs = self._normalize_obs( self.next_observations[batch_inds, 0, :], env) next_obs = np.concatenate( (next_obs, self._normalize_obs(self.expert_next_states[expert_inds], env)), axis=0) obs = self._normalize_obs(self.observations[batch_inds, 0, :], env) obs = np.concatenate( (obs, self._normalize_obs(self.expert_states[expert_inds], env)), axis=0) actions = self.actions[batch_inds, 0, :] actions = np.concatenate( (actions, self.expert_actions[expert_inds].reshape( num_expert_samples, -1)), axis=0) dones = self.dones[batch_inds] dones = np.concatenate((dones, self.expert_dones[expert_inds].reshape( num_expert_samples, -1)), axis=0) # SQIL Rewards rewards = self.rewards[batch_inds] * 0. rewards = np.concatenate((rewards, np.ones_like(rewards)), axis=0) data = (obs, actions, next_obs, dones, rewards) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None ) -> ReplayBufferSamples: if not self.add_her_while_sampling: data = (self._normalize_obs(self.observations[batch_inds, 0, :], env), self.actions[batch_inds, 0, :], self._normalize_obs(self.next_observations[batch_inds, 0, :], env), self.dones[batch_inds], self._normalize_reward(self.rewards[batch_inds], env)) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) else: ''' Sampling inspired by https://github.com/openai/baselines/blob/master/baselines/her/her_sampler.py ''' # TODO: Implement modes other than future if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: future_p = 1 - (1. / (1 + self.n_sampled_goal)) elif self.goal_selection_strategy == GoalSelectionStrategy.FINAL: raise NotImplementedError # future_t is always last timestep. Rest of code is same elif self.goal_selection_strategy == GoalSelectionStrategy.EPISODE: raise NotImplementedError # future_t is random value from 0 to last timestep elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM: raise NotImplementedError # sample second set of episode + timestep indices, use those ag's as dg's for the first set... else: raise ValueError("Invalid goal selection strategy," "please use one of {}".format(list(GoalSelectionStrategy))) episode_inds = batch_inds # renaming for better clarity max_timestep_inds = self.n_episode_steps[episode_inds] batch_size = len(episode_inds) timestep_inds = np.floor(np.random.uniform(max_timestep_inds)).astype(int) # randint does not support array, using np.uniform with floor instead # Select future time indexes proportional with probability future_p. These # will be used for HER replay by substituting in future goals. her_inds = np.where(np.random.uniform(size=batch_size) < future_p) future_offset = np.random.uniform(size=batch_size) * (max_timestep_inds - timestep_inds) future_offset = future_offset.astype(int) future_t = (timestep_inds + 1 + future_offset)[her_inds] # Replace goal with achieved goal but only for the previously-selected # HER transitions (as defined by her_indexes). For the other transitions, # keep the original goal. observations_dict = self.env.convert_obs_to_dict(self.observations[episode_inds]) future_ag = observations_dict['achieved_goal'][her_inds, future_t, np.newaxis][0] observations_dict['desired_goal'][her_inds, :] = future_ag rewards = self.env.compute_reward(observations_dict['achieved_goal'], observations_dict['desired_goal'], None)[:, 1:].astype(np.float32) # Skip reward computed for initial states obs = self.env.convert_dict_to_obs(observations_dict) data = (self._normalize_obs(obs[np.arange(obs.shape[0]), timestep_inds][:, 0], env), self.actions[episode_inds][np.arange(batch_size), timestep_inds][:, 0], self._normalize_obs(obs[np.arange(obs.shape[0]), timestep_inds + 1][:, 0], env), self.dones[episode_inds][np.arange(batch_size), timestep_inds], self._normalize_reward(rewards[np.arange(batch_size), timestep_inds], env)) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
def _sample_transitions( self, batch_size: Optional[int], maybe_vec_env: Optional[VecNormalize], online_sampling: bool, n_sampled_goal: Optional[int] = None, ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]: """ :param batch_size: Number of element to sample (only used for online sampling) :param env: associated gym VecEnv to normalize the observations/rewards Only valid when using online sampling :param online_sampling: Using online_sampling for HER or not. :param n_sampled_goal: Number of sampled goals for replay. (offline sampling) :return: Samples. """ # Select which episodes to use if online_sampling: assert batch_size is not None, "No batch_size specified for online sampling of HER transitions" # Do not sample the episode with index `self.pos` as the episode is invalid if self.full: episode_indices = ( np.random.randint(1, self.n_episodes_stored, batch_size) + self.pos) % self.n_episodes_stored else: episode_indices = np.random.randint(0, self.n_episodes_stored, batch_size) # A subset of the transitions will be relabeled using HER algorithm her_indices = np.arange(batch_size)[:int(self.her_ratio * batch_size)] else: assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer" assert n_sampled_goal is not None, "No n_sampled_goal specified for offline sampling of HER transitions" # Offline sampling: there is only one episode stored episode_length = self.episode_lengths[0] # we sample n_sampled_goal per timestep in the episode (only one is stored). episode_indices = np.tile(0, (episode_length * n_sampled_goal)) # we only sample virtual transitions # as real transitions are already stored in the replay buffer her_indices = np.arange(len(episode_indices)) ep_lengths = self.episode_lengths[episode_indices] # Special case when using the "future" goal sampling strategy # we cannot sample all transitions, we have to remove the last timestep if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE: # restrict the sampling domain when ep_lengths > 1 # otherwise filter out the indices her_indices = her_indices[ep_lengths[her_indices] > 1] ep_lengths[her_indices] -= 1 if online_sampling: # Select which transitions to use transitions_indices = np.random.randint(ep_lengths) else: if her_indices.size == 0: # Episode of one timestep, not enough for using the "future" strategy # no virtual transitions are created in that case return np.zeros(0), np.zeros(0), np.zeros(0), np.zeros(0) else: # Repeat every transition index n_sampled_goals times # to sample n_sampled_goal per timestep in the episode (only one is stored). # Now with the corrected episode length when using "future" strategy transitions_indices = np.tile(np.arange(ep_lengths[0]), n_sampled_goal) episode_indices = episode_indices[transitions_indices] her_indices = np.arange(len(episode_indices)) # get selected transitions transitions = { key: self.buffer[key][episode_indices, transitions_indices].copy() for key in self.buffer.keys() } # sample new desired goals and relabel the transitions new_goals = self.sample_goals(episode_indices, her_indices, transitions_indices) transitions["desired_goal"][her_indices] = new_goals # Convert info buffer to numpy array transitions["info"] = np.array([ self.info_buffer[episode_idx][transition_idx] for episode_idx, transition_idx in zip(episode_indices, transitions_indices) ]) # Vectorized computation of the new reward transitions["reward"][her_indices, 0] = self.env.env_method( "compute_reward", # the new state depends on the previous state and action # s_{t+1} = f(s_t, a_t) # so the next_achieved_goal depends also on the previous state and action # because we are in a GoalEnv: # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal) # therefore we have to use "next_achieved_goal" and not "achieved_goal" transitions["next_achieved_goal"][her_indices, 0], # here we use the new desired goal transitions["desired_goal"][her_indices, 0], transitions["info"][her_indices, 0], ) # concatenate observation with (desired) goal observations = ObsDictWrapper.convert_dict( self._normalize_obs(transitions, maybe_vec_env)) # HACK to make normalize obs work with the next observation transitions["observation"] = transitions["next_obs"] next_observations = ObsDictWrapper.convert_dict( self._normalize_obs(transitions, maybe_vec_env)) if online_sampling: data = ( observations[:, 0], transitions["action"], next_observations[:, 0], transitions["done"], self._normalize_reward(transitions["reward"], maybe_vec_env), ) return ReplayBufferSamples(*tuple(map(self.to_torch, data))) else: return observations, next_observations, transitions[ "action"], transitions["reward"]
def _get_samples( self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples: num_samples = len(batch_inds) if self.balanced: num_expert_samples = int(num_samples / 2) batch_inds = batch_inds[:num_expert_samples] expert_inds = np.random.randint(0, len(self.expert_states), size=num_expert_samples) # balanced sampling if self.optimize_memory_usage: next_obs = self._normalize_obs( self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env) else: next_obs = self._normalize_obs( self.next_observations[batch_inds, 0, :], env) next_obs = np.concatenate( (next_obs, self._normalize_obs(self.expert_next_states[expert_inds], env)), axis=0) obs = self._normalize_obs(self.observations[batch_inds, 0, :], env) obs = np.concatenate( (obs, self._normalize_obs(self.expert_states[expert_inds], env)), axis=0) actions = self.actions[batch_inds, 0, :] actions = np.concatenate( (actions, self.expert_actions[expert_inds].reshape( num_expert_samples, -1)), axis=0) dones = self.dones[batch_inds] dones = np.concatenate( (dones, self.expert_dones[expert_inds].reshape( num_expert_samples, -1)), axis=0) # AdRIL Rewards (indicator kernel) mask1 = (self.rewards[batch_inds] >= 0).astype(np.float32) mask2 = (self.rewards[batch_inds] < self.iter).astype(np.float32) r1 = -(1.** (-self.rewards[batch_inds])) * mask1 * mask2 # Past iter r2 = np.zeros_like(self.rewards[batch_inds]) * mask1 * ( 1 - mask2) # current iter r3 = -self.rewards[batch_inds] * (1 - mask1) # Expert if self.iter > 0: rewards = (r1 / self.N_learner) + r2 + r3 else: rewards = r1 + r2 + r3 rewards = np.concatenate( (rewards, np.ones_like(rewards) / self.N_expert), axis=0) else: if self.optimize_memory_usage: next_obs = self._normalize_obs( self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env) else: next_obs = self._normalize_obs( self.next_observations[batch_inds, 0, :], env) obs = self._normalize_obs(self.observations[batch_inds, 0, :], env) actions = self.actions[batch_inds, 0, :] dones = self.dones[batch_inds] # AdRIL Rewards (indicator kernel) mask1 = (self.rewards[batch_inds] >= 0).astype(np.float32) mask2 = (self.rewards[batch_inds] < self.iter).astype(np.float32) r1 = -(1.** (-self.rewards[batch_inds])) * mask1 * mask2 # Past iter r2 = np.zeros_like(self.rewards[batch_inds]) * mask1 * ( 1 - mask2) # current iter r3 = -self.rewards[batch_inds] * (1 - mask1) / self.N_expert # Expert if self.iter > 0: rewards = (r1 * 1. / self.N_learner) + r2 + r3 else: rewards = r1 + r2 + r3 data = (obs, actions, next_obs, dones, rewards) return ReplayBufferSamples(*tuple(map(self.to_torch, data)))