예제 #1
0
def _get_samples_with_support_for_octree(
        self,
        batch_inds: np.ndarray,
        env: Optional[VecNormalize] = None) -> ReplayBufferSamples:

    if not self.contains_octree_obs:
        return __old_get_samples__(self, batch_inds=batch_inds, env=env)

    # Current observations
    obs = self.observations[batch_inds, 0, :]
    obs = preprocess_stacked_octree_batch(obs, self.device)

    # Next observations
    if self.optimize_memory_usage:
        next_obs = self.observations[(batch_inds + 1) % self.buffer_size, 0, :]
    else:
        next_obs = self.next_observations[batch_inds, 0, :]
    next_obs = preprocess_stacked_octree_batch(next_obs, self.device)

    return ReplayBufferSamples(
        observations=obs,
        actions=self.to_torch(self.actions[batch_inds, 0, :]),
        next_observations=next_obs,
        dones=self.to_torch(self.dones[batch_inds]),
        rewards=self.to_torch(
            self._normalize_reward(self.rewards[batch_inds], env)),
    )
예제 #2
0
 def _get_seq_samples(self, seq, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
     if self.optimize_memory_usage:
         index = (batch_inds + 1) % self.buffer_size
         for i in range(len(batch_inds)):
           if i == 0:
             ndata = np.expand_dims(self.observations[index[i]:index[i]+seq, 0, :], 0)
           else:
             ndata = np.append(ndata, np.expand_dims(self.observations[index[i]:index[i]+seq, 0, :], 0), axis=0)
         next_obs = self._normalize_obs(ndata, env)
     else:
         for i in range(len(batch_inds)):
           if i == 0:
             ndata = np.expand_dims(self.next_observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0)
           else:
             ndata = np.append(ndata, np.expand_dims(self.next_observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0), axis=0)
         next_obs = self._normalize_obs(ndata, env)
     for i in range(len(batch_inds)):
       if i == 0:
         odata = np.expand_dims(self.observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0)
         adata = np.expand_dims(self.actions[batch_inds[i]:batch_inds[i]+seq, 0, :], 0)
         ddata = np.expand_dims(self.dones[batch_inds[i]:batch_inds[i]+seq], 0)
         rdata = np.expand_dims(self.rewards[batch_inds[i]:batch_inds[i]+seq], 0)
       else:
         odata = np.append(odata, np.expand_dims(self.observations[batch_inds[i]:batch_inds[i]+seq, 0, :], 0), axis=0)
         adata = np.append(adata, np.expand_dims(self.actions[batch_inds[i]:batch_inds[i]+seq, 0, :], 0), axis=0)
         ddata = np.append(ddata, np.expand_dims(self.dones[batch_inds[i]:batch_inds[i]+seq], 0), axis=0)
         rdata = np.append(rdata, np.expand_dims(self.rewards[batch_inds[i]:batch_inds[i]+seq], 0), axis=0)
     data = (
         self._normalize_obs(odata, env),
         adata,
         next_obs,
         ddata,
         self._normalize_reward(rdata, env),
     )
     return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #3
0
    def _get_samples(
            self,
            batch_inds: np.ndarray,
            env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
        # Sample randomly the env idx
        env_indices = np.random.randint(0,
                                        high=self.n_envs,
                                        size=(len(batch_inds), ))

        if self.optimize_memory_usage:
            next_obs = self._normalize_obs(
                self.observations[(batch_inds + 1) % self.buffer_size,
                                  env_indices, :], env)
        else:
            next_obs = self._normalize_obs(
                self.next_observations[batch_inds, env_indices, :], env)

        data = (
            self._normalize_obs(self.observations[batch_inds, env_indices, :],
                                env),
            self.actions[batch_inds, env_indices, :],
            next_obs,
            # Only use dones that are not due to timeouts
            # deactivated by default (timeouts is initialized as an array of False)
            (self.dones[batch_inds, env_indices] *
             (1 - self.timeouts[batch_inds, env_indices])).reshape(-1, 1),
            self._normalize_reward(
                self.rewards[batch_inds, env_indices].reshape(-1, 1), env),
        )
        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #4
0
 def _get_samples(
         self,
         batch_inds: np.ndarray,
         env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
     data = (self._normalize_obs(self.observations[batch_inds, 0, :],
                                 env), self.actions[batch_inds, 0, :],
             self._normalize_obs(self.next_observations[batch_inds, 0, :],
                                 env), self.dones[batch_inds],
             self._normalize_reward(self.rewards[batch_inds], env))
     return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #5
0
    def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
        if self.optimize_memory_usage:
            next_obs = self._normalize_obs(self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env)
        else:
            next_obs = self._normalize_obs(self.next_observations[batch_inds, 0, :], env)

        data = (
            self._normalize_obs(self.observations[batch_inds, 0, :], env),
            self.actions[batch_inds, 0, :],
            next_obs,
            self.dones[batch_inds],
            self._normalize_reward(self.rewards[batch_inds], env),
        )
        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #6
0
    def _get_samples(self, batch_inds: np.ndarray, env_idx: int, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
        if self.optimize_memory_usage:
            next_obs = self._normalize_obs(self.observations[(batch_inds + 1) % self.buffer_size, env_idx, :], env)
        else:
            next_obs = self._normalize_obs(self.next_observations[batch_inds, env_idx, :], env)

        data = (
            self._normalize_obs(self.observations[batch_inds, env_idx, :], env),
            self.actions[batch_inds, env_idx, :],
            next_obs,
            self.dones[batch_inds, env_idx, None], # keep the (batch_size, 1) dimension requirement
            self._normalize_reward(self.rewards[batch_inds, env_idx, None], env), # keep the (batch_size, 1) dimension requirement
        )
        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #7
0
    def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> ReplayBufferSamples:

        if self.optimize_memory_usage:
            next_obs = self._normalize_obs(self.observations[(batch_inds + 1) % self.buffer_size, 0, :], env)
        else:
            next_obs = self._normalize_obs(self.next_observations[batch_inds, 0, :], env)

        data = (
            self._normalize_obs(self.observations[batch_inds, 0, :], env),
            self.actions[batch_inds, 0, :],
            next_obs,
            # Only use dones that are not due to timeouts
            # deactivated by default (timeouts is initialized as an array of False)
            self.dones[batch_inds] * (1 - self.timeouts[batch_inds]),
            self._normalize_reward(self.rewards[batch_inds], env),
        )
        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #8
0
    def _get_samples(
            self,
            batch_inds: np.ndarray,
            env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
        if self.optimize_memory_usage:
            next_obs = self._normalize_obs(
                self.observations[(batch_inds + 1) % self.buffer_size, 0, :],
                env)
        else:
            next_obs = self._normalize_obs(
                self.next_observations[batch_inds, 0, :], env)

        # dones remains to be a np.array
        # because Tensor does not support substraction of boolean types
        # but numpy does
        data = (self._normalize_obs(self.observations[batch_inds, 0, :],
                                    env), self.actions[batch_inds, 0, :],
                next_obs, self.dones[batch_inds].astype(int),
                self._normalize_reward(self.rewards[batch_inds], env))
        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #9
0
    def _get_samples(
            self,
            batch_inds: np.ndarray,
            env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
        num_samples = len(batch_inds)
        num_expert_samples = int(num_samples / 2)
        batch_inds = batch_inds[:num_expert_samples]
        expert_inds = np.random.randint(0,
                                        len(self.expert_states),
                                        size=num_expert_samples)
        # Balanced sampling
        if self.optimize_memory_usage:
            next_obs = self._normalize_obs(
                self.observations[(batch_inds + 1) % self.buffer_size, 0, :],
                env)
        else:
            next_obs = self._normalize_obs(
                self.next_observations[batch_inds, 0, :], env)
        next_obs = np.concatenate(
            (next_obs,
             self._normalize_obs(self.expert_next_states[expert_inds], env)),
            axis=0)
        obs = self._normalize_obs(self.observations[batch_inds, 0, :], env)
        obs = np.concatenate(
            (obs, self._normalize_obs(self.expert_states[expert_inds], env)),
            axis=0)
        actions = self.actions[batch_inds, 0, :]
        actions = np.concatenate(
            (actions, self.expert_actions[expert_inds].reshape(
                num_expert_samples, -1)),
            axis=0)
        dones = self.dones[batch_inds]
        dones = np.concatenate((dones, self.expert_dones[expert_inds].reshape(
            num_expert_samples, -1)),
                               axis=0)
        # SQIL Rewards
        rewards = self.rewards[batch_inds] * 0.
        rewards = np.concatenate((rewards, np.ones_like(rewards)), axis=0)

        data = (obs, actions, next_obs, dones, rewards)
        return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #10
0
    def _get_samples(self,
                     batch_inds: np.ndarray,
                     env: Optional[VecNormalize] = None
                     ) -> ReplayBufferSamples:
        if not self.add_her_while_sampling:
            data = (self._normalize_obs(self.observations[batch_inds, 0, :], env),
                    self.actions[batch_inds, 0, :],
                    self._normalize_obs(self.next_observations[batch_inds, 0, :], env),
                    self.dones[batch_inds],
                    self._normalize_reward(self.rewards[batch_inds], env))
            return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
        else:
            '''
            Sampling inspired by https://github.com/openai/baselines/blob/master/baselines/her/her_sampler.py
            '''

            # TODO: Implement modes other than future
            if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
                future_p = 1 - (1. / (1 + self.n_sampled_goal))
            elif self.goal_selection_strategy == GoalSelectionStrategy.FINAL:
                raise NotImplementedError
            # future_t is always last timestep. Rest of code is same
            elif self.goal_selection_strategy == GoalSelectionStrategy.EPISODE:
                raise NotImplementedError
            # future_t is random value from 0 to last timestep
            elif self.goal_selection_strategy == GoalSelectionStrategy.RANDOM:
                raise NotImplementedError
            # sample second set of episode + timestep indices, use those ag's as dg's for the first set...
            else:
                raise ValueError("Invalid goal selection strategy,"
                                 "please use one of {}".format(list(GoalSelectionStrategy)))

            episode_inds = batch_inds  # renaming for better clarity
            max_timestep_inds = self.n_episode_steps[episode_inds]
            batch_size = len(episode_inds)
            timestep_inds = np.floor(np.random.uniform(max_timestep_inds)).astype(int)
            # randint does not support array, using np.uniform with floor instead

            # Select future time indexes proportional with probability future_p. These
            # will be used for HER replay by substituting in future goals.
            her_inds = np.where(np.random.uniform(size=batch_size) < future_p)
            future_offset = np.random.uniform(size=batch_size) * (max_timestep_inds - timestep_inds)
            future_offset = future_offset.astype(int)
            future_t = (timestep_inds + 1 + future_offset)[her_inds]

            # Replace goal with achieved goal but only for the previously-selected
            # HER transitions (as defined by her_indexes). For the other transitions,
            # keep the original goal.

            observations_dict = self.env.convert_obs_to_dict(self.observations[episode_inds])
            future_ag = observations_dict['achieved_goal'][her_inds, future_t, np.newaxis][0]
            observations_dict['desired_goal'][her_inds, :] = future_ag

            rewards = self.env.compute_reward(observations_dict['achieved_goal'],
                                              observations_dict['desired_goal'], None)[:, 1:].astype(np.float32)
            # Skip reward computed for initial states

            obs = self.env.convert_dict_to_obs(observations_dict)
            data = (self._normalize_obs(obs[np.arange(obs.shape[0]), timestep_inds][:, 0], env),
                    self.actions[episode_inds][np.arange(batch_size), timestep_inds][:, 0],
                    self._normalize_obs(obs[np.arange(obs.shape[0]), timestep_inds + 1][:, 0], env),
                    self.dones[episode_inds][np.arange(batch_size), timestep_inds],
                    self._normalize_reward(rewards[np.arange(batch_size), timestep_inds], env))

            return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
예제 #11
0
    def _sample_transitions(
        self,
        batch_size: Optional[int],
        maybe_vec_env: Optional[VecNormalize],
        online_sampling: bool,
        n_sampled_goal: Optional[int] = None,
    ) -> Union[ReplayBufferSamples, Tuple[np.ndarray, ...]]:
        """
        :param batch_size: Number of element to sample (only used for online sampling)
        :param env: associated gym VecEnv to normalize the observations/rewards
            Only valid when using online sampling
        :param online_sampling: Using online_sampling for HER or not.
        :param n_sampled_goal: Number of sampled goals for replay. (offline sampling)
        :return: Samples.
        """
        # Select which episodes to use
        if online_sampling:
            assert batch_size is not None, "No batch_size specified for online sampling of HER transitions"
            # Do not sample the episode with index `self.pos` as the episode is invalid
            if self.full:
                episode_indices = (
                    np.random.randint(1, self.n_episodes_stored, batch_size) +
                    self.pos) % self.n_episodes_stored
            else:
                episode_indices = np.random.randint(0, self.n_episodes_stored,
                                                    batch_size)
            # A subset of the transitions will be relabeled using HER algorithm
            her_indices = np.arange(batch_size)[:int(self.her_ratio *
                                                     batch_size)]
        else:
            assert maybe_vec_env is None, "Transitions must be stored unnormalized in the replay buffer"
            assert n_sampled_goal is not None, "No n_sampled_goal specified for offline sampling of HER transitions"
            # Offline sampling: there is only one episode stored
            episode_length = self.episode_lengths[0]
            # we sample n_sampled_goal per timestep in the episode (only one is stored).
            episode_indices = np.tile(0, (episode_length * n_sampled_goal))
            # we only sample virtual transitions
            # as real transitions are already stored in the replay buffer
            her_indices = np.arange(len(episode_indices))

        ep_lengths = self.episode_lengths[episode_indices]

        # Special case when using the "future" goal sampling strategy
        # we cannot sample all transitions, we have to remove the last timestep
        if self.goal_selection_strategy == GoalSelectionStrategy.FUTURE:
            # restrict the sampling domain when ep_lengths > 1
            # otherwise filter out the indices
            her_indices = her_indices[ep_lengths[her_indices] > 1]
            ep_lengths[her_indices] -= 1

        if online_sampling:
            # Select which transitions to use
            transitions_indices = np.random.randint(ep_lengths)
        else:
            if her_indices.size == 0:
                # Episode of one timestep, not enough for using the "future" strategy
                # no virtual transitions are created in that case
                return np.zeros(0), np.zeros(0), np.zeros(0), np.zeros(0)
            else:
                # Repeat every transition index n_sampled_goals times
                # to sample n_sampled_goal per timestep in the episode (only one is stored).
                # Now with the corrected episode length when using "future" strategy
                transitions_indices = np.tile(np.arange(ep_lengths[0]),
                                              n_sampled_goal)
                episode_indices = episode_indices[transitions_indices]
                her_indices = np.arange(len(episode_indices))

        # get selected transitions
        transitions = {
            key: self.buffer[key][episode_indices, transitions_indices].copy()
            for key in self.buffer.keys()
        }

        # sample new desired goals and relabel the transitions
        new_goals = self.sample_goals(episode_indices, her_indices,
                                      transitions_indices)
        transitions["desired_goal"][her_indices] = new_goals

        # Convert info buffer to numpy array
        transitions["info"] = np.array([
            self.info_buffer[episode_idx][transition_idx] for episode_idx,
            transition_idx in zip(episode_indices, transitions_indices)
        ])

        # Vectorized computation of the new reward
        transitions["reward"][her_indices, 0] = self.env.env_method(
            "compute_reward",
            # the new state depends on the previous state and action
            # s_{t+1} = f(s_t, a_t)
            # so the next_achieved_goal depends also on the previous state and action
            # because we are in a GoalEnv:
            # r_t = reward(s_t, a_t) = reward(next_achieved_goal, desired_goal)
            # therefore we have to use "next_achieved_goal" and not "achieved_goal"
            transitions["next_achieved_goal"][her_indices, 0],
            # here we use the new desired goal
            transitions["desired_goal"][her_indices, 0],
            transitions["info"][her_indices, 0],
        )

        # concatenate observation with (desired) goal
        observations = ObsDictWrapper.convert_dict(
            self._normalize_obs(transitions, maybe_vec_env))
        # HACK to make normalize obs work with the next observation
        transitions["observation"] = transitions["next_obs"]
        next_observations = ObsDictWrapper.convert_dict(
            self._normalize_obs(transitions, maybe_vec_env))

        if online_sampling:
            data = (
                observations[:, 0],
                transitions["action"],
                next_observations[:, 0],
                transitions["done"],
                self._normalize_reward(transitions["reward"], maybe_vec_env),
            )

            return ReplayBufferSamples(*tuple(map(self.to_torch, data)))
        else:
            return observations, next_observations, transitions[
                "action"], transitions["reward"]
예제 #12
0
파일: adril.py 프로젝트: gkswamy98/pillbox
 def _get_samples(
         self,
         batch_inds: np.ndarray,
         env: Optional[VecNormalize] = None) -> ReplayBufferSamples:
     num_samples = len(batch_inds)
     if self.balanced:
         num_expert_samples = int(num_samples / 2)
         batch_inds = batch_inds[:num_expert_samples]
         expert_inds = np.random.randint(0,
                                         len(self.expert_states),
                                         size=num_expert_samples)
         # balanced sampling
         if self.optimize_memory_usage:
             next_obs = self._normalize_obs(
                 self.observations[(batch_inds + 1) % self.buffer_size,
                                   0, :], env)
         else:
             next_obs = self._normalize_obs(
                 self.next_observations[batch_inds, 0, :], env)
         next_obs = np.concatenate(
             (next_obs,
              self._normalize_obs(self.expert_next_states[expert_inds],
                                  env)),
             axis=0)
         obs = self._normalize_obs(self.observations[batch_inds, 0, :], env)
         obs = np.concatenate(
             (obs, self._normalize_obs(self.expert_states[expert_inds],
                                       env)),
             axis=0)
         actions = self.actions[batch_inds, 0, :]
         actions = np.concatenate(
             (actions, self.expert_actions[expert_inds].reshape(
                 num_expert_samples, -1)),
             axis=0)
         dones = self.dones[batch_inds]
         dones = np.concatenate(
             (dones, self.expert_dones[expert_inds].reshape(
                 num_expert_samples, -1)),
             axis=0)
         # AdRIL Rewards (indicator kernel)
         mask1 = (self.rewards[batch_inds] >= 0).astype(np.float32)
         mask2 = (self.rewards[batch_inds] < self.iter).astype(np.float32)
         r1 = -(1.**
                (-self.rewards[batch_inds])) * mask1 * mask2  # Past iter
         r2 = np.zeros_like(self.rewards[batch_inds]) * mask1 * (
             1 - mask2)  # current iter
         r3 = -self.rewards[batch_inds] * (1 - mask1)  # Expert
         if self.iter > 0:
             rewards = (r1 / self.N_learner) + r2 + r3
         else:
             rewards = r1 + r2 + r3
         rewards = np.concatenate(
             (rewards, np.ones_like(rewards) / self.N_expert), axis=0)
     else:
         if self.optimize_memory_usage:
             next_obs = self._normalize_obs(
                 self.observations[(batch_inds + 1) % self.buffer_size,
                                   0, :], env)
         else:
             next_obs = self._normalize_obs(
                 self.next_observations[batch_inds, 0, :], env)
         obs = self._normalize_obs(self.observations[batch_inds, 0, :], env)
         actions = self.actions[batch_inds, 0, :]
         dones = self.dones[batch_inds]
         # AdRIL Rewards (indicator kernel)
         mask1 = (self.rewards[batch_inds] >= 0).astype(np.float32)
         mask2 = (self.rewards[batch_inds] < self.iter).astype(np.float32)
         r1 = -(1.**
                (-self.rewards[batch_inds])) * mask1 * mask2  # Past iter
         r2 = np.zeros_like(self.rewards[batch_inds]) * mask1 * (
             1 - mask2)  # current iter
         r3 = -self.rewards[batch_inds] * (1 -
                                           mask1) / self.N_expert  # Expert
         if self.iter > 0:
             rewards = (r1 * 1. / self.N_learner) + r2 + r3
         else:
             rewards = r1 + r2 + r3
     data = (obs, actions, next_obs, dones, rewards)
     return ReplayBufferSamples(*tuple(map(self.to_torch, data)))