예제 #1
0
    def optimize(self, obs: Arrayable, action: Arrayable, max_action: Tensor,
                 next_obs: Arrayable, max_next_action: Tensor,
                 reward: Arrayable, done: Arrayable, time_limit: Arrayable,
                 weights: Arrayable) -> Tensor:
        action = arr_to_th(action, self._device)
        reward = arr_to_th(reward, self._device)
        weights = arr_to_th(check_array(weights), self._device)
        done = arr_to_th(check_array(done).astype('float'), self._device)

        obs = check_array(obs)
        next_obs = check_array(next_obs)
        q = self.critic(obs, action)
        q_next = self.critic(next_obs, max_next_action,
                             target=True) * (1 - done)

        expected_q = (reward * self._dt +
                      self._gamma**self._dt * q_next).detach()
        critic_loss = (q - expected_q)**2

        self._q_optimizer.zero_grad()
        critic_loss.mean().backward(retain_graph=True)
        self._q_optimizer.step()

        soft_update(self._q_function, self._target_q_function, self._tau)

        return critic_loss
예제 #2
0
    def observe(self,
                next_obs: Arrayable,
                reward: Arrayable,
                done: Arrayable,
                time_limit: Optional[Arrayable] = None) -> None:

        if self._mode != "train":
            return None

        self._count += 1
        reward = check_array(reward)
        done = check_array(done)
        if time_limit is None:
            time_limit = np.zeros(done.shape)
        time_limit = check_array(time_limit)

        if not self._current_trajectories:
            self._nb_train_env = done.shape[0]
            self._current_trajectories = \
                [Trajectory(boundlength=self._T) for _ in range(self._nb_train_env)]

        for k, traj in enumerate(self._current_trajectories):
            traj.push(self._current_obs[k], self._current_action[k], reward[k],
                      float(done[k]), float(time_limit[k]))

        self.learn()
예제 #3
0
    def step(self, obs: Arrayable) -> np.ndarray:
        if self._mode != "eval":
            action = th_to_arr(self._actor.act_noisy(obs))
        else:
            action = th_to_arr(self._actor.act(obs))

        self._current_obs = check_array(obs)
        self._current_action = check_array(action)
        if isinstance(self._actor, OnlineActorContinuous):
            action = np.clip(action, -1, 1)
        return action
예제 #4
0
def compute_return(rewards: Arrayable, dones: Arrayable) -> float:
    """Compute return from rewards and termination signals.

    :args rewards: (seq_len, batch_size) reward array
    :args dones: (seq_len, batch_size) termination signal array

    :return: averaged undiscounted return
    """
    R = 0
    rewards = check_array(rewards)
    dones = check_array(dones)
    for r, d in zip(rewards[::-1], dones[::-1]):
        R = r + R * (1 - d)
    return np.mean(R)
예제 #5
0
    def push(self, obs: Arrayable, action: Arrayable, reward: float,
             done: float, time_limit: float) -> None:
        """
        Push a single transition on a trajectory
        (before seing the next observation).
        """
        obs = check_array(obs)
        action = check_array(action)

        self._obs.append(obs)
        self._actions.append(action)
        self._rewards.append(reward)
        self._done.append(done)
        self._time_limit.append(time_limit)
        self.boundlength()
예제 #6
0
    def sample(self, to_observe: bool = True) -> Tuple[Arrayable, ...]:
        if to_observe:
            assert self._idxs is None, "No observe after sample ..."
        idxs, priorities = zip(
            *[self._sum_tree.sample() for _ in range(self._batch_size)])
        idxs, priorities = check_array(idxs), check_array(priorities)
        obs, action, next_obs, reward, done, _, time_limit = self._memory.sample(
            idxs)
        weights = (self._sum_tree.total / self._memory.size /
                   priorities)**self._beta
        weights = weights / weights.max()

        if to_observe:
            self._idxs = idxs

        return obs, action, next_obs, reward, done, weights, time_limit
예제 #7
0
 def push(self, obs: Arrayable, action: Arrayable, next_obs: Arrayable,
          reward: Arrayable, done: Arrayable,
          time_limit: Optional[Arrayable]) -> None:
     self._memory.push(obs, action, next_obs, reward, done, time_limit)
     assert self._sum_tree.size == self._memory.size
     for _ in check_array(obs):
         self._sum_tree.add(self._max_priority**self._alpha)
예제 #8
0
    def observe(self, priorities: Arrayable):
        assert self._idxs is not None, "No sample before observe ..."
        priorities = check_array(priorities)
        self._max_priority = max(self._max_priority, priorities.max())
        for idx, prio in zip(self._idxs, priorities):
            self._sum_tree.modify(idx, prio**self._alpha)

        self._idxs = None
예제 #9
0
    def optimize(self, obs: Arrayable, action: Arrayable, max_action: Tensor,
                 next_obs: Arrayable, max_next_action: Tensor,
                 reward: Arrayable, done: Arrayable, time_limit: Arrayable,
                 weights: Arrayable) -> Tensor:
        """Optimizes using the DAU variant of advantage updating.

        Note that this variant uses max_action, and not max_next_action, as is
        more common with standard Q-Learning. It relies on the set of equations
        V^*(s) + dt A^*(s, a) = r(s, a) dt + gamma^dt V^*(s)
        A^*(s, a) = adv_function(s, a) - adv_function(s, max_action)
        """
        obs = check_array(obs)
        batch_size = obs.shape[0]
        action = arr_to_th(action, self._device).type_as(max_action)
        reward = arr_to_th(reward, self._device)
        weights = arr_to_th(check_array(weights), self._device)
        done = arr_to_th(check_array(done).astype('float'), self._device)

        v = self._val_function(obs).squeeze()
        next_v = (1 - done) * self._target_val_function(next_obs).squeeze()
        pre_advs = self.critic(np.concatenate([obs, obs], axis=0),
                               torch.cat([action, max_action], dim=0))
        pre_adv, pre_max_adv = pre_advs[:batch_size], pre_advs[batch_size:]
        adv = pre_adv - pre_max_adv
        q = v + self._dt * adv
        # next_adv = 0 by definition
        expected_q = (reward * self._dt +
                      self._gamma**self._dt * next_v).detach()

        critic_loss = (q - expected_q)**2

        self._val_optimizer.zero_grad()
        self._adv_optimizer.zero_grad()
        critic_loss.mean().backward(retain_graph=True)
        self._val_optimizer.step()
        self._adv_optimizer.step()

        soft_update(self._adv_function, self._target_adv_function, self._tau)
        soft_update(self._val_function, self._target_val_function, self._tau)

        return critic_loss
예제 #10
0
    def push(self, obs: Arrayable, action: Arrayable, next_obs: Arrayable,
             reward: Arrayable, done: Arrayable,
             time_limit: Optional[Arrayable]) -> None:
        """Push a transition on the buffer."""
        # if empty, initialize  buffer
        obs = check_array(obs)
        action = check_array(action)
        next_obs = check_array(next_obs)
        reward = check_array(reward)
        done = check_array(done)
        if time_limit is not None:
            time_limit = check_array(time_limit)

        nb_envs = obs.shape[0]
        if self._true_size == -1:
            self._true_size = (self._size // nb_envs) * nb_envs
            self._obs = np.zeros((self._true_size, *obs.shape[1:]))
            self._action = np.zeros((self._true_size, *action.shape[1:]))
            self._next_obs = np.zeros((self._true_size, *next_obs.shape[1:]))
            self._reward = np.zeros((self._true_size, *reward.shape[1:]))
            self._done = np.zeros((self._true_size, *done.shape[1:]))
            if time_limit is not None:
                self._time_limit = np.zeros(
                    (self._true_size, *time_limit.shape[1:]))

            # initialize reference point
            self._ref_obs = obs.copy()

        self._obs[self._cur:self._cur + nb_envs] = obs
        self._action[self._cur:self._cur + nb_envs] = action
        self._next_obs[self._cur:self._cur + nb_envs] = next_obs
        self._reward[self._cur:self._cur + nb_envs] = reward
        self._done[self._cur:self._cur + nb_envs] = done
        if self._time_limit is not None:
            self._time_limit[self._cur:self._cur + nb_envs] = time_limit
        if self._cur + nb_envs == self._true_size:
            self._full = True
        self._cur = (self._cur + nb_envs) % (self._true_size)