def compute_nstep_return( batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, target_q_fn: Callable[[ReplayBuffer, np.ndarray], torch.Tensor], gamma: float = 0.99, n_step: int = 1, rew_norm: bool = False, use_mixed: bool = False, ) -> Batch: r"""Compute n-step return for Q-learning targets. .. math:: G_t = \sum_{i = t}^{t + n - 1} \gamma^{i - t}(1 - d_i)r_i + \gamma^n (1 - d_{t + n}) Q_{\mathrm{target}}(s_{t + n}) where :math:`\gamma` is the discount factor, :math:`\gamma \in [0, 1]`, :math:`d_t` is the done flag of step :math:`t`. :param Batch batch: a data batch, which is equal to buffer[indice]. :param ReplayBuffer buffer: the data buffer. :param function target_q_fn: a function which compute target Q value of "obs_next" given data buffer and wanted indices. :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99. :param int n_step: the number of estimation step, should be an int greater than 0. Default to 1. :param bool rew_norm: normalize the reward to Normal(0, 1), Default to False. :return: a Batch. The result will be stored in batch.returns as a torch.Tensor with the same shape as target_q_fn's return tensor. """ assert not rew_norm, \ "Reward normalization in computing n-step returns is unsupported now." rew = buffer.rew bsz = len(indice) indices = [indice] for _ in range(n_step - 1): indices.append(buffer.next(indices[-1])) indices = np.stack(indices) # terminal indicates buffer indexes nstep after 'indice', # and are truncated at the end of each episode terminal = indices[-1] with autocast(enabled=use_mixed): with torch.no_grad(): target_q_torch = target_q_fn(buffer, terminal) # (bsz, ?) target_q = to_numpy(target_q_torch.float().reshape(bsz, -1)) target_q = target_q * BasePolicy.value_mask(buffer, terminal).reshape( -1, 1) end_flag = buffer.done.copy() end_flag[buffer.unfinished_index()] = True target_q = _nstep_return(rew, end_flag, target_q, indices, gamma, n_step) batch.returns = to_torch_as(target_q, target_q_torch) if hasattr(batch, "weight"): # prio buffer update batch.weight = to_torch_as(batch.weight, target_q_torch) return batch
def compute_episodic_return( batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None, gamma: float = 0.99, gae_lambda: float = 0.95, rew_norm: bool = False, ) -> Batch: """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) to calculate q function/reward to go of given batch. :param Batch batch: a data batch which contains several episodes of data in sequential order. Mind that the end of each finished episode of batch should be marked by done flag, unfinished (or collecting) episodes will be recongized by buffer.unfinished_index(). :param numpy.ndarray indice: tell batch's location in buffer, batch is equal to buffer[indice]. :param np.ndarray v_s_: the value function of all next states :math:`V(s')`. :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99. :param float gae_lambda: the parameter for Generalized Advantage Estimation, should be in [0, 1]. Default to 0.95. :param bool rew_norm: normalize the reward to Normal(0, 1). Default to False. :return: a Batch. The result will be stored in batch.returns as a numpy array with shape (bsz, ). """ rew = batch.rew if v_s_ is None: assert np.isclose(gae_lambda, 1.0) v_s_ = np.zeros_like(rew) else: v_s_ = to_numpy(v_s_.flatten()) * BasePolicy.value_mask( buffer, indice) end_flag = batch.done.copy() end_flag[np.isin(indice, buffer.unfinished_index())] = True returns = _episodic_return(v_s_, rew, end_flag, gamma, gae_lambda) if rew_norm and not np.isclose(returns.std(), 0.0, 1e-2): returns = (returns - returns.mean()) / returns.std() batch.returns = returns return batch
def compute_episodic_return( batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, v_s_: Optional[Union[np.ndarray, torch.Tensor]] = None, v_s: Optional[Union[np.ndarray, torch.Tensor]] = None, gamma: float = 0.99, gae_lambda: float = 0.95, ) -> Tuple[np.ndarray, np.ndarray]: """Compute returns over given batch. Use Implementation of Generalized Advantage Estimator (arXiv:1506.02438) to calculate q/advantage value of given batch. :param Batch batch: a data batch which contains several episodes of data in sequential order. Mind that the end of each finished episode of batch should be marked by done flag, unfinished (or collecting) episodes will be recongized by buffer.unfinished_index(). :param numpy.ndarray indice: tell batch's location in buffer, batch is equal to buffer[indice]. :param np.ndarray v_s_: the value function of all next states :math:`V(s')`. :param float gamma: the discount factor, should be in [0, 1]. Default to 0.99. :param float gae_lambda: the parameter for Generalized Advantage Estimation, should be in [0, 1]. Default to 0.95. :return: two numpy arrays (returns, advantage) with each shape (bsz, ). """ rew = batch.rew if v_s_ is None: assert np.isclose(gae_lambda, 1.0) v_s_ = np.zeros_like(rew) else: v_s_ = to_numpy(v_s_.flatten()) # type: ignore v_s_ = v_s_ * BasePolicy.value_mask(buffer, indice) v_s = np.roll(v_s_, 1) if v_s is None else to_numpy(v_s.flatten()) end_flag = batch.done.copy() end_flag[np.isin(indice, buffer.unfinished_index())] = True advantage = _gae_return(v_s, v_s_, rew, end_flag, gamma, gae_lambda) returns = advantage + v_s # normalization varies from each policy, so we don't do it here return returns, advantage
def _compute_return( self, batch: Batch, buffer: ReplayBuffer, indice: np.ndarray, gamma: float = 0.99, ) -> Batch: rew = batch.rew with torch.no_grad(): target_q_torch = self._target_q(buffer, indice) # (bsz, ?) target_q = to_numpy(target_q_torch) end_flag = buffer.done.copy() end_flag[buffer.unfinished_index()] = True end_flag = end_flag[indice] mean_target_q = np.mean(target_q, -1) if len(target_q.shape) > 1 else target_q _target_q = rew + gamma * mean_target_q * (1 - end_flag) target_q = np.repeat(_target_q[..., None], self.num_branches, axis=-1) target_q = np.repeat(target_q[..., None], self.max_action_num, axis=-1) batch.returns = to_torch_as(target_q, target_q_torch) if hasattr(batch, "weight"): # prio buffer update batch.weight = to_torch_as(batch.weight, target_q_torch) return batch