def discount_return_n_step(reward, done, n_step, discount, return_dest=None, done_n_dest=None, do_truncated=False): """Time-major inputs, optional other dimension: [T], [T,B], etc. Computes n-step discounted returns within the timeframe of the of given rewards. If `do_truncated==False`, then only compute at time-steps with full n-step future rewards are provided (i.e. not at last n-steps--output shape will change!). Returns n-step returns as well as n-step done signals, which is True if `done=True` at any future time before the n-step target bootstrap would apply (bootstrap in the algo, not here).""" rlen = reward.shape[0] if not do_truncated: rlen -= (n_step - 1) return_ = return_dest if return_dest is not None else zeros( (rlen,) + reward.shape[1:], dtype=reward.dtype) done_n = done_n_dest if done_n_dest is not None else zeros( (rlen,) + reward.shape[1:], dtype=done.dtype) return_[:] = reward[:rlen] # 1-step return is current reward. done_n[:] = done[:rlen] # True at time t if done any time by t + n - 1 is_torch = isinstance(done, torch.Tensor) if is_torch: done_dtype = done.dtype done_n = done_n.type(reward.dtype) done = done.type(reward.dtype) if n_step > 1: if do_truncated: for n in range(1, n_step): return_[:-n] += (discount ** n) * reward[n:n + rlen] * (1 - done_n[:-n]) done_n[:-n] = np.maximum(done_n[:-n], done[n:n + rlen]) else: for n in range(1, n_step): return_ += (discount ** n) * reward[n:n + rlen] * (1 - done_n) done_n[:] = np.maximum(done_n, done[n:n + rlen]) # Supports tensors. if is_torch: done_n = done_n.type(done_dtype) return return_, done_n
def generalized_advantage_estimation_tl(reward, value, done, bootstrap_value, discount, gae_lambda, timeout, advantage_dest=None, return_dest=None): """Like generalized_advantage_estimation(), above, except uses bootstrapping where 'done' is due to env horizon time-limit (tl=Time-Limit). (In the algo, should not train on samples where `timeout=True`.)""" advantage = advantage_dest if advantage_dest is not None else zeros( reward.shape, dtype=reward.dtype) return_ = return_dest if return_dest is not None else zeros( reward.shape, dtype=reward.dtype) assert all(done[timeout]) # timeout is bool dtype. nd = 1 - done nd = nd.type(reward.dtype) if isinstance(nd, torch.Tensor) else nd advantage[-1] = reward[-1] + discount * \ bootstrap_value * nd[-1] - value[-1] for t in reversed(range(len(reward) - 1)): delta = reward[t] + discount * value[t + 1] * nd[t] - value[t] advantage[t] = delta + discount * gae_lambda * nd[t] * advantage[t + 1] # Replace with bootstrap value where 'done' due to timeout. # Should mask out those samples for training: valid *= (1 - timeout), # because don't have valid next_state for bootstrap_value(next_state). tt = timeout[t + 1] advantage[t][tt] = ( reward[t][tt] + # Same formula before loop. discount * value[t + 1][tt] - value[t][tt]) return_[:] = advantage + value return advantage, return_
def discount_return_n_step(reward, done, n_step, discount, return_dest=None, done_n_dest=None, do_truncated=False): """Time-major inputs, optional other dimension: [T], [T,B], etc.""" rlen = reward.shape[0] if not do_truncated: rlen -= (n_step - 1) return_ = return_dest if return_dest is not None else zeros( (rlen,) + reward.shape[1:], dtype=reward.dtype) done_n = done_n_dest if done_n_dest is not None else zeros( (rlen,) + reward.shape[1:], dtype=done.dtype) return_[:] = reward[:rlen] # 1-step return is current reward. done_n[:] = done[:rlen] # True at time t if done any time by t + n - 1 is_torch = isinstance(done, torch.Tensor) if is_torch: done_dtype = done.dtype done_n = done_n.type(reward.dtype) done = done.type(reward.dtype) if n_step > 1: if do_truncated: for n in range(1, n_step): return_[:-n] += (discount ** n) * reward[n:n + rlen] * (1 - done_n[:-n]) done_n[:-n] = np.maximum(done_n[:-n], done[n:n + rlen]) else: for n in range(1, n_step): return_ += (discount ** n) * reward[n:n + rlen] * (1 - done_n) done_n = np.maximum(done_n, done[n:n + rlen]) # Supports tensors. if is_torch: done_n = done_n.type(done_dtype) return return_, done_n
def generalized_advantage_estimation(reward, value, done, bootstrap_value, discount, gae_lambda, advantage_dest=None, return_dest=None): """Time-major inputs, optional other dimensions: [T], [T,B], etc. Similar to `discount_return()` but using Generalized Advantage Estimation to compute advantages and returns.""" advantage = advantage_dest if advantage_dest is not None else zeros( reward.shape, dtype=reward.dtype) return_ = return_dest if return_dest is not None else zeros( reward.shape, dtype=reward.dtype) nd = 1 - done # array of whether or not an episode is not done nd = nd.type(reward.dtype) if isinstance(nd, torch.Tensor) else nd advantage[ -1] = reward[-1] + discount * bootstrap_value * nd[-1] - value[-1] for t in reversed(range(len(reward) - 1)): delta = reward[t] + discount * value[t + 1] * nd[t] - value[t] advantage[t] = delta + discount * gae_lambda * nd[t] * advantage[t + 1] # print(t, reward[t], value[t+1], nd[t], value[t], delta, advantage[t+1], advantage[t], '\n') return_[:] = advantage + value return advantage, return_
def generalized_advantage_estimation(reward, value, done, bootstrap_value, discount, gae_lambda, advantage_dest=None, return_dest=None): """Time-major inputs, optional other dimensions: [T], [T,B], etc.""" advantage = advantage_dest if advantage_dest is not None else zeros( reward.shape, dtype=reward.dtype) return_ = return_dest if return_dest is not None else zeros( reward.shape, dtype=reward.dtype) nd = 1 - done nd = nd.type(reward.dtype) if isinstance(nd, torch.Tensor) else nd nd = nd[:, :, None] if len(reward.shape) == 3 else nd advantage[-1] = reward[-1] + discount * bootstrap_value * nd[-1] - value[-1] for t in reversed(range(len(reward) - 1)): delta = reward[t] + discount * value[t + 1] * nd[t] - value[t] advantage[t] = delta + discount * gae_lambda * nd[t] * advantage[t + 1] return_[:] = advantage + value return advantage, return_
def discount_return(reward, done, bootstrap_value, discount, return_dest=None): """Time-major inputs, optional other dimensions: [T], [T,B], etc.""" return_ = return_dest if return_dest is not None else zeros( reward.shape, dtype=reward.dtype) not_done = 1 - done return_[-1] = reward[-1] + discount * bootstrap_value * not_done[-1] for t in reversed(range(len(reward) - 1)): return_[t] = reward[t] + return_[t + 1] * discount * not_done[t] return return_
def discount_return_n_step(reward, done, n_step, discount, return_dest=None, done_n_dest=None): """Time-major inputs, optional other dimension: [T], [T,B], etc.""" rl = reward.shape[0] - (n_step - 1) return_ = return_dest if return_dest is not None else zeros( (rl, ) + reward.shape[1:], dtype=reward.dtype) done_n = done_n_dest if done_n_dest is not None else zeros( (rl, ) + reward.shape[1:], dtype=done.dtype) return_[:] = reward[:rl] # 1-step return is current reward. done_n[:] = done[:rl] # True at time t if done any time by t + n - 1 if n_step > 1: for n in range(1, n_step): return_ += (discount**n) * reward[n:n + rl] * (1 - done_n) done_n += done[n:n + rl] return return_, done_n
def discount_return(reward, done, bootstrap_value, discount, return_dest=None): """Time-major inputs, optional other dimensions: [T], [T,B], etc.""" return_ = return_dest if return_dest is not None else zeros( reward.shape, dtype=reward.dtype) nd = 1 - done nd = nd.type(reward.dtype) if isinstance(nd, torch.Tensor) else nd return_[-1] = reward[-1] + discount * bootstrap_value * nd[-1] for t in reversed(range(len(reward) - 1)): return_[t] = reward[t] + return_[t + 1] * discount * nd[t] return return_
def discount_return(reward, done, bootstrap_value, discount, return_dest=None): """Time-major inputs, optional other dimensions: [T], [T,B], etc. Computes discounted sum of future rewards from each time-step to the end of the batch, including bootstrapping value. Sum resets where `done` is 1. Optionally, writes to buffer `return_dest`, if provided. Operations vectorized across all trailing dimensions after the first [T,].""" return_ = return_dest if return_dest is not None else zeros( reward.shape, dtype=reward.dtype) nd = 1 - done nd = nd.type(reward.dtype) if isinstance(nd, torch.Tensor) else nd return_[-1] = reward[-1] + discount * bootstrap_value * nd[-1] for t in reversed(range(len(reward) - 1)): return_[t] = reward[t] + return_[t + 1] * discount * nd[t] return return_
def discount_return_tl(reward, done, bootstrap_value, discount, timeout, value, return_dest=None): """Like discount_return(), above, except uses bootstrapping where 'done' is due to env horizon time-limit (tl=Time-Limit). (In the algo, should not train on samples where `timeout=True`.)""" return_ = return_dest if return_dest is not None else zeros( reward.shape, dtype=reward.dtype) assert all(done[timeout]) # Anywhere timeout, was done (timeout is bool dtype). nd = 1 - done nd = nd.type(reward.dtype) if isinstance(nd, torch.Tensor) else nd return_[-1] = reward[-1] + discount * bootstrap_value * nd[-1] for t in reversed(range(len(reward) - 1)): return_[t] = reward[t] + return_[t + 1] * discount * nd[t] # Replace with bootstrap value where 'done' due to timeout. # Should mask out those samples for training: valid *= (1 - timeout), # because don't have valid next_state for bootstrap_value(next_state). return_[t][timeout[t]] = value[t][timeout[t]] return return_