def _preprocess_BATCH(self, BATCH): # [T, B, *] BATCH = super()._preprocess_BATCH(BATCH) BATCH.reward += BATCH.reward_offset BATCH.last_options = int2one_hot(BATCH.last_options, self.options_num) BATCH.options = int2one_hot(BATCH.options, self.options_num) value = self._get_value(BATCH.obs_[-1], BATCH.options[-1], rnncs=self.rnncs) BATCH.discounted_reward = discounted_sum(BATCH.reward, self.gamma, BATCH.done, BATCH.begin_mask, init_value=value) td_error = calculate_td_error( BATCH.reward, self.gamma, BATCH.done, value=BATCH.value, next_value=np.concatenate((BATCH.value[1:], value[np.newaxis, :]), 0)) BATCH.gae_adv = discounted_sum(td_error, self.lambda_ * self.gamma, BATCH.done, BATCH.begin_mask, init_value=0., normalize=True) return BATCH
def _preprocess_BATCH(self, BATCH): # [T, B, *] BATCH = super()._preprocess_BATCH(BATCH) BATCH.discounted_reward = discounted_sum(BATCH.reward, self.gamma, BATCH.done, BATCH.begin_mask, init_value=0., normalize=True) return BATCH
def cal_gae_adv(self, lambda_, gamma): ''' 计算GAE优势估计 adv = td(s) + gamma * lambda * (1 - done) * td(s') ''' assert 'td_error' in self.buffer.keys() adv = np.asarray( discounted_sum(self.buffer['td_error'], lambda_ * gamma, 0, self.buffer['done'])) self.buffer['gae_adv'] = list(standardization(adv))
def cal_dc_r(self, gamma, init_value, normalize=False): ''' 计算折扣奖励 param gamma: 折扣因子 gamma \in [0, 1) param init_value: 序列最后状态的值 ''' dc_r = discounted_sum(self.buffer['r'], gamma, init_value, self.buffer['done']) if normalize: dc_r = standardization(np.asarray(dc_r)) self.buffer['discounted_reward'] = list(dc_r)
def cal_dc_r(self, gamma, init_value, normalize=False): ''' 计算折扣奖励 param gamma: 折扣因子 gamma \in [0, 1) param init_value: 序列最后状态的值 ''' dc_r = discounted_sum(self.buffer['r'], gamma, init_value, self.buffer['done']) if normalize: dc_r -= np.mean(dc_r) dc_r /= np.std(dc_r) self.buffer['discounted_reward'] = list(dc_r)
def cal_gae_adv(self, lambda_, gamma, normalize=False): ''' 计算GAE优势估计 adv = td(s) + gamma * lambda * (1 - done) * td(s') ''' assert 'td_error' in self.data_buffer.keys( ), "assert 'td_error' in self.data_buffer.keys()" # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438 # Eq (10): delta_t = Rt + gamma*V_{t+1} - V_t # Eq (16): batch_adv_t = delta_t + gamma*delta_{t+1} + gamma^2*delta_{t+2} + ... adv = np.asarray( discounted_sum(self.data_buffer['td_error'], lambda_ * gamma, 0, self.data_buffer['done'])) if normalize: adv = standardization(adv) self.data_buffer['gae_adv'] = list(standardization(adv))