示例#1
0
文件: aoc.py 项目: StepNeverStop/RLs
    def _preprocess_BATCH(self, BATCH):  # [T, B, *]
        BATCH = super()._preprocess_BATCH(BATCH)
        BATCH.reward += BATCH.reward_offset

        BATCH.last_options = int2one_hot(BATCH.last_options, self.options_num)
        BATCH.options = int2one_hot(BATCH.options, self.options_num)
        value = self._get_value(BATCH.obs_[-1],
                                BATCH.options[-1],
                                rnncs=self.rnncs)
        BATCH.discounted_reward = discounted_sum(BATCH.reward,
                                                 self.gamma,
                                                 BATCH.done,
                                                 BATCH.begin_mask,
                                                 init_value=value)
        td_error = calculate_td_error(
            BATCH.reward,
            self.gamma,
            BATCH.done,
            value=BATCH.value,
            next_value=np.concatenate((BATCH.value[1:], value[np.newaxis, :]),
                                      0))
        BATCH.gae_adv = discounted_sum(td_error,
                                       self.lambda_ * self.gamma,
                                       BATCH.done,
                                       BATCH.begin_mask,
                                       init_value=0.,
                                       normalize=True)
        return BATCH
示例#2
0
文件: pg.py 项目: StepNeverStop/RLs
 def _preprocess_BATCH(self, BATCH):  # [T, B, *]
     BATCH = super()._preprocess_BATCH(BATCH)
     BATCH.discounted_reward = discounted_sum(BATCH.reward,
                                              self.gamma,
                                              BATCH.done,
                                              BATCH.begin_mask,
                                              init_value=0.,
                                              normalize=True)
     return BATCH
示例#3
0
 def cal_gae_adv(self, lambda_, gamma):
     '''
     计算GAE优势估计
     adv = td(s) + gamma * lambda * (1 - done) * td(s')
     '''
     assert 'td_error' in self.buffer.keys()
     adv = np.asarray(
         discounted_sum(self.buffer['td_error'], lambda_ * gamma, 0,
                        self.buffer['done']))
     self.buffer['gae_adv'] = list(standardization(adv))
示例#4
0
 def cal_dc_r(self, gamma, init_value, normalize=False):
     '''
     计算折扣奖励
     param gamma: 折扣因子 gamma \in [0, 1)
     param init_value: 序列最后状态的值
     '''
     dc_r = discounted_sum(self.buffer['r'], gamma, init_value,
                           self.buffer['done'])
     if normalize:
         dc_r = standardization(np.asarray(dc_r))
     self.buffer['discounted_reward'] = list(dc_r)
示例#5
0
 def cal_dc_r(self, gamma, init_value, normalize=False):
     '''
     计算折扣奖励
     param gamma: 折扣因子 gamma \in [0, 1)
     param init_value: 序列最后状态的值
     '''
     dc_r = discounted_sum(self.buffer['r'], gamma, init_value,
                           self.buffer['done'])
     if normalize:
         dc_r -= np.mean(dc_r)
         dc_r /= np.std(dc_r)
     self.buffer['discounted_reward'] = list(dc_r)
示例#6
0
 def cal_gae_adv(self, lambda_, gamma, normalize=False):
     '''
     计算GAE优势估计
     adv = td(s) + gamma * lambda * (1 - done) * td(s')
     '''
     assert 'td_error' in self.data_buffer.keys(
     ), "assert 'td_error' in self.data_buffer.keys()"
     # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
     # Eq (10): delta_t = Rt + gamma*V_{t+1} - V_t
     # Eq (16): batch_adv_t = delta_t + gamma*delta_{t+1} + gamma^2*delta_{t+2} + ...
     adv = np.asarray(
         discounted_sum(self.data_buffer['td_error'], lambda_ * gamma, 0,
                        self.data_buffer['done']))
     if normalize:
         adv = standardization(adv)
     self.data_buffer['gae_adv'] = list(standardization(adv))