def process_rollout(self, rollout, gamma, lambda_=1.0): """ given a rollout, compute its returns and the advantage """ batch_si = np.asarray(rollout.states) batch_a = np.asarray(rollout.actions) rewards = np.asarray(rollout.rewards) time = np.asarray(rollout.time) meta = np.asarray(rollout.meta) vpred_t = np.asarray(rollout.values + [rollout.r]) rewards_plus_v = np.asarray(rollout.rewards + [rollout.r]) batch_r = util.discount(rewards_plus_v, gamma)[:-1] delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1] # this formula for the advantage comes "Generalized Advantage Estimation": # https://arxiv.org/abs/1506.02438 batch_adv = util.discount(delta_t, gamma * lambda_) features = rollout.features[0] return util.Batch(si=batch_si, a=batch_a, adv=batch_adv, r=batch_r, terminal=rollout.terminal, features=features, reward=rewards, step=time, meta=meta)
def sample(self, length): size = len(self.s) is_initial_state = False if self.sampling == 'rand': idx = np.random.randint(0, size-1) if self.term[idx]: return self.sample(length) for end_idx in range(idx, idx + length): if self.term[end_idx] or end_idx == size-1: break is_initial_state = (idx > 0 and self.term[idx-1]) or idx == 0 else: idx = self.sample_idx if self.term[idx]: idx = idx + 1 for end_idx in range(idx, idx + length): if self.term[end_idx] or end_idx == size-1: break self.sample_idx = end_idx + 1 if end_idx < size-1 else 0 is_initial_state = (idx > 0 and self.term[idx-1]) or idx == 0 assert end_idx == idx + length - 1 or self.term[end_idx] or end_idx == size-1 return util.Batch(si=np.asarray(self.s[idx:end_idx+1]), a=np.asarray(self.a[idx:end_idx+1]), adv=None, r=None, terminal=self.term, features=[], reward=np.asarray(self.r[idx:end_idx+1]), step=np.asarray(self.t[idx:end_idx+1]), meta=np.asarray(self.r_t[idx:end_idx+1])), is_initial_state
def process_rollout(self, rollout, gamma, lambda_=1.0): """ given a rollout, compute its returns """ #print ("shape of the roolout states:--------------") #print (len(rollout.states)) #print ((rollout.states[0]).shape) batch_si = np.asarray(rollout.states) batch_a = np.asarray(rollout.actions) rewards = np.asarray(rollout.rewards) time = np.asarray(rollout.time) meta = np.asarray(rollout.meta) rewards_plus_v = np.asarray(rollout.rewards + [rollout.r]) batch_r = util.discount(rewards_plus_v, gamma, time) features = rollout.features[0] return util.Batch(si=batch_si, a=batch_a, adv=None, r=batch_r, terminal=rollout.terminal, features=features, reward=rewards, step=time, meta=meta, )