def _fit_reward_baseline_compute_advantages(self, paths): """ only to be called if return_baseline is provided. Computes GAE advantage estimates """ assert self.return_baseline is not None # a) compute returns for idx, path in enumerate(paths): path["returns"] = utils.discount_cumsum(path["rewards"], self.discount) # b) fit return baseline estimator using the path returns and predict the return baselines self.return_baseline.fit(paths, target_key='returns') all_path_baselines = [ self.return_baseline.predict(path) for path in paths ] # c) generalized advantage estimation for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = utils.discount_cumsum( deltas, self.discount * self.gae_lambda) # d) pad paths and stack them advantages = [] for path in paths: path_length = path["observations"].shape[0] advantages.append(self._pad(path["advantages"], path_length)) advantages = np.stack(advantages, axis=0) # e) desired normalize / shift advantages if self.normalize_adv: advantages = utils.normalize_advantages(advantages) if self.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) return paths, advantages
def _compute_advantages(self, paths, all_path_baselines): assert len(paths) == len(all_path_baselines) for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = utils.discount_cumsum( deltas, self.discount * self.gae_lambda) return paths
def testFit(self): paths = self.sampler.obtain_samples() for task in paths.values(): unfit_error = 0 for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) unfit_pred = self.linear.predict(path) unfit_error += sum([ np.square(pred - actual) for pred, actual in zip(unfit_pred, path['returns']) ]) self.linear.fit(task) fit_error = 0 for path in task: fit_pred = self.linear.predict(path) fit_error += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertTrue(fit_error < unfit_error)
def _compute_samples_data(self, paths): assert type(paths) == list # 1) compute discounted rewards (returns) for idx, path in enumerate(paths): path["returns"] = utils.discount_cumsum(path["rewards"], self.discount) # # 2) fit baseline estimator using the path returns and predict the return baselines # self.baseline.fit(paths, target_key="returns") # all_path_baselines = [self.baseline.predict(path) for path in paths] # # # 3) compute advantages and adjusted rewards all_path_baselines = None paths = self._compute_advantages(paths, all_path_baselines) # 4) stack path data observations, actions, rewards, dones, returns, advantages, env_infos, agent_infos = self._concatenate_path_data( copy.deepcopy(paths)) # 5) if desired normalize / shift advantages # if self.normalize_adv: # advantages = utils.normalize_advantages(advantages) # if self.positive_adv: # advantages = utils.shift_advantages_to_positive(advantages) # 6) create samples_data object samples_data = dict( observations=observations, actions=actions, rewards=rewards, dones=dones, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, ) return samples_data, paths
def testSerialize(self): paths = self.sampler.obtain_samples() for task in paths.values(): for path in task: path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) self.linear.fit(task) fit_error_pre = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_pre += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) pkl = pickle.dumps(self.linear) self.linear = pickle.loads(pkl) fit_error_post = 0 for path in task: fit_pred = self.linear.predict(path) fit_error_post += sum([ np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns']) ]) self.assertEqual(fit_error_pre, fit_error_post)
from meta_mb.samplers.base import SampleProcessor