def _fit_baseline_with_data(self, samples_data): """Update baselines from samples. Args: samples_data (dict): Processed sample data. See garage.tf.paths_to_tensors() for details. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) samples_data['rewards'] = np_tensor_utils.pad_tensor_n( aug_rewards, self.max_path_length) samples_data['returns'] = np_tensor_utils.pad_tensor_n( aug_returns, self.max_path_length) # Fit baseline logger.log('Fitting baseline...') self._baseline.fit(paths)
def _get_baseline_prediction(self, samples_data): """Get baseline prediction. Args: samples_data (dict): Processed sample data. See garage.tf.paths_to_tensors() for details. Returns: np.ndarray: Baseline prediction, with shape :math:`(N, max_path_length * episode_per_task)`. """ paths = samples_data['paths'] baselines = [self._baseline.predict(path) for path in paths] return np_tensor_utils.pad_tensor_n(baselines, self.max_path_length)
def traj_list_to_tensors(paths, max_path_length, baseline_predictions, discount): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_path_length (int): Maximum length of a single rollout. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. Returns: dict: Processed sample data, with key * observations (numpy.ndarray): Padded array of the observations of the environment * actions (numpy.ndarray): Padded array of the actions fed to the the environment * rewards (numpy.ndarray): Padded array of the acquired rewards * agent_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * env_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * rewards (numpy.ndarray): Padded array of the validity information """ baselines = [] returns = [] for idx, path in enumerate(paths): # baselines path["baselines"] = baseline_predictions[idx] baselines.append(path["baselines"]) # returns path["returns"] = tensor_utils.discount_cumsum( path["rewards"], discount) returns.append(path["returns"]) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict( observations=obs, actions=actions, rewards=rewards, agent_infos=agent_infos, env_infos=env_infos, valids=valids, ) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = special.discount_cumsum(path['rewards'], self.discount) returns.append(path['returns']) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict(average_return=np.mean(undiscounted_returns)) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, 'predict_n'): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path['rewards'] + \ self.algo.discount * path_baselines[1:] - path_baselines[:-1] path['advantages'] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path['returns'] = special.discount_cumsum(path['rewards'], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path['returns']) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path['observations'] for path in paths]) actions = tensor_utils.concat_tensor_list( [path['actions'] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path['rewards'] for path in paths]) returns = tensor_utils.concat_tensor_list( [path['returns'] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path['advantages'] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path['env_infos'] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path['agent_infos'] for path in paths]) if self.algo.center_adv: advantages = utils.center_advantages(advantages) if self.algo.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path['advantages']) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path['advantages'] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path['advantages'] - adv_mean) / adv_std for path in paths] else: adv = [path['advantages'] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log('fitting baseline...') if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log('fitted') tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('ExplainedVariance', ev) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) terminals = [path['dones'] for path in paths] valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) undiscounted_returns = self.evaluate_performance( itr, dict(env_spec=None, observations=obs, actions=actions, rewards=rewards, terminals=terminals, env_infos=env_infos, agent_infos=agent_infos, lengths=lengths, discount=self.discount)) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data