def process_samples(self, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum path length (self.max_path_length) Args: paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) returns = torch.stack([ pad_to_last(tu.discount_cumsum(path['rewards'], self.discount).copy(), total_length=self.max_path_length) for path in paths ]) with torch.no_grad(): baselines = self._value_function(obs) return obs, actions, rewards, returns, valids, baselines
def _process_samples(self, paths): """Process sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. Returns: MAMLTrajectoryBatch: Processed samples data. """ for path in paths: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self._inner_algo.discount).copy() self._train_value_function(paths) obs, actions, rewards, _, valids, baselines \ = self._inner_algo.process_samples(paths) return MAMLTrajectoryBatch(paths, obs, actions, rewards, valids, baselines)
def process_samples(self, itr, paths): """Process sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: tuple: * obs (torch.Tensor): The observations of the environment. * actions (torch.Tensor): The actions fed to the environment. * rewards (torch.Tensor): The acquired rewards. * valids (list[int]): Numbers of valid steps in each paths. * baselines (torch.Tensor): Value function estimation at each step. """ for path in paths: if 'returns' not in path: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) valids = np.array([len(path['actions']) for path in paths]) baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) return Samples(obs, actions, rewards, valids, baselines)
def _process_samples(self, itr, paths): """Process sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: MAMLTrajectoryBatch: Processed samples data. """ for path in paths: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self._inner_algo.discount) self._baseline.fit(paths) obs, actions, rewards, valids, baselines \ = self._inner_algo.process_samples(itr, paths) return MAMLTrajectoryBatch(paths, obs, actions, rewards, valids, baselines)
def log_performance(itr, batch, discount, prefix='Evaluation'): """Evaluate the performance of an algorithm on a batch of trajectories. Args: itr (int): Iteration number. batch (TrajectoryBatch): The trajectories to evaluate with. discount (float): Discount value, from algorithm's property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] completion = [] success = [] for trajectory in batch.split(): returns.append(discount_cumsum(trajectory.rewards, discount)) undiscounted_returns.append(sum(trajectory.rewards)) completion.append(float(trajectory.terminals.any())) if 'success' in trajectory.env_infos: success.append(float(trajectory.env_infos['success'].any())) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', len(returns)) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('CompletionRate', np.mean(completion)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_returns, np.mean(success)
def paths_to_tensors(paths, max_path_length, baseline_predictions, discount): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_path_length (int): Maximum length of a single rollout. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. Returns: dict: Processed sample data, with key * observations (numpy.ndarray): Padded array of the observations of the environment * actions (numpy.ndarray): Padded array of the actions fed to the the environment * rewards (numpy.ndarray): Padded array of the acquired rewards * agent_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * env_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * rewards (numpy.ndarray): Padded array of the validity information """ baselines = [] returns = [] for idx, path in enumerate(paths): # baselines path['baselines'] = baseline_predictions[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum(path['rewards'], discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict(observations=obs, actions=actions, rewards=rewards, agent_infos=agent_infos, env_infos=env_infos, valids=valids) return samples_data
def paths_to_tensors(paths, max_path_length, baseline_predictions, discount, gae_lambda): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_path_length (int): Maximum length of a single rollout. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. gae_lambda (float): Lambda used for generalized advantage estimation. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) """ baselines = [] returns = [] total_steps = 0 for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(baseline_predictions[idx], 0) deltas = (path['rewards'] + discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, discount * gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = baseline_predictions[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) return samples_data
def log_performance(self, indices, test, epoch): """Get average returns for specific tasks. Args: indices (list): List of tasks. """ discounted_returns = [] undiscounted_returns = [] completion = [] success = [] traj = [] for idx in indices: eval_paths = [] for _ in range(self._num_evals): paths = self.collect_paths(idx, test) paths[-1]['terminals'] = paths[-1]['terminals'].squeeze() paths[-1]['dones'] = paths[-1]['terminals'] # HalfCheetahVel env if 'task' in paths[-1]['env_infos'].keys(): paths[-1]['env_infos']['task'] = paths[-1]['env_infos'][ 'task']['velocity'] eval_paths.append(paths[-1]) discounted_returns.append( discount_cumsum(paths[-1]['rewards'], self._discount)) undiscounted_returns.append(sum(paths[-1]['rewards'])) completion.append(float(paths[-1]['terminals'].any())) # calculate success rate for metaworld tasks if 'success' in paths[-1]['env_infos']: success.append(paths[-1]['env_infos']['success'].any()) if test: env = self.test_env[idx]() temp_traj = TrajectoryBatch.from_trajectory_list( env, eval_paths) else: env = self.env[idx]() temp_traj = TrajectoryBatch.from_trajectory_list( env, eval_paths) traj.append(temp_traj) if test: with tabular.prefix('Test/'): if self._test_task_names: log_multitask_performance( epoch, TrajectoryBatch.concatenate(*traj), self._discount, task_names=self._test_task_names) log_performance(epoch, TrajectoryBatch.concatenate(*traj), self._discount, prefix='Average') else: with tabular.prefix('Train/'): if self._train_task_names: log_multitask_performance( epoch, TrajectoryBatch.concatenate(*traj), self._discount, task_names=self._train_task_names) log_performance(epoch, TrajectoryBatch.concatenate(*traj), self._discount, prefix='Average')
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use metarl.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) # stack and pad to max path length of the concatenated # path, which will be fed to inner algo # i.e. max_path_length * episode_per_task concatenated_paths_stacked = ( np_tensor_utils.stack_and_pad_tensor_dict_list( concatenated_paths, self._inner_algo.max_path_length)) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0].env, 'all_task_names'): names = [ env.env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), self._inner_algo._discount, name_map=name_map) concatenated_paths_stacked['paths'] = concatenated_paths concatenated_paths_stacked['average_return'] = np.mean( undiscounted_returns) return concatenated_paths_stacked
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ concatenated_path_in_meta_batch = [] lengths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError('Batch idx is required for RL2 but not found') # all path in paths_by_task[i] are sampled from task[i] # for path in paths_by_task.values(): concatenated_path = self._concatenate_paths(path) concatenated_path_in_meta_batch.append(concatenated_path) # prepare paths for inner algorithm # pad the concatenated paths observations, actions, rewards, terminals, returns, valids, lengths, env_infos, agent_infos = \ self._stack_paths(max_len=self._inner_algo.max_path_length, paths=concatenated_path_in_meta_batch) # prepare paths for performance evaluation # performance is evaluated across all paths, so each path # is padded with self._max_path_length _observations, _actions, _rewards, _terminals, _, _valids, _lengths, _env_infos, _agent_infos = \ self._stack_paths(max_len=self._max_path_length, paths=paths) ent = np.sum(self._policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) undiscounted_returns = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), self._inner_algo.discount, task_names=self._task_names) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) # all paths in each meta batch is stacked together # shape: [meta_batch, max_path_length * episoder_per_task, *dims] # per RL^2 concatenated_path = dict(observations=observations, actions=actions, rewards=rewards, valids=valids, lengths=lengths, baselines=np.zeros_like(rewards), agent_infos=agent_infos, env_infos=env_infos, paths=concatenated_path_in_meta_batch, average_return=np.mean(undiscounted_returns)) return concatenated_path
def process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ baselines = [] returns = [] total_steps = 0 max_path_length = self.max_path_length dic = defaultdict(lambda: list()) for path in paths: assert all(path['env_infos']['task_name'][0] == name for name in path['env_infos']['task_name']) dic[path['env_infos']['task_name'][0]].append(path) avg_success = 0 for task_name, sep_paths in dic.items(): undiscounted_returns, success = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, sep_paths), discount=self.discount, task_names=self.task_names) avg_success += success avg_success /= len(dic) tabular.record('AverageSuccessRate', avg_success) if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data. Note: The returned samples is a dictionary with keys - observations: (numpy.ndarray), shape [B * (T), *obs_dims] - actions: (numpy.ndarray), shape [B * (T), *act_dims] - rewards : (numpy.ndarray), shape [B * (T), ] - baselines: (numpy.ndarray), shape [B * (T), ] - returns: (numpy.ndarray), shape [B * (T), ] - lengths: (numpy.ndarray), shape [P, ], i-th entry represents the length of i-th path. - valids: (numpy.ndarray), shape [P, ], [i, j] entry is 1 if the j-th sample in i-th path is valid, otherwise 0. - agent_infos: (dict), see OnPolicyVectorizedSampler.obtain_samples() - env_infos: (dict), see OnPolicyVectorizedSampler.obtain_samples() - paths: (list[dict]) The original path with observation or action flattened - average_return: (numpy.float64) where B = batch size, (T) = variable-length of each trajectory, P = number of paths. Notice that B * T equals to the total number of environment steps in all trajectories. """ baselines = [] returns = [] if self._flatten_input: paths = [ dict( observations=(self._env_spec.observation_space.flatten_n( path['observations'])), actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] if hasattr(self._baseline, 'predict_n'): all_path_baselines = self._baseline.predict_n(paths) else: all_path_baselines = [ self._baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self._discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self._discount * self._gae_lambda) path['deltas'] = deltas # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self._discount) returns.append(path['returns']) obs = np.concatenate([path['observations'] for path in paths]) actions = np.concatenate([path['actions'] for path in paths]) rewards = np.concatenate([path['rewards'] for path in paths]) returns = np.concatenate(returns) baselines = np.concatenate(baselines) agent_infos_path = [path['agent_infos'] for path in paths] agent_infos = dict() for key in self._policy.state_info_keys: agent_infos[key] = np.concatenate( [infos[key] for infos in agent_infos_path]) env_infos_path = [path['env_infos'] for path in paths] env_infos = dict() for key in paths[0]['env_infos'].keys(): env_infos[key] = np.concatenate( [infos[key] for infos in env_infos_path]) valids = np.asarray([np.ones_like(path['returns']) for path in paths]) lengths = np.asarray([v.sum() for v in valids]) average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self._episode_reward_mean.extend(undiscounted_returns) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, lengths=lengths, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data