def paths_to_tensors(self, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * tasks: (numpy.ndarray) * actions: (numpy.ndarray) * trjectories: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * letent_infos: (dict) * env_infos: (dict) * trjectory_infos: (dict) * paths: (list[dict]) """ max_path_length = self.max_path_length def _extract_latent_infos(infos): """Extract and pack latent infos from dict. Args: infos (dict): A dict that contains latent infos with key prefixed by 'latent_'. Returns: dict: A dict of latent infos. """ latent_infos = dict() for k, v in infos.items(): if k.startswith('latent_'): latent_infos[k[7:]] = v return latent_infos for path in paths: path['actions'] = (self._env_spec.action_space.flatten_n( path['actions'])) path['tasks'] = self.policy.task_space.flatten_n( path['env_infos']['task_onehot']) path['latents'] = path['agent_infos']['latent'] path['latent_infos'] = _extract_latent_infos(path['agent_infos']) # - Calculate a forward-looking sliding window. # - If step_space has shape (n, d), then trajs will have shape # (n, window, d) # - The length of the sliding window is determined by the # trajectory inference spec. We smear the last few elements to # preserve the time dimension. # - Only observation is used for a single step. # Alternatively, stacked [observation, action] can be used for # in harder tasks. obs = pad_tensor(path['observations'], max_path_length) obs_flat = self._env_spec.observation_space.flatten_n(obs) steps = obs_flat window = self._inference.spec.input_space.shape[0] traj = np_tensor_utils.sliding_window(steps, window, smear=True) traj_flat = self._inference.spec.input_space.flatten_n(traj) path['trajectories'] = traj_flat _, traj_info = self._inference.get_latents(traj_flat) path['trajectory_infos'] = traj_info all_path_baselines = [self._baseline.predict(path) for path in paths] tasks = [path['tasks'] for path in paths] tasks = pad_tensor_n(tasks, max_path_length) trajectories = np.stack([path['trajectories'] for path in paths]) latents = [path['latents'] for path in paths] latents = pad_tensor_n(latents, max_path_length) latent_infos = [path['latent_infos'] for path in paths] latent_infos = stack_tensor_dict_list( [pad_tensor_dict(p, max_path_length) for p in latent_infos]) trajectory_infos = [path['trajectory_infos'] for path in paths] trajectory_infos = stack_tensor_dict_list( [pad_tensor_dict(p, max_path_length) for p in trajectory_infos]) samples_data = paths_to_tensors(paths, max_path_length, all_path_baselines, self._discount, self._gae_lambda) samples_data['tasks'] = tasks samples_data['latents'] = latents samples_data['latent_infos'] = latent_infos samples_data['trajectories'] = trajectories samples_data['trajectory_infos'] = trajectory_infos return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] max_path_length = self.algo.max_path_length action_space = self.algo.env.action_space observation_space = self.algo.env.observation_space if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["deltas"] = deltas # calculate trajectory tensors (TODO: probably can do this in TF) for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) returns.append(path["returns"]) # Calculate trajectory samples # # Pad and flatten action and observation traces act = tensor_utils.pad_tensor(path['actions'], max_path_length) obs = tensor_utils.pad_tensor(path['observations'], max_path_length) act_flat = action_space.flatten_n(act) obs_flat = observation_space.flatten_n(obs) # Create a time series of stacked [act, obs] vectors #XXX now the inference network only looks at obs vectors #act_obs = np.concatenate([act_flat, obs_flat], axis=1) # TODO reactivate for harder envs? act_obs = obs_flat # act_obs = act_flat # Calculate a forward-looking sliding window of the stacked vectors # # If act_obs has shape (n, d), then trajs will have shape # (n, window, d) # # The length of the sliding window is determined by the trajectory # inference spec. We smear the last few elements to preserve the # time dimension. window = self.algo.inference.input_space.shape[0] trajs = sliding_window(act_obs, window, 1, smear=True) trajs_flat = self.algo.inference.input_space.flatten_n(trajs) path['trajectories'] = trajs_flat # trajectory infos _, traj_infos = self.algo.inference.get_latents(trajs) path['trajectory_infos'] = traj_infos ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) #DEBUG CPU vars ###################### cpu_adv = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) cpu_deltas = tensor_utils.concat_tensor_list( [path["deltas"] for path in paths]) cpu_act = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) cpu_obs = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) cpu_agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: cpu_adv = utils.center_advantages(cpu_adv) if self.algo.positive_adv: cpu_adv = utils.shift_advantages_to_positive(cpu_adv) ##################################### # make all paths the same length obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) tasks = [path["tasks"] for path in paths] tasks = tensor_utils.pad_tensor_n(tasks, max_path_length) tasks_gt = [path['tasks_gt'] for path in paths] tasks_gt = tensor_utils.pad_tensor_n(tasks_gt, max_path_length) latents = [path['latents'] for path in paths] latents = tensor_utils.pad_tensor_n(latents, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) trajectories = tensor_utils.stack_tensor_list( [path["trajectories"] for path in paths]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) latent_infos = [path["latent_infos"] for path in paths] latent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in latent_infos ]) trajectory_infos = [path["trajectory_infos"] for path in paths] trajectory_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in trajectory_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, tasks=tasks, latents=latents, trajectories=trajectories, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, latent_infos=latent_infos, trajectory_infos=trajectory_infos, env_infos=env_infos, paths=paths, cpu_adv=cpu_adv, #DEBUG cpu_deltas=cpu_deltas, #DEBUG cpu_obs=cpu_obs, #DEBUG cpu_act=cpu_act, #DEBUG cpu_agent_infos=cpu_agent_infos, # DEBUG ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data