def test_concat_tensor_dict_list(self): results = concat_tensor_dict_list(self.data) assert results['obs'].shape == (6, ) assert results['act'].shape == (6, ) assert results['info']['lala'].shape == (4, ) assert results['info']['baba'].shape == (4, ) results = concat_tensor_dict_list(self.data2) assert results['obs'].shape == (6, ) assert results['act'].shape == (6, ) assert results['info']['lala'].shape == (4, ) assert results['info']['baba'].shape == (2, )
def from_trajectory_list(cls, env_spec, num_skills, paths): lengths = np.asarray([len(p['self_rewards']) for p in paths]) if all( len(path['states']) == length + 1 for (path, length) in zip(paths, lengths)): last_states = np.asarray([p['states'][-1] for p in paths]) states = np.concatenate([p['states'][:-1] for p in paths]) else: # The number of observations and timesteps must match. states = np.concatenate([p['states'] for p in paths]) if paths[0].get('next_states') is not None: last_states = np.asarray([p['next_states'][-1] for p in paths]) else: last_states = np.asarray([p['states'][-1] for p in paths]) stacked_paths = tensor_utils.concat_tensor_dict_list(paths) return cls(env_spec=env_spec, num_skills=num_skills, skills=stacked_paths['skills'], # skills_onehot=np.eye(num_skills)[stacked_paths['skills']], states=states, last_states=last_states, actions=stacked_paths['actions'], env_rewards=stacked_paths['env_rewards'], self_rewards=stacked_paths['self_rewards'], terminals=stacked_paths['dones'], env_infos=stacked_paths['env_infos'], agent_infos=stacked_paths['agent_infos'], lengths=lengths)
def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list( tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( list(env_infos))
def _concatenate_paths(self, paths): """Concatenate paths. The input paths are from different rollouts but same task/environment. In RL^2, paths within each meta batch are all concatenate into a single path and fed to the policy. Args: paths (dict): Input paths. All paths are from different rollouts, but the same task/environment. Returns: dict: Concatenated paths from the same task/environment. Shape of values: :math:`[max_path_length * episode_per_task, S^*]` list[dict]: Original input paths. Length of the list is :math:`episode_per_task` and each path in the list has values of shape :math:`[max_path_length, S^*]` """ if self._flatten_input: observations = np.concatenate([ self._env_spec.observation_space.flatten_n( path['observations']) for path in paths ]) else: observations = np.concatenate( [path['observations'] for path in paths]) actions = np.concatenate([ self._env_spec.action_space.flatten_n(path['actions']) for path in paths ]) valids = np.concatenate( [np.ones_like(path['rewards']) for path in paths]) baselines = np.concatenate( [np.zeros_like(path['rewards']) for path in paths]) concatenated_path = np_tensor_utils.concat_tensor_dict_list(paths) concatenated_path['observations'] = observations concatenated_path['actions'] = actions concatenated_path['valids'] = valids concatenated_path['baselines'] = baselines return concatenated_path
def from_trajectory_list(cls, env_spec, paths): """Create a TrajectoryBatch from a list of trajectories. Args: env_spec (garage.envs.EnvSpec): Specification for the environment from which this data was sampled. paths (list[dict[str, np.ndarray or dict[str, np.ndarray]]]): Keys: * observations (np.ndarray): Non-flattened array of observations. Typically has shape (T, S^*) (the unflattened state space of the current environment). observations[i] was used by the agent to choose actions[i]. observations may instead have shape (T + 1, S^*). * next_observations (np.ndarray): Non-flattened array of observations. Has shape (T, S^*). next_observations[i] was observed by the agent after taking actions[i]. Optional. Note that to ensure all information from the environment was preserved, observations[i] should have shape (T + 1, S^*), or this key should be set. However, this method is lenient and will "duplicate" the last observation if the original last observation has been lost. * actions (np.ndarray): Non-flattened array of actions. Should have shape (T, S^*) (the unflattened action space of the current environment). * rewards (np.ndarray): Array of rewards of shape (T,) (1D array of length timesteps). * dones (np.ndarray): Array of rewards of shape (T,) (1D array of length timesteps). * agent_infos (dict[str, np.ndarray]): Dictionary of stacked, non-flattened `agent_info` arrays. * env_infos (dict[str, np.ndarray]): Dictionary of stacked, non-flattened `env_info` arrays. """ lengths = np.asarray([len(p["rewards"]) for p in paths]) if all( len(path["observations"]) == length + 1 for (path, length) in zip(paths, lengths)): last_observations = np.asarray( [p["observations"][-1] for p in paths]) observations = np.concatenate( [p["observations"][:-1] for p in paths]) else: # The number of observations and timesteps must match. observations = np.concatenate([p["observations"] for p in paths]) if paths[0].get("next_observations") is not None: last_observations = np.asarray( [p["next_observations"][-1] for p in paths]) else: last_observations = np.asarray( [p["observations"][-1] for p in paths]) stacked_paths = tensor_utils.concat_tensor_dict_list(paths) return cls( env_spec=env_spec, observations=observations, last_observations=last_observations, actions=stacked_paths["actions"], rewards=stacked_paths["rewards"], terminals=stacked_paths["dones"], env_infos=stacked_paths["env_infos"], agent_infos=stacked_paths["agent_infos"], lengths=lengths, )
def from_list(cls, env_spec, paths): """Create a EpisodeBatch from a list of episodes. Args: env_spec (EnvSpec): Specification for the environment from which this data was sampled. paths (list[dict[str, np.ndarray or dict[str, np.ndarray]]]): Keys: * observations (np.ndarray): Non-flattened array of observations. Typically has shape (T, S^*) (the unflattened state space of the current environment). observations[i] was used by the agent to choose actions[i]. observations may instead have shape (T + 1, S^*). * next_observations (np.ndarray): Non-flattened array of observations. Has shape (T, S^*). next_observations[i] was observed by the agent after taking actions[i]. Optional. Note that to ensure all information from the environment was preserved, observations[i] should have shape (T + 1, S^*), or this key should be set. However, this method is lenient and will "duplicate" the last observation if the original last observation has been lost. * actions (np.ndarray): Non-flattened array of actions. Should have shape (T, S^*) (the unflattened action space of the current environment). * rewards (np.ndarray): Array of rewards of shape (T,) (1D array of length timesteps). * agent_infos (dict[str, np.ndarray]): Dictionary of stacked, non-flattened `agent_info` arrays. * env_infos (dict[str, np.ndarray]): Dictionary of stacked, non-flattened `env_info` arrays. * step_types (numpy.ndarray): A numpy array of `StepType with shape (T,) containing the time step types for all transitions in this batch. """ lengths = np.asarray([len(p['rewards']) for p in paths]) if all( len(path['observations']) == length + 1 for (path, length) in zip(paths, lengths)): last_observations = np.asarray( [p['observations'][-1] for p in paths]) observations = np.concatenate( [p['observations'][:-1] for p in paths]) else: # The number of observations and timesteps must match. observations = np.concatenate([p['observations'] for p in paths]) if paths[0].get('next_observations') is not None: last_observations = np.asarray( [p['next_observations'][-1] for p in paths]) else: last_observations = np.asarray( [p['observations'][-1] for p in paths]) stacked_paths = tensor_utils.concat_tensor_dict_list(paths) # Temporary solution. This logic is not needed if algorithms process # step_types instead of dones directly. if 'dones' in stacked_paths and 'step_types' not in stacked_paths: step_types = np.array([ StepType.TERMINAL if done else StepType.MID for done in stacked_paths['dones'] ], dtype=StepType) stacked_paths['step_types'] = step_types del stacked_paths['dones'] return cls(env_spec=env_spec, observations=observations, last_observations=last_observations, actions=stacked_paths['actions'], rewards=stacked_paths['rewards'], env_infos=stacked_paths['env_infos'], agent_infos=stacked_paths['agent_infos'], step_types=stacked_paths['step_types'], lengths=lengths)
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, 'predict_n'): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path['rewards'] + \ self.algo.discount * path_baselines[1:] - path_baselines[:-1] path['advantages'] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path['returns'] = special.discount_cumsum(path['rewards'], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path['returns']) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path['observations'] for path in paths]) actions = tensor_utils.concat_tensor_list( [path['actions'] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path['rewards'] for path in paths]) returns = tensor_utils.concat_tensor_list( [path['returns'] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path['advantages'] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path['env_infos'] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path['agent_infos'] for path in paths]) if self.algo.center_adv: advantages = utils.center_advantages(advantages) if self.algo.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path['advantages']) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path['advantages'] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path['advantages'] - adv_mean) / adv_std for path in paths] else: adv = [path['advantages'] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log('fitting baseline...') if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log('fitted') tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('ExplainedVariance', ev) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples_discount(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): advantages = [] path_returns = [] ''' path_baselines = all_path_baselines[idx] return_so_far = 0 for t in range(len(path["rewards"])-1, -1, -1): return_so_far = path["rewards"][t] + self.algo.discount * return_so_far path_returns.append(return_so_far) advantage = return_so_far - path_baselines[t] advantages.append(advantage) ''' path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] advantages = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) # correction discount_array = self.algo.discount**np.arange(len( path["rewards"])) path['advantages'] = advantages * discount_array ''' path_returns = special.discount_cumsum(path["rewards"], self.algo.discount) path['returns'] = path_returns * discount_array ''' path['returns'] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = utils.center_advantages(advantages) if self.algo.positive_adv: advantages = utils.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) logger.log("fitting Exp_paper...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data