def _setup_worker(self, env_indices, tasks): """Setup workers. Args: env_indices (List[Int]): Indices of environments to be assigned to workers for sampling. tasks (List[dict]): List of tasks to assign. """ if self._vec_env is not None: self._vec_env.close() vec_envs = [] for env_ind in env_indices: for _ in range(self._envs_per_worker): vec_env = copy.deepcopy(self.env) vec_env.set_task(tasks[env_ind]) vec_envs.append(vec_env) seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(vec_envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=vec_envs, max_path_length=self.algo.max_path_length)
def start_worker(self): """Initialize the sampler.""" n_envs = self._n_envs envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)] # Deterministically set environment seeds based on the global seed. seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length)
class RL2Sampler(BaseSampler): """Sampler which uses VecEnvExecutor to run multiple environments. This sampler is for RL^2. See https://arxiv.org/pdf/1611.02779.pdf. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. This sampler uses an OrderedDict, instead of a List, to keep track of the paths for each environment/task. Args: algo (metarl.np.algos.RLAlgorithm): An algorithm instance. env (metarl.envs.MetaRLEnv): Environement to sample from. meta_batch_size (int): Meta batch size for sampling. If it is larger than n_envs, it must be a multiple of n_envs so it can be evenly distributed among environments. n_envs (int): Number of environment instances for sampling. It it is larger than meta_batch_size, it must be a multiple of meta_batch_size so batch can be evenly distributed among environments. Raises: ValueError: If meta_batch_size > n_envs and meta_batch_size is not a multiple of n_envs, or if n_envs > meta_batch_size and n_envs is not a multiple of meta_batch_size. """ def __init__(self, algo, env, meta_batch_size, n_envs=None): super().__init__(algo, env) if n_envs is None: n_envs = singleton_pool.n_parallel * 4 self._n_envs = n_envs self._meta_batch_size = meta_batch_size self._vec_env = None self._envs_per_worker = None self._vec_envs_indices = None if self._meta_batch_size > self._n_envs: if self._meta_batch_size % self._n_envs != 0: raise ValueError( 'meta_batch_size must be a multiple of n_envs') self._envs_per_worker = 1 self._vec_envs_indices = np.split(np.arange(self._meta_batch_size), self._n_envs) if self._n_envs >= self._meta_batch_size: if self._n_envs % self._meta_batch_size != 0: raise ValueError( 'n_envs must be a multiple of meta_batch_size') self._envs_per_worker = self._n_envs // self._meta_batch_size self._vec_envs_indices = [np.arange(self._meta_batch_size)] def start_worker(self): """This function is deprecated.""" def shutdown_worker(self): """Shutdown workers.""" self._vec_env.close() def _setup_worker(self, env_indices, tasks): """Setup workers. Args: env_indices (List[Int]): Indices of environments to be assigned to workers for sampling. tasks (List[dict]): List of tasks to assign. """ if self._vec_env is not None: self._vec_env.close() vec_envs = [] for env_ind in env_indices: for _ in range(self._envs_per_worker): vec_env = copy.deepcopy(self.env) vec_env.set_task(tasks[env_ind]) vec_envs.append(vec_env) seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(vec_envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=vec_envs, max_path_length=self.algo.max_path_length) # pylint: disable=too-many-statements def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. If batch size is not specified, episode per task by default is 1 so batch size will be meta_batch_size * max_path_length. When number of workers are less than meta batch size, sampling will be performed for each of self._vec_envs_indices in series. The i-th value of self._vec_envs_indices represents the indices of the environments/tasks to be sampled for the i-th iteration. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: OrderedDict: Sample paths. Key represents the index of the environment/task and value represents all the paths sampled from that particular environment/task. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape :math:`[N, S^*]` * actions: numpy.ndarray with shape :math:`[N, S^*]` * rewards: numpy.ndarray with shape :math:`[N, S^*]` * dones: numpy.ndarray with shape :math:`[N, S^*]` * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. """ logger.log('Obtaining samples for iteration %d...' % itr) if batch_size is None: batch_size = self.algo.max_path_length * self._meta_batch_size paths = [] tasks = self.env.sample_tasks(self._meta_batch_size) # Start main loop batch_size_per_loop = batch_size // len(self._vec_envs_indices) for vec_envs_indices in self._vec_envs_indices: self._setup_worker(vec_envs_indices, tasks) n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy # Only reset policies at the beginning of a meta batch policy.reset(dones) while n_samples < batch_size_per_loop: t = time.time() actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step( actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], dones=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['dones'].append(done) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), batch_idx=idx)) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
class OnPolicyVectorizedSampler(BatchSampler): """BatchSampler which uses VecEnvExecutor to run multiple environments. Args: algo (metarl.np.algos.RLAlgorithm): An algorithm instance. env (metarl.envs.MetaRLEnv): An environement instance. n_envs (int): Number of environment instances to setup. This parameter has effect on sampling performance. """ def __init__(self, algo, env, n_envs=None): if n_envs is None: n_envs = singleton_pool.n_parallel * 4 super().__init__(algo, env) self._n_envs = n_envs self._vec_env = None self._env_spec = self.env.spec warnings.warn( DeprecationWarning( 'OnPolicyVectoriizedSampler is deprecated, and will be ' 'removed in the next release. Please use VecWorker and one of ' 'the new samplers which implement metarl.sampler.Sampler, ' 'such as RaySampler.')) def start_worker(self): """Start workers.""" n_envs = self._n_envs envs = [ cloudpickle.loads(cloudpickle.dumps(self.env)) for _ in range(n_envs) ] # Deterministically set environment seeds based on the global seed. seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) def shutdown_worker(self): """Shutdown workers.""" self._vec_env.close() # pylint: disable=too-many-statements def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape [Batch, *obs_dims] * actions: numpy.ndarray with shape [Batch, *act_dims] * rewards: numpy.ndarray with shape [Batch, ] * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape [Batch, ?]. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape [Batch, ?]. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. * dones: numpy.ndarray with shape [Batch, ] """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy with click.progressbar(length=batch_size, label='Sampling') as pbar: while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], dones=[]) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) running_paths[idx]['dones'].append(done) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), dones=np.asarray( running_paths[idx]['dones']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.update(len(obses)) obses = next_obses tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
class OffPolicyVectorizedSampler(BatchSampler): """This class implements OffPolicyVectorizedSampler. Args: algo (metarl.np.RLAlgorithm): Algorithm. env (metarl.envs.MetaRLEnv): Environment. n_envs (int): Number of parallel environments managed by sampler. no_reset (bool): Reset environment between samples or not. """ def __init__(self, algo, env, n_envs=None, no_reset=True): if n_envs is None: n_envs = int(algo.rollout_batch_size) super().__init__(algo, env) self._n_envs = n_envs self._no_reset = no_reset self._vec_env = None self._env_spec = self.env.spec self._last_obses = None self._last_uncounted_discount = [0] * n_envs self._last_running_length = [0] * n_envs self._last_success_count = [0] * n_envs warnings.warn( DeprecationWarning( 'OffPolicyVectoriizedSampler is deprecated, and will be ' 'removed in the next release. Please use VecWorker and one of ' 'the new samplers which implement metarl.sampler.Sampler, ' 'such as RaySampler.')) def start_worker(self): """Initialize the sampler.""" n_envs = self._n_envs envs = [ cloudpickle.loads(cloudpickle.dumps(self.env)) for _ in range(n_envs) ] # Deterministically set environment seeds based on the global seed. seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) def shutdown_worker(self): """Terminate workers if necessary.""" self._vec_env.close() # pylint: disable=too-many-branches, too-many-statements def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Raises: ValueError: If the algorithm doesn't have an exploration_policy field. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.exploration_policy if policy is None: raise ValueError('OffPolicyVectoriizedSampler should only be used ' 'with an exploration_policy.') while n_samples < batch_size: policy.reset(completes) obs_space = self.algo.env_spec.observation_space input_obses = obs_space.flatten_n(obses) actions, agent_infos = policy.get_actions(input_obses) next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) completes = env_infos['vec_env_executor.complete'] self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, reward, env_info, done, obs, next_obs, action in zip( itertools.count(), rewards, env_infos, dones, obses, next_obses, actions): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], observations=[], next_observations=[], actions=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['observations'].append(obs) running_paths[idx]['next_observations'].append(next_obs) running_paths[idx]['actions'].append(action) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) act_space = self._env_spec.action_space path_dict = {} path_dict['observations'] = obs_space.flatten_n( running_paths[idx]['observations']) path_dict['next_observations'] = obs_space.flatten_n( running_paths[idx]['next_observations']) path_dict['rewards'] = np.asarray( running_paths[idx]['rewards']).reshape(-1, 1) path_dict['terminals'] = np.asarray( running_paths[idx]['dones']).reshape(-1, 1) path_dict['actions'] = act_space.flatten_n( running_paths[idx]['actions']) self.algo.replay_buffer.add_path(path_dict) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 obses = next_obses return paths
class OffPolicyVectorizedSampler(BatchSampler): """This class implements OffPolicyVectorizedSampler. Args: algo (metarl.np.RLAlgorithm): Algorithm. env (metarl.envs.MetaRLEnv): Environment. n_envs (int): Number of parallel environments managed by sampler. no_reset (bool): Reset environment between samples or not. """ def __init__(self, algo, env, n_envs=1, no_reset=False): if n_envs is None: n_envs = int(algo.rollout_batch_size) super().__init__(algo, env) self._n_envs = n_envs self._no_reset = no_reset self._vec_env = None self._env_spec = self.env.spec self._last_obses = None self._last_uncounted_discount = [0] * n_envs self._last_running_length = [0] * n_envs self._last_success_count = [0] * n_envs def start_worker(self): """Initialize the sampler.""" n_envs = self._n_envs envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)] # Deterministically set environment seeds based on the global seed. seed0 = deterministic.get_seed() if seed0 is not None: for (i, e) in enumerate(envs): e.seed(seed0 + i) self._vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) def shutdown_worker(self): """Terminate workers if necessary.""" self._vec_env.close() # pylint: disable=too-many-branches, too-many-statements def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self._env_spec, input_obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, obs_normalized, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( obs_normalized) next_obses, rewards, dones, env_infos = self._vec_env.step(actions) self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths