def __init__(self, env_fns, start_method=None): self.waiting = False self.closed = False n_envs = len(env_fns) if start_method is None: # Fork is not a thread safe method (see issue #217) # but is more user friendly (does not require to wrap the code in # a `if __name__ == "__main__":`) fork_available = 'fork' in multiprocessing.get_all_start_methods() start_method = 'fork' if fork_available else 'spawn' ctx = multiprocessing.get_context(start_method) self.remotes, self.work_remotes = zip( *[ctx.Pipe() for _ in range(n_envs)]) self.processes = [] for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes, env_fns): args = (work_remote, remote, CloudpickleWrapper(env_fn)) # daemon=True: if the main process crashes, we should not cause things to hang process = ctx.Process(target=_worker, args=args, daemon=True) process.start() self.processes.append(process) work_remote.close() self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def __init__(self, env_fns, start_method=None): self.waiting = False self.closed = False n_envs = len(env_fns) if start_method is None: # Use thread safe method, see issue #217. # forkserver faster than spawn but not always available. forkserver_available = 'forkserver' in multiprocessing.get_all_start_methods() start_method = 'forkserver' if forkserver_available else 'spawn' ctx = multiprocessing.get_context(start_method) self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(n_envs)]) self.processes = [] for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes, env_fns): args = (work_remote, remote, CloudpickleWrapper(env_fn)) # daemon=True: if the main process crashes, we should not cause things to hang process = ctx.Process(target=_worker, args=args, daemon=True) process.start() self.processes.append(process) work_remote.close() self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def __init__(self, env_id, n_agents): env_path = UnityVecEnv.GetFilePath(env_id, n_agents=n_agents) print("**** ", env_path) env = UnityEnv(env_path, multiagent=True) self.env = env env.num_envs = env.number_agents VecEnv.__init__(self, env.num_envs, env.observation_space, env.action_space) obs_space = env.observation_space # self.keys, shapes, dtypes = obs_space_info(obs_space) # self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } # self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) # self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) self.buf_infos = [{} for _ in range(self.num_envs)] # Fake Monitor self.tstart = time.time() self.results_writer = ResultsWriter("filename", header={ "t_start": time.time(), 'env_id': env.spec and env.spec.id }, extra_keys=() + ()) self.reset_keywords = () self.info_keywords = () self.allow_early_resets = True self.rewards = None self.needs_reset = True self.episode_rewards = [] self.episode_lengths = [] self.episode_times = [] self.total_steps = 0 self.current_reset_info = { } # extra info about the current episode, that was passed in during reset()
def evaluate_policy_rewards( model, env: VecEnv, n_eval_episodes: int = 10, deterministic: bool = True, render: bool = False) -> Tuple[List[float], List[int], List[int]]: if isinstance(env, VecEnv): assert env.num_envs == 1, "You must pass only one environment when using this function" episode_rewards, episode_lengths, rewards_memory_episodes = [], [], [] for i in range(n_eval_episodes): if not isinstance(env, VecEnv) or i == 0: obs = env.reset() rewards_memory = [] done, state = False, None episode_reward = 0.0 episode_length = 0 while not done: action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, _info = env.step(action) rewards_memory.append(reward[0]) episode_reward += reward[0] episode_length += 1 if render: env.render() rewards_memory_episodes.append(rewards_memory) episode_rewards.append(episode_reward) episode_lengths.append(episode_length) return episode_rewards, episode_lengths, rewards_memory_episodes
def __init__(self, env_fns): """ :param env_fns: ([function]) """ assert len(env_fns) == 1, "This dummy class does not support multiprocessing" self.envs = [fn() for fn in env_fns] env = self.envs[0] VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) self.env = self.envs[0] self.actions = None self.obs = None self.reward, self.done, self.infos = None, None, None
def __init__(self, env_fns, start_method=None): """ @brief Constructor @warning Only 'forkserver' and 'spawn' start methods are thread-safe, which is important when TensorFlow sessions or other non thread-safe libraries are used in the parent. However, compared to 'fork' they incur a small start-up cost and have restrictions on global variables. With those methods, users must wrap the code in an ``if __name__ == "__main__":`` For more information, see the multiprocessing documentation. @param[in] env_fns List of Gym Environments to run in subprocesses @param[in] start_method Method used to start the subprocesses. Must be one of the methods returned by multiprocessing.get_all_start_methods(). Optional: Defaults to 'fork' on available platforms, and 'spawn' otherwise. @return Instance of SubprocVecEnvLock. """ global lock self.waiting = False self.closed = False n_envs = len(env_fns) if start_method is None: # Fork is not a thread safe method (see issue #217) # but is more user friendly (does not require to wrap the code in # a `if __name__ == "__main__":`) fork_available = 'fork' in multiprocessing.get_all_start_methods() start_method = 'fork' if fork_available else 'spawn' ctx = multiprocessing.get_context(start_method) self.remotes, self.work_remotes = zip( *[ctx.Pipe() for _ in range(n_envs)]) self.processes = [] for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes, env_fns): args = (work_remote, remote, CloudpickleWrapper(env_fn), lock) # daemon=True: if the main process crashes, we should not cause things to hang process = ctx.Process(target=_worker, args=args, daemon=True) process.start() self.processes.append(process) work_remote.close() self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def __init__(self, env_fns, **env_args): self.envs = [fn(**env_args) for fn in env_fns] env = self.envs[0] VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) obs_space = env.observation_space self.keys, shapes, dtypes = obs_space_info(obs_space) self.buf_obs = OrderedDict([(k, np.zeros( (self.num_envs, ) + tuple(shapes[k]), dtype=dtypes[k])) for k in self.keys]) self.buf_dones = np.zeros((self.num_envs, ), dtype=np.bool) self.buf_rews = np.zeros((self.num_envs, ), dtype=np.float32) self.buf_infos = [{} for _ in range(self.num_envs)] self.actions = None
def __init__(self, env_name: str, env: VecEnv, model, n_steps=5, gamma=0.99): """ A runner to learn the policy of an environment for an a2c model :param env: (Gym environment) The environment to learn from :param model: (Model) The model to learn :param n_steps: (int) The number of steps to run for each environment :param gamma: (float) Discount factor """ self.env = env self.model = model n_env = env.num_envs self.batch_ob_shape = (n_env * n_steps, ) + env.observation_space.shape self.obs = np.zeros((n_env, ) + env.observation_space.shape, dtype=env.observation_space.dtype.name) self.obs[:] = env.reset() self.n_steps = n_steps self.dones = [False for _ in range(n_env)] self.gamma = gamma self.states = np.zeros((n_env, self.model.step_model.n_lstm * 2), dtype=np.float32) self.env_name = env_name
def __init__(self, env_fns): self.waiting = False self.closed = False n_envs = len(env_fns) self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(n_envs)]) self.processes = [Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] for process in self.processes: process.daemon = True # if the main process crashes, we should not cause things to hang process.start() for remote in self.work_remotes: remote.close() self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def from_venv(cls, venv: vec_env.VecEnv, *args, **kwargs): """Factory constructor, extracting spaces and target from environment.""" target = venv.env_method("get_body_com", "target") assert np.all(target[0] == target) return PointMazeReward( venv.observation_space, venv.action_space, target[0], *args, **kwargs )
def __init__(self, env_fns, create_method): self.waiting = False self.closed = False n_envs = len(env_fns) self.remotes, self.work_remotes = zip( *[create_pipe() for _ in range(n_envs)]) self.processes = [] for work_remote, env_fn in zip(self.work_remotes, env_fns): args = (work_remote, CloudpickleWrapper(env_fn)) # daemon=True: if the main process crashes, we should not cause # things to hang process = create_method(target=_worker, args=args, daemon=True) process.start() self.processes.append(process) self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def __init__(self, num_envs, num_agents, observation_space, action_space): VecEnv.__init__(self, num_envs, observation_space, action_space) self.num_agents = num_agents
def generate_trajectories(policy, venv: VecEnv, sample_until: GenTrajTerminationFn, *, deterministic_policy: bool = False, ) -> Sequence[Trajectory]: """Generate trajectory dictionaries from a policy and an environment. Args: policy (BasePolicy or BaseRLModel): A stable_baselines policy or RLModel, trained on the gym environment. venv: The vectorized environments to interact with. sample_until: A function determining the termination condition. It takes a sequence of trajectories, and returns a bool. Most users will want to use one of `min_episodes` or `min_timesteps`. deterministic_policy: If True, asks policy to deterministically return action. Note the trajectories might still be non-deterministic if the environment has non-determinism! Returns: Sequence of `Trajectory` named tuples. """ if isinstance(policy, BaseRLModel): get_action = policy.predict policy.set_env(venv) else: get_action = functools.partial(get_action_policy, policy) # Collect rollout tuples. trajectories = [] # accumulator for incomplete trajectories trajectories_accum = TrajectoryAccumulator() obs = venv.reset() for env_idx, ob in enumerate(obs): # Seed with first obs only. Inside loop, we'll only add second obs from # each (s,a,r,s') tuple, under the same "obs" key again. That way we still # get all observations, but they're not duplicated into "next obs" and # "previous obs" (this matters for, e.g., Atari, where observations are # really big). trajectories_accum.add_step(dict(obs=ob), env_idx) while not sample_until(trajectories): acts, _ = get_action(obs, deterministic=deterministic_policy) obs, rews, dones, infos = venv.step(acts) new_trajs = trajectories_accum.add_steps_and_auto_finish( acts, obs, rews, dones, infos) trajectories.extend(new_trajs) # Note that we just drop partial trajectories. This is not ideal for some # algos; e.g. BC can probably benefit from partial trajectories, too. # Sanity checks. for trajectory in trajectories: n_steps = len(trajectory.acts) # extra 1 for the end exp_obs = (n_steps + 1, ) + venv.observation_space.shape real_obs = trajectory.obs.shape assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}" exp_act = (n_steps, ) + venv.action_space.shape real_act = trajectory.acts.shape assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}" exp_rew = (n_steps,) real_rew = trajectory.rews.shape assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}" return trajectories
def generate_trajectories( policy, venv: VecEnv, sample_until: GenTrajTerminationFn, *, deterministic_policy: bool = False, rng: np.random.RandomState = np.random, ) -> Sequence[types.TrajectoryWithRew]: """Generate trajectory dictionaries from a policy and an environment. Args: policy (BasePolicy or BaseRLModel): A stable_baselines policy or RLModel, trained on the gym environment. venv: The vectorized environments to interact with. sample_until: A function determining the termination condition. It takes a sequence of trajectories, and returns a bool. Most users will want to use one of `min_episodes` or `min_timesteps`. deterministic_policy: If True, asks policy to deterministically return action. Note the trajectories might still be non-deterministic if the environment has non-determinism! rng: used for shuffling trajectories. Returns: Sequence of trajectories, satisfying `sample_until`. Additional trajectories may be collected to avoid biasing process towards short episodes; the user should truncate if required. """ if isinstance(policy, BaseRLModel): get_action = policy.predict policy.set_env(venv) else: get_action = functools.partial(get_action_policy, policy) # Collect rollout tuples. trajectories = [] # accumulator for incomplete trajectories trajectories_accum = TrajectoryAccumulator() obs = venv.reset() for env_idx, ob in enumerate(obs): # Seed with first obs only. Inside loop, we'll only add second obs from # each (s,a,r,s') tuple, under the same "obs" key again. That way we still # get all observations, but they're not duplicated into "next obs" and # "previous obs" (this matters for, e.g., Atari, where observations are # really big). trajectories_accum.add_step(dict(obs=ob), env_idx) # Now, we sample until `sample_until(trajectories)` is true. # If we just stopped then this would introduce a bias towards shorter episodes, # since longer episodes are more likely to still be active, i.e. in the process # of being sampled from. To avoid this, we continue sampling until all epsiodes # are complete. # # To start with, all environments are active. active = np.ones(venv.num_envs, dtype=np.bool) while np.any(active): acts, _ = get_action(obs, deterministic=deterministic_policy) obs, rews, dones, infos = venv.step(acts) # If an environment is inactive, i.e. the episode completed for that # environment after `sample_until(trajectories)` was true, then we do # *not* want to add any subsequent trajectories from it. We avoid this # by just making it never done. dones &= active new_trajs = trajectories_accum.add_steps_and_auto_finish( acts, obs, rews, dones, infos) trajectories.extend(new_trajs) if sample_until(trajectories): # Termination condition has been reached. Mark as inactive any environments # where a trajectory was completed this timestep. active &= ~dones # Note that we just drop partial trajectories. This is not ideal for some # algos; e.g. BC can probably benefit from partial trajectories, too. # Each trajectory is sampled i.i.d.; however, shorter episodes are added to # `trajectories` sooner. Shuffle to avoid bias in order. This is important # when callees end up truncating the number of trajectories or transitions. # It is also cheap, since we're just shuffling pointers. rng.shuffle(trajectories) # Sanity checks. for trajectory in trajectories: n_steps = len(trajectory.acts) # extra 1 for the end exp_obs = (n_steps + 1, ) + venv.observation_space.shape real_obs = trajectory.obs.shape assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}" exp_act = (n_steps, ) + venv.action_space.shape real_act = trajectory.acts.shape assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}" exp_rew = (n_steps, ) real_rew = trajectory.rews.shape assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}" return trajectories
def generate_trajectories( policy, venv: VecEnv, sample_until: GenTrajTerminationFn, *, deterministic_policy: bool = False, ) -> Sequence[Trajectory]: """Generate trajectory dictionaries from a policy and an environment. Args: policy (BasePolicy or BaseRLModel): A stable_baselines policy or RLModel, trained on the gym environment. venv: The vectorized environments to interact with. sample_until: A function determining the termination condition. It takes a sequence of trajectories, and returns a bool. Most users will want to use one of `min_episodes` or `min_timesteps`. deterministic_policy: If True, asks policy to deterministically return action. Note the trajectories might still be non-deterministic if the environment has non-determinism! Returns: Sequence of `Trajectory` named tuples. """ if isinstance(policy, BaseRLModel): get_action = policy.predict policy.set_env(venv) else: get_action = functools.partial(get_action_policy, policy) # Collect rollout tuples. trajectories = [] # accumulator for incomplete trajectories trajectories_accum = _TrajectoryAccumulator() obs_batch = venv.reset() for env_idx, obs in enumerate(obs_batch): # Seed with first obs only. Inside loop, we'll only add second obs from # each (s,a,r,s') tuple, under the same "obs" key again. That way we still # get all observations, but they're not duplicated into "next obs" and # "previous obs" (this matters for, e.g., Atari, where observations are # really big). trajectories_accum.add_step(env_idx, dict(obs=obs)) while not sample_until(trajectories): obs_old_batch = obs_batch act_batch, _ = get_action(obs_old_batch, deterministic=deterministic_policy) obs_batch, rew_batch, done_batch, info_batch = venv.step(act_batch) # Don't save tuples if there is a done. The next_obs for any environment # is incorrect for any timestep where there is an episode end, so we fix it # with returned state info. zip_iter = enumerate( zip(obs_old_batch, act_batch, obs_batch, rew_batch, done_batch, info_batch)) for env_idx, (obs_old, act, obs, rew, done, info) in zip_iter: real_obs = obs if done: # actual obs is inaccurate, so we use the one inserted into step info # by stable baselines wrapper real_obs = info['terminal_observation'] trajectories_accum.add_step( env_idx, dict( acts=act, rews=rew, # this is not the obs corresponding to `act`, but rather the obs # *after* `act` (see above) obs=real_obs, infos=info)) if done: # finish env_idx-th trajectory new_traj = trajectories_accum.finish_trajectory(env_idx) trajectories.append(new_traj) trajectories_accum.add_step(env_idx, dict(obs=obs)) continue # Note that we just drop partial trajectories. This is not ideal for some # algos; e.g. BC can probably benefit from partial trajectories, too. # Sanity checks. for trajectory in trajectories: n_steps = len(trajectory.acts) # extra 1 for the end exp_obs = (n_steps + 1, ) + venv.observation_space.shape real_obs = trajectory.obs.shape assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}" exp_act = (n_steps, ) + venv.action_space.shape real_act = trajectory.acts.shape assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}" exp_rew = (n_steps, ) real_rew = trajectory.rews.shape assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}" return trajectories