def __init__(self, env_fns, start_method=None):
        self.waiting = False
        self.closed = False
        n_envs = len(env_fns)

        if start_method is None:
            # Fork is not a thread safe method (see issue #217)
            # but is more user friendly (does not require to wrap the code in
            # a `if __name__ == "__main__":`)
            fork_available = 'fork' in multiprocessing.get_all_start_methods()
            start_method = 'fork' if fork_available else 'spawn'
        ctx = multiprocessing.get_context(start_method)

        self.remotes, self.work_remotes = zip(
            *[ctx.Pipe() for _ in range(n_envs)])
        self.processes = []
        for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes,
                                               env_fns):
            args = (work_remote, remote, CloudpickleWrapper(env_fn))
            # daemon=True: if the main process crashes, we should not cause things to hang
            process = ctx.Process(target=_worker, args=args, daemon=True)
            process.start()
            self.processes.append(process)
            work_remote.close()

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
예제 #2
0
    def __init__(self, env_fns, start_method=None):
        self.waiting = False
        self.closed = False
        n_envs = len(env_fns)

        if start_method is None:
            # Use thread safe method, see issue #217.
            # forkserver faster than spawn but not always available.
            forkserver_available = 'forkserver' in multiprocessing.get_all_start_methods()
            start_method = 'forkserver' if forkserver_available else 'spawn'
        ctx = multiprocessing.get_context(start_method)

        self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(n_envs)])
        self.processes = []
        for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes, env_fns):
            args = (work_remote, remote, CloudpickleWrapper(env_fn))
            # daemon=True: if the main process crashes, we should not cause things to hang
            process = ctx.Process(target=_worker, args=args, daemon=True)
            process.start()
            self.processes.append(process)
            work_remote.close()

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
예제 #3
0
    def __init__(self, env_id, n_agents):
        env_path = UnityVecEnv.GetFilePath(env_id, n_agents=n_agents)
        print("**** ", env_path)
        env = UnityEnv(env_path, multiagent=True)
        self.env = env
        env.num_envs = env.number_agents
        VecEnv.__init__(self, env.num_envs, env.observation_space,
                        env.action_space)
        obs_space = env.observation_space

        # self.keys, shapes, dtypes = obs_space_info(obs_space)
        # self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
        # self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
        # self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
        self.buf_infos = [{} for _ in range(self.num_envs)]
        # Fake Monitor
        self.tstart = time.time()
        self.results_writer = ResultsWriter("filename",
                                            header={
                                                "t_start": time.time(),
                                                'env_id': env.spec
                                                and env.spec.id
                                            },
                                            extra_keys=() + ())
        self.reset_keywords = ()
        self.info_keywords = ()
        self.allow_early_resets = True
        self.rewards = None
        self.needs_reset = True
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_times = []
        self.total_steps = 0
        self.current_reset_info = {
        }  # extra info about the current episode, that was passed in during reset()
예제 #4
0
def evaluate_policy_rewards(
        model,
        env: VecEnv,
        n_eval_episodes: int = 10,
        deterministic: bool = True,
        render: bool = False) -> Tuple[List[float], List[int], List[int]]:

    if isinstance(env, VecEnv):
        assert env.num_envs == 1, "You must pass only one environment when using this function"
    episode_rewards, episode_lengths, rewards_memory_episodes = [], [], []
    for i in range(n_eval_episodes):

        if not isinstance(env, VecEnv) or i == 0:
            obs = env.reset()
        rewards_memory = []
        done, state = False, None
        episode_reward = 0.0
        episode_length = 0
        while not done:
            action, state = model.predict(obs,
                                          state=state,
                                          deterministic=deterministic)
            obs, reward, done, _info = env.step(action)
            rewards_memory.append(reward[0])
            episode_reward += reward[0]
            episode_length += 1
            if render:
                env.render()
        rewards_memory_episodes.append(rewards_memory)
        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)

    return episode_rewards, episode_lengths, rewards_memory_episodes
예제 #5
0
 def __init__(self, env_fns):
     """
     :param env_fns: ([function])
     """
     assert len(env_fns) == 1, "This dummy class does not support multiprocessing"
     self.envs = [fn() for fn in env_fns]
     env = self.envs[0]
     VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
     self.env = self.envs[0]
     self.actions = None
     self.obs = None
     self.reward, self.done, self.infos = None, None, None
예제 #6
0
    def __init__(self, env_fns, start_method=None):
        """
        @brief      Constructor

        @warning    Only 'forkserver' and 'spawn' start methods are thread-safe, which is
                    important when TensorFlow sessions or other non thread-safe libraries
                    are used in the parent.
                    However, compared to 'fork' they incur a small start-up cost and have
                    restrictions on global variables. With those methods, users must wrap
                    the code in an ``if __name__ == "__main__":``
                    For more information, see the multiprocessing documentation.

        @param[in]  env_fns             List of Gym Environments to run in subprocesses
        @param[in]  start_method        Method used to start the subprocesses. Must be one of the
                                        methods returned by multiprocessing.get_all_start_methods().
                                        Optional: Defaults to 'fork' on available platforms, and 'spawn' otherwise.

        @return     Instance of SubprocVecEnvLock.
        """
        global lock

        self.waiting = False
        self.closed = False
        n_envs = len(env_fns)

        if start_method is None:
            # Fork is not a thread safe method (see issue #217)
            # but is more user friendly (does not require to wrap the code in
            # a `if __name__ == "__main__":`)
            fork_available = 'fork' in multiprocessing.get_all_start_methods()
            start_method = 'fork' if fork_available else 'spawn'
        ctx = multiprocessing.get_context(start_method)

        self.remotes, self.work_remotes = zip(
            *[ctx.Pipe() for _ in range(n_envs)])
        self.processes = []
        for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes,
                                               env_fns):
            args = (work_remote, remote, CloudpickleWrapper(env_fn), lock)
            # daemon=True: if the main process crashes, we should not cause things to hang
            process = ctx.Process(target=_worker, args=args, daemon=True)
            process.start()
            self.processes.append(process)
            work_remote.close()

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
예제 #7
0
    def __init__(self, env_fns, **env_args):
        self.envs = [fn(**env_args) for fn in env_fns]
        env = self.envs[0]
        VecEnv.__init__(self, len(env_fns), env.observation_space,
                        env.action_space)
        obs_space = env.observation_space
        self.keys, shapes, dtypes = obs_space_info(obs_space)

        self.buf_obs = OrderedDict([(k,
                                     np.zeros(
                                         (self.num_envs, ) + tuple(shapes[k]),
                                         dtype=dtypes[k])) for k in self.keys])
        self.buf_dones = np.zeros((self.num_envs, ), dtype=np.bool)
        self.buf_rews = np.zeros((self.num_envs, ), dtype=np.float32)
        self.buf_infos = [{} for _ in range(self.num_envs)]
        self.actions = None
예제 #8
0
    def __init__(self,
                 env_name: str,
                 env: VecEnv,
                 model,
                 n_steps=5,
                 gamma=0.99):
        """
        A runner to learn the policy of an environment for an a2c model

        :param env: (Gym environment) The environment to learn from
        :param model: (Model) The model to learn
        :param n_steps: (int) The number of steps to run for each environment
        :param gamma: (float) Discount factor
        """
        self.env = env
        self.model = model
        n_env = env.num_envs
        self.batch_ob_shape = (n_env * n_steps, ) + env.observation_space.shape
        self.obs = np.zeros((n_env, ) + env.observation_space.shape,
                            dtype=env.observation_space.dtype.name)
        self.obs[:] = env.reset()
        self.n_steps = n_steps
        self.dones = [False for _ in range(n_env)]

        self.gamma = gamma
        self.states = np.zeros((n_env, self.model.step_model.n_lstm * 2),
                               dtype=np.float32)
        self.env_name = env_name
예제 #9
0
    def __init__(self, env_fns):
        self.waiting = False
        self.closed = False
        n_envs = len(env_fns)
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(n_envs)])
        self.processes = [Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
                          for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
        for process in self.processes:
            process.daemon = True  # if the main process crashes, we should not cause things to hang
            process.start()
        for remote in self.work_remotes:
            remote.close()

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
예제 #10
0
 def from_venv(cls, venv: vec_env.VecEnv, *args, **kwargs):
     """Factory constructor, extracting spaces and target from environment."""
     target = venv.env_method("get_body_com", "target")
     assert np.all(target[0] == target)
     return PointMazeReward(
         venv.observation_space, venv.action_space, target[0], *args, **kwargs
     )
예제 #11
0
    def __init__(self, env_fns, create_method):
        self.waiting = False
        self.closed = False
        n_envs = len(env_fns)

        self.remotes, self.work_remotes = zip(
            *[create_pipe() for _ in range(n_envs)])
        self.processes = []
        for work_remote, env_fn in zip(self.work_remotes, env_fns):
            args = (work_remote, CloudpickleWrapper(env_fn))
            # daemon=True: if the main process crashes, we should not cause
            # things to hang
            process = create_method(target=_worker, args=args, daemon=True)
            process.start()
            self.processes.append(process)

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
예제 #12
0
 def __init__(self, num_envs, num_agents, observation_space, action_space):
     VecEnv.__init__(self, num_envs, observation_space, action_space)
     self.num_agents = num_agents
예제 #13
0
def generate_trajectories(policy,
                          venv: VecEnv,
                          sample_until: GenTrajTerminationFn,
                          *,
                          deterministic_policy: bool = False,
                          ) -> Sequence[Trajectory]:
  """Generate trajectory dictionaries from a policy and an environment.

  Args:
    policy (BasePolicy or BaseRLModel): A stable_baselines policy or RLModel,
        trained on the gym environment.
    venv: The vectorized environments to interact with.
    sample_until: A function determining the termination condition.
        It takes a sequence of trajectories, and returns a bool.
        Most users will want to use one of `min_episodes` or `min_timesteps`.
    deterministic_policy: If True, asks policy to deterministically return
        action. Note the trajectories might still be non-deterministic if the
        environment has non-determinism!

  Returns:
    Sequence of `Trajectory` named tuples.
  """
  if isinstance(policy, BaseRLModel):
    get_action = policy.predict
    policy.set_env(venv)
  else:
    get_action = functools.partial(get_action_policy, policy)

  # Collect rollout tuples.
  trajectories = []
  # accumulator for incomplete trajectories
  trajectories_accum = TrajectoryAccumulator()
  obs = venv.reset()
  for env_idx, ob in enumerate(obs):
    # Seed with first obs only. Inside loop, we'll only add second obs from
    # each (s,a,r,s') tuple, under the same "obs" key again. That way we still
    # get all observations, but they're not duplicated into "next obs" and
    # "previous obs" (this matters for, e.g., Atari, where observations are
    # really big).
    trajectories_accum.add_step(dict(obs=ob), env_idx)

  while not sample_until(trajectories):
    acts, _ = get_action(obs, deterministic=deterministic_policy)
    obs, rews, dones, infos = venv.step(acts)

    new_trajs = trajectories_accum.add_steps_and_auto_finish(
      acts, obs, rews, dones, infos)
    trajectories.extend(new_trajs)

  # Note that we just drop partial trajectories. This is not ideal for some
  # algos; e.g. BC can probably benefit from partial trajectories, too.

  # Sanity checks.
  for trajectory in trajectories:
    n_steps = len(trajectory.acts)
    # extra 1 for the end
    exp_obs = (n_steps + 1, ) + venv.observation_space.shape
    real_obs = trajectory.obs.shape
    assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}"
    exp_act = (n_steps, ) + venv.action_space.shape
    real_act = trajectory.acts.shape
    assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}"
    exp_rew = (n_steps,)
    real_rew = trajectory.rews.shape
    assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}"

  return trajectories
예제 #14
0
def generate_trajectories(
    policy,
    venv: VecEnv,
    sample_until: GenTrajTerminationFn,
    *,
    deterministic_policy: bool = False,
    rng: np.random.RandomState = np.random,
) -> Sequence[types.TrajectoryWithRew]:
    """Generate trajectory dictionaries from a policy and an environment.

    Args:
      policy (BasePolicy or BaseRLModel): A stable_baselines policy or RLModel,
          trained on the gym environment.
      venv: The vectorized environments to interact with.
      sample_until: A function determining the termination condition.
          It takes a sequence of trajectories, and returns a bool.
          Most users will want to use one of `min_episodes` or `min_timesteps`.
      deterministic_policy: If True, asks policy to deterministically return
          action. Note the trajectories might still be non-deterministic if the
          environment has non-determinism!
      rng: used for shuffling trajectories.

    Returns:
      Sequence of trajectories, satisfying `sample_until`. Additional trajectories
      may be collected to avoid biasing process towards short episodes; the user
      should truncate if required.
    """
    if isinstance(policy, BaseRLModel):
        get_action = policy.predict
        policy.set_env(venv)
    else:
        get_action = functools.partial(get_action_policy, policy)

    # Collect rollout tuples.
    trajectories = []
    # accumulator for incomplete trajectories
    trajectories_accum = TrajectoryAccumulator()
    obs = venv.reset()
    for env_idx, ob in enumerate(obs):
        # Seed with first obs only. Inside loop, we'll only add second obs from
        # each (s,a,r,s') tuple, under the same "obs" key again. That way we still
        # get all observations, but they're not duplicated into "next obs" and
        # "previous obs" (this matters for, e.g., Atari, where observations are
        # really big).
        trajectories_accum.add_step(dict(obs=ob), env_idx)

    # Now, we sample until `sample_until(trajectories)` is true.
    # If we just stopped then this would introduce a bias towards shorter episodes,
    # since longer episodes are more likely to still be active, i.e. in the process
    # of being sampled from. To avoid this, we continue sampling until all epsiodes
    # are complete.
    #
    # To start with, all environments are active.
    active = np.ones(venv.num_envs, dtype=np.bool)
    while np.any(active):
        acts, _ = get_action(obs, deterministic=deterministic_policy)
        obs, rews, dones, infos = venv.step(acts)

        # If an environment is inactive, i.e. the episode completed for that
        # environment after `sample_until(trajectories)` was true, then we do
        # *not* want to add any subsequent trajectories from it. We avoid this
        # by just making it never done.
        dones &= active

        new_trajs = trajectories_accum.add_steps_and_auto_finish(
            acts, obs, rews, dones, infos)
        trajectories.extend(new_trajs)

        if sample_until(trajectories):
            # Termination condition has been reached. Mark as inactive any environments
            # where a trajectory was completed this timestep.
            active &= ~dones

    # Note that we just drop partial trajectories. This is not ideal for some
    # algos; e.g. BC can probably benefit from partial trajectories, too.

    # Each trajectory is sampled i.i.d.; however, shorter episodes are added to
    # `trajectories` sooner. Shuffle to avoid bias in order. This is important
    # when callees end up truncating the number of trajectories or transitions.
    # It is also cheap, since we're just shuffling pointers.
    rng.shuffle(trajectories)

    # Sanity checks.
    for trajectory in trajectories:
        n_steps = len(trajectory.acts)
        # extra 1 for the end
        exp_obs = (n_steps + 1, ) + venv.observation_space.shape
        real_obs = trajectory.obs.shape
        assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}"
        exp_act = (n_steps, ) + venv.action_space.shape
        real_act = trajectory.acts.shape
        assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}"
        exp_rew = (n_steps, )
        real_rew = trajectory.rews.shape
        assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}"

    return trajectories
예제 #15
0
def generate_trajectories(
    policy,
    venv: VecEnv,
    sample_until: GenTrajTerminationFn,
    *,
    deterministic_policy: bool = False,
) -> Sequence[Trajectory]:
    """Generate trajectory dictionaries from a policy and an environment.

  Args:
    policy (BasePolicy or BaseRLModel): A stable_baselines policy or RLModel,
        trained on the gym environment.
    venv: The vectorized environments to interact with.
    sample_until: A function determining the termination condition.
        It takes a sequence of trajectories, and returns a bool.
        Most users will want to use one of `min_episodes` or `min_timesteps`.
    deterministic_policy: If True, asks policy to deterministically return
        action. Note the trajectories might still be non-deterministic if the
        environment has non-determinism!

  Returns:
    Sequence of `Trajectory` named tuples.
  """
    if isinstance(policy, BaseRLModel):
        get_action = policy.predict
        policy.set_env(venv)
    else:
        get_action = functools.partial(get_action_policy, policy)

    # Collect rollout tuples.
    trajectories = []
    # accumulator for incomplete trajectories
    trajectories_accum = _TrajectoryAccumulator()
    obs_batch = venv.reset()
    for env_idx, obs in enumerate(obs_batch):
        # Seed with first obs only. Inside loop, we'll only add second obs from
        # each (s,a,r,s') tuple, under the same "obs" key again. That way we still
        # get all observations, but they're not duplicated into "next obs" and
        # "previous obs" (this matters for, e.g., Atari, where observations are
        # really big).
        trajectories_accum.add_step(env_idx, dict(obs=obs))
    while not sample_until(trajectories):
        obs_old_batch = obs_batch
        act_batch, _ = get_action(obs_old_batch,
                                  deterministic=deterministic_policy)
        obs_batch, rew_batch, done_batch, info_batch = venv.step(act_batch)

        # Don't save tuples if there is a done. The next_obs for any environment
        # is incorrect for any timestep where there is an episode end, so we fix it
        # with returned state info.
        zip_iter = enumerate(
            zip(obs_old_batch, act_batch, obs_batch, rew_batch, done_batch,
                info_batch))
        for env_idx, (obs_old, act, obs, rew, done, info) in zip_iter:
            real_obs = obs
            if done:
                # actual obs is inaccurate, so we use the one inserted into step info
                # by stable baselines wrapper
                real_obs = info['terminal_observation']
            trajectories_accum.add_step(
                env_idx,
                dict(
                    acts=act,
                    rews=rew,
                    # this is not the obs corresponding to `act`, but rather the obs
                    # *after* `act` (see above)
                    obs=real_obs,
                    infos=info))
            if done:
                # finish env_idx-th trajectory
                new_traj = trajectories_accum.finish_trajectory(env_idx)
                trajectories.append(new_traj)
                trajectories_accum.add_step(env_idx, dict(obs=obs))
                continue

    # Note that we just drop partial trajectories. This is not ideal for some
    # algos; e.g. BC can probably benefit from partial trajectories, too.

    # Sanity checks.
    for trajectory in trajectories:
        n_steps = len(trajectory.acts)
        # extra 1 for the end
        exp_obs = (n_steps + 1, ) + venv.observation_space.shape
        real_obs = trajectory.obs.shape
        assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}"
        exp_act = (n_steps, ) + venv.action_space.shape
        real_act = trajectory.acts.shape
        assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}"
        exp_rew = (n_steps, )
        real_rew = trajectory.rews.shape
        assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}"

    return trajectories