Пример #1
0
    def collect_sample(self, action):
        """Perform the sample collection operation over a single step.

        This method is responsible for executing a single step of the
        environment. This is perform a number of times in the _collect_samples
        method before training is executed. The data from the rollouts is
        stored in the policy's replay buffer(s).

        Parameters
        ----------
        action : array_like
            the action to be performed by the agent(s) within the environment

        Returns
        -------
        dict
            information from the most recent environment update step,
            consisting of the following terms:

            * obs : the most recent observation. This consists of a single
              observation if no reset occured, and a tuple of (last observation
              from the previous rollout, first observation of the next rollout)
              if a reset occured.
            * context : the contextual term from the environment
            * action : the action performed by the agent(s)
            * reward : the reward from the most recent step
            * done : the done mask
            * env_num : the environment number
            * all_obs : the most recent full-state observation. This consists
              of a single observation if no reset occured, and a tuple of (last
              observation from the previous rollout, first observation of the
              next rollout) if a reset occured.
        """
        # Execute the next action.
        obs, reward, done, info = self.env.step(action)
        obs, all_obs = get_obs(obs)

        # Visualize the current step.
        if self._render and self._env_num == 0:
            self.env.render()  # pragma: no cover

        # Get the contextual term.
        context = getattr(self.env, "current_context", None)

        # Done mask for multi-agent policies is slightly different.
        if isinstance(done, dict):
            done = done["__all__"]

        if done:
            # Reset the environment.
            reset_obs = self.env.reset()
            reset_obs, reset_all_obs = get_obs(reset_obs)
        else:
            reset_obs = None
            reset_all_obs = None

        return {
            "obs": obs if not done else (obs, reset_obs),
            "context": context,
            "action": action,
            "reward": reward,
            "done": done,
            "env_num": self._env_num,
            "all_obs": all_obs if not done else (all_obs, reset_all_obs),
            "info": info,
        }
Пример #2
0
    def _evaluate(self, total_steps, env):
        """Perform the evaluation operation.

        This method runs the evaluation environment for a number of episodes
        and returns the cumulative rewards and successes from each environment.

        Parameters
        ----------
        total_steps : int
            the total number of samples to train on
        env : gym.Env
            the evaluation environment that the policy is meant to be tested on

        Returns
        -------
        list of float
            the list of cumulative rewards from every episode in the evaluation
            phase
        list of bool
            a list of boolean terms representing if each episode ended in
            success or not. If the list is empty, then the environment did not
            output successes or failures, and the success rate will be set to
            zero.
        dict
            additional information that is meant to be logged
        """
        num_steps = deepcopy(self.total_steps)
        eval_episode_rewards = []
        eval_episode_successes = []
        ret_info = {'initial': [], 'final': [], 'average': []}

        if self.verbose >= 1:
            for _ in range(3):
                print("-------------------")
            print("Running evaluation for {} episodes:".format(
                self.nb_eval_episodes))

        # Clear replay buffer-related memory in the policy to allow for the
        # meta-actions to properly updated.
        if is_goal_conditioned_policy(self.policy):
            self.policy_tf.clear_memory()

        for i in range(self.nb_eval_episodes):
            # Reset the environment.
            eval_obs = env.reset()
            eval_obs, eval_all_obs = get_obs(eval_obs)

            # Add the fingerprint term, if needed.
            eval_obs = add_fingerprint(
                obs=eval_obs,
                steps=self.total_steps,
                total_steps=total_steps,
                use_fingerprints=self.policy_kwargs.get(
                    "use_fingerprints", False),
            )

            # Reset rollout-specific variables.
            eval_episode_reward = 0.
            eval_episode_step = 0

            rets = np.array([])
            while True:
                # Collect the contextual term. None if it is not passed.
                context = [env.current_context] \
                    if hasattr(env, "current_context") else None

                eval_action = self._policy(
                    obs=eval_obs,
                    context=context,
                    apply_noise=not self.eval_deterministic,
                    random_actions=False,
                )

                # Update the environment.
                obs, eval_r, done, info = env.step(eval_action)
                obs, all_obs = get_obs(obs)

                # Visualize the current step.
                if self.render_eval:
                    self.eval_env.render()  # pragma: no cover

                # Add the distance to this list for logging purposes (applies
                # only to the Ant* environments).
                if hasattr(env, "current_context"):
                    context = getattr(env, "current_context")
                    reward_fn = getattr(env, "contextual_reward")
                    rets = np.append(rets, reward_fn(eval_obs, context, obs))

                # Get the contextual term.
                context0 = context1 = getattr(env, "current_context", None)

                # Store a transition in the replay buffer. This is just for the
                # purposes of calling features in the store_transition method
                # of the policy.
                self._store_transition(
                    obs0=eval_obs,
                    context0=context0,
                    action=eval_action,
                    reward=eval_r,
                    obs1=obs,
                    context1=context1,
                    terminal1=False,
                    is_final_step=False,
                    all_obs0=eval_all_obs,
                    all_obs1=all_obs,
                    evaluate=True,
                )

                # Update the previous step observation.
                eval_obs = obs.copy()
                eval_all_obs = all_obs

                # Add the fingerprint term, if needed.
                eval_obs = add_fingerprint(
                    obs=eval_obs,
                    steps=self.total_steps,
                    total_steps=total_steps,
                    use_fingerprints=self.policy_kwargs.get(
                        "use_fingerprints", False),
                )

                # Increment the reward and step count.
                num_steps += 1
                eval_episode_reward += eval_r
                eval_episode_step += 1

                if done:
                    eval_episode_rewards.append(eval_episode_reward)
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        eval_episode_successes.append(float(maybe_is_success))

                    if self.verbose >= 1:
                        if rets.shape[0] > 0:
                            print("%d/%d: initial: %.3f, final: %.3f, average:"
                                  " %.3f, success: %d" %
                                  (i + 1, self.nb_eval_episodes, rets[0],
                                   rets[-1], float(rets.mean()),
                                   int(info.get('is_success'))))
                        else:
                            print("%d/%d" % (i + 1, self.nb_eval_episodes))

                    if hasattr(env, "current_context"):
                        ret_info['initial'].append(rets[0])
                        ret_info['final'].append(rets[-1])
                        ret_info['average'].append(float(rets.mean()))

                    # Exit the loop.
                    break

        if self.verbose >= 1:
            print("Done.")
            print("Average return: {}".format(np.mean(eval_episode_rewards)))
            if len(eval_episode_successes) > 0:
                print("Success rate: {}".format(
                    np.mean(eval_episode_successes)))
            for _ in range(3):
                print("-------------------")
            print("")

        # get the average of the reward information
        ret_info['initial'] = np.mean(ret_info['initial'])
        ret_info['final'] = np.mean(ret_info['final'])
        ret_info['average'] = np.mean(ret_info['average'])

        # Clear replay buffer-related memory in the policy once again so that
        # it does not affect the training procedure.
        if is_goal_conditioned_policy(self.policy):
            self.policy_tf.clear_memory()

        return eval_episode_rewards, eval_episode_successes, ret_info
Пример #3
0
    def setup_sampler(self, env, render, shared, maddpg):
        """Create the environment and collect the initial observations.

        Parameters
        ----------
        env : str
            the name of the environment
        render : bool
            whether to render the environment
        shared : bool
            specifies whether agents in an environment are meant to share
            policies. This is solely used by multi-agent Flow environments.
        maddpg : bool
            whether to use an environment variant that is compatible with the
            MADDPG algorithm

        Returns
        -------
        list of Sampler or list of RaySampler
            the sampler objects
        list of array_like or list of dict < str, array_like >
            the initila observation. If the environment is multi-agent, this
            will be a dictionary of observations for each agent, indexed by the
            agent ID. One element for each environment.
        list of array_like or list of None
            additional information, used by MADDPG variants of the multi-agent
            policy to pass full-state information. One element for each
            environment
        """
        if self.num_envs > 1:
            from hbaselines.utils.sampler import RaySampler
            sampler = [
                RaySampler.remote(
                    env_name=env,
                    render=render,
                    shared=shared,
                    maddpg=maddpg,
                    env_num=env_num,
                    evaluate=False,
                ) for env_num in range(self.num_envs)
            ]
            ob = ray.get([s.get_init_obs.remote() for s in sampler])
        else:
            from hbaselines.utils.sampler import Sampler
            sampler = [
                Sampler(
                    env_name=env,
                    render=render,
                    shared=shared,
                    maddpg=maddpg,
                    env_num=0,
                    evaluate=False,
                )
            ]
            ob = [s.get_init_obs() for s in sampler]

        # Separate the observation and full-state observation.
        obs = [get_obs(o)[0] for o in ob]
        all_obs = [get_obs(o)[1] for o in ob]

        return sampler, obs, all_obs
Пример #4
0
    def __init__(self,
                 policy,
                 env,
                 eval_env=None,
                 nb_train_steps=1,
                 nb_rollout_steps=1,
                 nb_eval_episodes=50,
                 actor_update_freq=2,
                 meta_update_freq=10,
                 reward_scale=1.,
                 render=False,
                 render_eval=False,
                 eval_deterministic=True,
                 verbose=0,
                 policy_kwargs=None,
                 _init_setup_model=True):
        """Instantiate the algorithm object.

        Parameters
        ----------
        policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use
        env : gym.Env or str
            the environment to learn from (if registered in Gym, can be str)
        eval_env : gym.Env or str
            the environment to evaluate from (if registered in Gym, can be str)
        nb_train_steps : int
            the number of training steps
        nb_rollout_steps : int
            the number of rollout steps
        nb_eval_episodes : int
            the number of evaluation episodes
        actor_update_freq : int
            number of training steps per actor policy update step. The critic
            policy is updated every training step.
        meta_update_freq : int
            number of training steps per meta policy update step. The actor
            policy of the meta-policy is further updated at the frequency
            provided by the actor_update_freq variable. Note that this value is
            only relevant when using the GoalConditionedPolicy policy.
        reward_scale : float
            the value the reward should be scaled by
        render : bool
            enable rendering of the training environment
        render_eval : bool
            enable rendering of the evaluation environment
        eval_deterministic : bool
            if set to True, the policy provides deterministic actions to the
            evaluation environment. Otherwise, stochastic or noisy actions are
            returned.
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        policy_kwargs : dict
            policy-specific hyperparameters
        _init_setup_model : bool
            Whether or not to build the network at the creation of the instance
        """
        shared = False if policy_kwargs is None else \
            policy_kwargs.get("shared", False)
        maddpg = False if policy_kwargs is None else \
            policy_kwargs.get("maddpg", False)

        self.policy = policy
        self.env_name = deepcopy(env) if isinstance(env, str) \
            else env.__str__()
        self.eval_env, _ = create_env(eval_env,
                                      render_eval,
                                      shared,
                                      maddpg,
                                      evaluate=True)
        self.nb_train_steps = nb_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.nb_eval_episodes = nb_eval_episodes
        self.actor_update_freq = actor_update_freq
        self.meta_update_freq = meta_update_freq
        self.reward_scale = reward_scale
        self.render = render
        self.render_eval = render_eval
        self.eval_deterministic = eval_deterministic
        self.verbose = verbose
        self.policy_kwargs = {'verbose': verbose}

        # Create the training sampler and collect the initial observations.
        self.sampler = Sampler(
            env_name=env,
            render=render,
            shared=shared,
            maddpg=maddpg,
            evaluate=False,
        )
        self.obs, self.all_obs = get_obs(self.sampler.get_init_obs())

        # Collect the spaces of the environments.
        self.action_space = self.sampler.action_space()
        self.observation_space = self.sampler.observation_space()
        self.context_space = self.sampler.context_space()

        # Add the default policy kwargs to the policy_kwargs term.
        if is_feedforward_policy(policy):
            self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy())
        elif is_goal_conditioned_policy(policy):
            self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy())
            self.policy_kwargs['env_name'] = self.env_name.__str__()
        elif is_multiagent_policy(policy):
            self.policy_kwargs.update(MULTI_FEEDFORWARD_PARAMS.copy())
            self.policy_kwargs["all_ob_space"] = \
                self.sampler.all_observation_space()

        if is_td3_policy(policy):
            self.policy_kwargs.update(TD3_PARAMS.copy())
        elif is_sac_policy(policy):
            self.policy_kwargs.update(SAC_PARAMS.copy())

        self.policy_kwargs.update(policy_kwargs or {})

        # Compute the time horizon, which is used to check if an environment
        # terminated early and used to compute the done mask for TD3.
        self.horizon = self.sampler.horizon()

        # init
        self.graph = None
        self.policy_tf = None
        self.sess = None
        self.summary = None
        self.episode_step = 0
        self.episodes = 0
        self.total_steps = 0
        self.epoch_episode_steps = []
        self.epoch_episode_rewards = []
        self.epoch_episodes = 0
        self.epoch = 0
        self.episode_rew_history = deque(maxlen=100)
        self.episode_reward = 0
        self.rew_ph = None
        self.rew_history_ph = None
        self.eval_rew_ph = None
        self.eval_success_ph = None
        self.saver = None

        if self.policy_kwargs.get("use_fingerprints", False):
            # Append the fingerprint dimension to the observation dimension.
            fingerprint_range = self.policy_kwargs["fingerprint_range"]
            low = np.concatenate(
                (self.observation_space.low, fingerprint_range[0]))
            high = np.concatenate(
                (self.observation_space.high, fingerprint_range[1]))
            self.observation_space = Box(low, high, dtype=np.float32)

            # Add the fingerprint term to the first observation.
            self.obs = add_fingerprint(self.obs, 0, 1, True)

        # Create the model variables and operations.
        if _init_setup_model:
            self.trainable_vars = self.setup_model()
Пример #5
0
    def collect_sample(self, action, multiagent, steps, total_steps,
                       use_fingerprints):
        """Perform the sample collection operation over a single step.

        This method is responsible for executing a single step of the
        environment. This is perform a number of times in the _collect_samples
        method before training is executed. The data from the rollouts is
        stored in the policy's replay buffer(s).

        Parameters
        ----------
        action : array_like
            the action to be performed by the agent(s) within the environment
        multiagent : bool
             whether the policy is multi-agent
        steps : int
            the total number of steps that have been executed since training
            began
        total_steps : int
            the total number of samples to train on. Used by the fingerprint
            element
        use_fingerprints : bool
            specifies whether to add a time-dependent fingerprint to the
            observations

        Returns
        -------
        dict
            information from the most recent environment update step,
            consisting of the following terms:

            * obs : the most recent observation. This consists of a single
              observation if no reset occured, and a tuple of (last observation
              from the previous rollout, first observation of the next rollout)
              if a reset occured.
            * context : the contextual term from the environment
            * action : the action performed by the agent(s)
            * reward : the reward from the most recent step
            * done : the done mask
            * all_obs : the most recent full-state observation. This consists
              of a single observation if no reset occured, and a tuple of (last
              observation from the previous rollout, first observation of the
              next rollout) if a reset occured.
        """
        # Execute the next action.
        obs, reward, done, info = self.env.step(action)
        obs, all_obs = get_obs(obs)

        # Visualize the current step.
        if self._render:
            self.env.render()  # pragma: no cover

        # Get the contextual term.
        context = getattr(self.env, "current_context", None)

        # Add the fingerprint term to this observation, if needed.
        obs = add_fingerprint(obs, steps, total_steps, use_fingerprints)

        # Done mask for multi-agent policies is slightly different.
        if multiagent:
            done = done["__all__"]

        if done:
            # Reset the environment.
            reset_obs = self.env.reset()
            reset_obs, reset_all_obs = get_obs(reset_obs)

            # Add the fingerprint term, if needed.
            obs = add_fingerprint(obs, steps, total_steps, use_fingerprints)
        else:
            reset_obs = None
            reset_all_obs = None

        return {
            "obs": obs if not done else (obs, reset_obs),
            "context": context,
            "action": action,
            "reward": reward,
            "done": done,
            "all_obs": all_obs if not done else (all_obs, reset_all_obs),
        }