def collect_sample(self, action): """Perform the sample collection operation over a single step. This method is responsible for executing a single step of the environment. This is perform a number of times in the _collect_samples method before training is executed. The data from the rollouts is stored in the policy's replay buffer(s). Parameters ---------- action : array_like the action to be performed by the agent(s) within the environment Returns ------- dict information from the most recent environment update step, consisting of the following terms: * obs : the most recent observation. This consists of a single observation if no reset occured, and a tuple of (last observation from the previous rollout, first observation of the next rollout) if a reset occured. * context : the contextual term from the environment * action : the action performed by the agent(s) * reward : the reward from the most recent step * done : the done mask * env_num : the environment number * all_obs : the most recent full-state observation. This consists of a single observation if no reset occured, and a tuple of (last observation from the previous rollout, first observation of the next rollout) if a reset occured. """ # Execute the next action. obs, reward, done, info = self.env.step(action) obs, all_obs = get_obs(obs) # Visualize the current step. if self._render and self._env_num == 0: self.env.render() # pragma: no cover # Get the contextual term. context = getattr(self.env, "current_context", None) # Done mask for multi-agent policies is slightly different. if isinstance(done, dict): done = done["__all__"] if done: # Reset the environment. reset_obs = self.env.reset() reset_obs, reset_all_obs = get_obs(reset_obs) else: reset_obs = None reset_all_obs = None return { "obs": obs if not done else (obs, reset_obs), "context": context, "action": action, "reward": reward, "done": done, "env_num": self._env_num, "all_obs": all_obs if not done else (all_obs, reset_all_obs), "info": info, }
def _evaluate(self, total_steps, env): """Perform the evaluation operation. This method runs the evaluation environment for a number of episodes and returns the cumulative rewards and successes from each environment. Parameters ---------- total_steps : int the total number of samples to train on env : gym.Env the evaluation environment that the policy is meant to be tested on Returns ------- list of float the list of cumulative rewards from every episode in the evaluation phase list of bool a list of boolean terms representing if each episode ended in success or not. If the list is empty, then the environment did not output successes or failures, and the success rate will be set to zero. dict additional information that is meant to be logged """ num_steps = deepcopy(self.total_steps) eval_episode_rewards = [] eval_episode_successes = [] ret_info = {'initial': [], 'final': [], 'average': []} if self.verbose >= 1: for _ in range(3): print("-------------------") print("Running evaluation for {} episodes:".format( self.nb_eval_episodes)) # Clear replay buffer-related memory in the policy to allow for the # meta-actions to properly updated. if is_goal_conditioned_policy(self.policy): self.policy_tf.clear_memory() for i in range(self.nb_eval_episodes): # Reset the environment. eval_obs = env.reset() eval_obs, eval_all_obs = get_obs(eval_obs) # Add the fingerprint term, if needed. eval_obs = add_fingerprint( obs=eval_obs, steps=self.total_steps, total_steps=total_steps, use_fingerprints=self.policy_kwargs.get( "use_fingerprints", False), ) # Reset rollout-specific variables. eval_episode_reward = 0. eval_episode_step = 0 rets = np.array([]) while True: # Collect the contextual term. None if it is not passed. context = [env.current_context] \ if hasattr(env, "current_context") else None eval_action = self._policy( obs=eval_obs, context=context, apply_noise=not self.eval_deterministic, random_actions=False, ) # Update the environment. obs, eval_r, done, info = env.step(eval_action) obs, all_obs = get_obs(obs) # Visualize the current step. if self.render_eval: self.eval_env.render() # pragma: no cover # Add the distance to this list for logging purposes (applies # only to the Ant* environments). if hasattr(env, "current_context"): context = getattr(env, "current_context") reward_fn = getattr(env, "contextual_reward") rets = np.append(rets, reward_fn(eval_obs, context, obs)) # Get the contextual term. context0 = context1 = getattr(env, "current_context", None) # Store a transition in the replay buffer. This is just for the # purposes of calling features in the store_transition method # of the policy. self._store_transition( obs0=eval_obs, context0=context0, action=eval_action, reward=eval_r, obs1=obs, context1=context1, terminal1=False, is_final_step=False, all_obs0=eval_all_obs, all_obs1=all_obs, evaluate=True, ) # Update the previous step observation. eval_obs = obs.copy() eval_all_obs = all_obs # Add the fingerprint term, if needed. eval_obs = add_fingerprint( obs=eval_obs, steps=self.total_steps, total_steps=total_steps, use_fingerprints=self.policy_kwargs.get( "use_fingerprints", False), ) # Increment the reward and step count. num_steps += 1 eval_episode_reward += eval_r eval_episode_step += 1 if done: eval_episode_rewards.append(eval_episode_reward) maybe_is_success = info.get('is_success') if maybe_is_success is not None: eval_episode_successes.append(float(maybe_is_success)) if self.verbose >= 1: if rets.shape[0] > 0: print("%d/%d: initial: %.3f, final: %.3f, average:" " %.3f, success: %d" % (i + 1, self.nb_eval_episodes, rets[0], rets[-1], float(rets.mean()), int(info.get('is_success')))) else: print("%d/%d" % (i + 1, self.nb_eval_episodes)) if hasattr(env, "current_context"): ret_info['initial'].append(rets[0]) ret_info['final'].append(rets[-1]) ret_info['average'].append(float(rets.mean())) # Exit the loop. break if self.verbose >= 1: print("Done.") print("Average return: {}".format(np.mean(eval_episode_rewards))) if len(eval_episode_successes) > 0: print("Success rate: {}".format( np.mean(eval_episode_successes))) for _ in range(3): print("-------------------") print("") # get the average of the reward information ret_info['initial'] = np.mean(ret_info['initial']) ret_info['final'] = np.mean(ret_info['final']) ret_info['average'] = np.mean(ret_info['average']) # Clear replay buffer-related memory in the policy once again so that # it does not affect the training procedure. if is_goal_conditioned_policy(self.policy): self.policy_tf.clear_memory() return eval_episode_rewards, eval_episode_successes, ret_info
def setup_sampler(self, env, render, shared, maddpg): """Create the environment and collect the initial observations. Parameters ---------- env : str the name of the environment render : bool whether to render the environment shared : bool specifies whether agents in an environment are meant to share policies. This is solely used by multi-agent Flow environments. maddpg : bool whether to use an environment variant that is compatible with the MADDPG algorithm Returns ------- list of Sampler or list of RaySampler the sampler objects list of array_like or list of dict < str, array_like > the initila observation. If the environment is multi-agent, this will be a dictionary of observations for each agent, indexed by the agent ID. One element for each environment. list of array_like or list of None additional information, used by MADDPG variants of the multi-agent policy to pass full-state information. One element for each environment """ if self.num_envs > 1: from hbaselines.utils.sampler import RaySampler sampler = [ RaySampler.remote( env_name=env, render=render, shared=shared, maddpg=maddpg, env_num=env_num, evaluate=False, ) for env_num in range(self.num_envs) ] ob = ray.get([s.get_init_obs.remote() for s in sampler]) else: from hbaselines.utils.sampler import Sampler sampler = [ Sampler( env_name=env, render=render, shared=shared, maddpg=maddpg, env_num=0, evaluate=False, ) ] ob = [s.get_init_obs() for s in sampler] # Separate the observation and full-state observation. obs = [get_obs(o)[0] for o in ob] all_obs = [get_obs(o)[1] for o in ob] return sampler, obs, all_obs
def __init__(self, policy, env, eval_env=None, nb_train_steps=1, nb_rollout_steps=1, nb_eval_episodes=50, actor_update_freq=2, meta_update_freq=10, reward_scale=1., render=False, render_eval=False, eval_deterministic=True, verbose=0, policy_kwargs=None, _init_setup_model=True): """Instantiate the algorithm object. Parameters ---------- policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use env : gym.Env or str the environment to learn from (if registered in Gym, can be str) eval_env : gym.Env or str the environment to evaluate from (if registered in Gym, can be str) nb_train_steps : int the number of training steps nb_rollout_steps : int the number of rollout steps nb_eval_episodes : int the number of evaluation episodes actor_update_freq : int number of training steps per actor policy update step. The critic policy is updated every training step. meta_update_freq : int number of training steps per meta policy update step. The actor policy of the meta-policy is further updated at the frequency provided by the actor_update_freq variable. Note that this value is only relevant when using the GoalConditionedPolicy policy. reward_scale : float the value the reward should be scaled by render : bool enable rendering of the training environment render_eval : bool enable rendering of the evaluation environment eval_deterministic : bool if set to True, the policy provides deterministic actions to the evaluation environment. Otherwise, stochastic or noisy actions are returned. verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug policy_kwargs : dict policy-specific hyperparameters _init_setup_model : bool Whether or not to build the network at the creation of the instance """ shared = False if policy_kwargs is None else \ policy_kwargs.get("shared", False) maddpg = False if policy_kwargs is None else \ policy_kwargs.get("maddpg", False) self.policy = policy self.env_name = deepcopy(env) if isinstance(env, str) \ else env.__str__() self.eval_env, _ = create_env(eval_env, render_eval, shared, maddpg, evaluate=True) self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.nb_eval_episodes = nb_eval_episodes self.actor_update_freq = actor_update_freq self.meta_update_freq = meta_update_freq self.reward_scale = reward_scale self.render = render self.render_eval = render_eval self.eval_deterministic = eval_deterministic self.verbose = verbose self.policy_kwargs = {'verbose': verbose} # Create the training sampler and collect the initial observations. self.sampler = Sampler( env_name=env, render=render, shared=shared, maddpg=maddpg, evaluate=False, ) self.obs, self.all_obs = get_obs(self.sampler.get_init_obs()) # Collect the spaces of the environments. self.action_space = self.sampler.action_space() self.observation_space = self.sampler.observation_space() self.context_space = self.sampler.context_space() # Add the default policy kwargs to the policy_kwargs term. if is_feedforward_policy(policy): self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy()) elif is_goal_conditioned_policy(policy): self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy()) self.policy_kwargs['env_name'] = self.env_name.__str__() elif is_multiagent_policy(policy): self.policy_kwargs.update(MULTI_FEEDFORWARD_PARAMS.copy()) self.policy_kwargs["all_ob_space"] = \ self.sampler.all_observation_space() if is_td3_policy(policy): self.policy_kwargs.update(TD3_PARAMS.copy()) elif is_sac_policy(policy): self.policy_kwargs.update(SAC_PARAMS.copy()) self.policy_kwargs.update(policy_kwargs or {}) # Compute the time horizon, which is used to check if an environment # terminated early and used to compute the done mask for TD3. self.horizon = self.sampler.horizon() # init self.graph = None self.policy_tf = None self.sess = None self.summary = None self.episode_step = 0 self.episodes = 0 self.total_steps = 0 self.epoch_episode_steps = [] self.epoch_episode_rewards = [] self.epoch_episodes = 0 self.epoch = 0 self.episode_rew_history = deque(maxlen=100) self.episode_reward = 0 self.rew_ph = None self.rew_history_ph = None self.eval_rew_ph = None self.eval_success_ph = None self.saver = None if self.policy_kwargs.get("use_fingerprints", False): # Append the fingerprint dimension to the observation dimension. fingerprint_range = self.policy_kwargs["fingerprint_range"] low = np.concatenate( (self.observation_space.low, fingerprint_range[0])) high = np.concatenate( (self.observation_space.high, fingerprint_range[1])) self.observation_space = Box(low, high, dtype=np.float32) # Add the fingerprint term to the first observation. self.obs = add_fingerprint(self.obs, 0, 1, True) # Create the model variables and operations. if _init_setup_model: self.trainable_vars = self.setup_model()
def collect_sample(self, action, multiagent, steps, total_steps, use_fingerprints): """Perform the sample collection operation over a single step. This method is responsible for executing a single step of the environment. This is perform a number of times in the _collect_samples method before training is executed. The data from the rollouts is stored in the policy's replay buffer(s). Parameters ---------- action : array_like the action to be performed by the agent(s) within the environment multiagent : bool whether the policy is multi-agent steps : int the total number of steps that have been executed since training began total_steps : int the total number of samples to train on. Used by the fingerprint element use_fingerprints : bool specifies whether to add a time-dependent fingerprint to the observations Returns ------- dict information from the most recent environment update step, consisting of the following terms: * obs : the most recent observation. This consists of a single observation if no reset occured, and a tuple of (last observation from the previous rollout, first observation of the next rollout) if a reset occured. * context : the contextual term from the environment * action : the action performed by the agent(s) * reward : the reward from the most recent step * done : the done mask * all_obs : the most recent full-state observation. This consists of a single observation if no reset occured, and a tuple of (last observation from the previous rollout, first observation of the next rollout) if a reset occured. """ # Execute the next action. obs, reward, done, info = self.env.step(action) obs, all_obs = get_obs(obs) # Visualize the current step. if self._render: self.env.render() # pragma: no cover # Get the contextual term. context = getattr(self.env, "current_context", None) # Add the fingerprint term to this observation, if needed. obs = add_fingerprint(obs, steps, total_steps, use_fingerprints) # Done mask for multi-agent policies is slightly different. if multiagent: done = done["__all__"] if done: # Reset the environment. reset_obs = self.env.reset() reset_obs, reset_all_obs = get_obs(reset_obs) # Add the fingerprint term, if needed. obs = add_fingerprint(obs, steps, total_steps, use_fingerprints) else: reset_obs = None reset_all_obs = None return { "obs": obs if not done else (obs, reset_obs), "context": context, "action": action, "reward": reward, "done": done, "all_obs": all_obs if not done else (all_obs, reset_all_obs), }