def _train(self): """Perform the training operation. Through this method, the actor and critic networks are updated within the policy, and the summary information is logged to tensorboard. """ for t_train in range(self.nb_train_steps): if is_goal_conditioned_policy(self.policy): # specifies whether to update the meta actor and critic # policies based on the meta and actor update frequencies kwargs = { "update_meta": (self.total_steps + t_train) % self.meta_update_freq == 0, "update_meta_actor": (self.total_steps + t_train) % (self.meta_update_freq * self.actor_update_freq) == 0 } else: kwargs = {} # specifies whether to update the actor policy, base on the actor # update frequency update = (self.total_steps + t_train) % self.actor_update_freq == 0 # Run a step of training from batch. _ = self.policy_tf.update(update_actor=update, **kwargs)
def _train(self): """Perform the training operation. Through this method, the actor and critic networks are updated within the policy, and the summary information is logged to tensorboard. """ # Added to adjust the actor update frequency based on the rate at which # training occurs. total_steps = int(self.total_steps / self.nb_rollout_steps) if is_goal_conditioned_policy(self.policy): # specifies whether to update the meta actor and critic # policies based on the meta and actor update frequencies kwargs = { "update_meta": total_steps % self.meta_update_freq == 0, "update_meta_actor": total_steps % (self.meta_update_freq * self.actor_update_freq) == 0 } else: kwargs = {} # Specifies whether to update the actor policy, base on the actor # update frequency. update = total_steps % self.actor_update_freq == 0 # Run a step of training from batch. for _ in range(self.nb_train_steps): _ = self.policy_tf.update(update_actor=update, **kwargs)
def _train(self): """Perform the training operation. Through this method, the actor and critic networks are updated within the policy, and the summary information is logged to tensorboard. """ for t_train in range(self.nb_train_steps): if is_goal_conditioned_policy(self.policy): # specifies whether to update the meta actor and critic # policies based on the meta and actor update frequencies kwargs = { "update_meta": (self.total_steps + t_train) % self.meta_update_freq == 0, "update_meta_actor": (self.total_steps + t_train) % (self.meta_update_freq * self.actor_update_freq) == 0 } else: kwargs = {} # specifies whether to update the actor policy, base on the actor # update frequency update = (self.total_steps + t_train) % self.actor_update_freq == 0 # Run a step of training from batch. critic_loss, actor_loss = self.policy_tf.update( update_actor=update, **kwargs) # Add actor and critic loss information for logging purposes. if isinstance(critic_loss, tuple): # For hierarchical policies # TODO: modify for Manager/Worker paradigm self.epoch_q1_losses.append(critic_loss[0][0] + critic_loss[0][1]) self.epoch_q2_losses.append(critic_loss[1][0] + critic_loss[1][1]) else: # For non-hierarchical policies self.epoch_q1_losses.append(critic_loss[0]) self.epoch_q2_losses.append(critic_loss[1]) self.epoch_actor_losses.append(actor_loss)
def get_hyperparameters(args, policy): """Return the hyperparameters of a training algorithm from the parser.""" algorithm_params = { "nb_train_steps": args.nb_train_steps, "nb_rollout_steps": args.nb_rollout_steps, "nb_eval_episodes": args.nb_eval_episodes, "actor_update_freq": args.actor_update_freq, "meta_update_freq": args.meta_update_freq, "reward_scale": args.reward_scale, "render": args.render, "render_eval": args.render_eval, "save_replay_buffer": args.save_replay_buffer, "verbose": args.verbose, "num_envs": args.num_envs, "_init_setup_model": True, } # add FeedForwardPolicy parameters policy_kwargs = { "l2_penalty": args.l2_penalty, "model_params": { "model_type": getattr(args, "model_params:model_type"), "layer_norm": getattr(args, "model_params:layer_norm"), "ignore_image": getattr(args, "model_params:ignore_image"), "image_height": getattr(args, "model_params:image_height"), "image_width": getattr(args, "model_params:image_width"), "image_channels": getattr(args, "model_params:image_channels"), "ignore_flat_channels": getattr(args, "model_params:ignore_flat_channels") or FEEDFORWARD_PARAMS["model_params"]["ignore_flat_channels"], "filters": getattr(args, "model_params:filters") or FEEDFORWARD_PARAMS["model_params"]["filters"], "kernel_sizes": getattr(args, "model_params:kernel_sizes") or FEEDFORWARD_PARAMS["model_params"]["kernel_sizes"], "strides": getattr(args, "model_params:strides") or FEEDFORWARD_PARAMS["model_params"]["strides"], "layers": getattr(args, "model_params:layers") or FEEDFORWARD_PARAMS["model_params"]["layers"], } } # add TD3 parameters if is_td3_policy(policy): policy_kwargs.update({ "buffer_size": args.buffer_size, "batch_size": args.batch_size, "actor_lr": args.actor_lr, "critic_lr": args.critic_lr, "tau": args.tau, "gamma": args.gamma, "use_huber": args.use_huber, "noise": args.noise, "target_policy_noise": args.target_policy_noise, "target_noise_clip": args.target_noise_clip, }) # add SAC parameters if is_sac_policy(policy): policy_kwargs.update({ "buffer_size": args.buffer_size, "batch_size": args.batch_size, "actor_lr": args.actor_lr, "critic_lr": args.critic_lr, "tau": args.tau, "gamma": args.gamma, "use_huber": args.use_huber, "target_entropy": args.target_entropy, }) # add PPO parameters if is_ppo_policy(policy): policy_kwargs.update({ "learning_rate": args.learning_rate, "n_minibatches": args.n_minibatches, "n_opt_epochs": args.n_opt_epochs, "gamma": args.gamma, "lam": args.lam, "ent_coef": args.ent_coef, "vf_coef": args.vf_coef, "max_grad_norm": args.max_grad_norm, "cliprange": args.cliprange, "cliprange_vf": args.cliprange_vf, }) # add GoalConditionedPolicy parameters if is_goal_conditioned_policy(policy): policy_kwargs.update({ "num_levels": args.num_levels, "meta_period": args.meta_period, "intrinsic_reward_type": args.intrinsic_reward_type, "intrinsic_reward_scale": args.intrinsic_reward_scale, "relative_goals": args.relative_goals, "off_policy_corrections": args.off_policy_corrections, "hindsight": args.hindsight, "subgoal_testing_rate": args.subgoal_testing_rate, "cooperative_gradients": args.cooperative_gradients, "cg_weights": args.cg_weights, "cg_delta": args.cg_delta, "pretrain_worker": args.pretrain_worker, "pretrain_path": args.pretrain_path, "pretrain_ckpt": args.pretrain_ckpt, }) # add MultiActorCriticPolicy parameters if is_multiagent_policy(policy): policy_kwargs.update({ "shared": args.shared, "maddpg": args.maddpg, "n_agents": args.n_agents, }) # add the policy_kwargs term to the algorithm parameters algorithm_params['policy_kwargs'] = policy_kwargs return algorithm_params
def _evaluate(self, total_timesteps, env): """Perform the evaluation operation. This method runs the evaluation environment for a number of episodes and returns the cumulative rewards and successes from each environment. Parameters ---------- total_timesteps : int the total number of samples to train on env : gym.Env the evaluation environment that the policy is meant to be tested on Returns ------- list of float the list of cumulative rewards from every episode in the evaluation phase list of bool a list of boolean terms representing if each episode ended in success or not. If the list is empty, then the environment did not output successes or failures, and the success rate will be set to zero. dict additional information that is meant to be logged """ num_steps = deepcopy(self.total_steps) eval_episode_rewards = [] eval_episode_successes = [] ret_info = {'initial': [], 'final': [], 'average': []} if self.verbose >= 1: for _ in range(3): print("-------------------") print("Running evaluation for {} episodes:".format( self.nb_eval_episodes)) # Clear replay buffer-related memory in the policy to allow for the # meta-actions to properly updated. if is_goal_conditioned_policy(self.policy): self.policy_tf.clear_memory() for i in range(self.nb_eval_episodes): # Reset the environment. eval_obs = env.reset() eval_obs, eval_all_obs = self._get_obs(eval_obs) # Add the fingerprint term, if needed. eval_obs = self._add_fingerprint(eval_obs, self.total_steps, total_timesteps) # Reset rollout-specific variables. eval_episode_reward = 0. eval_episode_step = 0 rets = np.array([]) while True: # Collect the contextual term. None if it is not passed. context = [env.current_context] \ if hasattr(env, "current_context") else None eval_action = self._policy( eval_obs, context, apply_noise=not self.eval_deterministic, random_actions=False, ) obs, eval_r, done, info = env.step(eval_action) obs, all_obs = self._get_obs(obs) # Visualize the current step. if self.render_eval: self.eval_env.render() # pragma: no cover # Add the distance to this list for logging purposes (applies # only to the Ant* environments). if hasattr(env, "current_context"): context = getattr(env, "current_context") reward_fn = getattr(env, "contextual_reward") rets = np.append(rets, reward_fn(eval_obs, context, obs)) # Get the contextual term. context0 = context1 = getattr(env, "current_context", None) # Store a transition in the replay buffer. This is just for the # purposes of calling features in the store_transition method # of the policy. self._store_transition( obs0=eval_obs, context0=context0, action=eval_action, reward=eval_r, obs1=obs, context1=context1, terminal1=False, is_final_step=False, all_obs0=eval_all_obs, all_obs1=all_obs, evaluate=True, ) # Update the previous step observation. eval_obs = obs.copy() eval_all_obs = all_obs # Add the fingerprint term, if needed. eval_obs = self._add_fingerprint(eval_obs, self.total_steps, total_timesteps) # Increment the reward and step count. num_steps += 1 eval_episode_reward += eval_r eval_episode_step += 1 if done: eval_episode_rewards.append(eval_episode_reward) maybe_is_success = info.get('is_success') if maybe_is_success is not None: eval_episode_successes.append(float(maybe_is_success)) if self.verbose >= 1: if rets.shape[0] > 0: print("%d/%d: initial: %.3f, final: %.3f, average:" " %.3f, success: %d" % (i + 1, self.nb_eval_episodes, rets[0], rets[-1], float(rets.mean()), int(info.get('is_success')))) else: print("%d/%d" % (i + 1, self.nb_eval_episodes)) if hasattr(env, "current_context"): ret_info['initial'].append(rets[0]) ret_info['final'].append(rets[-1]) ret_info['average'].append(float(rets.mean())) # Exit the loop. break if self.verbose >= 1: print("Done.") print("Average return: {}".format(np.mean(eval_episode_rewards))) if len(eval_episode_successes) > 0: print("Success rate: {}".format( np.mean(eval_episode_successes))) for _ in range(3): print("-------------------") print("") # get the average of the reward information ret_info['initial'] = np.mean(ret_info['initial']) ret_info['final'] = np.mean(ret_info['final']) ret_info['average'] = np.mean(ret_info['average']) # Clear replay buffer-related memory in the policy once again so that # it does not affect the training procedure. if is_goal_conditioned_policy(self.policy): self.policy_tf.clear_memory() return eval_episode_rewards, eval_episode_successes, ret_info
def __init__(self, policy, env, eval_env=None, nb_train_steps=1, nb_rollout_steps=1, nb_eval_episodes=50, actor_update_freq=2, meta_update_freq=10, reward_scale=1., render=False, render_eval=False, eval_deterministic=True, verbose=0, policy_kwargs=None, _init_setup_model=True): """Instantiate the algorithm object. Parameters ---------- policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use env : gym.Env or str the environment to learn from (if registered in Gym, can be str) eval_env : gym.Env or str the environment to evaluate from (if registered in Gym, can be str) nb_train_steps : int the number of training steps nb_rollout_steps : int the number of rollout steps nb_eval_episodes : int the number of evaluation episodes actor_update_freq : int number of training steps per actor policy update step. The critic policy is updated every training step. meta_update_freq : int number of training steps per meta policy update step. The actor policy of the meta-policy is further updated at the frequency provided by the actor_update_freq variable. Note that this value is only relevant when using the GoalConditionedPolicy policy. reward_scale : float the value the reward should be scaled by render : bool enable rendering of the training environment render_eval : bool enable rendering of the evaluation environment eval_deterministic : bool if set to True, the policy provides deterministic actions to the evaluation environment. Otherwise, stochastic or noisy actions are returned. verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug policy_kwargs : dict policy-specific hyperparameters _init_setup_model : bool Whether or not to build the network at the creation of the instance """ shared = False if policy_kwargs is None else \ policy_kwargs.get("shared", False) maddpg = False if policy_kwargs is None else \ policy_kwargs.get("maddpg", False) self.policy = policy self.env_name = deepcopy(env) if isinstance(env, str) \ else env.__str__() self.env = create_env(env, render, shared, maddpg, evaluate=False) self.eval_env = create_env(eval_env, render_eval, shared, maddpg, evaluate=True) self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.nb_eval_episodes = nb_eval_episodes self.actor_update_freq = actor_update_freq self.meta_update_freq = meta_update_freq self.reward_scale = reward_scale self.render = render self.render_eval = render_eval self.eval_deterministic = eval_deterministic self.verbose = verbose self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.context_space = getattr(self.env, "context_space", None) self.policy_kwargs = {'verbose': verbose} # add the default policy kwargs to the policy_kwargs term if is_feedforward_policy(policy): self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy()) elif is_goal_conditioned_policy(policy): self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy()) self.policy_kwargs['env_name'] = self.env_name.__str__() elif is_multiagent_policy(policy): self.policy_kwargs.update(MULTI_FEEDFORWARD_PARAMS.copy()) self.policy_kwargs["all_ob_space"] = getattr( self.env, "all_observation_space", Box(-1, 1, (1, ), dtype=np.float32)) if is_td3_policy(policy): self.policy_kwargs.update(TD3_PARAMS.copy()) elif is_sac_policy(policy): self.policy_kwargs.update(SAC_PARAMS.copy()) self.policy_kwargs.update(policy_kwargs or {}) # Compute the time horizon, which is used to check if an environment # terminated early and used to compute the done mask as per TD3 # implementation (see appendix A of their paper). If the horizon cannot # be found, it is assumed to be 500 (default value for most gym # environments). if hasattr(self.env, "horizon"): self.horizon = self.env.horizon elif hasattr(self.env, "_max_episode_steps"): self.horizon = self.env._max_episode_steps elif hasattr(self.env, "env_params"): # for Flow environments self.horizon = self.env.env_params.horizon else: raise ValueError("Horizon attribute not found.") # init self.graph = None self.policy_tf = None self.sess = None self.summary = None self.obs = None self.all_obs = None self.episode_step = 0 self.episodes = 0 self.total_steps = 0 self.epoch_episode_steps = [] self.epoch_episode_rewards = [] self.epoch_episodes = 0 self.epoch = 0 self.episode_rew_history = deque(maxlen=100) self.episode_reward = 0 self.rew_ph = None self.rew_history_ph = None self.eval_rew_ph = None self.eval_success_ph = None self.saver = None # Append the fingerprint dimension to the observation dimension, if # needed. if self.policy_kwargs.get("use_fingerprints", False): fingerprint_range = self.policy_kwargs["fingerprint_range"] low = np.concatenate( (self.observation_space.low, fingerprint_range[0])) high = np.concatenate( (self.observation_space.high, fingerprint_range[1])) self.observation_space = Box(low=low, high=high, dtype=np.float32) # Create the model variables and operations. if _init_setup_model: self.trainable_vars = self.setup_model()
def get_hyperparameters(args, policy): """Return the hyperparameters of a training algorithm from the parser.""" algorithm_params = { "nb_train_steps": args.nb_train_steps, "nb_rollout_steps": args.nb_rollout_steps, "nb_eval_episodes": args.nb_eval_episodes, "actor_update_freq": args.actor_update_freq, "meta_update_freq": args.meta_update_freq, "reward_scale": args.reward_scale, "render": args.render, "render_eval": args.render_eval, "verbose": args.verbose, "num_envs": args.num_envs, "_init_setup_model": True, } # add FeedForwardPolicy parameters policy_kwargs = { "buffer_size": args.buffer_size, "batch_size": args.batch_size, "actor_lr": args.actor_lr, "critic_lr": args.critic_lr, "tau": args.tau, "gamma": args.gamma, "layer_norm": args.layer_norm, "use_huber": args.use_huber, } # add TD3 parameters if is_td3_policy(policy): policy_kwargs.update({ "noise": args.noise, "target_policy_noise": args.target_policy_noise, "target_noise_clip": args.target_noise_clip, }) # add SAC parameters if is_sac_policy(policy): policy_kwargs.update({ "target_entropy": args.target_entropy, }) # add GoalConditionedPolicy parameters if is_goal_conditioned_policy(policy): policy_kwargs.update({ "num_levels": args.num_levels, "meta_period": args.meta_period, "intrinsic_reward_type": args.intrinsic_reward_type, "intrinsic_reward_scale": args.intrinsic_reward_scale, "relative_goals": args.relative_goals, "off_policy_corrections": args.off_policy_corrections, "hindsight": args.hindsight, "subgoal_testing_rate": args.subgoal_testing_rate, "connected_gradients": args.connected_gradients, "cg_weights": args.cg_weights, "use_fingerprints": args.use_fingerprints, "centralized_value_functions": args.centralized_value_functions, }) # add MultiFeedForwardPolicy parameters if is_multiagent_policy(policy): policy_kwargs.update({ "shared": args.shared, "maddpg": args.maddpg, }) # add the policy_kwargs term to the algorithm parameters algorithm_params['policy_kwargs'] = policy_kwargs return algorithm_params
def __init__(self, policy, env, eval_env=None, nb_train_steps=1, nb_rollout_steps=1, nb_eval_episodes=50, actor_update_freq=2, meta_update_freq=10, reward_scale=1., render=False, render_eval=False, eval_deterministic=True, save_replay_buffer=False, num_envs=1, verbose=0, policy_kwargs=None, _init_setup_model=True): """Instantiate the algorithm object. Parameters ---------- policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use env : gym.Env or str the environment to learn from (if registered in Gym, can be str) eval_env : gym.Env or str the environment to evaluate from (if registered in Gym, can be str) nb_train_steps : int the number of training steps nb_rollout_steps : int the number of rollout steps nb_eval_episodes : int the number of evaluation episodes actor_update_freq : int number of training steps per actor policy update step. The critic policy is updated every training step. meta_update_freq : int number of training steps per meta policy update step. The actor policy of the meta-policy is further updated at the frequency provided by the actor_update_freq variable. Note that this value is only relevant when using the GoalConditionedPolicy policy. reward_scale : float the value the reward should be scaled by render : bool enable rendering of the training environment render_eval : bool enable rendering of the evaluation environment eval_deterministic : bool if set to True, the policy provides deterministic actions to the evaluation environment. Otherwise, stochastic or noisy actions are returned. save_replay_buffer : bool whether to save the data from the replay buffer, at the frequency that the model is saved. Only the most recent replay buffer is stored. num_envs : int number of environments used to run simulations in parallel. Each environment is run on a separate CPUS and uses the same policy as the rest. Must be less than or equal to nb_rollout_steps. verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug policy_kwargs : dict policy-specific hyperparameters _init_setup_model : bool Whether or not to build the network at the creation of the instance Raises ------ AssertionError if num_envs > nb_rollout_steps """ shared = False if policy_kwargs is None else \ policy_kwargs.get("shared", False) maddpg = False if policy_kwargs is None else \ policy_kwargs.get("maddpg", False) # Run assertions. assert num_envs <= nb_rollout_steps, \ "num_envs must be less than or equal to nb_rollout_steps" # Instantiate the ray instance. if num_envs > 1: ray.init(num_cpus=num_envs + 1, ignore_reinit_error=True) self.policy = policy self.env_name = deepcopy(env) if isinstance(env, str) \ else env.__str__() self.eval_env, _ = create_env(eval_env, render_eval, shared, maddpg, evaluate=True) self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.nb_eval_episodes = nb_eval_episodes self.actor_update_freq = actor_update_freq self.meta_update_freq = meta_update_freq self.reward_scale = reward_scale self.render = render self.render_eval = render_eval self.eval_deterministic = eval_deterministic self.save_replay_buffer = save_replay_buffer self.num_envs = num_envs self.verbose = verbose self.policy_kwargs = {'verbose': verbose} # Create the environment and collect the initial observations. self.sampler, self.obs, self.all_obs = self.setup_sampler( env, render, shared, maddpg) # Collect the spaces of the environments. self.ac_space, self.ob_space, self.co_space, all_ob_space = \ self.get_spaces() # Add the default policy kwargs to the policy_kwargs term. if is_feedforward_policy(policy): self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy()) if is_goal_conditioned_policy(policy): self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy()) self.policy_kwargs['env_name'] = self.env_name.__str__() self.policy_kwargs['num_envs'] = num_envs if is_multiagent_policy(policy): self.policy_kwargs.update(MULTIAGENT_PARAMS.copy()) self.policy_kwargs["all_ob_space"] = all_ob_space if is_td3_policy(policy): self.policy_kwargs.update(TD3_PARAMS.copy()) elif is_sac_policy(policy): self.policy_kwargs.update(SAC_PARAMS.copy()) self.policy_kwargs = recursive_update(self.policy_kwargs, policy_kwargs or {}) # Compute the time horizon, which is used to check if an environment # terminated early and used to compute the done mask for TD3. if self.num_envs > 1: self.horizon = ray.get(self.sampler[0].horizon.remote()) else: self.horizon = self.sampler[0].horizon() # init self.graph = None self.policy_tf = None self.sess = None self.summary = None self.episode_step = [0 for _ in range(num_envs)] self.episodes = 0 self.total_steps = 0 self.epoch_episode_steps = [] self.epoch_episode_rewards = [] self.epoch_episodes = 0 self.epoch = 0 self.episode_rew_history = deque(maxlen=100) self.episode_reward = [0 for _ in range(num_envs)] self.rew_ph = None self.rew_history_ph = None self.eval_rew_ph = None self.eval_success_ph = None self.saver = None # Create the model variables and operations. if _init_setup_model: self.trainable_vars = self.setup_model()