class TestHierReplayBuffer(unittest.TestCase): """Tests for the HierReplayBuffer object.""" def setUp(self): self.replay_buffer = HierReplayBuffer( buffer_size=2, batch_size=1, meta_period=3, obs_dim=1, ac_dim=1, co_dim=1, goal_dim=1, num_levels=3, ) def tearDown(self): del self.replay_buffer def test_buffer_size(self): """Validate the buffer_size output from the replay buffer.""" self.assertEqual(self.replay_buffer.buffer_size, 2) def test_add_sample(self): """Test the `add` and `sample` methods the replay buffer.""" # Set the random seed. random.seed(0) obs_t = [ np.array([0]), np.array([1]), np.array([2]), np.array([3]), np.array([4]), np.array([5]), np.array([6]), np.array([7]), np.array([8]), np.array([9]) ] action_t = [[ np.array([0]), np.array([1]), np.array([2]), np.array([3]) ], [ np.array([0]), np.array([1]), np.array([2]), np.array([3]), np.array([4]), np.array([5]), np.array([6]), np.array([7]), np.array([8]), np.array([9]) ], [ np.array([0]), np.array([1]), np.array([2]), np.array([3]), np.array([4]), np.array([5]), np.array([6]), np.array([7]), np.array([8]), np.array([9]) ]] context_t = [np.array([0]), np.array([1])] reward_t = [[0], [0, 1, 2], [0, 1, 2, 3, 4, 5, 6, 7, 8]] done_t = [ False, False, False, False, False, False, False, False, False ] # Add an element. self.replay_buffer.add( obs_t=obs_t, action_t=action_t, context_t=context_t, reward_t=reward_t, done_t=done_t, ) # Check is_full in the False case. self.assertEqual(self.replay_buffer.is_full(), False) # Add an element. self.replay_buffer.add( obs_t=obs_t, action_t=action_t, context_t=context_t, reward_t=reward_t, done_t=done_t, ) # Check is_full in the True case. self.assertEqual(self.replay_buffer.is_full(), True) # Check can_sample in the True case. self.assertEqual(self.replay_buffer.can_sample(), True) # Test the `sample` method. obs0, obs1, act, rew, done, _ = self.replay_buffer.sample(False) np.testing.assert_array_almost_equal(obs0[0], [[0, 0]]) np.testing.assert_array_almost_equal(obs0[1], [[6, 2]]) np.testing.assert_array_almost_equal(obs0[2], [[6, 6]]) np.testing.assert_array_almost_equal(obs1[0], [[9, 1]]) np.testing.assert_array_almost_equal(obs1[1], [[9, 3]]) np.testing.assert_array_almost_equal(obs1[2], [[7, 7]]) np.testing.assert_array_almost_equal(act[0], [[0]]) np.testing.assert_array_almost_equal(act[1], [[6]]) np.testing.assert_array_almost_equal(act[2], [[6]]) np.testing.assert_array_almost_equal(rew[0], [0]) np.testing.assert_array_almost_equal(rew[1], [2]) np.testing.assert_array_almost_equal(rew[2], [6]) np.testing.assert_array_almost_equal(done[0], [0]) np.testing.assert_array_almost_equal(done[1], [0]) np.testing.assert_array_almost_equal(done[2], [0])
class GoalConditionedPolicy(ActorCriticPolicy): r"""Goal-conditioned hierarchical reinforcement learning model. This policy is an implementation of the two-level hierarchy presented in [1], which itself is similar to the feudal networks formulation [2, 3]. This network consists of a high-level, or Manager, pi_{\theta_H} that computes and outputs goals g_t ~ pi_{\theta_H}(s_t, h) every `meta_period` time steps, and a low-level policy pi_{\theta_L} that takes as inputs the current state and the assigned goals and attempts to perform an action a_t ~ pi_{\theta_L}(s_t,g_t) that satisfies these goals. The Manager is rewarded based on the original environment reward function: r_H = r(s,a;h). The Target term, h, parameterizes the reward assigned to the Manager in order to allow the policy to generalize to several goals within a task, a technique that was first proposed by [4]. Finally, the Worker is motivated to follow the goals set by the Manager via an intrinsic reward based on the distance between the current observation and the goal observation: r_L (s_t, g_t, s_{t+1}) = -||s_t + g_t - s_{t+1}||_2 Bibliography: [1] Nachum, Ofir, et al. "Data-efficient hierarchical reinforcement learning." Advances in Neural Information Processing Systems. 2018. [2] Dayan, Peter, and Geoffrey E. Hinton. "Feudal reinforcement learning." Advances in neural information processing systems. 1993. [3] Vezhnevets, Alexander Sasha, et al. "Feudal networks for hierarchical reinforcement learning." Proceedings of the 34th International Conference on Machine Learning-Volume 70. JMLR. org, 2017. [4] Schaul, Tom, et al. "Universal value function approximators." International Conference on Machine Learning. 2015. Attributes ---------- manager : hbaselines.fcnet.base.ActorCriticPolicy the manager policy meta_period : int manger action period worker_reward_scale : float the value the intrinsic (Worker) reward should be scaled by relative_goals : bool specifies whether the goal issued by the Manager is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296. hindsight : bool whether to use hindsight action and goal transitions, as well as subgoal testing. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. connected_gradients : bool whether to connect the graph between the manager and worker cg_weights : float weights for the gradients of the loss of the worker with respect to the parameters of the manager. Only used if `connected_gradients` is set to True. use_fingerprints : bool specifies whether to add a time-dependent fingerprint to the observations fingerprint_range : (list of float, list of float) the low and high values for each fingerprint element, if they are being used fingerprint_dim : tuple of int the shape of the fingerprint elements, if they are being used centralized_value_functions : bool specifies whether to use centralized value functions for the Manager critic functions prev_meta_obs : array_like previous observation by the Manager meta_action : array_like current action by the Manager meta_reward : float current meta reward, counting as the cumulative environment reward during the meta period batch_size : int SGD batch size worker : hbaselines.fcnet.base.ActorCriticPolicy the worker policy worker_reward_fn : function reward function for the worker """ def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, layer_norm, layers, act_fun, use_huber, meta_period, worker_reward_scale, relative_goals, off_policy_corrections, hindsight, subgoal_testing_rate, connected_gradients, cg_weights, use_fingerprints, fingerprint_range, centralized_value_functions, env_name="", meta_policy=None, worker_policy=None, additional_params=None): """Instantiate the goal-conditioned hierarchical policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor layer_norm : bool enable layer normalisation layers : list of int or None the size of the neural network for the policy act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead meta_period : int manger action period worker_reward_scale : float the value the intrinsic (Worker) reward should be scaled by relative_goals : bool specifies whether the goal issued by the Manager is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296 hindsight : bool whether to include hindsight action and goal transitions in the replay buffer. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. connected_gradients : bool whether to connect the graph between the manager and worker cg_weights : float weights for the gradients of the loss of the worker with respect to the parameters of the manager. Only used if `connected_gradients` is set to True. use_fingerprints : bool specifies whether to add a time-dependent fingerprint to the observations fingerprint_range : (list of float, list of float) the low and high values for each fingerprint element, if they are being used centralized_value_functions : bool specifies whether to use centralized value functions for the Manager and Worker critic functions meta_policy : type [ hbaselines.fcnet.base.ActorCriticPolicy ] the policy model to use for the Manager worker_policy : type [ hbaselines.fcnet.base.ActorCriticPolicy ] the policy model to use for the Worker additional_params : dict additional algorithm-specific policy parameters. Used internally by the class when instantiating other (child) policies. """ super(GoalConditionedPolicy, self).__init__(sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber) self.meta_period = meta_period self.worker_reward_scale = worker_reward_scale self.relative_goals = relative_goals self.off_policy_corrections = off_policy_corrections self.hindsight = hindsight self.subgoal_testing_rate = subgoal_testing_rate self.connected_gradients = connected_gradients self.cg_weights = cg_weights self.use_fingerprints = use_fingerprints self.fingerprint_range = fingerprint_range self.fingerprint_dim = (len(self.fingerprint_range[0]), ) self.centralized_value_functions = centralized_value_functions # Get the Manager's action space. manager_ac_space = get_manager_ac_space(ob_space, relative_goals, env_name, use_fingerprints, self.fingerprint_dim) # Manager observation size meta_ob_dim = self._get_ob_dim(ob_space, co_space) # Create the replay buffer. self.replay_buffer = HierReplayBuffer( buffer_size=int(buffer_size / meta_period), batch_size=batch_size, meta_period=meta_period, meta_obs_dim=meta_ob_dim[0], meta_ac_dim=manager_ac_space.shape[0], worker_obs_dim=ob_space.shape[0] + manager_ac_space.shape[0], worker_ac_dim=ac_space.shape[0], ) # Collect the state indices for the worker rewards. self.goal_indices = get_state_indices(ob_space, env_name, use_fingerprints, self.fingerprint_dim) # Utility method for indexing the goal out of an observation variable. self.crop_to_goal = lambda g: tf.gather( g, tf.tile(tf.expand_dims(np.array(self.goal_indices), 0), [self.batch_size, 1]), batch_dims=1, axis=1) # =================================================================== # # Part 1. Setup the Manager # # =================================================================== # # Create the Manager policy. with tf.compat.v1.variable_scope("Manager"): self.manager = meta_policy( sess=sess, ob_space=ob_space, ac_space=manager_ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber, scope="Manager", zero_fingerprint=False, fingerprint_dim=self.fingerprint_dim[0], **(additional_params or {}), ) # a fixed goal transition function for the meta-actions in between meta # periods. This is used when relative_goals is set to True in order to # maintain a fixed absolute position of the goal. if relative_goals: def goal_transition_fn(obs0, goal, obs1): return obs0 + goal - obs1 else: def goal_transition_fn(obs0, goal, obs1): return goal self.goal_transition_fn = goal_transition_fn # previous observation by the Manager self.prev_meta_obs = None # current action by the Manager self.meta_action = None # current meta reward, counting as the cumulative environment reward # during the meta period self.meta_reward = None # The following is redundant but necessary if the changes to the update # function are to be in the GoalConditionedPolicy policy and not # FeedForwardPolicy. self.batch_size = batch_size # Use this to store a list of observations that stretch as long as the # dilated horizon chosen for the Manager. These observations correspond # to the s(t) in the HIRO paper. self._observations = [] # Use this to store the list of environmental actions that the worker # takes. These actions correspond to the a(t) in the HIRO paper. self._worker_actions = [] # rewards provided by the policy to the worker self._worker_rewards = [] # done masks at every time step for the worker self._dones = [] # actions performed by the manager during a given meta period. Used by # the replay buffer. self._meta_actions = [] # =================================================================== # # Part 2. Setup the Worker # # =================================================================== # # Create the Worker policy. with tf.compat.v1.variable_scope("Worker"): self.worker = worker_policy( sess, ob_space=ob_space, ac_space=ac_space, co_space=manager_ac_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber, scope="Worker", zero_fingerprint=self.use_fingerprints, fingerprint_dim=self.fingerprint_dim[0], **(additional_params or {}), ) # reward function for the worker def worker_reward_fn(states, goals, next_states): return negative_distance(states=states, state_indices=self.goal_indices, goals=goals, next_states=next_states, relative_context=relative_goals, offset=0.0) self.worker_reward_fn = worker_reward_fn if self.connected_gradients: self._setup_connected_gradients() def initialize(self): """See parent class. This method calls the initialization methods of the manager and worker. """ self.manager.initialize() self.worker.initialize() self.meta_reward = 0 def update(self, update_actor=True, **kwargs): """Perform a gradient update step. This is done both at the level of the Manager and Worker policies. The kwargs argument for this method contains two additional terms: * update_meta (bool): specifies whether to perform a gradient update step for the meta-policy (i.e. Manager) * update_meta_actor (bool): similar to the `update_policy` term, but for the meta-policy. Note that, if `update_meta` is set to False, this term is void. **Note**; The target update soft updates for both the manager and the worker policies occur at the same frequency as their respective actor update frequencies. Parameters ---------- update_actor : bool specifies whether to update the actor policy. The critic policy is still updated if this value is set to False. Returns ------- ([float, float], [float, float]) manager critic loss, worker critic loss (float, float) manager actor loss, worker actor loss """ # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return ([0, 0], [0, 0]), (0, 0) # Specifies whether to remove additional data from the replay buffer # sampling procedure. Since only a subset of algorithms use additional # data, removing it can speedup the other algorithms. with_additional = self.off_policy_corrections # Get a batch. meta_obs0, meta_obs1, meta_act, meta_rew, meta_done, worker_obs0, \ worker_obs1, worker_act, worker_rew, worker_done, additional = \ self.replay_buffer.sample(with_additional=with_additional) # Update the Manager policy. if kwargs['update_meta']: # Replace the goals with the most likely goals. if self.off_policy_corrections: meta_act = self._sample_best_meta_action( meta_obs0=meta_obs0, meta_obs1=meta_obs1, meta_action=meta_act, worker_obses=additional["worker_obses"], worker_actions=additional["worker_actions"], k=8) if self.connected_gradients: # Perform the connected gradients update procedure. m_critic_loss, m_actor_loss = self._connected_gradients_update( obs0=meta_obs0, actions=meta_act, rewards=meta_rew, obs1=meta_obs1, terminals1=meta_done, update_actor=kwargs['update_meta_actor'], worker_obs0=worker_obs0, worker_obs1=worker_obs1, worker_actions=worker_act, ) else: # Perform the regular manager update procedure. m_critic_loss, m_actor_loss = self.manager.update_from_batch( obs0=meta_obs0, actions=meta_act, rewards=meta_rew, obs1=meta_obs1, terminals1=meta_done, update_actor=kwargs['update_meta_actor'], ) else: m_critic_loss, m_actor_loss = [0, 0], 0 # Update the Worker policy. w_critic_loss, w_actor_loss = self.worker.update_from_batch( obs0=worker_obs0, actions=worker_act, rewards=worker_rew, obs1=worker_obs1, terminals1=worker_done, update_actor=update_actor, ) return (m_critic_loss, w_critic_loss), (m_actor_loss, w_actor_loss) def get_action(self, obs, context, apply_noise, random_actions): """See parent class.""" if self._update_meta: # Update the meta action based on the output from the policy if the # time period requires is. self.meta_action = self.manager.get_action(obs, context, apply_noise, random_actions) else: # Update the meta-action in accordance with the fixed transition # function. self.meta_action = self.goal_transition_fn( obs0=np.asarray([self._observations[-1][self.goal_indices]]), goal=self.meta_action, obs1=obs[:, self.goal_indices]) # Return the worker action. worker_action = self.worker.get_action(obs, self.meta_action, apply_noise, random_actions) return worker_action def value(self, obs, context, action): """See parent class.""" return 0, 0 # FIXME def store_transition(self, obs0, context0, action, reward, obs1, context1, done, is_final_step, evaluate=False): """See parent class.""" # Compute the worker reward and append it to the list of rewards. self._worker_rewards.append( self.worker_reward_scale * self.worker_reward_fn(obs0, self.meta_action.flatten(), obs1)) # Add the environmental observations and done masks, and the manager # and worker actions to their respective lists. self._worker_actions.append(action) self._meta_actions.append(self.meta_action.flatten()) self._observations.append(self._get_obs(obs0, self.meta_action, 0)) # Modify the done mask in accordance with the TD3 algorithm. Done # masks that correspond to the final step are set to False. self._dones.append(done and not is_final_step) # Increment the meta reward with the most recent reward. self.meta_reward += reward # Modify the previous meta observation whenever the action has changed. if len(self._observations) == 1: self.prev_meta_obs = self._get_obs(obs0, context0, 0) # Add a sample to the replay buffer. if len(self._observations) == self.meta_period or done: # Add the last observation. self._observations.append(self._get_obs(obs1, self.meta_action, 0)) # Add the contextual observation to the most recent environmental # observation, if applicable. meta_obs1 = self._get_obs(obs1, context1, 0) # Avoid storing samples when performing evaluations. if not evaluate: if not self.hindsight \ or random.random() < self.subgoal_testing_rate: # Store a sample in the replay buffer. self.replay_buffer.add( obs_t=self._observations, goal_t=self._meta_actions[0], action_t=self._worker_actions, reward_t=self._worker_rewards, done=self._dones, meta_obs_t=(self.prev_meta_obs, meta_obs1), meta_reward_t=self.meta_reward, ) if self.hindsight: # Implement hindsight action and goal transitions. goal, obs, rewards = self._hindsight_actions_goals( meta_action=self.meta_action, initial_observations=self._observations, initial_rewards=self._worker_rewards) # Store the hindsight sample in the replay buffer. self.replay_buffer.add( obs_t=obs, goal_t=goal, action_t=self._worker_actions, reward_t=rewards, done=self._dones, meta_obs_t=(self.prev_meta_obs, meta_obs1), meta_reward_t=self.meta_reward, ) # Clear the worker rewards and actions, and the environmental # observation and reward. self.clear_memory() @property def _update_meta(self): """Return True if the meta-action should be updated by the policy. This is done by checking the length of the observation lists that are passed to the replay buffer, which are cleared whenever the meta-period has been met or the environment has been reset. """ return len(self._observations) == 0 def clear_memory(self): """Clear internal memory that is used by the replay buffer. By clearing memory, the Manager policy is then informed during the `get_action` procedure to update the meta-action. """ self.meta_reward = 0 self._observations = [] self._worker_actions = [] self._worker_rewards = [] self._dones = [] self._meta_actions = [] def get_td_map(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return {} # Get a batch. meta_obs0, meta_obs1, meta_act, meta_rew, meta_done, worker_obs0, \ worker_obs1, worker_act, worker_rew, worker_done, _ = \ self.replay_buffer.sample() td_map = {} td_map.update( self.manager.get_td_map_from_batch(meta_obs0, meta_act, meta_rew, meta_obs1, meta_done)) td_map.update( self.worker.get_td_map_from_batch(worker_obs0, worker_act, worker_rew, worker_obs1, worker_done)) return td_map # ======================================================================= # # Auxiliary methods for HIRO # # ======================================================================= # def _sample_best_meta_action(self, meta_obs0, meta_obs1, meta_action, worker_obses, worker_actions, k=10): """Return meta-actions that approximately maximize low-level log-probs. Parameters ---------- meta_obs0 : array_like (batch_size, m_obs_dim) matrix of Manager observations meta_obs1 : array_like (batch_size, m_obs_dim) matrix of next time step Manager observations meta_action : array_like (batch_size, m_ac_dim) matrix of Manager actions worker_obses : array_like (batch_size, w_obs_dim, meta_period+1) matrix of current Worker state observations worker_actions : array_like (batch_size, w_ac_dim, meta_period) matrix of current Worker environmental actions k : int, optional number of goals returned, excluding the initial goal and the mean value Returns ------- array_like (batch_size, m_ac_dim) matrix of most likely Manager actions """ batch_size, goal_dim = meta_action.shape # Collect several samples of potentially optimal goals. sampled_actions = self._sample(meta_obs0, meta_obs1, meta_action, k) assert sampled_actions.shape == (batch_size, goal_dim, k) # Compute the fitness of each candidate goal. The fitness is the sum of # the log-probabilities of each action for the given goal. fitness = self._log_probs(sampled_actions, worker_obses, worker_actions) assert fitness.shape == (batch_size, k) # For each sample, choose the meta action that maximizes the fitness. indx = np.argmax(fitness, 1) best_goals = np.asarray( [sampled_actions[i, :, indx[i]] for i in range(batch_size)]) return best_goals def _sample(self, meta_obs0, meta_obs1, meta_action, num_samples, sc=0.5): """Sample different goals. The goals are sampled as follows: * The first num_samples-2 goals are acquired from a random Gaussian distribution centered at s_{t+c} - s_t. * The second to last goal is s_{t+c} - s_t. * The last goal is the originally sampled goal g_t. Parameters ---------- meta_obs0 : array_like (batch_size, m_obs_dim) matrix of Manager observations meta_obs1 : array_like (batch_size, m_obs_dim) matrix of next time step Manager observations meta_action : array_like (batch_size, m_ac_dim) matrix of Manager actions num_samples : int number of samples sc : float scaling factor for the normal distribution. Returns ------- array_like (batch_size, goal_dim, num_samples) matrix of sampled goals Helps ----- * _sample_best_meta_action(self) """ batch_size, goal_dim = meta_action.shape goal_space = self.manager.ac_space spec_range = goal_space.high - goal_space.low random_samples = num_samples - 2 # Compute the mean and std for the Gaussian distribution to sample # from, and well as the maxima and minima. loc = meta_obs1[:, self.goal_indices] - meta_obs0[:, self.goal_indices] scale = [sc * spec_range / 2] minimum, maximum = [goal_space.low], [goal_space.high] new_loc = np.zeros((batch_size, goal_dim, random_samples)) new_scale = np.zeros((batch_size, goal_dim, random_samples)) for i in range(random_samples): new_loc[:, :, i] = loc new_scale[:, :, i] = scale new_minimum = np.zeros((batch_size, goal_dim, num_samples)) new_maximum = np.zeros((batch_size, goal_dim, num_samples)) for i in range(num_samples): new_minimum[:, :, i] = minimum new_maximum[:, :, i] = maximum # Generate random samples for the above distribution. normal_samples = np.random.normal(size=(random_samples * batch_size * goal_dim)) normal_samples = normal_samples.reshape( (batch_size, goal_dim, random_samples)) samples = np.zeros((batch_size, goal_dim, num_samples)) samples[:, :, :-2] = new_loc + normal_samples * new_scale samples[:, :, -2] = loc samples[:, :, -1] = meta_action # Clip the values based on the Manager action space range. samples = np.minimum(np.maximum(samples, new_minimum), new_maximum) return samples def _log_probs(self, meta_actions, worker_obses, worker_actions): """Calculate the log probability of the next goal by the Manager. Parameters ---------- meta_actions : array_like (batch_size, m_ac_dim, num_samples) matrix of candidate Manager actions worker_obses : array_like (batch_size, w_obs_dim, meta_period + 1) matrix of Worker observations worker_actions : array_like (batch_size, w_ac_dim, meta_period) list of Worker actions Returns ------- array_like (batch_size, num_samples) fitness associated with every state / action / goal pair Helps ----- * _sample_best_meta_action(self): """ raise NotImplementedError # ======================================================================= # # Auxiliary methods for HAC # # ======================================================================= # def _hindsight_actions_goals(self, meta_action, initial_observations, initial_rewards): """Calculate hindsight goal and action transitions. These are then stored in the replay buffer along with the original (non-hindsight) sample. See the README at the front page of this repository for an in-depth description of this procedure. Parameters ---------- meta_action : array_like the original Manager actions (goal) initial_observations : array_like the original worker observations with the non-hindsight goals appended to them initial_rewards : array_like the original worker rewards Returns ------- array_like the Manager action (goal) in hindsight array_like the modified Worker observations with the hindsight goals appended to them array_like the modified Worker rewards taking into account the hindsight goals Helps ----- * store_transition(self): """ goal_dim = meta_action.shape[0] observations = deepcopy(initial_observations) rewards = deepcopy(initial_rewards) hindsight_goal = 0 if self.relative_goals \ else observations[-1][self.goal_indices] obs_tp1 = observations[-1] for i in range(1, len(observations) + 1): obs_t = observations[-i] # Calculate the hindsight goal in using relative goals. # If not, the hindsight goal is simply a subset of the # final state observation. if self.relative_goals: hindsight_goal += \ obs_tp1[self.goal_indices] - obs_t[self.goal_indices] # Modify the Worker intrinsic rewards based on the new # hindsight goal. if i > 1: rewards[-(i - 1)] = self.worker_reward_scale \ * self.worker_reward_fn(obs_t, hindsight_goal, obs_tp1) obs_tp1 = deepcopy(obs_t) # Replace the goal with the goal that the worker # actually achieved. observations[-i][-goal_dim:] = hindsight_goal return hindsight_goal, observations, rewards # ======================================================================= # # Auxiliary methods for HRL-CG # # ======================================================================= # def _setup_connected_gradients(self): """Create the updated manager optimization with connected gradients.""" raise NotImplementedError def _connected_gradients_update(self, obs0, actions, rewards, obs1, terminals1, worker_obs0, worker_obs1, worker_actions, update_actor=True): """Perform the gradient update procedure for the HRL-CG algorithm. This procedure is similar to self.manager.update_from_batch, expect it runs the self.cg_optimizer operation instead of self.manager.optimizer, and utilizes some information from the worker samples as well. Parameters ---------- obs0 : np.ndarray batch of manager observations actions : numpy float batch of manager actions executed given obs_batch rewards : numpy float manager rewards received as results of executing act_batch obs1 : np.ndarray set of next manager observations seen after executing act_batch terminals1 : numpy bool done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode and 0 otherwise. worker_obs0 : array_like batch of worker observations worker_obs1 : array_like batch of next worker observations worker_actions : array_like batch of worker actions update_actor : bool specifies whether to update the actor policy of the manager. The critic policy is still updated if this value is set to False. Returns ------- [float, float] manager critic loss float manager actor loss """ raise NotImplementedError
class GoalConditionedPolicy(ActorCriticPolicy): r"""Goal-conditioned hierarchical reinforcement learning model. FIXME This policy is an implementation of the two-level hierarchy presented in [1], which itself is similar to the feudal networks formulation [2, 3]. This network consists of a high-level, or Manager, pi_{\theta_H} that computes and outputs goals g_t ~ pi_{\theta_H}(s_t, h) every `meta_period` time steps, and a low-level policy pi_{\theta_L} that takes as inputs the current state and the assigned goals and attempts to perform an action a_t ~ pi_{\theta_L}(s_t,g_t) that satisfies these goals. The highest level policy is rewarded based on the original environment reward function: r_H = r(s,a;h). The Target term, h, parametrizes the reward assigned to the highest level policy in order to allow the policy to generalize to several goals within a task, a technique that was first proposed by [4]. Finally, the Worker is motivated to follow the goals set by the Manager via an intrinsic reward based on the distance between the current observation and the goal observation: r_L (s_t, g_t, s_{t+1}) = -||s_t + g_t - s_{t+1}||_2 Bibliography: [1] Nachum, Ofir, et al. "Data-efficient hierarchical reinforcement learning." Advances in Neural Information Processing Systems. 2018. [2] Dayan, Peter, and Geoffrey E. Hinton. "Feudal reinforcement learning." Advances in neural information processing systems. 1993. [3] Vezhnevets, Alexander Sasha, et al. "Feudal networks for hierarchical reinforcement learning." Proceedings of the 34th International Conference on Machine Learning-Volume 70. JMLR. org, 2017. [4] Schaul, Tom, et al. "Universal value function approximators." International Conference on Machine Learning. 2015. Attributes ---------- meta_period : int meta-policy action period intrinsic_reward_type : str the reward function to be used by the worker. Must be one of: * "negative_distance": the negative two norm between the states and desired absolute or relative goals. * "scaled_negative_distance": similar to the negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "non_negative_distance": the negative two norm between the states and desired absolute or relative goals offset by the maximum goal space (to ensure non-negativity) * "scaled_non_negative_distance": similar to the non-negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "exp_negative_distance": equal to exp(-negative_distance^2). The result is a reward between 0 and 1. This is useful for policies that terminate early. * "scaled_exp_negative_distance": similar to the previous worker reward type but with states, actions, and next states that are scaled. intrinsic_reward_scale : float the value that the intrinsic reward should be scaled by relative_goals : bool specifies whether the goal issued by the higher-level policies is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296. hindsight : bool whether to use hindsight action and goal transitions, as well as subgoal testing. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. connected_gradients : bool whether to use the connected gradient update actor update procedure to the higher-level policy. See: https://arxiv.org/abs/1912.02368v1 cg_weights : float weights for the gradients of the loss of the lower-level policies with respect to the parameters of the higher-level policies. Only used if `connected_gradients` is set to True. use_fingerprints : bool specifies whether to add a time-dependent fingerprint to the observations fingerprint_range : (list of float, list of float) the low and high values for each fingerprint element, if they are being used fingerprint_dim : tuple of int the shape of the fingerprint elements, if they are being used centralized_value_functions : bool specifies whether to use centralized value functions policy : list of hbaselines.base_policies.ActorCriticPolicy a list of policy object for each level in the hierarchy, order from highest to lowest level policy replay_buffer : hbaselines.goal_conditioned.replay_buffer.HierReplayBuffer the replay buffer object goal_indices : list of int the state indices for the intrinsic rewards intrinsic_reward_fn : function reward function for the lower-level policies """ def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, layer_norm, layers, act_fun, use_huber, num_levels, meta_period, intrinsic_reward_type, intrinsic_reward_scale, relative_goals, off_policy_corrections, hindsight, subgoal_testing_rate, connected_gradients, cg_weights, use_fingerprints, fingerprint_range, centralized_value_functions, env_name="", meta_policy=None, worker_policy=None, additional_params=None): """Instantiate the goal-conditioned hierarchical policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor layer_norm : bool enable layer normalisation layers : list of int or None the size of the neural network for the policy act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead num_levels : int number of levels within the hierarchy. Must be greater than 1. Two levels correspond to a Manager/Worker paradigm. meta_period : int meta-policy action period intrinsic_reward_type : str the reward function to be used by the worker. Must be one of: * "negative_distance": the negative two norm between the states and desired absolute or relative goals. * "scaled_negative_distance": similar to the negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "non_negative_distance": the negative two norm between the states and desired absolute or relative goals offset by the maximum goal space (to ensure non-negativity) * "scaled_non_negative_distance": similar to the non-negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "exp_negative_distance": equal to exp(-negative_distance^2). The result is a reward between 0 and 1. This is useful for policies that terminate early. * "scaled_exp_negative_distance": similar to the previous worker reward type but with states, actions, and next states that are scaled. intrinsic_reward_scale : float the value that the intrinsic reward should be scaled by relative_goals : bool specifies whether the goal issued by the higher-level policies is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296 hindsight : bool whether to include hindsight action and goal transitions in the replay buffer. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. connected_gradients : bool whether to use the connected gradient update actor update procedure to the higher-level policy. See: https://arxiv.org/abs/1912.02368v1 cg_weights : float weights for the gradients of the loss of the lower-level policies with respect to the parameters of the higher-level policies. Only used if `connected_gradients` is set to True. use_fingerprints : bool specifies whether to add a time-dependent fingerprint to the observations fingerprint_range : (list of float, list of float) the low and high values for each fingerprint element, if they are being used centralized_value_functions : bool specifies whether to use centralized value functions meta_policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use for the meta policies worker_policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use for the worker policy additional_params : dict additional algorithm-specific policy parameters. Used internally by the class when instantiating other (child) policies. """ super(GoalConditionedPolicy, self).__init__(sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber) assert num_levels >= 2, "num_levels must be greater than or equal to 2" self.num_levels = num_levels self.meta_period = meta_period self.intrinsic_reward_type = intrinsic_reward_type self.intrinsic_reward_scale = intrinsic_reward_scale self.relative_goals = relative_goals self.off_policy_corrections = off_policy_corrections self.hindsight = hindsight self.subgoal_testing_rate = subgoal_testing_rate self.connected_gradients = connected_gradients self.cg_weights = cg_weights self.use_fingerprints = use_fingerprints self.fingerprint_range = fingerprint_range self.fingerprint_dim = (len(self.fingerprint_range[0]), ) self.centralized_value_functions = centralized_value_functions # Get the observation and action space of the higher level policies. meta_ac_space = get_meta_ac_space(ob_space=ob_space, relative_goals=relative_goals, env_name=env_name, use_fingerprints=use_fingerprints, fingerprint_dim=self.fingerprint_dim) # =================================================================== # # Step 1: Create the policies for the individual levels. # # =================================================================== # self.policy = [] # The policies are ordered from the highest level to lowest level # policies in the hierarchy. for i in range(num_levels): # Determine the appropriate parameters to use for the policy in the # current level. policy_fn = meta_policy if i < (num_levels - 1) else worker_policy ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space co_space_i = co_space if i == 0 else meta_ac_space ob_space_i = ob_space zero_fingerprint_i = i == (num_levels - 1) # The policies are ordered from the highest level to lowest level # policies in the hierarchy. with tf.compat.v1.variable_scope("level_{}".format(i)): self.policy.append( policy_fn( sess=sess, ob_space=ob_space_i, ac_space=ac_space_i, co_space=co_space_i, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber, scope="level_{}".format(i), zero_fingerprint=zero_fingerprint_i, fingerprint_dim=self.fingerprint_dim[0], **(additional_params or {}), )) # =================================================================== # # Step 2: Create attributes for the replay buffer. # # =================================================================== # # Create the replay buffer. self.replay_buffer = HierReplayBuffer( buffer_size=int(buffer_size / meta_period), batch_size=batch_size, meta_period=meta_period, obs_dim=ob_space.shape[0], ac_dim=ac_space.shape[0], co_dim=None if co_space is None else co_space.shape[0], goal_dim=meta_ac_space.shape[0], num_levels=num_levels) # current action by the meta-level policies self._meta_action = [None for _ in range(num_levels - 1)] # a list of all the actions performed by each level in the hierarchy, # ordered from highest to lowest level policy self._actions = None # a list of the rewards (intrinsic or other) experienced by every level # in the hierarchy, ordered from highest to lowest level policy self._rewards = None # a list of observations that stretch as long as the dilated horizon # chosen for the highest level policy self._observations = None # the first and last contextual term self._contexts = None # a list of done masks at every time step self._dones = None # Collect the state indices for the intrinsic rewards. self.goal_indices = get_state_indices( ob_space=ob_space, env_name=env_name, use_fingerprints=use_fingerprints, fingerprint_dim=self.fingerprint_dim) # Define the intrinsic reward function. if intrinsic_reward_type in [ "negative_distance", "scaled_negative_distance", "non_negative_distance", "scaled_non_negative_distance", "exp_negative_distance", "scaled_exp_negative_distance" ]: # Offset the distance measure by the maximum possible distance to # ensure non-negativity. if "non_negative" in intrinsic_reward_type: offset = np.sqrt( np.sum(np.square(meta_ac_space.high - meta_ac_space.low), -1)) else: offset = 0 # Scale the outputs from the state by the meta-action space if you # wish to scale the worker reward. if intrinsic_reward_type.startswith("scaled"): scale = 0.5 * (meta_ac_space.high - meta_ac_space.low) else: scale = 1 def intrinsic_reward_fn(states, goals, next_states): return negative_distance( states=states[self.goal_indices] / scale, goals=goals / scale, next_states=next_states[self.goal_indices] / scale, relative_context=relative_goals, offset=0.0) + offset # Perform the exponential and squashing operations to keep the # intrinsic reward between 0 and 1. if "exp" in intrinsic_reward_type: def exp_intrinsic_reward_fn(states, goals, next_states): return np.exp( -1 * intrinsic_reward_fn(states, goals, next_states)**2) self.intrinsic_reward_fn = exp_intrinsic_reward_fn else: self.intrinsic_reward_fn = intrinsic_reward_fn else: raise ValueError("Unknown intrinsic reward type: {}".format( intrinsic_reward_type)) # =================================================================== # # Step 3: Create algorithm-specific features. # # =================================================================== # # a fixed goal transition function for the meta-actions in between meta # periods. This is used when relative_goals is set to True in order to # maintain a fixed absolute position of the goal. if relative_goals: def goal_transition_fn(obs0, goal, obs1): return obs0 + goal - obs1 else: def goal_transition_fn(obs0, goal, obs1): return goal self.goal_transition_fn = goal_transition_fn # Utility method for indexing the goal out of an observation variable. self.crop_to_goal = lambda g: tf.gather( g, tf.tile(tf.expand_dims(np.array(self.goal_indices), 0), [self.batch_size, 1]), batch_dims=1, axis=1) if self.connected_gradients: self._setup_connected_gradients() def initialize(self): """See parent class. This method calls the initialization methods of the policies at every level of the hierarchy. """ for i in range(self.num_levels): self.policy[i].initialize() self.clear_memory() def update(self, update_actor=True, **kwargs): """Perform a gradient update step. This is done both at every level of the hierarchy. The kwargs argument for this method contains two additional terms: * update_meta (bool): specifies whether to perform a gradient update step for the meta-policies * update_meta_actor (bool): similar to the `update_policy` term, but for the meta-policy. Note that, if `update_meta` is set to False, this term is void. **Note**; The target update soft updates for all policies occur at the same frequency as their respective actor update frequencies. Parameters ---------- update_actor : bool specifies whether to update the actor policy. The critic policy is still updated if this value is set to False. Returns ------- ([float, float], [float, float]) the critic loss for every policy in the hierarchy (float, float) the actor loss for every policy in the hierarchy """ # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return tuple([[0, 0] for _ in range(self.num_levels)]), \ tuple([0 for _ in range(self.num_levels)]) # Specifies whether to remove additional data from the replay buffer # sampling procedure. Since only a subset of algorithms use additional # data, removing it can speedup the other algorithms. with_additional = self.off_policy_corrections # Get a batch. obs0, obs1, act, rew, done, additional = self.replay_buffer.sample( with_additional) # Update the higher-level policies. actor_loss = [] critic_loss = [] if kwargs['update_meta']: # Replace the goals with the most likely goals. if self.off_policy_corrections: meta_act = self._sample_best_meta_action( meta_obs0=obs0[0], meta_obs1=obs1[0], meta_action=act[0], worker_obses=additional["worker_obses"], worker_actions=additional["worker_actions"], k=8) act[0] = meta_act for i in range(self.num_levels - 1): if self.connected_gradients: # Perform the connected gradients update procedure. vf_loss, pi_loss = self._connected_gradients_update( obs0=obs0, actions=act, rewards=rew, obs1=obs1, terminals1=done, update_actor=kwargs['update_meta_actor'], ) else: # Perform the regular meta update procedure. vf_loss, pi_loss = self.policy[i].update_from_batch( obs0=obs0[i], actions=act[i], rewards=rew[i], obs1=obs1[i], terminals1=done[i], update_actor=kwargs['update_meta_actor'], ) actor_loss.append(pi_loss) critic_loss.append(vf_loss) else: for i in range(self.num_levels - 1): actor_loss.append(0) critic_loss.append([0, 0]) # Update the lowest level policy. w_critic_loss, w_actor_loss = self.policy[-1].update_from_batch( obs0=obs0[-1], actions=act[-1], rewards=rew[-1], obs1=obs1[-1], terminals1=done[-1], update_actor=update_actor, ) critic_loss.append(w_critic_loss) actor_loss.append(w_actor_loss) return tuple(critic_loss), tuple(actor_loss) def get_action(self, obs, context, apply_noise, random_actions): """See parent class.""" # Loop through the policies in the hierarchy. for i in range(self.num_levels - 1): if self._update_meta(i): context_i = context if i == 0 else self._meta_action[i - 1] # Update the meta action based on the output from the policy if # the time period requires is. self._meta_action[i] = self.policy[i].get_action( obs, context_i, apply_noise, random_actions) else: # Update the meta-action in accordance with a fixed transition # function. self._meta_action[i] = self.goal_transition_fn( obs0=np.array([self._observations[-1][self.goal_indices]]), goal=self._meta_action[i], obs1=obs[:, self.goal_indices]) # Return the action to be performed within the environment (i.e. the # action by the lowest level policy). action = self.policy[-1].get_action(obs, self._meta_action[-1], apply_noise, random_actions) return action def store_transition(self, obs0, context0, action, reward, obs1, context1, done, is_final_step, evaluate=False): """See parent class.""" # the time since the most recent sample began collecting step samples t_start = len(self._observations) for i in range(1, self.num_levels): # Actions and intrinsic rewards for the high-level policies are # only updated when the action is recomputed by the graph. if t_start % self.meta_period**(i - 1) == 0: self._rewards[-i].append(0) self._actions[-i - 1].append(self._meta_action[-i].flatten()) # Compute the intrinsic rewards and append them to the list of # rewards. self._rewards[-i][-1] += \ self.intrinsic_reward_scale / self.meta_period ** (i-1) * \ self.intrinsic_reward_fn( states=obs0, goals=self._meta_action[-i].flatten(), next_states=obs1 ) # The highest level policy receives the sum of environmental rewards. self._rewards[0][0] += reward # The lowest level policy's actions are received from the algorithm. self._actions[-1].append(action) # Add the environmental observations and contextual terms to their # respective lists. self._observations.append(obs0) if t_start == 0: self._contexts.append(context0) # Modify the done mask in accordance with the TD3 algorithm. Done masks # that correspond to the final step are set to False. self._dones.append(done and not is_final_step) # Add a sample to the replay buffer. if len(self._observations) == \ self.meta_period ** (self.num_levels - 1) or done: # Add the last observation and context. self._observations.append(obs1) self._contexts.append(context1) # Compute the current state goals to add to the final observation. for i in range(self.num_levels - 1): self._actions[i].append( self.goal_transition_fn( obs0=obs0[self.goal_indices], goal=self._meta_action[i], obs1=obs1[self.goal_indices]).flatten()) # Avoid storing samples when performing evaluations. if not evaluate: if not self.hindsight \ or random.random() < self.subgoal_testing_rate: # Store a sample in the replay buffer. self.replay_buffer.add( obs_t=self._observations, context_t=self._contexts, action_t=self._actions, reward_t=self._rewards, done_t=self._dones, ) if self.hindsight: # Some temporary attributes. worker_obses = [ self._get_obs(self._observations[i], self._actions[0][i], 0) for i in range(len(self._observations)) ] intrinsic_rewards = self._rewards[-1] # Implement hindsight action and goal transitions. goal, rewards = self._hindsight_actions_goals( initial_observations=worker_obses, initial_rewards=intrinsic_rewards) new_actions = deepcopy(self._actions) new_actions[0] = goal new_rewards = deepcopy(self._rewards) new_rewards[-1] = rewards # Store the hindsight sample in the replay buffer. self.replay_buffer.add( obs_t=self._observations, context_t=self._contexts, action_t=new_actions, reward_t=new_rewards, done_t=self._dones, ) # Clear the memory that has been stored in the replay buffer. self.clear_memory() def _update_meta(self, level): """Determine whether a meta-policy should update its action. This is done by checking the length of the observation lists that are passed to the replay buffer, which are cleared whenever the highest level meta-period has been met or the environment has been reset. Parameters ---------- level : int the level of the policy Returns ------- bool True if the action should be updated by the meta-policy at the given level """ return len(self._observations) % \ (self.meta_period ** (self.num_levels - level - 1)) == 0 def clear_memory(self): """Clear internal memory that is used by the replay buffer.""" self._actions = [[] for _ in range(self.num_levels)] self._rewards = [[0]] + [[] for _ in range(self.num_levels - 1)] self._observations = [] self._contexts = [] self._dones = [] def get_td_map(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return {} # Get a batch. obs0, obs1, act, rew, done, _ = self.replay_buffer.sample(False) td_map = {} for i in range(self.num_levels): td_map.update(self.policy[i].get_td_map_from_batch( obs0=obs0[i], actions=act[i], rewards=rew[i], obs1=obs1[i], terminals1=done[i])) return td_map # ======================================================================= # # Auxiliary methods for HIRO # # ======================================================================= # def _sample_best_meta_action(self, meta_obs0, meta_obs1, meta_action, worker_obses, worker_actions, k=10): """Return meta-actions that approximately maximize low-level log-probs. Parameters ---------- meta_obs0 : array_like (batch_size, m_obs_dim) matrix of meta observations meta_obs1 : array_like (batch_size, m_obs_dim) matrix of next time step meta observations meta_action : array_like (batch_size, m_ac_dim) matrix of meta actions worker_obses : array_like (batch_size, w_obs_dim, meta_period+1) matrix of current Worker state observations worker_actions : array_like (batch_size, w_ac_dim, meta_period) matrix of current Worker environmental actions k : int, optional number of goals returned, excluding the initial goal and the mean value Returns ------- array_like (batch_size, m_ac_dim) matrix of most likely meta actions """ batch_size, goal_dim = meta_action.shape # Collect several samples of potentially optimal goals. sampled_actions = self._sample(meta_obs0, meta_obs1, meta_action, k) assert sampled_actions.shape == (batch_size, goal_dim, k) # Compute the fitness of each candidate goal. The fitness is the sum of # the log-probabilities of each action for the given goal. fitness = self._log_probs(sampled_actions, worker_obses, worker_actions) assert fitness.shape == (batch_size, k) # For each sample, choose the meta action that maximizes the fitness. indx = np.argmax(fitness, 1) best_goals = np.asarray( [sampled_actions[i, :, indx[i]] for i in range(batch_size)]) return best_goals def _sample(self, meta_obs0, meta_obs1, meta_action, num_samples, sc=0.5): """Sample different goals. The goals are sampled as follows: * The first num_samples-2 goals are acquired from a random Gaussian distribution centered at s_{t+c} - s_t. * The second to last goal is s_{t+c} - s_t. * The last goal is the originally sampled goal g_t. Parameters ---------- meta_obs0 : array_like (batch_size, m_obs_dim) matrix of meta observations meta_obs1 : array_like (batch_size, m_obs_dim) matrix of next time step meta observations meta_action : array_like (batch_size, m_ac_dim) matrix of meta actions num_samples : int number of samples sc : float scaling factor for the normal distribution. Returns ------- array_like (batch_size, goal_dim, num_samples) matrix of sampled goals Helps ----- * _sample_best_meta_action(self) """ batch_size, goal_dim = meta_action.shape goal_space = self.policy[0].ac_space spec_range = goal_space.high - goal_space.low random_samples = num_samples - 2 # Compute the mean and std for the Gaussian distribution to sample # from, and well as the maxima and minima. loc = meta_obs1[:, self.goal_indices] - meta_obs0[:, self.goal_indices] scale = [sc * spec_range / 2] minimum, maximum = [goal_space.low], [goal_space.high] new_loc = np.zeros((batch_size, goal_dim, random_samples)) new_scale = np.zeros((batch_size, goal_dim, random_samples)) for i in range(random_samples): new_loc[:, :, i] = loc new_scale[:, :, i] = scale new_minimum = np.zeros((batch_size, goal_dim, num_samples)) new_maximum = np.zeros((batch_size, goal_dim, num_samples)) for i in range(num_samples): new_minimum[:, :, i] = minimum new_maximum[:, :, i] = maximum # Generate random samples for the above distribution. normal_samples = np.random.normal(size=(random_samples * batch_size * goal_dim)) normal_samples = normal_samples.reshape( (batch_size, goal_dim, random_samples)) samples = np.zeros((batch_size, goal_dim, num_samples)) samples[:, :, :-2] = new_loc + normal_samples * new_scale samples[:, :, -2] = loc samples[:, :, -1] = meta_action # Clip the values based on the meta action space range. samples = np.minimum(np.maximum(samples, new_minimum), new_maximum) return samples def _log_probs(self, meta_actions, worker_obses, worker_actions): """Calculate the log probability of the next goal by the meta-policies. Parameters ---------- meta_actions : array_like (batch_size, m_ac_dim, num_samples) matrix of candidate higher- level policy actions worker_obses : array_like (batch_size, w_obs_dim, meta_period + 1) matrix of lower-level policy observations worker_actions : array_like (batch_size, w_ac_dim, meta_period) list of lower-level policy actions Returns ------- array_like (batch_size, num_samples) fitness associated with every state / action / goal pair Helps ----- * _sample_best_meta_action(self): """ raise NotImplementedError # ======================================================================= # # Auxiliary methods for HAC # # ======================================================================= # def _hindsight_actions_goals(self, initial_observations, initial_rewards): """Calculate hindsight goal and action transitions. These are then stored in the replay buffer along with the original (non-hindsight) sample. See the README at the front page of this repository for an in-depth description of this procedure. Parameters ---------- initial_observations : array_like the original worker observations with the non-hindsight goals appended to them initial_rewards : array_like the original intrinsic rewards Returns ------- array_like the goal at every step in hindsight array_like the modified intrinsic rewards taking into account the hindsight goals Helps ----- * store_transition(self): """ new_goals = [] observations = deepcopy(initial_observations) rewards = deepcopy(initial_rewards) hindsight_goal = 0 if self.relative_goals \ else observations[-1][self.goal_indices] obs_tp1 = observations[-1] for i in range(1, len(observations) + 1): obs_t = observations[-i] # Calculate the hindsight goal in using relative goals. # If not, the hindsight goal is simply a subset of the # final state observation. if self.relative_goals: hindsight_goal += \ obs_tp1[self.goal_indices] - obs_t[self.goal_indices] # Modify the Worker intrinsic rewards based on the new # hindsight goal. if i > 1: rewards[-(i - 1)] = self.intrinsic_reward_scale \ * self.intrinsic_reward_fn(obs_t, hindsight_goal, obs_tp1) obs_tp1 = deepcopy(obs_t) new_goals = [deepcopy(hindsight_goal)] + new_goals return new_goals, rewards # ======================================================================= # # Auxiliary methods for HRL-CG # # ======================================================================= # def _setup_connected_gradients(self): """Create the connected gradients meta-policy optimizer.""" raise NotImplementedError def _connected_gradients_update(self, obs0, actions, rewards, obs1, terminals1, update_actor=True): """Perform the gradient update procedure for the HRL-CG algorithm. This procedure is similar to update_from_batch, expect it runs the self.cg_optimizer operation instead of the policy object's optimizer, and utilizes some information from the worker samples as well. Parameters ---------- obs0 : list of array_like (batch_size, obs_dim) matrix of observations for every level in the hierarchy actions : list of array_like (batch_size, ac_dim) matrix of actions for every level in the hierarchy obs1 : list of array_like (batch_size, obs_dim) matrix of next step observations for every level in the hierarchy rewards : list of array_like (batch_size,) vector of rewards for every level in the hierarchy terminals1 : list of numpy bool (batch_size,) vector of done masks for every level in the hierarchy update_actor : bool specifies whether to update the actor policy of the meta policy. The critic policy is still updated if this value is set to False. Returns ------- [float, float] meta-policy critic loss float meta-policy actor loss """ raise NotImplementedError
class GoalConditionedPolicy(Policy): r"""Goal-conditioned hierarchical reinforcement learning model. TODO This policy is an implementation of the two-level hierarchy presented in [1], which itself is similar to the feudal networks formulation [2, 3]. This network consists of a high-level, or Manager, pi_{\theta_H} that computes and outputs goals g_t ~ pi_{\theta_H}(s_t, h) every `meta_period` time steps, and a low-level policy pi_{\theta_L} that takes as inputs the current state and the assigned goals and attempts to perform an action a_t ~ pi_{\theta_L}(s_t,g_t) that satisfies these goals. The highest level policy is rewarded based on the original environment reward function: r_H = r(s,a;h). The Target term, h, parametrizes the reward assigned to the highest level policy in order to allow the policy to generalize to several goals within a task, a technique that was first proposed by [4]. Finally, the Worker is motivated to follow the goals set by the Manager via an intrinsic reward based on the distance between the current observation and the goal observation: r_L (s_t, g_t, s_{t+1}) = -||s_t + g_t - s_{t+1}||_2 Bibliography: [1] Nachum, Ofir, et al. "Data-efficient hierarchical reinforcement learning." Advances in Neural Information Processing Systems. 2018. [2] Dayan, Peter, and Geoffrey E. Hinton. "Feudal reinforcement learning." Advances in neural information processing systems. 1993. [3] Vezhnevets, Alexander Sasha, et al. "Feudal networks for hierarchical reinforcement learning." Proceedings of the 34th International Conference on Machine Learning-Volume 70. JMLR. org, 2017. [4] Schaul, Tom, et al. "Universal value function approximators." International Conference on Machine Learning. 2015. Attributes ---------- num_levels : int number of levels within the hierarchy. Must be greater than 1. Two levels correspond to a Manager/Worker paradigm. meta_period : int meta-policy action period intrinsic_reward_type : str the reward function to be used by the worker. Must be one of: * "negative_distance": the negative two norm between the states and desired absolute or relative goals. * "scaled_negative_distance": similar to the negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "non_negative_distance": the negative two norm between the states and desired absolute or relative goals offset by the maximum goal space (to ensure non-negativity) * "scaled_non_negative_distance": similar to the non-negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "exp_negative_distance": equal to exp(-negative_distance^2). The result is a reward between 0 and 1. This is useful for policies that terminate early. * "scaled_exp_negative_distance": similar to the previous worker reward type but with states, actions, and next states that are scaled. intrinsic_reward_scale : float the value that the intrinsic reward should be scaled by relative_goals : bool specifies whether the goal issued by the higher-level policies is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296. hindsight : bool whether to use hindsight action and goal transitions, as well as subgoal testing. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. cooperative_gradients : bool whether to use the cooperative gradient update procedure for the higher-level policy. See: https://arxiv.org/abs/1912.02368v1 cg_weights : float weights for the gradients of the loss of the lower-level policies with respect to the parameters of the higher-level policies. Only used if `cooperative_gradients` is set to True. pretrain_worker : bool specifies whether you are pre-training the lower-level policies. Actions by the high-level policy are randomly sampled from its action space. pretrain_path : str or None path to the pre-trained worker policy checkpoints pretrain_ckpt : int or None checkpoint number to use within the worker policy path. If set to None, the most recent checkpoint is used. total_steps : int Total number of timesteps used during training. Used by a subset of algorithms. policy : list of hbaselines.base_policies.Policy a list of policy object for each level in the hierarchy, order from highest to lowest level policy replay_buffer : hbaselines.goal_conditioned.replay_buffer.HierReplayBuffer the replay buffer object goal_indices : list of int the state indices for the intrinsic rewards intrinsic_reward_fn : function reward function for the lower-level policies """ def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, use_huber, l2_penalty, model_params, num_levels, meta_period, intrinsic_reward_type, intrinsic_reward_scale, relative_goals, off_policy_corrections, hindsight, subgoal_testing_rate, cooperative_gradients, cg_weights, cg_delta, pretrain_worker, pretrain_path, pretrain_ckpt, total_steps, scope=None, env_name="", num_envs=1, meta_policy=None, worker_policy=None, additional_params=None): """Instantiate the goal-conditioned hierarchical policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead model_params : dict dictionary of model-specific parameters. See parent class. num_levels : int number of levels within the hierarchy. Must be greater than 1. Two levels correspond to a Manager/Worker paradigm. meta_period : int meta-policy action period intrinsic_reward_type : str the reward function to be used by the worker. Must be one of: * "negative_distance": the negative two norm between the states and desired absolute or relative goals. * "scaled_negative_distance": similar to the negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "non_negative_distance": the negative two norm between the states and desired absolute or relative goals offset by the maximum goal space (to ensure non-negativity) * "scaled_non_negative_distance": similar to the non-negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "exp_negative_distance": equal to exp(-negative_distance^2). The result is a reward between 0 and 1. This is useful for policies that terminate early. * "scaled_exp_negative_distance": similar to the previous worker reward type but with states, actions, and next states that are scaled. intrinsic_reward_scale : float the value that the intrinsic reward should be scaled by relative_goals : bool specifies whether the goal issued by the higher-level policies is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296 hindsight : bool whether to include hindsight action and goal transitions in the replay buffer. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. cooperative_gradients : bool whether to use the cooperative gradient update procedure for the higher-level policy. See: https://arxiv.org/abs/1912.02368v1 cg_weights : float weights for the gradients of the loss of the lower-level policies with respect to the parameters of the higher-level policies. Only used if `cooperative_gradients` is set to True. cg_delta : float the desired lower-level expected returns. If set to None, a fixed Lagrangian specified by cg_weights is used instead. Only used if `cooperative_gradients` is set to True. pretrain_worker : bool specifies whether you are pre-training the lower-level policies. Actions by the high-level policy are randomly sampled from the action space. pretrain_path : str or None path to the pre-trained worker policy checkpoints pretrain_ckpt : int or None checkpoint number to use within the worker policy path. If set to None, the most recent checkpoint is used. total_steps : int Total number of timesteps used during training. Used by a subset of algorithms. meta_policy : type [ hbaselines.base_policies.Policy ] the policy model to use for the meta policies worker_policy : type [ hbaselines.base_policies.Policy ] the policy model to use for the worker policy additional_params : dict additional algorithm-specific policy parameters. Used internally by the class when instantiating other (child) policies. """ super(GoalConditionedPolicy, self).__init__( sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, verbose=verbose, l2_penalty=l2_penalty, model_params=model_params, num_envs=num_envs, ) assert num_levels >= 2, "num_levels must be greater than or equal to 2" self.num_levels = num_levels self.meta_period = meta_period self.intrinsic_reward_type = intrinsic_reward_type self.intrinsic_reward_scale = intrinsic_reward_scale self.relative_goals = relative_goals self.off_policy_corrections = off_policy_corrections self.hindsight = hindsight self.subgoal_testing_rate = subgoal_testing_rate self.cooperative_gradients = cooperative_gradients self.cg_weights = cg_weights self.cg_delta = cg_delta self.pretrain_worker = pretrain_worker self.pretrain_path = pretrain_path self.pretrain_ckpt = pretrain_ckpt self.total_steps = total_steps # Get the observation and action space of the higher level policies. meta_ac_space = get_meta_ac_space( ob_space=ob_space, relative_goals=relative_goals, env_name=env_name, ) # =================================================================== # # Step 1: Create the policies for the individual levels. # # =================================================================== # self.policy = [] # The policies are ordered from the highest level to lowest level # policies in the hierarchy. for i in range(num_levels): # Determine the appropriate parameters to use for the policy in the # current level. policy_fn = meta_policy if i < (num_levels - 1) else worker_policy ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space co_space_i = co_space if i == 0 else meta_ac_space ob_space_i = ob_space # The policies are ordered from the highest level to lowest level # policies in the hierarchy. with tf.compat.v1.variable_scope("level_{}".format(i)): # Compute the scope name based on any outer scope term. scope_i = "level_{}".format(i) if scope is not None: scope_i = "{}/{}".format(scope, scope_i) # TODO: description. model_params_i = model_params.copy() model_params_i.update({ "ignore_flat_channels": model_params["ignore_flat_channels"] if i < 1 else [], "ignore_image": model_params["ignore_image"] if i < 1 else True, }) # Create the next policy. self.policy.append( policy_fn( sess=sess, ob_space=ob_space_i, ac_space=ac_space_i, co_space=co_space_i, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, use_huber=use_huber, l2_penalty=l2_penalty, model_params=model_params_i, scope=scope_i, **(additional_params or {}), )) # =================================================================== # # Step 2: Create attributes for the replay buffer. # # =================================================================== # # Create the replay buffer. self.replay_buffer = HierReplayBuffer( buffer_size=int(buffer_size / meta_period), batch_size=batch_size, meta_period=meta_period, obs_dim=ob_space.shape[0], ac_dim=ac_space.shape[0], co_dim=None if co_space is None else co_space.shape[0], goal_dim=meta_ac_space.shape[0], num_levels=num_levels) # current action by the meta-level policies self.meta_action = [[None for _ in range(num_levels - 1)] for _ in range(num_envs)] # a list of all the actions performed by each level in the hierarchy, # ordered from highest to lowest level policy. A separate element is # used for each environment. self._actions = [[[] for _ in range(self.num_levels)] for _ in range(num_envs)] # a list of the rewards (intrinsic or other) experienced by every level # in the hierarchy, ordered from highest to lowest level policy. A # separate element is used for each environment. self._rewards = [[[0]] + [[] for _ in range(self.num_levels - 1)] for _ in range(num_envs)] # a list of observations that stretch as long as the dilated horizon # chosen for the highest level policy. A separate element is used for # each environment. self._observations = [[] for _ in range(num_envs)] # the first and last contextual term. A separate element is used for # each environment. self._contexts = [[] for _ in range(num_envs)] # a list of done masks at every time step. A separate element is used # for each environment. self._dones = [[] for _ in range(num_envs)] # Collect the state indices for the intrinsic rewards. self.goal_indices = get_state_indices(ob_space, env_name) # Define the intrinsic reward function. if intrinsic_reward_type in [ "negative_distance", "scaled_negative_distance", "non_negative_distance", "scaled_non_negative_distance", "exp_negative_distance", "scaled_exp_negative_distance" ]: # Offset the distance measure by the maximum possible distance to # ensure non-negativity. if "non_negative" in intrinsic_reward_type: offset = np.sqrt( np.sum(np.square(meta_ac_space.high - meta_ac_space.low), -1)) else: offset = 0 # Scale the outputs from the state by the meta-action space if you # wish to scale the worker reward. if intrinsic_reward_type.startswith("scaled"): scale = 0.5 * (meta_ac_space.high - meta_ac_space.low) else: scale = 1 def intrinsic_reward_fn(states, goals, next_states): return negative_distance( states=states[self.goal_indices] / scale, goals=goals / scale, next_states=next_states[self.goal_indices] / scale, relative_context=relative_goals, offset=0.0, ) + offset # Perform the exponential and squashing operations to keep the # intrinsic reward between 0 and 1. if "exp" in intrinsic_reward_type: def exp_intrinsic_reward_fn(states, goals, next_states): # TODO: temporary span = sum( np.square(self.policy[0].ac_space.high - self.policy[0].ac_space.low)) rew = intrinsic_reward_fn(states, goals, next_states) return np.exp(-(rew / (span / 40))**2) self.intrinsic_reward_fn = exp_intrinsic_reward_fn else: self.intrinsic_reward_fn = intrinsic_reward_fn else: raise ValueError("Unknown intrinsic reward type: {}".format( intrinsic_reward_type)) # =================================================================== # # Step 3: Create algorithm-specific features. # # =================================================================== # # the number of get_action calls that have been performed. This is used # when pretraining the worker to incrementally train different levels # of the policy. self._steps = 0 # a fixed goal transition function for the meta-actions in between meta # periods. This is used when relative_goals is set to True in order to # maintain a fixed absolute position of the goal. if relative_goals: def goal_transition_fn(obs0, goal, obs1): return obs0 + goal - obs1 else: def goal_transition_fn(obs0, goal, obs1): return goal self.goal_transition_fn = goal_transition_fn if self.cooperative_gradients: if scope is None: self._setup_cooperative_gradients() else: with tf.compat.v1.variable_scope(scope): self._setup_cooperative_gradients() def initialize(self): """See parent class. This method performs the following operations: - It calls the initialization methods of the policies at every level of the hierarchy to match the target value function parameters with the current policy parameters. - It also imports the lower-level policies from a pretrained checkpoint if a path to one is specified. """ # Initialize the separate policies in the hierarchy. for i in range(self.num_levels): self.policy[i].initialize() if self.pretrain_path is not None: ckpt_path = os.path.join(self.pretrain_path, "checkpoints") # Get the checkpoint number. if self.pretrain_ckpt is None: filenames = os.listdir(ckpt_path) metafiles = [f[:-5] for f in filenames if f[-5:] == ".meta"] metanum = [int(f.split("-")[-1]) for f in metafiles] ckpt_num = max(metanum) else: ckpt_num = self.pretrain_ckpt # Extract the checkpoint path. ckpt_path = os.path.join(ckpt_path, "itr-{}".format(ckpt_num)) var_list = tf.train.list_variables(ckpt_path) ckpt_reader = tf.train.load_checkpoint(ckpt_path) # Check that the number of levels match. assert var_list[-1][0].startswith( "level_{}".format(self.num_levels-1)), \ "Number of levels between the checkpoint and current policy " \ "do not match. Policy={}, Checkpoint={}".format( self.num_levels, int(var_list[-1][0].split("/")[0][6:]) + 1) # Check that the names and shapes of the lowest-level policy # parameters match the current policy. current_vars = { v.name: v.shape.as_list() for v in get_trainable_vars() } for var in var_list: var_name, var_shape = var var_name = "{}:0".format(var_name) # We only check the lower-level policies. if any( var_name.startswith("level_{}".format(level)) for level in range(1, self.num_levels)): assert var_name in current_vars.keys(), \ "{} not available in current policy.".format(var_name) current_shape = current_vars[var_name] assert current_shape == var_shape, \ "Shape mismatch for {}, {} != {}".format( var_name, var_shape, current_shape) # Import the lower-level policy parameters. current_vars = {v.name: v for v in get_trainable_vars()} for var in var_list: var_name, var_shape = var if any( var_name.startswith("level_{}".format(level)) for level in range(1, self.num_levels)): value = ckpt_reader.get_tensor(var_name) var_name = "{}:0".format(var_name) self.sess.run( tf.compat.v1.assign(current_vars[var_name], value)) def update(self, update_actor=True, **kwargs): """Perform a gradient update step. This is done both at every level of the hierarchy. The kwargs argument for this method contains two additional terms: * update_meta (bool): specifies whether to perform a gradient update step for the meta-policies * update_meta_actor (bool): similar to the `update_policy` term, but for the meta-policy. Note that, if `update_meta` is set to False, this term is void. **Note**; The target update soft updates for all policies occur at the same frequency as their respective actor update frequencies. Parameters ---------- update_actor : bool specifies whether to update the actor policy. The critic policy is still updated if this value is set to False. """ # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return # Specifies whether to remove additional data from the replay buffer # sampling procedure. Since only a subset of algorithms use additional # data, removing it can speedup the other algorithms. with_additional = self.off_policy_corrections # Specifies the levels to collect data from, corresponding to the # levels that will be trained. This also helps speedup the operation. collect_levels = [ i for i in range(self.num_levels - 1) if kwargs["update_meta"][i] ] + [self.num_levels - 1] # Get a batch. obs0, obs1, act, rew, done, additional = self.replay_buffer.sample( with_additional, collect_levels) # Do not use done masks for lower-level policies with negative # intrinsic rewards (these the policies to terminate early). if self._negative_reward_fn(): for i in range(self.num_levels - 1): done[i + 1] = np.array([False] * done[i + 1].shape[0]) # Loop through all meta-policies. for i in range(self.num_levels - 1): if kwargs['update_meta'][i] and not self._pretrain_level(i): # Replace the goals with the most likely goals. if self.off_policy_corrections and i == 0: # FIXME meta_act = self._sample_best_meta_action( meta_obs0=obs0[i], meta_obs1=obs1[i], meta_action=act[i], worker_obses=additional["worker_obses"], worker_actions=additional["worker_actions"], k=8) act[i] = meta_act if self.cooperative_gradients: # Perform the cooperative gradients update procedure. self._cooperative_gradients_update( obs0=obs0, actions=act, rewards=rew, obs1=obs1, terminals1=done, level_num=i, update_actor=kwargs['update_meta_actor'], ) else: # Perform the regular meta update procedure. self.policy[i].update_from_batch( obs0=obs0[i], actions=act[i], rewards=rew[i], obs1=obs1[i], terminals1=done[i], update_actor=kwargs['update_meta_actor'], ) # Update the lowest level policy. self.policy[-1].update_from_batch( obs0=obs0[-1], actions=act[-1], rewards=rew[-1], obs1=obs1[-1], terminals1=done[-1], update_actor=update_actor, ) def get_action(self, obs, context, apply_noise, random_actions, env_num=0): """See parent class.""" # Increment the internal number of get_action calls. self._steps += 1 # Loop through the policies in the hierarchy. for i in range(self.num_levels - 1): if self._update_meta(i, env_num): if self._pretrain_level(i): # Sample goals randomly when performing pre-training. self.meta_action[env_num][i] = np.array( [self.policy[i].ac_space.sample()]) else: context_i = context if i == 0 \ else self.meta_action[env_num][i - 1] # Update the meta action based on the output from the # policy if the time period requires is. self.meta_action[env_num][i] = self.policy[i].get_action( obs, context_i, apply_noise, random_actions) else: # Update the meta-action in accordance with a fixed transition # function. self.meta_action[env_num][i] = self.goal_transition_fn( obs0=np.array( [self._observations[env_num][-1][self.goal_indices]]), goal=self.meta_action[env_num][i], obs1=obs[:, self.goal_indices]) # Return the action to be performed within the environment (i.e. the # action by the lowest level policy). action = self.policy[-1].get_action( obs=obs, context=self.meta_action[env_num][-1], apply_noise=apply_noise, random_actions=random_actions and self.pretrain_path is None) return action def store_transition(self, obs0, context0, action, reward, obs1, context1, done, is_final_step, env_num=0, evaluate=False): """See parent class.""" # the time since the most recent sample began collecting step samples t_start = len(self._observations[env_num]) # Flatten the observations. obs0 = obs0.flatten() obs1 = obs1.flatten() for i in range(1, self.num_levels): # Actions and intrinsic rewards for the high-level policies are # only updated when the action is recomputed by the graph. if t_start % self.meta_period**(i - 1) == 0: self._rewards[env_num][-i].append(0) self._actions[env_num][-i - 1].append( self.meta_action[env_num][-i].flatten()) # Compute the intrinsic rewards and append them to the list of # rewards. self._rewards[env_num][-i][-1] += \ self.intrinsic_reward_scale / self.meta_period ** (i-1) * \ self.intrinsic_reward_fn( states=obs0, goals=self.meta_action[env_num][-i].flatten(), next_states=obs1 ) # The highest level policy receives the sum of environmental rewards. self._rewards[env_num][0][0] += reward # The lowest level policy's actions are received from the algorithm. self._actions[env_num][-1].append(action) # Add the environmental observations and contextual terms to their # respective lists. self._observations[env_num].append(obs0) if t_start == 0: self._contexts[env_num].append(context0) # Modify the done mask in accordance with the TD3 algorithm. Done masks # that correspond to the final step are set to False. self._dones[env_num].append(done and not is_final_step) # Add a sample to the replay buffer. if len(self._observations[env_num]) == \ self.meta_period ** (self.num_levels - 1) or done: # Add the last observation and context. self._observations[env_num].append(obs1) self._contexts[env_num].append(context1) # Compute the current state goals to add to the final observation. for i in range(self.num_levels - 1): self._actions[env_num][i].append( self.goal_transition_fn( obs0=obs0[self.goal_indices], goal=self.meta_action[env_num][i], obs1=obs1[self.goal_indices]).flatten()) # Avoid storing samples when performing evaluations. if not evaluate: if not self.hindsight \ or random.random() < self.subgoal_testing_rate: # Store a sample in the replay buffer. self.replay_buffer.add( obs_t=self._observations[env_num], context_t=self._contexts[env_num], action_t=self._actions[env_num], reward_t=self._rewards[env_num], done_t=self._dones[env_num], ) if self.hindsight: # Some temporary attributes. worker_obses = [ self._get_obs(self._observations[env_num][i], self._actions[env_num][0][i], 0) for i in range(len(self._observations[env_num])) ] intrinsic_rewards = self._rewards[env_num][-1] # Implement hindsight action and goal transitions. goal, rewards = self._hindsight_actions_goals( initial_observations=worker_obses, initial_rewards=intrinsic_rewards) new_actions = deepcopy(self._actions[env_num]) new_actions[0] = goal new_rewards = deepcopy(self._rewards[env_num]) new_rewards[-1] = rewards # Store the hindsight sample in the replay buffer. self.replay_buffer.add( obs_t=self._observations[env_num], context_t=self._contexts[env_num], action_t=new_actions, reward_t=new_rewards, done_t=self._dones[env_num], ) # Clear the memory that has been stored in the replay buffer. self.clear_memory(env_num) def _update_meta(self, level, env_num): """Determine whether a meta-policy should update its action. This is done by checking the length of the observation lists that are passed to the replay buffer, which are cleared whenever the highest level meta-period has been met or the environment has been reset. Parameters ---------- level : int the level of the policy env_num : int the environment number. Used to handle situations when multiple parallel environments are being used. Returns ------- bool True if the action should be updated by the meta-policy at the given level """ return len(self._observations[env_num]) % \ (self.meta_period ** (self.num_levels - level - 1)) == 0 def clear_memory(self, env_num): """Clear internal memory that is used by the replay buffer.""" self._actions[env_num] = [[] for _ in range(self.num_levels)] self._rewards[env_num] = \ [[0]] + [[] for _ in range(self.num_levels - 1)] self._observations[env_num] = [] self._contexts[env_num] = [] self._dones[env_num] = [] def get_td_map(self): """See parent class.""" # Not enough samples in the replay buffer. if not self.replay_buffer.can_sample(): return {} # Get a batch. obs0, obs1, act, rew, done, _ = self.replay_buffer.sample(False) td_map = {} for i in range(self.num_levels): td_map.update(self.policy[i].get_td_map_from_batch( obs0=obs0[i], actions=act[i], rewards=rew[i], obs1=obs1[i], terminals1=done[i])) return td_map def _negative_reward_fn(self): """Return True if the intrinsic reward returns negative values. Intrinsic reward functions with negative rewards incentivize early terminations, which we attempt to mitigate in the training operation by preventing early terminations from return an expected return of 0. """ return "exp" not in self.intrinsic_reward_type \ and "non" not in self.intrinsic_reward_type def _pretrain_level(self, level): """Check whether the current level should be training. When using `pretrain_worker` the lowest level policy is trained every step, and higher level policies are incrementally unfrozen for a fraction of the training steps. The highest level policy is not trained in this case, but the checkpoints can later be used to continue training the entire hierarchy. Parameters ---------- level : int the level of the policy Returns ------- bool True if the level should not be trained and should perform random actions, False otherwise """ # number of steps to perform pretraining for a given level, assuming # pretrain_worker is set to True. pretrain_steps = self.total_steps * \ (self.num_levels - level - 1) / (self.num_levels - 1) if level == 0: # bug fix for the final step return self.pretrain_worker else: return self.pretrain_worker and (self._steps < pretrain_steps) # ======================================================================= # # Auxiliary methods for HIRO # # ======================================================================= # def _sample_best_meta_action(self, meta_obs0, meta_obs1, meta_action, worker_obses, worker_actions, k=10): """Return meta-actions that approximately maximize low-level log-probs. Parameters ---------- meta_obs0 : array_like (batch_size, m_obs_dim) matrix of meta observations meta_obs1 : array_like (batch_size, m_obs_dim) matrix of next time step meta observations meta_action : array_like (batch_size, m_ac_dim) matrix of meta actions worker_obses : array_like (batch_size, w_obs_dim, meta_period+1) matrix of current Worker state observations worker_actions : array_like (batch_size, w_ac_dim, meta_period) matrix of current Worker environmental actions k : int, optional number of goals returned, excluding the initial goal and the mean value Returns ------- array_like (batch_size, m_ac_dim) matrix of most likely meta actions """ batch_size, goal_dim = meta_action.shape # Collect several samples of potentially optimal goals. sampled_actions = self._sample(meta_obs0, meta_obs1, meta_action, k) assert sampled_actions.shape == (batch_size, goal_dim, k) # Compute the fitness of each candidate goal. The fitness is the sum of # the log-probabilities of each action for the given goal. fitness = self._log_probs(sampled_actions, worker_obses, worker_actions) assert fitness.shape == (batch_size, k) # For each sample, choose the meta action that maximizes the fitness. indx = np.argmax(fitness, 1) best_goals = np.asarray( [sampled_actions[i, :, indx[i]] for i in range(batch_size)]) return best_goals def _sample(self, meta_obs0, meta_obs1, meta_action, num_samples, sc=0.5): """Sample different goals. The goals are sampled as follows: * The first num_samples-2 goals are acquired from a random Gaussian distribution centered at s_{t+c} - s_t. * The second to last goal is s_{t+c} - s_t. * The last goal is the originally sampled goal g_t. Parameters ---------- meta_obs0 : array_like (batch_size, m_obs_dim) matrix of meta observations meta_obs1 : array_like (batch_size, m_obs_dim) matrix of next time step meta observations meta_action : array_like (batch_size, m_ac_dim) matrix of meta actions num_samples : int number of samples sc : float scaling factor for the normal distribution. Returns ------- array_like (batch_size, goal_dim, num_samples) matrix of sampled goals Helps ----- * _sample_best_meta_action(self) """ batch_size, goal_dim = meta_action.shape goal_space = self.policy[0].ac_space spec_range = goal_space.high - goal_space.low random_samples = num_samples - 2 # Compute the mean and std for the Gaussian distribution to sample # from, and well as the maxima and minima. loc = meta_obs1[:, self.goal_indices] - meta_obs0[:, self.goal_indices] scale = [sc * spec_range / 2] minimum, maximum = [goal_space.low], [goal_space.high] new_loc = np.zeros((batch_size, goal_dim, random_samples)) new_scale = np.zeros((batch_size, goal_dim, random_samples)) for i in range(random_samples): new_loc[:, :, i] = loc new_scale[:, :, i] = scale new_minimum = np.zeros((batch_size, goal_dim, num_samples)) new_maximum = np.zeros((batch_size, goal_dim, num_samples)) for i in range(num_samples): new_minimum[:, :, i] = minimum new_maximum[:, :, i] = maximum # Generate random samples for the above distribution. normal_samples = np.random.normal(size=(random_samples * batch_size * goal_dim)) normal_samples = normal_samples.reshape( (batch_size, goal_dim, random_samples)) samples = np.zeros((batch_size, goal_dim, num_samples)) samples[:, :, :-2] = new_loc + normal_samples * new_scale samples[:, :, -2] = loc samples[:, :, -1] = meta_action # Clip the values based on the meta action space range. samples = np.minimum(np.maximum(samples, new_minimum), new_maximum) return samples def _log_probs(self, meta_actions, worker_obses, worker_actions): """Calculate the log probability of the next goal by the meta-policies. Parameters ---------- meta_actions : array_like (batch_size, m_ac_dim, num_samples) matrix of candidate higher- level policy actions worker_obses : array_like (batch_size, w_obs_dim, meta_period + 1) matrix of lower-level policy observations worker_actions : array_like (batch_size, w_ac_dim, meta_period) list of lower-level policy actions Returns ------- array_like (batch_size, num_samples) fitness associated with every state / action / goal pair Helps ----- * _sample_best_meta_action(self): """ raise NotImplementedError # ======================================================================= # # Auxiliary methods for HAC # # ======================================================================= # def _hindsight_actions_goals(self, initial_observations, initial_rewards): """Calculate hindsight goal and action transitions. These are then stored in the replay buffer along with the original (non-hindsight) sample. See the README at the front page of this repository for an in-depth description of this procedure. Parameters ---------- initial_observations : array_like the original worker observations with the non-hindsight goals appended to them initial_rewards : array_like the original intrinsic rewards Returns ------- array_like the goal at every step in hindsight array_like the modified intrinsic rewards taking into account the hindsight goals Helps ----- * store_transition(self): """ new_goals = [] observations = deepcopy(initial_observations) rewards = deepcopy(initial_rewards) hindsight_goal = 0 if self.relative_goals \ else observations[-1][self.goal_indices] obs_tp1 = observations[-1] for i in range(1, len(observations) + 1): obs_t = observations[-i] # Calculate the hindsight goal in using relative goals. # If not, the hindsight goal is simply a subset of the # final state observation. if self.relative_goals: hindsight_goal += \ obs_tp1[self.goal_indices] - obs_t[self.goal_indices] # Modify the Worker intrinsic rewards based on the new # hindsight goal. if i > 1: rewards[-(i - 1)] = self.intrinsic_reward_scale \ * self.intrinsic_reward_fn(obs_t, hindsight_goal, obs_tp1) obs_tp1 = deepcopy(obs_t) new_goals = [deepcopy(hindsight_goal)] + new_goals return new_goals, rewards # ======================================================================= # # Auxiliary methods for CHER # # ======================================================================= # def _setup_cooperative_gradients(self): """Create the cooperative gradients meta-policy optimizer.""" raise NotImplementedError def _cooperative_gradients_update(self, obs0, actions, rewards, obs1, terminals1, level_num, update_actor=True): """Perform the gradient update procedure for the CHER algorithm. This procedure is similar to update_from_batch, expect it runs the self.cg_optimizer operation instead of the policy object's optimizer, and utilizes some information from the worker samples as well. Parameters ---------- obs0 : list of array_like (batch_size, obs_dim) matrix of observations for every level in the hierarchy actions : list of array_like (batch_size, ac_dim) matrix of actions for every level in the hierarchy obs1 : list of array_like (batch_size, obs_dim) matrix of next step observations for every level in the hierarchy rewards : list of array_like (batch_size,) vector of rewards for every level in the hierarchy terminals1 : list of numpy bool (batch_size,) vector of done masks for every level in the hierarchy level_num : int the hierarchy level number of the policy to optimize update_actor : bool specifies whether to update the actor policy of the meta policy. The critic policy is still updated if this value is set to False. Returns ------- [float, float] meta-policy critic loss float meta-policy actor loss """ raise NotImplementedError
class TestHierReplayBuffer(unittest.TestCase): """Tests for the HierReplayBuffer object.""" def setUp(self): self.replay_buffer = HierReplayBuffer(buffer_size=2, batch_size=1, meta_period=1, meta_obs_dim=2, meta_ac_dim=3, worker_obs_dim=4, worker_ac_dim=5) def tearDown(self): del self.replay_buffer def test_init(self): """Validate that all the attributes were initialize properly.""" self.assertTupleEqual(self.replay_buffer.meta_obs0.shape, (1, 2)) self.assertTupleEqual(self.replay_buffer.meta_obs1.shape, (1, 2)) self.assertTupleEqual(self.replay_buffer.meta_act.shape, (1, 3)) self.assertTupleEqual(self.replay_buffer.meta_rew.shape, (1, )) self.assertTupleEqual(self.replay_buffer.meta_done.shape, (1, )) self.assertTupleEqual(self.replay_buffer.worker_obs0.shape, (1, 4)) self.assertTupleEqual(self.replay_buffer.worker_obs1.shape, (1, 4)) self.assertTupleEqual(self.replay_buffer.worker_act.shape, (1, 5)) self.assertTupleEqual(self.replay_buffer.worker_rew.shape, (1, )) self.assertTupleEqual(self.replay_buffer.worker_done.shape, (1, )) def test_buffer_size(self): """Validate the buffer_size output from the replay buffer.""" self.assertEqual(self.replay_buffer.buffer_size, 2) def test_add_sample(self): """Test the `add` and `sample` methods the replay buffer.""" """Test the `add` and `sample` methods the replay buffer.""" # Add an element. self.replay_buffer.add( obs_t=[np.array([0, 0, 0, 0]), np.array([1, 1, 1, 1])], goal_t=np.array([2, 2, 2]), action_t=[np.array([3, 3, 3, 3, 3])], reward_t=[4], done=[False], meta_obs_t=(np.array([5, 5]), np.array([6, 6])), meta_reward_t=7, ) # Check is_full in the False case. self.assertEqual(self.replay_buffer.is_full(), False) # Add an element. self.replay_buffer.add( obs_t=[np.array([0, 0, 0, 0]), np.array([1, 1, 1, 1])], goal_t=np.array([2, 2, 2]), action_t=[np.array([3, 3, 3, 3, 3])], reward_t=[4], done=[False], meta_obs_t=(np.array([5, 5]), np.array([6, 6])), meta_reward_t=7, ) # Check is_full in the True case. self.assertEqual(self.replay_buffer.is_full(), True) # Check can_sample in the True case. self.assertEqual(self.replay_buffer.can_sample(), True) # Test the `sample` method. meta_obs0, meta_obs1, meta_act, meta_rew, meta_done, worker_obs0, \ worker_obs1, worker_act, worker_rew, worker_done, _ = \ self.replay_buffer.sample() np.testing.assert_array_almost_equal(meta_obs0, [[5, 5]]) np.testing.assert_array_almost_equal(meta_obs1, [[6, 6]]) np.testing.assert_array_almost_equal(meta_act, [[2, 2, 2]]) np.testing.assert_array_almost_equal(meta_rew, [7]) np.testing.assert_array_almost_equal(meta_done, [0]) np.testing.assert_array_almost_equal(worker_obs0, [[0, 0, 0, 0]]) np.testing.assert_array_almost_equal(worker_obs1, [[1, 1, 1, 1]]) np.testing.assert_array_almost_equal(worker_act, [[3, 3, 3, 3, 3]]) np.testing.assert_array_almost_equal(worker_rew, [4]) np.testing.assert_array_almost_equal(worker_done, [0])