def test_state_indices(self): # non-relevant parameters for most tests params = dict( ob_space=Box(-1, 1, shape=(2, ), dtype=np.float32), use_fingerprints=False, fingerprint_dim=1, ) # test for AntMaze self.assertListEqual( get_state_indices(env_name="AntMaze", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for AntGather self.assertListEqual( get_state_indices(env_name="AntGather", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for AntPush self.assertListEqual( get_state_indices(env_name="AntPush", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for AntFall self.assertListEqual( get_state_indices(env_name="AntFall", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for UR5 self.assertIsNone(get_state_indices(env_name="UR5", **params)) # test for Pendulum self.assertListEqual(get_state_indices(env_name="Pendulum", **params), [0, 2]) # test for ring self.assertListEqual(get_state_indices(env_name="ring", **params), [0, 5, 10, 15, 20]) # test for ring_small self.assertListEqual( get_state_indices(env_name="ring_small", **params), [0]) # test for merge0 self.assertListEqual(get_state_indices(env_name="merge0", **params), [0, 5, 10, 15, 20]) # test for merge1 self.assertListEqual( get_state_indices(env_name="merge1", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]) # test for merge2 self.assertListEqual( get_state_indices(env_name="merge2", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]) # test for figureeight0 self.assertListEqual( get_state_indices(env_name="figureeight0", **params), [13]) # test for figureeight1 self.assertListEqual( get_state_indices(env_name="figureeight1", **params), [1, 3, 5, 7, 9, 11, 13]) # test for figureeight2 self.assertListEqual( get_state_indices(env_name="figureeight2", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) # test for highway-single self.assertListEqual( get_state_indices(env_name="highway-single", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]) # test for Point2DEnv self.assertListEqual( get_state_indices(env_name="Point2DEnv", **params), [0, 1]) # test for Point2DImageEnv self.assertListEqual( get_state_indices(env_name="Point2DImageEnv", **params), [1024, 1025])
def test_state_indices(self): # non-relevant parameters for most tests params = dict( ob_space=Box(-1, 1, shape=(2, )), use_fingerprints=False, fingerprint_dim=1, ) # test for AntMaze self.assertListEqual( get_state_indices(env_name="AntMaze", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for AntGather self.assertListEqual( get_state_indices(env_name="AntGather", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for AntPush self.assertListEqual( get_state_indices(env_name="AntPush", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for AntFall self.assertListEqual( get_state_indices(env_name="AntFall", **params), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) # test for UR5 self.assertIsNone(get_state_indices(env_name="UR5", **params)) # test for Pendulum self.assertListEqual(get_state_indices(env_name="Pendulum", **params), [0, 2]) # test for ring-v0 self.assertListEqual(get_state_indices(env_name="ring-v0", **params), [0, 5, 10, 15, 20]) # test for ring-v1 self.assertListEqual(get_state_indices(env_name="ring-v1", **params), [0, 5, 10, 15, 20]) # test for ring-v2 self.assertListEqual(get_state_indices(env_name="ring-v2", **params), [0, 5, 10, 15, 20]) # test for ring-imitation self.assertListEqual( get_state_indices(env_name="ring-imitation", **params), [0, 5, 10, 15, 20]) # test for merge-v0 self.assertListEqual(get_state_indices(env_name="merge-v0", **params), [0, 5, 10, 15, 20]) # test for merge-v1 self.assertListEqual( get_state_indices(env_name="merge-v1", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]) # test for merge-v2 self.assertListEqual( get_state_indices(env_name="merge-v2", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]) # test for highway-v0 self.assertListEqual( get_state_indices(env_name="highway-v0", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]) # test for highway-v1 self.assertListEqual( get_state_indices(env_name="highway-v1", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]) # test for highway-v2 self.assertListEqual( get_state_indices(env_name="highway-v2", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]) # test for highway-imitation self.assertListEqual( get_state_indices(env_name="highway-imitation", **params), [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]) # test for i210-v0 self.assertListEqual(get_state_indices(env_name="i210-v0", **params), [ 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240, 245 ]) # test for i210-v1 self.assertListEqual(get_state_indices(env_name="i210-v1", **params), [ 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240, 245 ]) # test for i210-v2 self.assertListEqual(get_state_indices(env_name="i210-v2", **params), [ 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240, 245 ]) # test for Point2DEnv self.assertListEqual( get_state_indices(env_name="Point2DEnv", **params), [0, 1]) # test for Point2DImageEnv self.assertListEqual( get_state_indices(env_name="Point2DImageEnv", **params), [1024, 1025])
def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, use_huber, l2_penalty, model_params, num_levels, meta_period, intrinsic_reward_type, intrinsic_reward_scale, relative_goals, off_policy_corrections, hindsight, subgoal_testing_rate, cooperative_gradients, cg_weights, cg_delta, pretrain_worker, pretrain_path, pretrain_ckpt, total_steps, scope=None, env_name="", num_envs=1, meta_policy=None, worker_policy=None, additional_params=None): """Instantiate the goal-conditioned hierarchical policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead model_params : dict dictionary of model-specific parameters. See parent class. num_levels : int number of levels within the hierarchy. Must be greater than 1. Two levels correspond to a Manager/Worker paradigm. meta_period : int meta-policy action period intrinsic_reward_type : str the reward function to be used by the worker. Must be one of: * "negative_distance": the negative two norm between the states and desired absolute or relative goals. * "scaled_negative_distance": similar to the negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "non_negative_distance": the negative two norm between the states and desired absolute or relative goals offset by the maximum goal space (to ensure non-negativity) * "scaled_non_negative_distance": similar to the non-negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "exp_negative_distance": equal to exp(-negative_distance^2). The result is a reward between 0 and 1. This is useful for policies that terminate early. * "scaled_exp_negative_distance": similar to the previous worker reward type but with states, actions, and next states that are scaled. intrinsic_reward_scale : float the value that the intrinsic reward should be scaled by relative_goals : bool specifies whether the goal issued by the higher-level policies is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296 hindsight : bool whether to include hindsight action and goal transitions in the replay buffer. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. cooperative_gradients : bool whether to use the cooperative gradient update procedure for the higher-level policy. See: https://arxiv.org/abs/1912.02368v1 cg_weights : float weights for the gradients of the loss of the lower-level policies with respect to the parameters of the higher-level policies. Only used if `cooperative_gradients` is set to True. cg_delta : float the desired lower-level expected returns. If set to None, a fixed Lagrangian specified by cg_weights is used instead. Only used if `cooperative_gradients` is set to True. pretrain_worker : bool specifies whether you are pre-training the lower-level policies. Actions by the high-level policy are randomly sampled from the action space. pretrain_path : str or None path to the pre-trained worker policy checkpoints pretrain_ckpt : int or None checkpoint number to use within the worker policy path. If set to None, the most recent checkpoint is used. total_steps : int Total number of timesteps used during training. Used by a subset of algorithms. meta_policy : type [ hbaselines.base_policies.Policy ] the policy model to use for the meta policies worker_policy : type [ hbaselines.base_policies.Policy ] the policy model to use for the worker policy additional_params : dict additional algorithm-specific policy parameters. Used internally by the class when instantiating other (child) policies. """ super(GoalConditionedPolicy, self).__init__( sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, verbose=verbose, l2_penalty=l2_penalty, model_params=model_params, num_envs=num_envs, ) assert num_levels >= 2, "num_levels must be greater than or equal to 2" self.num_levels = num_levels self.meta_period = meta_period self.intrinsic_reward_type = intrinsic_reward_type self.intrinsic_reward_scale = intrinsic_reward_scale self.relative_goals = relative_goals self.off_policy_corrections = off_policy_corrections self.hindsight = hindsight self.subgoal_testing_rate = subgoal_testing_rate self.cooperative_gradients = cooperative_gradients self.cg_weights = cg_weights self.cg_delta = cg_delta self.pretrain_worker = pretrain_worker self.pretrain_path = pretrain_path self.pretrain_ckpt = pretrain_ckpt self.total_steps = total_steps # Get the observation and action space of the higher level policies. meta_ac_space = get_meta_ac_space( ob_space=ob_space, relative_goals=relative_goals, env_name=env_name, ) # =================================================================== # # Step 1: Create the policies for the individual levels. # # =================================================================== # self.policy = [] # The policies are ordered from the highest level to lowest level # policies in the hierarchy. for i in range(num_levels): # Determine the appropriate parameters to use for the policy in the # current level. policy_fn = meta_policy if i < (num_levels - 1) else worker_policy ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space co_space_i = co_space if i == 0 else meta_ac_space ob_space_i = ob_space # The policies are ordered from the highest level to lowest level # policies in the hierarchy. with tf.compat.v1.variable_scope("level_{}".format(i)): # Compute the scope name based on any outer scope term. scope_i = "level_{}".format(i) if scope is not None: scope_i = "{}/{}".format(scope, scope_i) # TODO: description. model_params_i = model_params.copy() model_params_i.update({ "ignore_flat_channels": model_params["ignore_flat_channels"] if i < 1 else [], "ignore_image": model_params["ignore_image"] if i < 1 else True, }) # Create the next policy. self.policy.append( policy_fn( sess=sess, ob_space=ob_space_i, ac_space=ac_space_i, co_space=co_space_i, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, use_huber=use_huber, l2_penalty=l2_penalty, model_params=model_params_i, scope=scope_i, **(additional_params or {}), )) # =================================================================== # # Step 2: Create attributes for the replay buffer. # # =================================================================== # # Create the replay buffer. self.replay_buffer = HierReplayBuffer( buffer_size=int(buffer_size / meta_period), batch_size=batch_size, meta_period=meta_period, obs_dim=ob_space.shape[0], ac_dim=ac_space.shape[0], co_dim=None if co_space is None else co_space.shape[0], goal_dim=meta_ac_space.shape[0], num_levels=num_levels) # current action by the meta-level policies self.meta_action = [[None for _ in range(num_levels - 1)] for _ in range(num_envs)] # a list of all the actions performed by each level in the hierarchy, # ordered from highest to lowest level policy. A separate element is # used for each environment. self._actions = [[[] for _ in range(self.num_levels)] for _ in range(num_envs)] # a list of the rewards (intrinsic or other) experienced by every level # in the hierarchy, ordered from highest to lowest level policy. A # separate element is used for each environment. self._rewards = [[[0]] + [[] for _ in range(self.num_levels - 1)] for _ in range(num_envs)] # a list of observations that stretch as long as the dilated horizon # chosen for the highest level policy. A separate element is used for # each environment. self._observations = [[] for _ in range(num_envs)] # the first and last contextual term. A separate element is used for # each environment. self._contexts = [[] for _ in range(num_envs)] # a list of done masks at every time step. A separate element is used # for each environment. self._dones = [[] for _ in range(num_envs)] # Collect the state indices for the intrinsic rewards. self.goal_indices = get_state_indices(ob_space, env_name) # Define the intrinsic reward function. if intrinsic_reward_type in [ "negative_distance", "scaled_negative_distance", "non_negative_distance", "scaled_non_negative_distance", "exp_negative_distance", "scaled_exp_negative_distance" ]: # Offset the distance measure by the maximum possible distance to # ensure non-negativity. if "non_negative" in intrinsic_reward_type: offset = np.sqrt( np.sum(np.square(meta_ac_space.high - meta_ac_space.low), -1)) else: offset = 0 # Scale the outputs from the state by the meta-action space if you # wish to scale the worker reward. if intrinsic_reward_type.startswith("scaled"): scale = 0.5 * (meta_ac_space.high - meta_ac_space.low) else: scale = 1 def intrinsic_reward_fn(states, goals, next_states): return negative_distance( states=states[self.goal_indices] / scale, goals=goals / scale, next_states=next_states[self.goal_indices] / scale, relative_context=relative_goals, offset=0.0, ) + offset # Perform the exponential and squashing operations to keep the # intrinsic reward between 0 and 1. if "exp" in intrinsic_reward_type: def exp_intrinsic_reward_fn(states, goals, next_states): # TODO: temporary span = sum( np.square(self.policy[0].ac_space.high - self.policy[0].ac_space.low)) rew = intrinsic_reward_fn(states, goals, next_states) return np.exp(-(rew / (span / 40))**2) self.intrinsic_reward_fn = exp_intrinsic_reward_fn else: self.intrinsic_reward_fn = intrinsic_reward_fn else: raise ValueError("Unknown intrinsic reward type: {}".format( intrinsic_reward_type)) # =================================================================== # # Step 3: Create algorithm-specific features. # # =================================================================== # # the number of get_action calls that have been performed. This is used # when pretraining the worker to incrementally train different levels # of the policy. self._steps = 0 # a fixed goal transition function for the meta-actions in between meta # periods. This is used when relative_goals is set to True in order to # maintain a fixed absolute position of the goal. if relative_goals: def goal_transition_fn(obs0, goal, obs1): return obs0 + goal - obs1 else: def goal_transition_fn(obs0, goal, obs1): return goal self.goal_transition_fn = goal_transition_fn if self.cooperative_gradients: if scope is None: self._setup_cooperative_gradients() else: with tf.compat.v1.variable_scope(scope): self._setup_cooperative_gradients()
def __init__(self, sess, ob_space, ac_space, co_space, buffer_size, batch_size, actor_lr, critic_lr, verbose, tau, gamma, layer_norm, layers, act_fun, use_huber, num_levels, meta_period, intrinsic_reward_type, intrinsic_reward_scale, relative_goals, off_policy_corrections, hindsight, subgoal_testing_rate, connected_gradients, cg_weights, use_fingerprints, fingerprint_range, centralized_value_functions, env_name="", meta_policy=None, worker_policy=None, additional_params=None): """Instantiate the goal-conditioned hierarchical policy. Parameters ---------- sess : tf.compat.v1.Session the current TensorFlow session ob_space : gym.spaces.* the observation space of the environment ac_space : gym.spaces.* the action space of the environment co_space : gym.spaces.* the context space of the environment buffer_size : int the max number of transitions to store batch_size : int SGD batch size actor_lr : float actor learning rate critic_lr : float critic learning rate verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug tau : float target update rate gamma : float discount factor layer_norm : bool enable layer normalisation layers : list of int or None the size of the neural network for the policy act_fun : tf.nn.* the activation function to use in the neural network use_huber : bool specifies whether to use the huber distance function as the loss for the critic. If set to False, the mean-squared error metric is used instead num_levels : int number of levels within the hierarchy. Must be greater than 1. Two levels correspond to a Manager/Worker paradigm. meta_period : int meta-policy action period intrinsic_reward_type : str the reward function to be used by the worker. Must be one of: * "negative_distance": the negative two norm between the states and desired absolute or relative goals. * "scaled_negative_distance": similar to the negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "non_negative_distance": the negative two norm between the states and desired absolute or relative goals offset by the maximum goal space (to ensure non-negativity) * "scaled_non_negative_distance": similar to the non-negative distance reward where the states, goals, and next states are scaled by the inverse of the action space of the manager policy * "exp_negative_distance": equal to exp(-negative_distance^2). The result is a reward between 0 and 1. This is useful for policies that terminate early. * "scaled_exp_negative_distance": similar to the previous worker reward type but with states, actions, and next states that are scaled. intrinsic_reward_scale : float the value that the intrinsic reward should be scaled by relative_goals : bool specifies whether the goal issued by the higher-level policies is meant to be a relative or absolute goal, i.e. specific state or change in state off_policy_corrections : bool whether to use off-policy corrections during the update procedure. See: https://arxiv.org/abs/1805.08296 hindsight : bool whether to include hindsight action and goal transitions in the replay buffer. See: https://arxiv.org/abs/1712.00948 subgoal_testing_rate : float rate at which the original (non-hindsight) sample is stored in the replay buffer as well. Used only if `hindsight` is set to True. connected_gradients : bool whether to use the connected gradient update actor update procedure to the higher-level policy. See: https://arxiv.org/abs/1912.02368v1 cg_weights : float weights for the gradients of the loss of the lower-level policies with respect to the parameters of the higher-level policies. Only used if `connected_gradients` is set to True. use_fingerprints : bool specifies whether to add a time-dependent fingerprint to the observations fingerprint_range : (list of float, list of float) the low and high values for each fingerprint element, if they are being used centralized_value_functions : bool specifies whether to use centralized value functions meta_policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use for the meta policies worker_policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use for the worker policy additional_params : dict additional algorithm-specific policy parameters. Used internally by the class when instantiating other (child) policies. """ super(GoalConditionedPolicy, self).__init__(sess=sess, ob_space=ob_space, ac_space=ac_space, co_space=co_space, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber) assert num_levels >= 2, "num_levels must be greater than or equal to 2" self.num_levels = num_levels self.meta_period = meta_period self.intrinsic_reward_type = intrinsic_reward_type self.intrinsic_reward_scale = intrinsic_reward_scale self.relative_goals = relative_goals self.off_policy_corrections = off_policy_corrections self.hindsight = hindsight self.subgoal_testing_rate = subgoal_testing_rate self.connected_gradients = connected_gradients self.cg_weights = cg_weights self.use_fingerprints = use_fingerprints self.fingerprint_range = fingerprint_range self.fingerprint_dim = (len(self.fingerprint_range[0]), ) self.centralized_value_functions = centralized_value_functions # Get the observation and action space of the higher level policies. meta_ac_space = get_meta_ac_space(ob_space=ob_space, relative_goals=relative_goals, env_name=env_name, use_fingerprints=use_fingerprints, fingerprint_dim=self.fingerprint_dim) # =================================================================== # # Step 1: Create the policies for the individual levels. # # =================================================================== # self.policy = [] # The policies are ordered from the highest level to lowest level # policies in the hierarchy. for i in range(num_levels): # Determine the appropriate parameters to use for the policy in the # current level. policy_fn = meta_policy if i < (num_levels - 1) else worker_policy ac_space_i = meta_ac_space if i < (num_levels - 1) else ac_space co_space_i = co_space if i == 0 else meta_ac_space ob_space_i = ob_space zero_fingerprint_i = i == (num_levels - 1) # The policies are ordered from the highest level to lowest level # policies in the hierarchy. with tf.compat.v1.variable_scope("level_{}".format(i)): self.policy.append( policy_fn( sess=sess, ob_space=ob_space_i, ac_space=ac_space_i, co_space=co_space_i, buffer_size=buffer_size, batch_size=batch_size, actor_lr=actor_lr, critic_lr=critic_lr, verbose=verbose, tau=tau, gamma=gamma, layer_norm=layer_norm, layers=layers, act_fun=act_fun, use_huber=use_huber, scope="level_{}".format(i), zero_fingerprint=zero_fingerprint_i, fingerprint_dim=self.fingerprint_dim[0], **(additional_params or {}), )) # =================================================================== # # Step 2: Create attributes for the replay buffer. # # =================================================================== # # Create the replay buffer. self.replay_buffer = HierReplayBuffer( buffer_size=int(buffer_size / meta_period), batch_size=batch_size, meta_period=meta_period, obs_dim=ob_space.shape[0], ac_dim=ac_space.shape[0], co_dim=None if co_space is None else co_space.shape[0], goal_dim=meta_ac_space.shape[0], num_levels=num_levels) # current action by the meta-level policies self._meta_action = [None for _ in range(num_levels - 1)] # a list of all the actions performed by each level in the hierarchy, # ordered from highest to lowest level policy self._actions = None # a list of the rewards (intrinsic or other) experienced by every level # in the hierarchy, ordered from highest to lowest level policy self._rewards = None # a list of observations that stretch as long as the dilated horizon # chosen for the highest level policy self._observations = None # the first and last contextual term self._contexts = None # a list of done masks at every time step self._dones = None # Collect the state indices for the intrinsic rewards. self.goal_indices = get_state_indices( ob_space=ob_space, env_name=env_name, use_fingerprints=use_fingerprints, fingerprint_dim=self.fingerprint_dim) # Define the intrinsic reward function. if intrinsic_reward_type in [ "negative_distance", "scaled_negative_distance", "non_negative_distance", "scaled_non_negative_distance", "exp_negative_distance", "scaled_exp_negative_distance" ]: # Offset the distance measure by the maximum possible distance to # ensure non-negativity. if "non_negative" in intrinsic_reward_type: offset = np.sqrt( np.sum(np.square(meta_ac_space.high - meta_ac_space.low), -1)) else: offset = 0 # Scale the outputs from the state by the meta-action space if you # wish to scale the worker reward. if intrinsic_reward_type.startswith("scaled"): scale = 0.5 * (meta_ac_space.high - meta_ac_space.low) else: scale = 1 def intrinsic_reward_fn(states, goals, next_states): return negative_distance( states=states[self.goal_indices] / scale, goals=goals / scale, next_states=next_states[self.goal_indices] / scale, relative_context=relative_goals, offset=0.0) + offset # Perform the exponential and squashing operations to keep the # intrinsic reward between 0 and 1. if "exp" in intrinsic_reward_type: def exp_intrinsic_reward_fn(states, goals, next_states): return np.exp( -1 * intrinsic_reward_fn(states, goals, next_states)**2) self.intrinsic_reward_fn = exp_intrinsic_reward_fn else: self.intrinsic_reward_fn = intrinsic_reward_fn else: raise ValueError("Unknown intrinsic reward type: {}".format( intrinsic_reward_type)) # =================================================================== # # Step 3: Create algorithm-specific features. # # =================================================================== # # a fixed goal transition function for the meta-actions in between meta # periods. This is used when relative_goals is set to True in order to # maintain a fixed absolute position of the goal. if relative_goals: def goal_transition_fn(obs0, goal, obs1): return obs0 + goal - obs1 else: def goal_transition_fn(obs0, goal, obs1): return goal self.goal_transition_fn = goal_transition_fn # Utility method for indexing the goal out of an observation variable. self.crop_to_goal = lambda g: tf.gather( g, tf.tile(tf.expand_dims(np.array(self.goal_indices), 0), [self.batch_size, 1]), batch_dims=1, axis=1) if self.connected_gradients: self._setup_connected_gradients()