def _action(self, time_step, policy_state, seed): seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy') greedy_action = self._greedy_policy.action(time_step, policy_state) random_action = self._random_policy.action(time_step, (), seed_stream()) outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) rng = tf.random.uniform(outer_shape, maxval=1.0, seed=seed_stream(), name='epsilon_rng') cond = tf.greater(rng, self._get_epsilon()) # Selects the action/info from the random policy with probability epsilon. # TODO(b/133175894): tf.compat.v1.where only supports a condition which is # either a scalar or a vector. Use tf.compat.v2 so that it can support any # condition whose leading dimensions are the same as the other operands of # tf.where. outer_ndims = int(outer_shape.shape[0]) if outer_ndims >= 2: raise ValueError( 'Only supports batched time steps with a single batch dimension' ) action = tf.nest.map_structure( lambda g, r: tf.compat.v1.where(cond, g, r), greedy_action.action, random_action.action) if greedy_action.info: if not random_action.info: raise ValueError('Incompatible info field') info = nest_utils.where(cond, greedy_action.info, random_action.info) # Overwrite bandit policy info type. if policy_utilities.has_bandit_policy_type(info, check_for_tensor=True): # Generate mask of the same shape as bandit_policy_type (batch_size, 1). # This is the opposite of `cond`, which is 1-D bool tensor (batch_size,) # that is true when greedy policy was used, otherwise `cond` is false. random_policy_mask = tf.reshape( tf.logical_not(cond), tf.shape(info.bandit_policy_type)) bandit_policy_type = policy_utilities.bandit_policy_uniform_mask( info.bandit_policy_type, mask=random_policy_mask) info = policy_utilities.set_bandit_policy_type( info, bandit_policy_type) else: if random_action.info: raise ValueError('Incompatible info field') info = () # The state of the epsilon greedy policy is the state of the underlying # greedy policy (the random policy carries no state). # It is commonly assumed that the new policy state only depends only # on the previous state and "time_step", the action (be it the greedy one # or the random one) does not influence the new policy state. state = greedy_action.state return policy_step.PolicyStep(action, state, info)
def testWhereSameRankDifferentDimension(self): condition = tf.convert_to_tensor([True, False, True]) true_output = (tf.convert_to_tensor([1]), tf.convert_to_tensor([2])) false_output = (tf.convert_to_tensor([3, 4, 5]), tf.convert_to_tensor([6, 7, 8])) result = nest_utils.where(condition, true_output, false_output) result = self.evaluate(result) expected = (np.array([1, 4, 1]), np.array([2, 7, 2])) self.assertAllEqual(expected, result)
def testWhere(self): condition = tf.convert_to_tensor([True, False, False, True, False]) true_output = tf.nest.map_structure( tf.convert_to_tensor, (np.array([0] * 5), np.arange(1, 6))) false_output = tf.nest.map_structure( tf.convert_to_tensor, (np.array([1] * 5), np.arange(6, 11))) result = nest_utils.where(condition, true_output, false_output) result = self.evaluate(result) expected = (np.array([0, 1, 1, 0, 1]), np.array([1, 7, 8, 4, 10])) self.assertAllEqual(expected, result)
def testWhereDifferentRanks(self): condition = tf.convert_to_tensor([True, False, False, True, False]) true_output = tf.nest.map_structure( tf.convert_to_tensor, (np.reshape(np.array([0] * 10), (5, 2)), np.reshape(np.arange(1, 11), (5, 2)))) false_output = tf.nest.map_structure( tf.convert_to_tensor, (np.reshape(np.array([1] * 10), (5, 2)), np.reshape(np.arange(12, 22), (5, 2)))) result = nest_utils.where(condition, true_output, false_output) result = self.evaluate(result) expected = (np.array([[0, 0], [1, 1], [1, 1], [0, 0], [1, 1]]), np.array([[1, 2], [14, 15], [16, 17], [7, 8], [20, 21]])) self.assertAllEqual(expected, result)
def _action(self, time_step, policy_state, seed): seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy') greedy_action = self._greedy_policy.action(time_step, policy_state) random_action = self._random_policy.action(time_step, (), seed_stream()) outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) rng = tf.random.uniform(outer_shape, maxval=1.0, seed=seed_stream(), name='epsilon_rng') cond = tf.greater(rng, self._get_epsilon()) # Selects the action/info from the random policy with probability epsilon. # TODO(b/133175894): tf.compat.v1.where only supports a condition which is # either a scalar or a vector. Use tf.compat.v2 so that it can support any # condition whose leading dimensions are the same as the other operands of # tf.where. outer_ndims = int(outer_shape.shape[0]) if outer_ndims >= 2: raise ValueError( 'Only supports batched time steps with a single batch dimension' ) action = tf.compat.v1.where(cond, greedy_action.action, random_action.action) if greedy_action.info: if not random_action.info: raise ValueError('Incompatible info field') info = nest_utils.where(cond, greedy_action.info, random_action.info) else: if random_action.info: raise ValueError('Incompatible info field') info = () # The state of the epsilon greedy policy is the state of the underlying # greedy policy (the random policy carries no state). # It is commonly assumed that the new policy state only depends only # on the previous state and "time_step", the action (be it the greedy one # or the random one) does not influence the new policy state. state = greedy_action.state return policy_step.PolicyStep(action, state, info)
def _maybe_reset_state(self, time_step, policy_state): if policy_state is (): # pylint: disable=literal-comparison return policy_state batch_size = tf.compat.dimension_value(time_step.discount.shape[0]) if batch_size is None: batch_size = tf.shape(time_step.discount)[0] # Make sure we call this with a kwarg as it may be wrapped in tf.function # which would expect a tensor if it was not a kwarg. zero_state = self.get_initial_state(batch_size=batch_size) condition = time_step.is_first() # When experience is a sequence we only reset automatically for the first # time_step in the sequence as we can't easily generalize how the policy is # unrolled over the sequence. if nest_utils.get_outer_rank(time_step, self._time_step_spec) > 1: condition = time_step.is_first()[:, 0, ...] return nest_utils.where(condition, zero_state, policy_state)
def _action(self, time_step, policy_state, seed): seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy') greedy_action = self._greedy_policy.action(time_step, policy_state) random_action = self._random_policy.action(time_step, (), seed_stream()) outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) rng = tf.random.uniform( outer_shape, maxval=1.0, seed=seed_stream(), name='epsilon_rng') cond = tf.greater_equal(rng, self._get_epsilon()) # Selects the action/info from the random policy with probability epsilon. # TODO(b/133175894): tf.compat.v1.where only supports a condition which is # either a scalar or a vector. Use tf.compat.v2 so that it can support any # condition whose leading dimensions are the same as the other operands of # tf.where. outer_ndims = int(outer_shape.shape[0]) if outer_ndims >= 2: raise ValueError( 'Only supports batched time steps with a single batch dimension') action = tf.nest.map_structure(lambda g, r: tf.compat.v1.where(cond, g, r), greedy_action.action, random_action.action) if greedy_action.info: if not random_action.info: raise ValueError('Incompatible info field') # Note that the objects in PolicyInfo may have different shapes, so we # need to call nest_utils.where() on each type of object. info = tf.nest.map_structure(lambda x, y: nest_utils.where(cond, x, y), greedy_action.info, random_action.info) if self._emit_log_probability: # At this point, info.log_probability contains the log prob of the # action chosen, conditioned on the policy that was chosen. We want to # emit the full log probability of the action, so we'll add in the log # probability of choosing the policy. random_log_prob = tf.nest.map_structure( lambda t: tf.math.log(tf.zeros_like(t) + self._get_epsilon()), info.log_probability) greedy_log_prob = tf.nest.map_structure( lambda t: tf.math.log(tf.ones_like(t) - self._get_epsilon()), random_log_prob) log_prob_of_chosen_policy = nest_utils.where(cond, greedy_log_prob, random_log_prob) log_prob = tf.nest.map_structure(lambda a, b: a + b, log_prob_of_chosen_policy, info.log_probability) info = policy_step.set_log_probability(info, log_prob) # Overwrite bandit policy info type. if policy_utilities.has_bandit_policy_type(info, check_for_tensor=True): # Generate mask of the same shape as bandit_policy_type (batch_size, 1). # This is the opposite of `cond`, which is 1-D bool tensor (batch_size,) # that is true when greedy policy was used, otherwise `cond` is false. random_policy_mask = tf.reshape(tf.logical_not(cond), tf.shape(info.bandit_policy_type)) # pytype: disable=attribute-error bandit_policy_type = policy_utilities.bandit_policy_uniform_mask( info.bandit_policy_type, mask=random_policy_mask) # pytype: disable=attribute-error info = policy_utilities.set_bandit_policy_type( info, bandit_policy_type) else: if random_action.info: raise ValueError('Incompatible info field') info = () # The state of the epsilon greedy policy is the state of the underlying # greedy policy (the random policy carries no state). # It is commonly assumed that the new policy state only depends only # on the previous state and "time_step", the action (be it the greedy one # or the random one) does not influence the new policy state. state = greedy_action.state return policy_step.PolicyStep(action, state, info)