示例#1
0
    def _action(self, time_step, policy_state, seed):
        seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy')
        greedy_action = self._greedy_policy.action(time_step, policy_state)
        random_action = self._random_policy.action(time_step, (),
                                                   seed_stream())

        outer_shape = nest_utils.get_outer_shape(time_step,
                                                 self._time_step_spec)
        rng = tf.random.uniform(outer_shape,
                                maxval=1.0,
                                seed=seed_stream(),
                                name='epsilon_rng')
        cond = tf.greater(rng, self._get_epsilon())

        # Selects the action/info from the random policy with probability epsilon.
        # TODO(b/133175894): tf.compat.v1.where only supports a condition which is
        # either a scalar or a vector. Use tf.compat.v2 so that it can support any
        # condition whose leading dimensions are the same as the other operands of
        # tf.where.
        outer_ndims = int(outer_shape.shape[0])
        if outer_ndims >= 2:
            raise ValueError(
                'Only supports batched time steps with a single batch dimension'
            )
        action = tf.nest.map_structure(
            lambda g, r: tf.compat.v1.where(cond, g, r), greedy_action.action,
            random_action.action)

        if greedy_action.info:
            if not random_action.info:
                raise ValueError('Incompatible info field')
            info = nest_utils.where(cond, greedy_action.info,
                                    random_action.info)
            # Overwrite bandit policy info type.
            if policy_utilities.has_bandit_policy_type(info,
                                                       check_for_tensor=True):
                # Generate mask of the same shape as bandit_policy_type (batch_size, 1).
                # This is the opposite of `cond`, which is 1-D bool tensor (batch_size,)
                # that is true when greedy policy was used, otherwise `cond` is false.
                random_policy_mask = tf.reshape(
                    tf.logical_not(cond), tf.shape(info.bandit_policy_type))
                bandit_policy_type = policy_utilities.bandit_policy_uniform_mask(
                    info.bandit_policy_type, mask=random_policy_mask)
                info = policy_utilities.set_bandit_policy_type(
                    info, bandit_policy_type)
        else:
            if random_action.info:
                raise ValueError('Incompatible info field')
            info = ()

        # The state of the epsilon greedy policy is the state of the underlying
        # greedy policy (the random policy carries no state).
        # It is commonly assumed that the new policy state only depends only
        # on the previous state and "time_step", the action (be it the greedy one
        # or the random one) does not influence the new policy state.
        state = greedy_action.state

        return policy_step.PolicyStep(action, state, info)
示例#2
0
    def testWhereSameRankDifferentDimension(self):
        condition = tf.convert_to_tensor([True, False, True])
        true_output = (tf.convert_to_tensor([1]), tf.convert_to_tensor([2]))
        false_output = (tf.convert_to_tensor([3, 4, 5]),
                        tf.convert_to_tensor([6, 7, 8]))

        result = nest_utils.where(condition, true_output, false_output)
        result = self.evaluate(result)

        expected = (np.array([1, 4, 1]), np.array([2, 7, 2]))
        self.assertAllEqual(expected, result)
示例#3
0
    def testWhere(self):
        condition = tf.convert_to_tensor([True, False, False, True, False])
        true_output = tf.nest.map_structure(
            tf.convert_to_tensor, (np.array([0] * 5), np.arange(1, 6)))
        false_output = tf.nest.map_structure(
            tf.convert_to_tensor, (np.array([1] * 5), np.arange(6, 11)))

        result = nest_utils.where(condition, true_output, false_output)
        result = self.evaluate(result)

        expected = (np.array([0, 1, 1, 0, 1]), np.array([1, 7, 8, 4, 10]))
        self.assertAllEqual(expected, result)
示例#4
0
    def testWhereDifferentRanks(self):
        condition = tf.convert_to_tensor([True, False, False, True, False])
        true_output = tf.nest.map_structure(
            tf.convert_to_tensor,
            (np.reshape(np.array([0] * 10),
                        (5, 2)), np.reshape(np.arange(1, 11), (5, 2))))
        false_output = tf.nest.map_structure(
            tf.convert_to_tensor,
            (np.reshape(np.array([1] * 10),
                        (5, 2)), np.reshape(np.arange(12, 22), (5, 2))))

        result = nest_utils.where(condition, true_output, false_output)
        result = self.evaluate(result)

        expected = (np.array([[0, 0], [1, 1], [1, 1], [0, 0], [1, 1]]),
                    np.array([[1, 2], [14, 15], [16, 17], [7, 8], [20, 21]]))
        self.assertAllEqual(expected, result)
示例#5
0
    def _action(self, time_step, policy_state, seed):
        seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy')
        greedy_action = self._greedy_policy.action(time_step, policy_state)
        random_action = self._random_policy.action(time_step, (),
                                                   seed_stream())

        outer_shape = nest_utils.get_outer_shape(time_step,
                                                 self._time_step_spec)
        rng = tf.random.uniform(outer_shape,
                                maxval=1.0,
                                seed=seed_stream(),
                                name='epsilon_rng')
        cond = tf.greater(rng, self._get_epsilon())

        # Selects the action/info from the random policy with probability epsilon.
        # TODO(b/133175894): tf.compat.v1.where only supports a condition which is
        # either a scalar or a vector. Use tf.compat.v2 so that it can support any
        # condition whose leading dimensions are the same as the other operands of
        # tf.where.
        outer_ndims = int(outer_shape.shape[0])
        if outer_ndims >= 2:
            raise ValueError(
                'Only supports batched time steps with a single batch dimension'
            )
        action = tf.compat.v1.where(cond, greedy_action.action,
                                    random_action.action)

        if greedy_action.info:
            if not random_action.info:
                raise ValueError('Incompatible info field')
            info = nest_utils.where(cond, greedy_action.info,
                                    random_action.info)
        else:
            if random_action.info:
                raise ValueError('Incompatible info field')
            info = ()

        # The state of the epsilon greedy policy is the state of the underlying
        # greedy policy (the random policy carries no state).
        # It is commonly assumed that the new policy state only depends only
        # on the previous state and "time_step", the action (be it the greedy one
        # or the random one) does not influence the new policy state.
        state = greedy_action.state

        return policy_step.PolicyStep(action, state, info)
示例#6
0
    def _maybe_reset_state(self, time_step, policy_state):
        if policy_state is ():  # pylint: disable=literal-comparison
            return policy_state

        batch_size = tf.compat.dimension_value(time_step.discount.shape[0])
        if batch_size is None:
            batch_size = tf.shape(time_step.discount)[0]

        # Make sure we call this with a kwarg as it may be wrapped in tf.function
        # which would expect a tensor if it was not a kwarg.
        zero_state = self.get_initial_state(batch_size=batch_size)
        condition = time_step.is_first()
        # When experience is a sequence we only reset automatically for the first
        # time_step in the sequence as we can't easily generalize how the policy is
        # unrolled over the sequence.
        if nest_utils.get_outer_rank(time_step, self._time_step_spec) > 1:
            condition = time_step.is_first()[:, 0, ...]
        return nest_utils.where(condition, zero_state, policy_state)
  def _action(self, time_step, policy_state, seed):
    seed_stream = tfp.util.SeedStream(seed=seed, salt='epsilon_greedy')
    greedy_action = self._greedy_policy.action(time_step, policy_state)
    random_action = self._random_policy.action(time_step, (), seed_stream())

    outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
    rng = tf.random.uniform(
        outer_shape, maxval=1.0, seed=seed_stream(), name='epsilon_rng')
    cond = tf.greater_equal(rng, self._get_epsilon())

    # Selects the action/info from the random policy with probability epsilon.
    # TODO(b/133175894): tf.compat.v1.where only supports a condition which is
    # either a scalar or a vector. Use tf.compat.v2 so that it can support any
    # condition whose leading dimensions are the same as the other operands of
    # tf.where.
    outer_ndims = int(outer_shape.shape[0])
    if outer_ndims >= 2:
      raise ValueError(
          'Only supports batched time steps with a single batch dimension')
    action = tf.nest.map_structure(lambda g, r: tf.compat.v1.where(cond, g, r),
                                   greedy_action.action, random_action.action)

    if greedy_action.info:
      if not random_action.info:
        raise ValueError('Incompatible info field')
      # Note that the objects in PolicyInfo may have different shapes, so we
      # need to call nest_utils.where() on each type of object.
      info = tf.nest.map_structure(lambda x, y: nest_utils.where(cond, x, y),
                                   greedy_action.info, random_action.info)
      if self._emit_log_probability:
        # At this point, info.log_probability contains the log prob of the
        # action chosen, conditioned on the policy that was chosen. We want to
        # emit the full log probability of the action, so we'll add in the log
        # probability of choosing the policy.
        random_log_prob = tf.nest.map_structure(
            lambda t: tf.math.log(tf.zeros_like(t) + self._get_epsilon()),
            info.log_probability)
        greedy_log_prob = tf.nest.map_structure(
            lambda t: tf.math.log(tf.ones_like(t) - self._get_epsilon()),
            random_log_prob)
        log_prob_of_chosen_policy = nest_utils.where(cond, greedy_log_prob,
                                                     random_log_prob)
        log_prob = tf.nest.map_structure(lambda a, b: a + b,
                                         log_prob_of_chosen_policy,
                                         info.log_probability)
        info = policy_step.set_log_probability(info, log_prob)
      # Overwrite bandit policy info type.
      if policy_utilities.has_bandit_policy_type(info, check_for_tensor=True):
        # Generate mask of the same shape as bandit_policy_type (batch_size, 1).
        # This is the opposite of `cond`, which is 1-D bool tensor (batch_size,)
        # that is true when greedy policy was used, otherwise `cond` is false.
        random_policy_mask = tf.reshape(tf.logical_not(cond),
                                        tf.shape(info.bandit_policy_type))  # pytype: disable=attribute-error
        bandit_policy_type = policy_utilities.bandit_policy_uniform_mask(
            info.bandit_policy_type, mask=random_policy_mask)  # pytype: disable=attribute-error
        info = policy_utilities.set_bandit_policy_type(
            info, bandit_policy_type)
    else:
      if random_action.info:
        raise ValueError('Incompatible info field')
      info = ()

    # The state of the epsilon greedy policy is the state of the underlying
    # greedy policy (the random policy carries no state).
    # It is commonly assumed that the new policy state only depends only
    # on the previous state and "time_step", the action (be it the greedy one
    # or the random one) does not influence the new policy state.
    state = greedy_action.state

    return policy_step.PolicyStep(action, state, info)