def _get_actions_from_reward_layer(self, encoded_observation, mask):
        # Get the predicted expected reward.
        est_mean_reward = self._reward_layer(encoded_observation)
        if mask is None:
            greedy_actions = tf.argmax(est_mean_reward,
                                       axis=-1,
                                       output_type=tf.int32)
        else:
            greedy_actions = policy_utilities.masked_argmax(
                est_mean_reward, mask, output_type=tf.int32)

        # Add epsilon greedy on top, if needed.
        if self._epsilon_greedy:
            batch_size = (tf.compat.dimension_value(
                encoded_observation.shape[0])
                          or tf.shape(encoded_observation)[0])
            if mask is None:
                random_actions = tf.random.uniform([batch_size],
                                                   maxval=self._num_actions,
                                                   dtype=tf.int32)
            else:
                zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
                masked_categorical = masked.MaskedCategorical(zero_logits,
                                                              mask,
                                                              dtype=tf.int32)
                random_actions = masked_categorical.sample()

            rng = tf.random.uniform([batch_size], maxval=1.0)
            cond = tf.greater(rng, self._epsilon_greedy)
            chosen_actions = tf.compat.v1.where(cond, greedy_actions,
                                                random_actions)
        else:
            chosen_actions = greedy_actions

        return chosen_actions
示例#2
0
  def _action(self, time_step, policy_state, seed: Optional[types.Seed] = None):
    del seed  # Unused. Seed passed to the class.
    outer_dims = self._outer_dims
    if outer_dims is None:
      if self.time_step_spec.observation:
        outer_dims = nest_utils.get_outer_array_shape(
            time_step.observation, self.time_step_spec.observation)
      else:
        outer_dims = ()

    observation_and_action_constraint_splitter = (
        self.observation_and_action_constraint_splitter)

    if observation_and_action_constraint_splitter is not None:
      _, mask = observation_and_action_constraint_splitter(
          time_step.observation)

      zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
      masked_categorical = masked.MaskedCategorical(zero_logits, mask)
      random_action = tf.cast(
          masked_categorical.sample() + self.action_spec.minimum,
          self.action_spec.dtype)

      # If the action spec says each action should be shaped (1,), add another
      # dimension so the final shape is (B, 1) rather than (B,).
      if len(self.action_spec.shape) == 1:
        random_action = tf.expand_dims(random_action, axis=-1)
    else:
      random_action = array_spec.sample_spec_nest(
          self._action_spec, self._rng, outer_dims=outer_dims)

    info = array_spec.sample_spec_nest(
        self._info_spec, self._rng, outer_dims=outer_dims)

    return policy_step.PolicyStep(random_action, policy_state, info)
示例#3
0
    def _action(self, time_step, policy_state, seed):
        observation_and_action_constraint_splitter = (
            self.observation_and_action_constraint_splitter)

        outer_dims = nest_utils.get_outer_shape(time_step,
                                                self._time_step_spec)
        if observation_and_action_constraint_splitter is not None:
            observation, mask = observation_and_action_constraint_splitter(
                time_step.observation)

            zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
            masked_categorical = masked.MaskedCategorical(zero_logits, mask)
            action_ = tf.cast(
                masked_categorical.sample() + self.action_spec.minimum,
                self.action_spec.dtype)

            # If the action spec says each action should be shaped (1,), add another
            # dimension so the final shape is (B, 1) rather than (B,).
            if self.action_spec.shape.rank == 1:
                action_ = tf.expand_dims(action_, axis=-1)
            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)
        else:
            observation = time_step.observation

            action_ = tensor_spec.sample_spec_nest(self._action_spec,
                                                   seed=seed,
                                                   outer_dims=outer_dims)
            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)
        if self._accepts_per_arm_features:

            def _gather_fn(t):
                return tf.gather(params=t, indices=action_, batch_dims=1)

            chosen_arm_features = tf.nest.map_structure(
                _gather_fn, observation['per_arm'])
            policy_info = policy_info._replace(
                chosen_arm_features=chosen_arm_features)

        # TODO(b/78181147): Investigate why this control dependency is required.
        if time_step is not None:
            with tf.control_dependencies(tf.nest.flatten(time_step)):
                action_ = tf.nest.map_structure(tf.identity, action_)

        if self.emit_log_probability:
            if observation_and_action_constraint_splitter is not None:
                log_probability = masked_categorical.log_prob(
                    action_ - self.action_spec.minimum)
            else:
                action_probability = tf.nest.map_structure(
                    _uniform_probability, self._action_spec)
                log_probability = tf.nest.map_structure(
                    tf.math.log, action_probability)
            policy_info = policy_step.set_log_probability(
                policy_info, log_probability)

        step = policy_step.PolicyStep(action_, policy_state, policy_info)
        return step
示例#4
0
 def testCopy(self):
     """Confirm we can copy the distribution."""
     distribution = masked.MaskedCategorical([100.0, 100.0, 100.0],
                                             mask=[True, False, True])
     copy = distribution.copy()
     with self.cached_session() as s:
         probs_np = s.run(copy.probs_parameter())
         logits_np = s.run(copy.logits_parameter())
         ref_probs_np = s.run(distribution.probs_parameter())
         ref_logits_np = s.run(distribution.logits_parameter())
     self.assertAllEqual(ref_logits_np, logits_np)
     self.assertAllEqual(ref_probs_np, probs_np)
    def _action(self, time_step, policy_state, seed):
        observation_and_action_constraint_splitter = (
            self.observation_and_action_constraint_splitter)

        if observation_and_action_constraint_splitter is not None:
            _, mask = observation_and_action_constraint_splitter(
                time_step.observation)

            zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
            masked_categorical = masked.MaskedCategorical(zero_logits, mask)
            #Modified to accomodate scalar action spaces
            #action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum,
            #                  self.action_spec.dtype)
            action_ = tf.reshape(
                tf.cast(masked_categorical.sample() + self.action_spec.minimum,
                        self.action_spec.dtype), [1])

            # If the action spec says each action should be shaped (1,), add another
            # dimension so the final shape is (B, 1) rather than (B,).
            if self.action_spec.shape.rank == 1:
                action_ = tf.expand_dims(action_, axis=-1)
        else:
            outer_dims = nest_utils.get_outer_shape(time_step,
                                                    self._time_step_spec)

            action_ = tensor_spec.sample_spec_nest(self._action_spec,
                                                   seed=seed,
                                                   outer_dims=outer_dims)

        # TODO(b/78181147): Investigate why this control dependency is required.
        if time_step is not None:
            with tf.control_dependencies(tf.nest.flatten(time_step)):
                action_ = tf.nest.map_structure(tf.identity, action_)
        step = policy_step.PolicyStep(action_, policy_state)

        if self.emit_log_probability:
            if observation_and_action_constraint_splitter is not None:
                log_probability = masked_categorical.log_prob(
                    action_ - self.action_spec.minimum)
            else:
                action_probability = tf.nest.map_structure(
                    _uniform_probability, self._action_spec)
                log_probability = tf.nest.map_structure(
                    tf.math.log, action_probability)

            info = policy_step.PolicyInfo(log_probability=log_probability)
            return step._replace(info=info)

        return step
示例#6
0
    def testMasking(self):
        distribution = masked.MaskedCategorical([100.0, 100.0, 100.0],
                                                mask=[True, False, True])
        sample = distribution.sample()
        results = []

        probs_tensor = distribution.probs
        logits_tensor = distribution.logits

        with self.cached_session() as s:
            probs_np = s.run(probs_tensor)
            logits_np = s.run(logits_tensor)

            # Draw samples & confirm we never draw a masked sample
            for _ in range(100):
                results.append(s.run(sample))

        self.assertAllEqual([0.5, 0, 0.5], probs_np)
        self.assertAllEqual([100, float('-inf'), 100], logits_np)
        self.assertNotIn(1, results)
示例#7
0
    def _action(self, time_step, policy_state, seed):
        if time_step.observation['mask'] is not None:

            mask = time_step.observation['mask']

            zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
            masked_categorical = masked.MaskedCategorical(zero_logits, mask)
            action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum,
                                self.action_spec.dtype)

            # If the action spec says each action should be shaped (1,), add another
            # dimension so the final shape is (B, 1) rather than (B,).
            if self.action_spec.shape.rank == 1:
                action_ = tf.expand_dims(action_, axis=-1)
        else:
            outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec)

            action_ = tensor_spec.sample_spec_nest(
                self._action_spec, seed=seed, outer_dims=outer_dims)

        if time_step is not None:
            with tf.control_dependencies(tf.nest.flatten(time_step)):
                action_ = tf.nest.map_structure(tf.identity, action_)

        policy_info = tensor_spec.sample_spec_nest(self._info_spec)

        if self.emit_log_probability:
            if time_step.observation['mask'] is not None:
                log_probability = masked_categorical.log_prob(
                    action_ - self.action_spec.minimum)
            else:
                _uniform_probability = np.random.uniform(low=0.0, high=1.0)
                action_probability = tf.nest.map_structure(_uniform_probability, self._action_spec)
                log_probability = tf.nest.map_structure(tf.math.log, action_probability)
            policy_info = policy_step.set_log_probability(policy_info, log_probability)

        step = policy_step.PolicyStep(action_, policy_state, policy_info)
        return step
示例#8
0
    def _action(self, time_step, policy_state, seed):
        observation_and_action_constraint_splitter = (
            self.observation_and_action_constraint_splitter)

        outer_dims = nest_utils.get_outer_shape(time_step,
                                                self._time_step_spec)
        if observation_and_action_constraint_splitter is not None:
            observation, mask = observation_and_action_constraint_splitter(
                time_step.observation)

            action_spec = tensor_spec.from_spec(self.action_spec)
            action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec)
            zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
            masked_categorical = masked.MaskedCategorical(zero_logits, mask)
            action_ = tf.cast(
                masked_categorical.sample() + action_spec.minimum,
                action_spec.dtype)

            # If the action spec says each action should be shaped (1,), add another
            # dimension so the final shape is (B, 1) rather than (B,).
            if action_spec.shape.rank == 1:
                action_ = tf.expand_dims(action_, axis=-1)
            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)
        else:
            observation = time_step.observation
            action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec)

            if self._accepts_per_arm_features:
                max_num_arms = action_spec.maximum - action_spec.minimum + 1
                batch_size = tf.shape(time_step.step_type)[0]
                num_actions = observation.get(
                    bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY,
                    tf.ones(shape=(batch_size, ), dtype=tf.int32) *
                    max_num_arms)
                mask = tf.sequence_mask(num_actions, max_num_arms)
                zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
                masked_categorical = masked.MaskedCategorical(
                    zero_logits, mask)
                action_ = tf.nest.map_structure(
                    lambda t: tf.cast(masked_categorical.sample() + t.minimum,
                                      t.dtype), action_spec)
            else:
                action_ = tensor_spec.sample_spec_nest(self._action_spec,
                                                       seed=seed,
                                                       outer_dims=outer_dims)

            policy_info = tensor_spec.sample_spec_nest(self._info_spec,
                                                       outer_dims=outer_dims)

        # Update policy info with chosen arm features.
        if self._accepts_per_arm_features:

            def _gather_fn(t):
                return tf.gather(params=t, indices=action_, batch_dims=1)

            chosen_arm_features = tf.nest.map_structure(
                _gather_fn, observation[bandit_spec_utils.PER_ARM_FEATURE_KEY])

            if policy_utilities.has_chosen_arm_features(self._info_spec):
                policy_info = policy_info._replace(
                    chosen_arm_features=chosen_arm_features)

        # TODO(b/78181147): Investigate why this control dependency is required.
        if time_step is not None:
            with tf.control_dependencies(tf.nest.flatten(time_step)):
                action_ = tf.nest.map_structure(tf.identity, action_)

        if self.emit_log_probability:
            if (self._accepts_per_arm_features
                    or observation_and_action_constraint_splitter is not None):
                action_spec = cast(tensor_spec.BoundedTensorSpec,
                                   self.action_spec)
                log_probability = masked_categorical.log_prob(
                    action_ - action_spec.minimum)
            else:
                log_probability = tf.nest.map_structure(
                    lambda s: _calculate_log_probability(outer_dims, s),
                    self._action_spec)
            policy_info = policy_step.set_log_probability(
                policy_info, log_probability)

        step = policy_step.PolicyStep(action_, policy_state, policy_info)
        return step