def _get_actions_from_reward_layer(self, encoded_observation, mask): # Get the predicted expected reward. est_mean_reward = self._reward_layer(encoded_observation) if mask is None: greedy_actions = tf.argmax(est_mean_reward, axis=-1, output_type=tf.int32) else: greedy_actions = policy_utilities.masked_argmax( est_mean_reward, mask, output_type=tf.int32) # Add epsilon greedy on top, if needed. if self._epsilon_greedy: batch_size = (tf.compat.dimension_value( encoded_observation.shape[0]) or tf.shape(encoded_observation)[0]) if mask is None: random_actions = tf.random.uniform([batch_size], maxval=self._num_actions, dtype=tf.int32) else: zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask, dtype=tf.int32) random_actions = masked_categorical.sample() rng = tf.random.uniform([batch_size], maxval=1.0) cond = tf.greater(rng, self._epsilon_greedy) chosen_actions = tf.compat.v1.where(cond, greedy_actions, random_actions) else: chosen_actions = greedy_actions return chosen_actions
def _action(self, time_step, policy_state, seed: Optional[types.Seed] = None): del seed # Unused. Seed passed to the class. outer_dims = self._outer_dims if outer_dims is None: if self.time_step_spec.observation: outer_dims = nest_utils.get_outer_array_shape( time_step.observation, self.time_step_spec.observation) else: outer_dims = () observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) if observation_and_action_constraint_splitter is not None: _, mask = observation_and_action_constraint_splitter( time_step.observation) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) random_action = tf.cast( masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if len(self.action_spec.shape) == 1: random_action = tf.expand_dims(random_action, axis=-1) else: random_action = array_spec.sample_spec_nest( self._action_spec, self._rng, outer_dims=outer_dims) info = array_spec.sample_spec_nest( self._info_spec, self._rng, outer_dims=outer_dims) return policy_step.PolicyStep(random_action, policy_state, info)
def _action(self, time_step, policy_state, seed): observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) if observation_and_action_constraint_splitter is not None: observation, mask = observation_and_action_constraint_splitter( time_step.observation) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast( masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if self.action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) else: observation = time_step.observation action_ = tensor_spec.sample_spec_nest(self._action_spec, seed=seed, outer_dims=outer_dims) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) if self._accepts_per_arm_features: def _gather_fn(t): return tf.gather(params=t, indices=action_, batch_dims=1) chosen_arm_features = tf.nest.map_structure( _gather_fn, observation['per_arm']) policy_info = policy_info._replace( chosen_arm_features=chosen_arm_features) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) if self.emit_log_probability: if observation_and_action_constraint_splitter is not None: log_probability = masked_categorical.log_prob( action_ - self.action_spec.minimum) else: action_probability = tf.nest.map_structure( _uniform_probability, self._action_spec) log_probability = tf.nest.map_structure( tf.math.log, action_probability) policy_info = policy_step.set_log_probability( policy_info, log_probability) step = policy_step.PolicyStep(action_, policy_state, policy_info) return step
def testCopy(self): """Confirm we can copy the distribution.""" distribution = masked.MaskedCategorical([100.0, 100.0, 100.0], mask=[True, False, True]) copy = distribution.copy() with self.cached_session() as s: probs_np = s.run(copy.probs_parameter()) logits_np = s.run(copy.logits_parameter()) ref_probs_np = s.run(distribution.probs_parameter()) ref_logits_np = s.run(distribution.logits_parameter()) self.assertAllEqual(ref_logits_np, logits_np) self.assertAllEqual(ref_probs_np, probs_np)
def _action(self, time_step, policy_state, seed): observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) if observation_and_action_constraint_splitter is not None: _, mask = observation_and_action_constraint_splitter( time_step.observation) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) #Modified to accomodate scalar action spaces #action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum, # self.action_spec.dtype) action_ = tf.reshape( tf.cast(masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype), [1]) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if self.action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) else: outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) action_ = tensor_spec.sample_spec_nest(self._action_spec, seed=seed, outer_dims=outer_dims) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) step = policy_step.PolicyStep(action_, policy_state) if self.emit_log_probability: if observation_and_action_constraint_splitter is not None: log_probability = masked_categorical.log_prob( action_ - self.action_spec.minimum) else: action_probability = tf.nest.map_structure( _uniform_probability, self._action_spec) log_probability = tf.nest.map_structure( tf.math.log, action_probability) info = policy_step.PolicyInfo(log_probability=log_probability) return step._replace(info=info) return step
def testMasking(self): distribution = masked.MaskedCategorical([100.0, 100.0, 100.0], mask=[True, False, True]) sample = distribution.sample() results = [] probs_tensor = distribution.probs logits_tensor = distribution.logits with self.cached_session() as s: probs_np = s.run(probs_tensor) logits_np = s.run(logits_tensor) # Draw samples & confirm we never draw a masked sample for _ in range(100): results.append(s.run(sample)) self.assertAllEqual([0.5, 0, 0.5], probs_np) self.assertAllEqual([100, float('-inf'), 100], logits_np) self.assertNotIn(1, results)
def _action(self, time_step, policy_state, seed): if time_step.observation['mask'] is not None: mask = time_step.observation['mask'] zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast(masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if self.action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) else: outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) action_ = tensor_spec.sample_spec_nest( self._action_spec, seed=seed, outer_dims=outer_dims) if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) policy_info = tensor_spec.sample_spec_nest(self._info_spec) if self.emit_log_probability: if time_step.observation['mask'] is not None: log_probability = masked_categorical.log_prob( action_ - self.action_spec.minimum) else: _uniform_probability = np.random.uniform(low=0.0, high=1.0) action_probability = tf.nest.map_structure(_uniform_probability, self._action_spec) log_probability = tf.nest.map_structure(tf.math.log, action_probability) policy_info = policy_step.set_log_probability(policy_info, log_probability) step = policy_step.PolicyStep(action_, policy_state, policy_info) return step
def _action(self, time_step, policy_state, seed): observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) if observation_and_action_constraint_splitter is not None: observation, mask = observation_and_action_constraint_splitter( time_step.observation) action_spec = tensor_spec.from_spec(self.action_spec) action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) action_ = tf.cast( masked_categorical.sample() + action_spec.minimum, action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if action_spec.shape.rank == 1: action_ = tf.expand_dims(action_, axis=-1) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) else: observation = time_step.observation action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec) if self._accepts_per_arm_features: max_num_arms = action_spec.maximum - action_spec.minimum + 1 batch_size = tf.shape(time_step.step_type)[0] num_actions = observation.get( bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY, tf.ones(shape=(batch_size, ), dtype=tf.int32) * max_num_arms) mask = tf.sequence_mask(num_actions, max_num_arms) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical( zero_logits, mask) action_ = tf.nest.map_structure( lambda t: tf.cast(masked_categorical.sample() + t.minimum, t.dtype), action_spec) else: action_ = tensor_spec.sample_spec_nest(self._action_spec, seed=seed, outer_dims=outer_dims) policy_info = tensor_spec.sample_spec_nest(self._info_spec, outer_dims=outer_dims) # Update policy info with chosen arm features. if self._accepts_per_arm_features: def _gather_fn(t): return tf.gather(params=t, indices=action_, batch_dims=1) chosen_arm_features = tf.nest.map_structure( _gather_fn, observation[bandit_spec_utils.PER_ARM_FEATURE_KEY]) if policy_utilities.has_chosen_arm_features(self._info_spec): policy_info = policy_info._replace( chosen_arm_features=chosen_arm_features) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(tf.nest.flatten(time_step)): action_ = tf.nest.map_structure(tf.identity, action_) if self.emit_log_probability: if (self._accepts_per_arm_features or observation_and_action_constraint_splitter is not None): action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec) log_probability = masked_categorical.log_prob( action_ - action_spec.minimum) else: log_probability = tf.nest.map_structure( lambda s: _calculate_log_probability(outer_dims, s), self._action_spec) policy_info = policy_step.set_log_probability( policy_info, log_probability) step = policy_step.PolicyStep(action_, policy_state, policy_info) return step