def setUp(self): super(EpsilonGreedyPolicyTest, self).setUp() self._obs_spec = tensor_spec.TensorSpec([2], tf.float32) self._time_step_spec = ts.time_step_spec(self._obs_spec) self._num_actions = 3 self._greedy_action = 1 self._action_spec = tensor_spec.BoundedTensorSpec( (), tf.int32, 0, self._num_actions - 1) self._policy = fixed_policy.FixedPolicy( np.asarray(self._greedy_action, dtype=np.int32), self._time_step_spec, self._action_spec) self._bandit_policy_type = tf.constant([1, 1]) self._bandit_policy_type_spec = ( policy_util.create_bandit_policy_type_tensor_spec(shape=())) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) self._time_step = ts.restart(observations, batch_size=2)
def testSetBanditPolicyType(self): dims = (10, 1) bandit_policy_spec = ( policy_utilities.create_bandit_policy_type_tensor_spec(dims)) info = policy_utilities.set_bandit_policy_type(None, bandit_policy_spec) self.assertIsInstance(info, policy_utilities.PolicyInfo) self.assertIsInstance(info.bandit_policy_type, tensor_spec.BoundedTensorSpec) self.assertEqual(info.bandit_policy_type.shape, dims) self.assertEqual(info.bandit_policy_type.dtype, tf.int32) # Set to tensor. input_tensor = tf.fill(dims, value=_GREEDY) info = policy_utilities.set_bandit_policy_type(info, input_tensor) self.assertIsInstance(info.bandit_policy_type, tf.Tensor) self.assertEqual(info.bandit_policy_type.shape, input_tensor.shape) expected = [[_GREEDY] for _ in range(dims[0])] self.assertAllEqual(info.bandit_policy_type, expected)
def __init__(self, time_step_spec=None, action_spec=None, reward_network=None, observation_and_action_constraint_splitter=None, emit_policy_info=(), name=None): """Builds a GreedyRewardPredictionPolicy given a reward tf_agents.Network. This policy takes a tf_agents.Network predicting rewards and generates the action corresponding to the largest predicted reward. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. reward_network: An instance of a `tf_agents.network.Network`, callable via `network(observation, step_type) -> (output, final_state)`. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. """ flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of type int32 and shape (). ' 'Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 self._action_offset = action_spec.minimum reward_network.create_variables() self._reward_network = reward_network self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._expected_num_actions]) bandit_policy_type = () if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info: bandit_policy_type = ( policy_utilities.create_bandit_policy_type_tensor_spec( shape=[1])) info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type) super(GreedyRewardPredictionPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=reward_network.state_spec, clip=False, info_spec=info_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)
def __init__( self, time_step_spec: Optional[ts.TimeStep], action_spec: Optional[NestedBoundedTensorSpec], scalarizer: multi_objective_scalarizer.Scalarizer, objective_networks: Sequence[Network], observation_and_action_constraint_splitter: types.Splitter = None, accepts_per_arm_features: bool = False, emit_policy_info: Tuple[Text] = (), name: Optional[Text] = None): """Builds a GreedyMultiObjectiveNeuralPolicy based on multiple networks. This policy takes an iterable of `tf_agents.Network`, each responsible for predicting a specific objective, along with a `Scalarizer` object to generate an action by maximizing the scalarized objective, i.e., the output of the `Scalarizer` applied to the multiple predicted objectives by the networks. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of `BoundedTensorSpec` representing the actions. scalarizer: A `tf_agents.bandits.multi_objective.multi_objective_scalarizer.Scalarizer` object that implements scalarization of multiple objectives into a single scalar reward. objective_networks: A Sequence of `tf_agents.network.Network` objects to be used by the policy. Each network will be called with call(observation, step_type) and is expected to provide a prediction for a specific objective for all actions. observation_and_action_constraint_splitter: A function used for masking valid/invalid actions with each state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the mask. The mask should be a 0-1 `Tensor` of shape `[batch_size, num_actions]`. This function should also work with a `TensorSpec` as input, and should output `TensorSpec` objects for the observation and mask. accepts_per_arm_features: (bool) Whether the policy accepts per-arm features. emit_policy_info: (tuple of strings) what side information we want to get as part of the policy info. Allowed values can be found in `policy_utilities.PolicyInfo`. name: The name of this policy. All variables in this module will fall under that name. Defaults to the class name. Raises: NotImplementedError: If `action_spec` contains more than one `BoundedTensorSpec` or the `BoundedTensorSpec` is not valid. NotImplementedError: If `action_spec` is not a `BoundedTensorSpec` of type int32 and shape (). ValueError: If `objective_networks` has fewer than two networks. ValueError: If `accepts_per_arm_features` is true but `time_step_spec` is None. """ flat_action_spec = tf.nest.flatten(action_spec) if len(flat_action_spec) > 1: raise NotImplementedError( 'action_spec can only contain a single BoundedTensorSpec.') action_spec = flat_action_spec[0] if (not tensor_spec.is_bounded(action_spec) or not tensor_spec.is_discrete(action_spec) or action_spec.shape.rank > 1 or action_spec.shape.num_elements() != 1): raise NotImplementedError( 'action_spec must be a BoundedTensorSpec of type int32 and shape (). ' 'Found {}.'.format(action_spec)) self._expected_num_actions = action_spec.maximum - action_spec.minimum + 1 self._action_offset = action_spec.minimum policy_state_spec = [] for network in objective_networks: policy_state_spec.append(network.state_spec) network.create_variables() self._objective_networks = objective_networks self._scalarizer = scalarizer self._num_objectives = len(self._objective_networks) if self._num_objectives < 2: raise ValueError( 'Number of objectives should be at least two, but found to be {}' .format(self._num_objectives)) self._emit_policy_info = emit_policy_info predicted_rewards_mean = () if policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN in emit_policy_info: predicted_rewards_mean = tensor_spec.TensorSpec( [self._num_objectives, self._expected_num_actions]) bandit_policy_type = () if policy_utilities.InfoFields.BANDIT_POLICY_TYPE in emit_policy_info: bandit_policy_type = ( policy_utilities.create_bandit_policy_type_tensor_spec( shape=[1])) if accepts_per_arm_features: if time_step_spec is None: raise ValueError( 'time_step_spec should not be None for per-arm-features policies, ' 'but found to be.') # The features for the chosen arm is saved to policy_info. chosen_arm_features_info = ( policy_utilities.create_chosen_arm_features_info_spec( time_step_spec.observation, observation_and_action_constraint_splitter)) info_spec = policy_utilities.PerArmPolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type, chosen_arm_features=chosen_arm_features_info) else: info_spec = policy_utilities.PolicyInfo( predicted_rewards_mean=predicted_rewards_mean, bandit_policy_type=bandit_policy_type) self._accepts_per_arm_features = accepts_per_arm_features super(GreedyMultiObjectiveNeuralPolicy, self).__init__(time_step_spec, action_spec, policy_state_spec=policy_state_spec, clip=False, info_spec=info_spec, emit_log_probability='log_probability' in emit_policy_info, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter), name=name)