def testProcessExperienceGlobalFeatures(self): observation_spec = { 'f1': tf.TensorSpec(shape=(5, ), dtype=tf.string), 'f2': tf.TensorSpec(shape=(5, 2), dtype=tf.int32) } time_step_spec = time_step.time_step_spec(observation_spec) training_data_spec = trajectory.Trajectory( step_type=time_step_spec.step_type, observation=time_step_spec.observation, action=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=4, dtype=tf.int32), policy_info=(), next_step_type=time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=2, dtype=tf.float32), discount=time_step_spec.discount) experience = tensor_spec.sample_spec_nest(training_data_spec, outer_dims=(7, 2)) observation, action, reward = utils.process_experience_for_neural_agents( experience, False, training_data_spec) self.assertAllEqual(observation['f1'][0], experience.observation['f1'][0, 0]) self.assertEqual(action[0], experience.action[0, 0]) self.assertEqual(reward[0], experience.reward[0, 0])
def _train(self, experience, weights): (observations, actions, rewards) = bandit_utils.process_experience_for_neural_agents( experience, self._observation_and_action_constraint_splitter, self._accepts_per_arm_features, self.training_data_spec) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) return loss_info
def _loss(self, experience: types.NestedTensor, weights: Optional[types.Float] = None, training: bool = False) -> tf_agent.LossInfo: """Computes loss for training the reward and constraint networks. Args: experience: A batch of experience data in the form of a `Trajectory` or `Transition`. weights: Optional scalar or elementwise (per-batch-entry) importance weights. The output batch loss will be scaled by these weights, and the final scalar loss is the mean of these values. training: Whether the loss is being used for training. Returns: loss: A `LossInfo` containing the loss for the training step. Raises: ValueError: if the number of actions is greater than 1. """ (observations, actions, rewards) = bandit_utils.process_experience_for_neural_agents( experience, self._accepts_per_arm_features, self.training_data_spec) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) if self._constraints: rewards_tensor = rewards[bandit_spec_utils.REWARD_SPEC_KEY] else: rewards_tensor = rewards reward_loss = self.reward_loss(observations, actions, rewards_tensor, weights, training) constraint_loss = tf.constant(0.0) for i, c in enumerate(self._constraints, 0): if self._time_step_spec.reward[ bandit_spec_utils.CONSTRAINTS_SPEC_KEY].shape.rank > 1: constraint_targets = rewards[ bandit_spec_utils.CONSTRAINTS_SPEC_KEY][:, i] else: constraint_targets = rewards[ bandit_spec_utils.CONSTRAINTS_SPEC_KEY] constraint_loss += c.compute_loss(observations, actions, constraint_targets, weights, training) self.compute_summaries( reward_loss, constraint_loss=(constraint_loss if self._constraints else None)) total_loss = reward_loss if self._constraints: total_loss += constraint_loss return tf_agent.LossInfo(total_loss, extra=())
def _train(self, experience, weights=None): """Updates the policy based on the data in `experience`. Note that `experience` should only contain data points that this agent has not previously seen. If `experience` comes from a replay buffer, this buffer should be cleared between each call to `train`. Args: experience: A batch of experience data in the form of a `Trajectory`. weights: (optional) sample weights. Returns: A `LossInfo` containing the loss *before* the training step is taken. In most cases, if `weights` is provided, the entries of this tuple will have been calculated with the weights. Note that each Agent chooses its own method of applying weights. """ experience = self._as_trajectory(experience) (observation, action, reward) = bandit_utils.process_experience_for_neural_agents( experience, self._accepts_per_arm_features, self.training_data_spec) if self._observation_and_action_constraint_splitter is not None: observation, _ = self._observation_and_action_constraint_splitter( observation) reward = tf.cast(reward, self._dtype) if tf.distribute.has_strategy(): if self._distributed_train_encoding_network: loss_info = self.compute_loss_using_reward_layer( observation, action, reward, weights, training=True) else: loss_info = self.compute_loss_using_linucb_distributed( observation, action, reward, weights, training=True) return loss_info tf.compat.v1.assign( self.actions_from_reward_layer, tf.less(self._train_step_counter, self._encoding_network_num_train_steps)) def use_actions_from_reward_layer(): return self.compute_loss_using_reward_layer( observation, action, reward, weights, training=True) def no_actions_from_reward_layer(): return self.compute_loss_using_linucb( observation, action, reward, weights, training=True) loss_info = tf.cond( self.actions_from_reward_layer, use_actions_from_reward_layer, no_actions_from_reward_layer) return loss_info
def _loss(self, experience: types.NestedTensor, weights: types.Tensor = None, training: bool = False) -> tf_agent.LossInfo: """Computes loss for training the objective networks. Args: experience: A batch of experience data in the form of a `Trajectory` or `Transition`. weights: Optional scalar or elementwise (per-batch-entry) importance weights. The output batch loss will be scaled by these weights, and the final scalar loss is the mean of these values. training: Whether the loss is being used for training. Returns: loss: A `LossInfo` containing the loss for the training step. Raises: ValueError: - If the number of actions is greater than 1. - If `objectives` is not rank-2. - If the number of columns in `objectives` does not equal `self._num_objectives`. """ (observations, actions, objective_values) = bandit_utils.process_experience_for_neural_agents( experience, self._accepts_per_arm_features, self.training_data_spec) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) if objective_values.shape.rank != 2: raise ValueError( 'The objectives tensor should be rank-2 [batch_size, num_objectives],' ' but found to be rank-{}'.format(objective_values.shape.rank)) if objective_values.shape[1] != self._num_objectives: raise ValueError( 'The number of objectives in the objective_values tensor: {} ' 'is different from the number of objective networks: {}.'. format(objective_values.shape[1], self._num_objectives)) objective_losses = [] for idx in range(self._num_objectives): single_objective_values = objective_values[:, idx] objective_losses.append( self._single_objective_loss(idx, observations, actions, single_objective_values, weights, training)) self.compute_summaries(objective_losses) total_loss = tf.reduce_sum(objective_losses) return tf_agent.LossInfo(total_loss, extra=())
def _train(self, experience: types.NestedTensor, weights: types.Tensor) -> tf_agent.LossInfo: (observations, actions, objective_values) = bandit_utils.process_experience_for_neural_agents( experience, self._accepts_per_arm_features, self.training_data_spec) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) if objective_values.shape.rank != 2: raise ValueError( 'The objectives tensor should be rank-2 [batch_size, num_objectives],' ' but found to be rank-{}'.format(objective_values.shape.rank)) if objective_values.shape[1] != self._num_objectives: raise ValueError( 'The number of objectives in the objective_values tensor: {} ' 'is different from the number of objective networks: {}.'. format(objective_values.shape[1], self._num_objectives)) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, objective_values, weights=weights, training=True) variables_to_train = self._variables_to_train() if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars) self.train_step_counter.assign_add(1) return loss_info
def testProcessExperiencePerArmFeaturesWithMask(self): mask_spec = tensor_spec.BoundedTensorSpec(shape=(5, ), minimum=0, maximum=1, dtype=tf.int32) observation_spec = ({ 'global': tf.TensorSpec(shape=(4, ), dtype=tf.float32), 'per_arm': { 'f1': tf.TensorSpec(shape=(5, ), dtype=tf.string), 'f2': tf.TensorSpec(shape=(5, 2), dtype=tf.int32) } }, mask_spec) time_step_spec = time_step.time_step_spec(observation_spec) policy_info_spec = policy_utilities.PerArmPolicyInfo( chosen_arm_features={ 'f1': tf.TensorSpec(shape=(), dtype=tf.string), 'f2': tf.TensorSpec(shape=(2, ), dtype=tf.int32) }) training_data_spec = trajectory.Trajectory( step_type=time_step_spec.step_type, observation=time_step_spec.observation, action=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=4, dtype=tf.int32), policy_info=policy_info_spec, next_step_type=time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=2, dtype=tf.float32), discount=time_step_spec.discount) experience = tensor_spec.sample_spec_nest(training_data_spec, outer_dims=(7, 2)) observation, action, reward = utils.process_experience_for_neural_agents( experience, lambda x: (x[0], x[1]), True, training_data_spec) self.assertEqual( observation['per_arm']['f1'][0], experience.policy_info.chosen_arm_features['f1'][0, 0]) self.assertAllEqual(action, tf.zeros(14, dtype=tf.int32)) self.assertEqual(reward[0], experience.reward[0, 0])