def _train(self, experience, weights): with tf.GradientTape() as tape: loss_info = self._loss( experience, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, reward_scale_factor=self._reward_scale_factor, weights=weights) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') variables_to_train = self._q_network.trainable_weights non_trainable_weights = self._q_network.non_trainable_weights assert list(variables_to_train), "No variables in the agent's q_network." grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = list(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms(grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: grads_and_vars_with_non_trainable = ( grads_and_vars + [(None, v) for v in non_trainable_weights]) eager_utils.add_variables_summaries(grads_and_vars_with_non_trainable, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) training.apply_gradients( self._optimizer, grads_and_vars, global_step=self.train_step_counter) self._update_target() return loss_info
def train_single_net(self, net, individual_iql_time_step, individual_iql_next_time_step, time_steps, actions, next_time_steps, i, t): variables_to_train = net.agent._q_network.trainable_weights assert list( variables_to_train), "No variables in the agent's QMIX network." with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(variables_to_train) loss_info = self._loss( net, individual_iql_time_step, individual_iql_next_time_step, time_steps, actions, next_time_steps, i, t, td_errors_loss_fn=net.agent._td_errors_loss_fn, gamma=net.agent._gamma, training=True) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = list(zip(grads, variables_to_train)) training_lib.apply_gradients(net.agent._optimizer, grads_and_vars, global_step=net.agent.train_step_counter) net.agent._update_target() return loss_info
def _train(self, experience, weights): rewards, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) actions, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observations, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self.training_data_spec.observation) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) if self._accepts_per_arm_features: # The arm observation we train on needs to be copied from the respective # policy info field to the per arm observation field. Pretending there was # only one action, we fill the action field with zeros. chosen_action, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.policy_info.chosen_arm_features, self.policy.info_spec.chosen_arm_features) observations[ bandit_spec_utils.PER_ARM_FEATURE_KEY] = tf.expand_dims( chosen_action, axis=1) actions = tf.zeros_like(actions) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) self.compute_summaries(loss_info.loss) variables_to_train = self._reward_network.trainable_weights if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) training_lib.apply_gradients(self._optimizer, grads_and_vars, global_step=self.train_step_counter) return loss_info
def compute_loss_using_reward_layer(self, observation, action, reward, weights, training=False): """Computes loss using the reward layer. Args: observation: A batch of observations. action: A batch of actions. reward: A batch of rewards. weights: Optional scalar or elementwise (per-batch-entry) importance weights. The output batch loss will be scaled by these weights, and the final scalar loss is the mean of these values. training: Whether the loss is being used for training. Returns: loss: A `LossInfo` containing the loss for the training step. """ # Update the neural network params. with tf.GradientTape() as tape: loss_info = self.loss(observation, action, reward, weights, training=training) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') if self._summarize_grads_and_vars: self.compute_summaries(loss_info.loss) variables_to_train = (self._encoding_network.trainable_weights + self._reward_layer.trainable_weights) if not variables_to_train: raise ValueError('No variable to train in the agent.') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: with tf.name_scope('Reward_network/'): eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) training_lib.apply_gradients(self._optimizer, grads_and_vars, global_step=self.train_step_counter) return loss_info
def train(self, experience, agents, nameDict, networkDict): """QMIX - get the Q values from the target network and main network of all the agents""" time_steps, policy_steps, next_time_steps = ( trajectory.experience_to_transitions(experience, squeeze_time_dim=True)) variables_to_train = getTrainableVariables(networkDict) variables_to_train.append(self.QMIXNet.trainable_weights) variables_to_train = tf.nest.flatten(variables_to_train) assert list( variables_to_train), "No variables in the agent's QMIX network." with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(variables_to_train) loss_info = self._loss(time_steps, policy_steps, next_time_steps, agents, nameDict, networkDict, td_errors_loss_fn=self._td_errors_loss_fn, gamma=self._gamma, training=True) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = list(zip(grads, variables_to_train)) self.train_step_counter = training_lib.apply_gradients( self._optimizer, grads_and_vars, global_step=self.train_step_counter) self._update_target() return loss_info
def _train(self, experience, weights): rewards, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) actions, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observations, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self._time_step_spec.observation) if self._observation_and_action_constraint_splitter is not None: observations, _ = self._observation_and_action_constraint_splitter( observations) with tf.GradientTape() as tape: loss_info = self.loss(observations, actions, rewards, weights=weights, training=True) tf.debugging.check_numerics(loss_info[0], 'Loss is inf or nan') self.compute_summaries(loss_info.loss) variables_to_train = self._reward_network.trainable_weights if not variables_to_train: logging.info('No variable to train in the agent.') return loss_info grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping is not None: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) training_lib.apply_gradients(self._optimizer, grads_and_vars, global_step=self.train_step_counter) return loss_info