def _step(self): # Update target network. online_variables = ( *self._observation_network.variables, *self._critic_network.variables, *self._policy_network.variables, ) target_variables = ( *self._target_observation_network.variables, *self._target_critic_network.variables, *self._target_policy_network.variables, ) # Make online -> target network update ops. if tf.math.mod(self._num_steps, self._target_update_period) == 0: for src, dest in zip(online_variables, target_variables): dest.assign(src) self._num_steps.assign_add(1) # Get data from replay (dropping extras if any). Note there is no # extra data here because we do not insert any into Reverb. inputs = next(self._iterator) transitions: types.Transition = inputs.data # Cast the additional discount to match the environment discount dtype. discount = tf.cast(self._discount, dtype=transitions.discount.dtype) with tf.GradientTape(persistent=True) as tape: # Maybe transform the observation before feeding into policy and critic. # Transforming the observations this way at the start of the learning # step effectively means that the policy and critic share observation # network weights. o_tm1 = self._observation_network(transitions.observation) o_t = self._target_observation_network( transitions.next_observation) # This stop_gradient prevents gradients to propagate into the target # observation network. In addition, since the online policy network is # evaluated at o_t, this also means the policy loss does not influence # the observation network training. o_t = tree.map_structure(tf.stop_gradient, o_t) # Critic learning. q_tm1 = self._critic_network(o_tm1, transitions.action) q_t = self._target_critic_network(o_t, self._target_policy_network(o_t)) # Squeeze into the shape expected by the td_learning implementation. q_tm1 = tf.squeeze(q_tm1, axis=-1) # [B] q_t = tf.squeeze(q_t, axis=-1) # [B] # Critic loss. critic_loss = trfl.td_learning(q_tm1, transitions.reward, discount * transitions.discount, q_t).loss critic_loss = tf.reduce_mean(critic_loss, axis=0) # Actor learning. dpg_a_t = self._policy_network(o_t) dpg_q_t = self._critic_network(o_t, dpg_a_t) # Actor loss. If clipping is true use dqda clipping and clip the norm. dqda_clipping = 1.0 if self._clipping else None policy_loss = losses.dpg(dpg_q_t, dpg_a_t, tape=tape, dqda_clipping=dqda_clipping, clip_norm=self._clipping) policy_loss = tf.reduce_mean(policy_loss, axis=0) # Get trainable variables. policy_variables = self._policy_network.trainable_variables critic_variables = ( # In this agent, the critic loss trains the observation network. self._observation_network.trainable_variables + self._critic_network.trainable_variables) # Compute gradients. policy_gradients = tape.gradient(policy_loss, policy_variables) critic_gradients = tape.gradient(critic_loss, critic_variables) # Delete the tape manually because of the persistent=True flag. del tape # Maybe clip gradients. if self._clipping: policy_gradients = tf.clip_by_global_norm(policy_gradients, 40.)[0] critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0] # Apply gradients. self._policy_optimizer.apply(policy_gradients, policy_variables) self._critic_optimizer.apply(critic_gradients, critic_variables) # Losses to track. return { 'critic_loss': critic_loss, 'policy_loss': policy_loss, }
def _forward(self, inputs: Any) -> None: """Trainer forward pass Args: inputs (Any): input data from the data table (transitions) """ # TODO: Update this forward function to work like MAD4PG data = inputs.data # Note (dries): The unused variable is start_of_episodes. observations, actions, rewards, discounts, _, extras = ( data.observations, data.actions, data.rewards, data.discounts, data.start_of_episode, data.extras, ) # Get initial state for the LSTM from replay and # extract the first state in the sequence.. core_state = tree.map_structure(lambda s: s[:, 0, :], extras["core_states"]) target_core_state = tree.map_structure(tf.identity, core_state) # TODO (dries): Take out all the data_points that does not need # to be processed here at the start. Therefore it does not have # to be done later on and saves processing time. self.policy_losses: Dict[str, tf.Tensor] = {} self.critic_losses: Dict[str, tf.Tensor] = {} # Do forward passes through the networks and calculate the losses with tf.GradientTape(persistent=True) as tape: # Note (dries): We are assuming that only the policy network # is recurrent and not the observation network. obs_trans, target_obs_trans = self._transform_observations( observations) target_actions = self._target_policy_actions( target_obs_trans, target_core_state) for agent in self._agents: agent_key = self.agent_net_keys[agent] # Get critic feed ( obs_trans_feed, target_obs_trans_feed, action_feed, target_actions_feed, ) = self._get_critic_feed( obs_trans=obs_trans, target_obs_trans=target_obs_trans, actions=actions, target_actions=target_actions, extras=extras, agent=agent, ) # Critic learning. # Remove the last sequence step for the normal network obs_comb, dims = train_utils.combine_dim(obs_trans_feed) act_comb, _ = train_utils.combine_dim(action_feed) q_values = self._critic_networks[agent_key](obs_comb, act_comb) q_values.set_dimensions(dims) # Remove first sequence step for the target obs_comb, _ = train_utils.combine_dim(target_obs_trans_feed) act_comb, _ = train_utils.combine_dim(target_actions_feed) target_q_values = self._target_critic_networks[agent_key]( obs_comb, act_comb) target_q_values.set_dimensions(dims) # Cast the additional discount to match # the environment discount dtype. agent_discount = discounts[agent] discount = tf.cast(self._discount, dtype=agent_discount.dtype) # Critic loss. critic_loss = recurrent_n_step_critic_loss( q_values, target_q_values, rewards[agent], discount * agent_discount, bootstrap_n=self._bootstrap_n, loss_fn=losses.categorical, ) self.critic_losses[agent] = tf.reduce_mean(critic_loss, axis=0) # Actor learning. obs_agent_feed = target_obs_trans[agent] # TODO (dries): Why is there an extra tuple? agent_core_state = core_state[agent][0] transposed_obs = tf2_utils.batch_to_sequence(obs_agent_feed) outputs, updated_states = snt.static_unroll( self._policy_networks[agent_key], transposed_obs, agent_core_state, ) dpg_actions = tf2_utils.batch_to_sequence(outputs) # Note (dries): This is done to so that losses.dpg can verify # using gradient.tape that there is a # gradient relationship between dpg_q_values and dpg_actions_comb. dpg_actions_comb, dim = train_utils.combine_dim(dpg_actions) # Note (dries): This seemingly useless line is important! # Don't remove it. See above note. dpg_actions = train_utils.extract_dim(dpg_actions_comb, dim) # Get dpg actions dpg_actions_feed = self._get_dpg_feed(target_actions, dpg_actions, agent) # Get dpg Q values. obs_comb, _ = train_utils.combine_dim(target_obs_trans_feed) act_comb, _ = train_utils.combine_dim(dpg_actions_feed) dpg_z_values = self._critic_networks[agent_key](obs_comb, act_comb) dpg_q_values = dpg_z_values.mean() # Actor loss. If clipping is true use dqda clipping and clip the norm. dqda_clipping = 1.0 if self._max_gradient_norm is not None else None clip_norm = True if self._max_gradient_norm is not None else False policy_loss = losses.dpg( dpg_q_values, dpg_actions_comb, tape=tape, dqda_clipping=dqda_clipping, clip_norm=clip_norm, ) self.policy_losses[agent] = tf.reduce_mean(policy_loss, axis=0) self.tape = tape
def _step(self): # Update target network. online_variables = ( *self._observation_network.variables, *self._critic_network.variables, *self._policy_network.variables, ) target_variables = ( *self._target_observation_network.variables, *self._target_critic_network.variables, *self._target_policy_network.variables, ) # Make online -> target network update ops. if self._target_update_period > 0 and \ tf.math.mod(self._num_steps, self._target_update_period) == 0: for src, dest in zip(online_variables, target_variables): dest.assign(src) self._num_steps.assign_add(1) # Get data from replay (dropping extras if any). Note there is no # extra data here because we do not insert any into Reverb. inputs = next(self._iterator) o_tm1, a_tm1, r_t, d_t, o_t, extra = inputs.data behavior_logP_tm1 = extra['logP'] behavior_tm1 = extra['policy'] # Cast the additional discount to match the environment discount dtype. discount = tf.cast(self._discount, dtype=d_t.dtype) with tf.GradientTape(persistent=True) as tape: # Maybe transform the observation before feeding into policy and critic. # Transforming the observations this way at the start of the learning # step effectively means that the policy and critic share observation # network weights. o_tm1 = self._observation_network(o_tm1) o_t = self._target_observation_network(o_t) o_t = tree.map_structure(tf.stop_gradient, o_t) # Policy pol_tm1, v_tm1 = self._policy_network(o_tm1) pol_t, v_t = self._target_policy_network(o_t) pol_t = tree.map_structure(tf.stop_gradient, pol_t) v_t = tree.map_structure(tf.stop_gradient, v_t) # Actor loss. If clipping is true use dqda clipping and clip the norm. # TODO: two critic nets, e.g. q1_tm1 and q2_tm1, pick the min as target # DPG loss. If clipping is true use dqda clipping and clip the norm. dqda_clipping = 1.0 if self._clipping else None onpol_a_tm1, onpol_logP_tm1 = self._sampling_head(pol_tm1) onpol_q_tm1 = self._critic_network(o_tm1, onpol_a_tm1) onpol_q_tm1 = tf.squeeze(onpol_q_tm1, axis=-1) # [B] logP_tm1 = self._sampling_head.log_prob(a_tm1, pol_tm1) ReFER_params_loss = self._ReFER.loss(behavior_logP_tm1, logP_tm1) dpg_loss = losses.dpg(onpol_q_tm1, onpol_a_tm1, tape=tape, dqda_clipping=dqda_clipping, clip_norm=self._clipping) dpg_loss = tf.reduce_mean(dpg_loss, axis=0) entropy_loss = self._entropy_coeff * tf.reduce_mean(onpol_logP_tm1, axis=0) KL_coef = self._ReFER.DKL_coef() #behavior_P_tm1 = tf.math.exp(behavior_logP_tm1) #KL_loss = KL_coef * behavior_P_tm1 * (behavior_logP_tm1 - logP_tm1) KL_loss = tf.reduce_sum((behavior_tm1 - pol_tm1)**2, axis=-1) KL_loss = KL_coef * tf.reduce_mean(KL_loss, axis=0) # V(s) loss value_target = tf.stop_gradient(onpol_q_tm1 - self._entropy_coeff * onpol_logP_tm1) value_loss = losses.huber(value_target - v_tm1, 1.0) #value_loss = 0.5 * (value_target - v_tm1) ** 2 value_loss = tf.reduce_mean(value_loss, axis=0) # Critic learning with TD loss q_tm1 = self._critic_network(o_tm1, a_tm1) q_tm1 = tf.squeeze(q_tm1, axis=-1) # [B] onpol_a_t, logP_t = self._sampling_head(pol_t) onpol_q_t = self._target_critic_network(o_t, onpol_a_t) onpol_q_t = tf.squeeze(onpol_q_t, axis=-1) # [B] onpol_q_t = tree.map_structure(tf.stop_gradient, onpol_q_t) R_t = self._observation_network.scale_rewards(r_t) critic_target = tf.stop_gradient(R_t + d_t * tf.minimum(v_t, onpol_q_t)) #critic_target = tf.stop_gradient(R_t + d_t * 0.5*(v_t + onpol_q_t)) critic_loss = losses.huber(critic_target - q_tm1, 1.0) #critic_loss = 0.5 * (critic_target - q_tm1) ** 2 critic_loss = tf.reduce_mean(critic_loss, axis=0) encoder_loss = self._observation_network.compute_loss(o_tm1, r_t) policy_loss = value_loss + entropy_loss + dpg_loss + encoder_loss + KL_loss # Compute gradients. policy_gradients = tape.gradient(policy_loss, self._policy_variables) critic_gradients = tape.gradient(critic_loss, self._critic_variables) ReFER_gradient = tape.gradient(ReFER_params_loss, self._ReFER.trainable_variables) # Delete the tape manually because of the persistent=True flag. del tape # Maybe clip gradients. if self._clipping: policy_gradients = tf.clip_by_global_norm(policy_gradients, 40.)[0] critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0] # Apply gradients. self._policy_optimizer.apply(policy_gradients, self._policy_variables) self._critic_optimizer.apply(critic_gradients, self._critic_variables) self._ReFER_optimizer.apply(ReFER_gradient, self._ReFER.trainable_variables) # Losses to track. return { 'critic_loss': critic_loss, 'svalue_loss': value_loss, 'entropy_loss': entropy_loss, 'dpg_loss': dpg_loss, 'avg_q': tf.reduce_mean(onpol_q_t, axis=0), 'KL_loss': KL_loss, #'frac_off_pol': self._ReFER._last_frac_off_pol, 'beta': self._ReFER._beta, 'r_mean': self._observation_network._ret_mean, 'r_scale': self._observation_network._ret_scale, }
def _forward(self, inputs: Any) -> None: """Trainer forward pass Args: inputs (Any): input data from the data table (transitions) """ # Unpack input data as follows: # o_tm1 = dictionary of observations one for each agent # a_tm1 = dictionary of actions taken from obs in o_tm1 # e_tm1 [Optional] = extra data for timestep t-1 # that the agents persist in replay. # r_t = dictionary of rewards or rewards sequences # (if using N step transitions) ensuing from actions a_tm1 # d_t = environment discount ensuing from actions a_tm1. # This discount is applied to future rewards after r_t. # o_t = dictionary of next observations or next observation sequences # e_t [Optional] = extra data for timestep t that the agents persist in replay. o_tm1, a_tm1, e_tm1, r_t, d_t, o_t, e_t = inputs.data # Do forward passes through the networks and calculate the losses self.policy_losses = {} self.critic_losses = {} with tf.GradientTape(persistent=True) as tape: o_tm1_trans, o_t_trans = self._transform_observations(o_tm1, o_t) a_t = self._target_policy_actions(o_t_trans) for agent in self._agents: agent_key = self.agent_net_keys[agent] # Get critic feed o_tm1_feed, o_t_feed, a_tm1_feed, a_t_feed = self._get_critic_feed( o_tm1_trans=o_tm1_trans, o_t_trans=o_t_trans, a_tm1=a_tm1, a_t=a_t, e_tm1=e_tm1, e_t=e_t, agent=agent, ) # Critic learning. q_tm1 = self._critic_networks[agent_key](o_tm1_feed, a_tm1_feed) q_t = self._target_critic_networks[agent_key](o_t_feed, a_t_feed) # Cast the additional discount to match the environment discount dtype. discount = tf.cast(self._discount, dtype=d_t[agent].dtype) # Critic loss. critic_loss = losses.categorical(q_tm1, r_t[agent], discount * d_t[agent], q_t) self.critic_losses[agent] = tf.reduce_mean(critic_loss, axis=0) # Actor learning. o_t_agent_feed = o_t_trans[agent] dpg_a_t = self._policy_networks[agent_key](o_t_agent_feed) # Get dpg actions dpg_a_t_feed = self._get_dpg_feed(a_t, dpg_a_t, agent) # Get dpg Q values. dpg_z_t = self._critic_networks[agent_key](o_t_feed, dpg_a_t_feed) dpg_q_t = dpg_z_t.mean() # Actor loss. If clipping is true use dqda clipping and clip the norm. dqda_clipping = 1.0 if self._max_gradient_norm is not None else None clip_norm = True if self._max_gradient_norm is not None else False policy_loss = losses.dpg( dpg_q_t, dpg_a_t, tape=tape, dqda_clipping=dqda_clipping, clip_norm=clip_norm, ) self.policy_losses[agent] = tf.reduce_mean(policy_loss, axis=0) self.tape = tape
def _step(self, sample) -> Dict[str, tf.Tensor]: transitions: types.Transition = sample.data # Assuming ReverbSample. # Cast the additional discount to match the environment discount dtype. discount = tf.cast(self._discount, dtype=transitions.discount.dtype) with tf.GradientTape(persistent=True) as tape: # Maybe transform the observation before feeding into policy and critic. # Transforming the observations this way at the start of the learning # step effectively means that the policy and critic share observation # network weights. o_tm1 = self._observation_network(transitions.observation) o_t = self._target_observation_network(transitions.next_observation) # This stop_gradient prevents gradients to propagate into the target # observation network. In addition, since the online policy network is # evaluated at o_t, this also means the policy loss does not influence # the observation network training. o_t = tree.map_structure(tf.stop_gradient, o_t) # Critic learning. q_tm1 = self._critic_network(o_tm1, transitions.action) q_t = self._target_critic_network(o_t, self._target_policy_network(o_t)) # Critic loss. critic_loss = losses.categorical(q_tm1, transitions.reward, discount * transitions.discount, q_t) critic_loss = tf.reduce_mean(critic_loss, axis=[0]) # Actor learning. dpg_a_t = self._policy_network(o_t) dpg_z_t = self._critic_network(o_t, dpg_a_t) dpg_q_t = dpg_z_t.mean() # Actor loss. If clipping is true use dqda clipping and clip the norm. dqda_clipping = 1.0 if self._clipping else None policy_loss = losses.dpg( dpg_q_t, dpg_a_t, tape=tape, dqda_clipping=dqda_clipping, clip_norm=self._clipping) policy_loss = tf.reduce_mean(policy_loss, axis=[0]) # Get trainable variables. policy_variables = self._policy_network.trainable_variables critic_variables = ( # In this agent, the critic loss trains the observation network. self._observation_network.trainable_variables + self._critic_network.trainable_variables) # Compute gradients. replica_context = tf.distribute.get_replica_context() policy_gradients = _average_gradients_across_replicas( replica_context, tape.gradient(policy_loss, policy_variables)) critic_gradients = _average_gradients_across_replicas( replica_context, tape.gradient(critic_loss, critic_variables)) # Delete the tape manually because of the persistent=True flag. del tape # Maybe clip gradients. if self._clipping: policy_gradients = tf.clip_by_global_norm(policy_gradients, 40.)[0] critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0] # Apply gradients. self._policy_optimizer.apply(policy_gradients, policy_variables) self._critic_optimizer.apply(critic_gradients, critic_variables) # Losses to track. return { 'critic_loss': critic_loss, 'policy_loss': policy_loss, }