def stats(policy, train_batch): values_batched = _make_time_major(policy, train_batch.get("seq_lens"), policy.model.value_function(), drop_last=policy.config["vtrace"]) stats_dict = { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.pi_loss, "entropy": policy.loss.entropy, "var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()), "vf_loss": policy.loss.vf_loss, "vf_explained_var": explained_variance(tf.reshape(policy.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), } if policy.config["vtrace"]: is_stat_mean, is_stat_var = tf.nn.moments(policy.loss.is_ratio, [0, 1]) stats_dict.update({"mean_IS": is_stat_mean}) stats_dict.update({"var_IS": is_stat_var}) if policy.config["use_kl_loss"]: stats_dict.update({"kl": policy.loss.mean_kl}) stats_dict.update({"KL_Coeff": policy.kl_coeff}) return stats_dict
def central_vf_stats(policy, train_batch, grads): # Report the explained variance of the central value function. return { "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy.central_value_out), }
def kl_and_loss_stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]: """Stats function for PPO. Returns a dict with important KL and loss stats. Args: policy (Policy): The Policy to generate stats for. train_batch (SampleBatch): The SampleBatch (already) used for training. Returns: Dict[str, TensorType]: The stats dict. """ return { "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64), "cur_lr": tf.cast(policy.cur_lr, tf.float64), "total_loss": policy._total_loss, "policy_loss": policy._mean_policy_loss, "vf_loss": policy._mean_vf_loss, "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function()), "kl": policy._mean_kl, "entropy": policy._mean_entropy, "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), }
def grad_stats(policy, train_batch, grads): return { "grad_gnorm": tf.linalg.global_norm(grads), "vf_explained_var": explained_variance( train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function()), }
def central_vf_stats(policy, train_batch, grads): # Report the explained variance of the central value function. return { "grad_gnorm": tf.linalg.global_norm(grads), "vf_explained_var": explained_variance( train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function() ), }
def grad_stats(policy: Policy, train_batch: SampleBatch, grads: ModelGradients) -> Dict[str, TensorType]: return { "grad_gnorm": tf.linalg.global_norm(grads), "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function()) }
def __init__(self, policy, state_values, action_dist, actions, advantages, vf_loss_coeff, beta): self.v_loss = self._build_value_loss(state_values, advantages) self.p_loss = self._build_policy_loss(policy, state_values, advantages, actions, action_dist, beta) self.total_loss = self.p_loss.loss + vf_loss_coeff * self.v_loss.loss explained_var = explained_variance(advantages, state_values) self.explained_variance = tf.reduce_mean(explained_var)
def __init__(self, policy: Policy, value_estimates: TensorType, action_dist: ActionDistribution, train_batch: SampleBatch, vf_loss_coeff: float, beta: float): # L = - A * log\pi_\theta(a|s) logprobs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) if beta != 0.0: cumulative_rewards = train_batch[Postprocessing.ADVANTAGES] # Advantage Estimation. adv = cumulative_rewards - value_estimates adv_squared = tf.reduce_mean(tf.math.square(adv)) # Value function's loss term (MSE). self.v_loss = 0.5 * adv_squared # Perform moving averaging of advantage^2. rate = policy.config["moving_average_sqd_adv_norm_update_rate"] # Update averaged advantage norm. # Eager. if policy.config["framework"] in ["tf2", "tfe"]: update_term = adv_squared - policy._moving_average_sqd_adv_norm policy._moving_average_sqd_adv_norm.assign_add(rate * update_term) # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) # Static graph. else: update_adv_norm = tf1.assign_add( ref=policy._moving_average_sqd_adv_norm, value=rate * (adv_squared - policy._moving_average_sqd_adv_norm)) # Exponentially weighted advantages. with tf1.control_dependencies([update_adv_norm]): exp_advs = tf.math.exp(beta * tf.math.divide( adv, 1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm))) exp_advs = tf.stop_gradient(exp_advs) self.explained_variance = tf.reduce_mean( explained_variance(cumulative_rewards, value_estimates)) else: # Value function's loss term (MSE). self.v_loss = tf.constant(0.0) exp_advs = 1.0 self.p_loss = -1.0 * tf.reduce_mean(exp_advs * logprobs) self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss
def __init__(self, policy, value_estimates, action_dist, actions, cumulative_rewards, vf_loss_coeff, beta): # Advantage Estimation. adv = cumulative_rewards - value_estimates adv_squared = tf.reduce_mean(tf.math.square(adv)) # Value function's loss term (MSE). self.v_loss = 0.5 * adv_squared if beta != 0.0: # Perform moving averaging of advantage^2. # Update averaged advantage norm. # Eager. if policy.config["framework"] in ["tf2", "tfe"]: update_term = adv_squared - policy._moving_average_sqd_adv_norm policy._moving_average_sqd_adv_norm.assign_add(1e-8 * update_term) # Exponentially weighted advantages. c = tf.math.sqrt(policy._moving_average_sqd_adv_norm) exp_advs = tf.math.exp(beta * (adv / (1e-8 + c))) # Static graph. else: update_adv_norm = tf1.assign_add( ref=policy._moving_average_sqd_adv_norm, value=1e-6 * (adv_squared - policy._moving_average_sqd_adv_norm)) # Exponentially weighted advantages. with tf1.control_dependencies([update_adv_norm]): exp_advs = tf.math.exp(beta * tf.math.divide( adv, 1e-8 + tf.math.sqrt(policy._moving_average_sqd_adv_norm))) exp_advs = tf.stop_gradient(exp_advs) else: exp_advs = 1.0 # L = - A * log\pi_\theta(a|s) logprobs = action_dist.logp(actions) self.p_loss = -1.0 * tf.reduce_mean(exp_advs * logprobs) self.total_loss = self.p_loss + vf_loss_coeff * self.v_loss self.explained_variance = tf.reduce_mean( explained_variance(cumulative_rewards, value_estimates))
def stats(policy, train_batch): values_batched = _make_time_major( policy, train_batch.get("seq_lens"), policy.model.value_function(), drop_last=policy.config["vtrace"]) return { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.pi_loss, "entropy": policy.loss.entropy, "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), "var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()), "vf_loss": policy.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(policy.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), }
def stats(policy: Policy, train_batch: SampleBatch) -> Dict[str, TensorType]: """Stats function for APPO. Returns a dict with important loss stats. Args: policy (Policy): The Policy to generate stats for. train_batch (SampleBatch): The SampleBatch (already) used for training. Returns: Dict[str, TensorType]: The stats dict. """ values_batched = _make_time_major(policy, train_batch.get(SampleBatch.SEQ_LENS), policy.model.value_function(), drop_last=policy.config["vtrace"]) stats_dict = { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "total_loss": policy._total_loss, "policy_loss": policy._mean_policy_loss, "entropy": policy._mean_entropy, "var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()), "vf_loss": policy._mean_vf_loss, "vf_explained_var": explained_variance(tf.reshape(policy._value_targets, [-1]), tf.reshape(values_batched, [-1])) } if policy.config["vtrace"]: is_stat_mean, is_stat_var = tf.nn.moments(policy._is_ratio, [0, 1]) stats_dict["mean_IS"] = is_stat_mean stats_dict["var_IS"] = is_stat_var if policy.config["use_kl_loss"]: stats_dict["kl"] = policy._mean_kl_loss stats_dict["KL_Coeff"] = policy.kl_coeff return stats_dict
def kl_and_loss_stats(policy, train_batch): return { "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64), "cur_lr": tf.cast(policy.cur_lr, tf.float64), "total_loss": policy.loss_obj.loss, "policy_loss": policy.loss_obj.mean_policy_loss, "vf_loss": policy.loss_obj.mean_vf_loss, "vf_explained_var": explained_variance(train_batch[Postprocessing.VALUE_TARGETS], policy.model.value_function()), "kl": policy.loss_obj.mean_kl, "entropy": policy.loss_obj.mean_entropy, "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), }