def run_with_custom_entropy_loss(): """Example of customizing the loss function of an existing policy. This performs about the same as the default loss does.""" def entropy_policy_gradient_loss(policy, batch_tensors): actions = batch_tensors["actions"] advantages = batch_tensors["advantages"] return (-0.1 * policy.action_dist.entropy() - tf.reduce_mean(policy.action_dist.logp(actions) * advantages)) EntropyPolicy = PGTFPolicy.with_updates( loss_fn=entropy_policy_gradient_loss) EntropyLossPG = PGTrainer.with_updates( name="EntropyPG", get_policy_class=lambda _: EntropyPolicy) run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG)
def run_with_custom_entropy_loss(): """Example of customizing the loss function of an existing policy. This performs about the same as the default loss does.""" def entropy_policy_gradient_loss(policy, model, dist_class, train_batch): logits, _ = model.from_batch(train_batch) action_dist = dist_class(logits, model) return (-0.1 * action_dist.entropy() - tf.reduce_mean( action_dist.logp(train_batch["actions"]) * train_batch["advantages"])) EntropyPolicy = PGTFPolicy.with_updates( loss_fn=entropy_policy_gradient_loss) EntropyLossPG = PGTrainer.with_updates( name="EntropyPG", get_policy_class=lambda _: EntropyPolicy) run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG)