def run_with_custom_entropy_loss():
    """Example of customizing the loss function of an existing policy.

    This performs about the same as the default loss does."""
    def entropy_policy_gradient_loss(policy, batch_tensors):
        actions = batch_tensors["actions"]
        advantages = batch_tensors["advantages"]
        return (-0.1 * policy.action_dist.entropy() -
                tf.reduce_mean(policy.action_dist.logp(actions) * advantages))

    EntropyPolicy = PGTFPolicy.with_updates(
        loss_fn=entropy_policy_gradient_loss)
    EntropyLossPG = PGTrainer.with_updates(
        name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)
    run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG)
예제 #2
0
def run_with_custom_entropy_loss():
    """Example of customizing the loss function of an existing policy.

    This performs about the same as the default loss does."""
    def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):
        logits, _ = model.from_batch(train_batch)
        action_dist = dist_class(logits, model)
        return (-0.1 * action_dist.entropy() - tf.reduce_mean(
            action_dist.logp(train_batch["actions"]) *
            train_batch["advantages"]))

    EntropyPolicy = PGTFPolicy.with_updates(
        loss_fn=entropy_policy_gradient_loss)
    EntropyLossPG = PGTrainer.with_updates(
        name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)
    run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG)