示例#1
0
def build_learner(agent: snt.RNNCore,
                  agent_state,
                  env_outputs,
                  agent_outputs,
                  reward_clipping: str,
                  discounting: float,
                  baseline_cost: float,
                  entropy_cost: float,
                  policy_cloning_cost: float,
                  value_cloning_cost: float,
                  clip_grad_norm: float,
                  clip_advantage: bool,
                  learning_rate: float,
                  batch_size: int,
                  batch_size_from_replay: int,
                  unroll_length: int,
                  reward_scaling: float = 1.0,
                  adam_beta1: float = 0.9,
                  adam_beta2: float = 0.999,
                  adam_epsilon: float = 1e-8,
                  fixed_step_mul: bool = False,
                  step_mul: int = 8):
    """Builds the learner loop.

    Returns:
        A tuple of (done, infos, and environment frames) where
        the environment frames tensor causes an update.

    """
    learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs,
                                      agent_state)

    # Use last baseline value (from the value function) to bootstrap.
    bootstrap_value = learner_outputs.baseline[-1]

    # At this point, the environment outputs at time step `t` are the inputs that
    # lead to the learner_outputs at time step `t`. After the following shifting,
    # the actions in agent_outputs and learner_outputs at time step `t` is what
    # leads to the environment outputs at time step `t`.
    agent_outputs = tf.nest.map_structure(lambda t: t[1:], agent_outputs)
    agent_outputs_from_buffer = tf.nest.map_structure(
        lambda t: t[:, :batch_size_from_replay], agent_outputs)
    learner_outputs_from_buffer = tf.nest.map_structure(
        lambda t: t[:-1, :batch_size_from_replay], learner_outputs)

    rewards, infos, done, _ = tf.nest.map_structure(lambda t: t[1:],
                                                    env_outputs)
    learner_outputs = tf.nest.map_structure(lambda t: t[:-1], learner_outputs)

    rewards = rewards * reward_scaling
    clipped_rewards = clip_rewards(rewards, reward_clipping)
    discounts = tf.to_float(~done) * discounting

    # We only need to learn a step_mul policy if the step multiplier is not fixed.
    if not fixed_step_mul:
        agent_outputs.action['step_mul'] = agent_outputs.step_mul
        agent_outputs.action_logits['step_mul'] = agent_outputs.step_mul_logits
        learner_outputs.action_logits[
            'step_mul'] = learner_outputs.step_mul_logits
        agent_outputs_from_buffer.action_logits[
            'step_mul'] = agent_outputs_from_buffer.step_mul_logits
        learner_outputs_from_buffer.action_logits[
            'step_mul'] = learner_outputs_from_buffer.step_mul_logits

    actions = tf.nest.flatten(
        tf.nest.map_structure(lambda x: tf.squeeze(x, axis=2),
                              agent_outputs.action))
    behaviour_logits = tf.nest.flatten(agent_outputs.action_logits)
    target_logits = tf.nest.flatten(learner_outputs.action_logits)
    behaviour_logits_from_buffer = tf.nest.flatten(
        agent_outputs_from_buffer.action_logits)
    target_logits_from_buffer = tf.nest.flatten(
        learner_outputs_from_buffer.action_logits)

    behaviour_neg_log_probs = sum(
        tf.nest.map_structure(compute_neg_log_probs, behaviour_logits,
                              actions))
    target_neg_log_probs = sum(
        tf.nest.map_structure(compute_neg_log_probs, target_logits, actions))
    entropy_loss = sum(
        tf.nest.map_structure(compute_entropy_loss, target_logits))

    with tf.device('/cpu'):
        vtrace_returns = vtrace.from_importance_weights(
            log_rhos=behaviour_neg_log_probs - target_neg_log_probs,
            discounts=discounts,
            rewards=clipped_rewards,
            values=learner_outputs.baseline,
            bootstrap_value=bootstrap_value)

    advantages = tf.stop_gradient(vtrace_returns.pg_advantages)
    # Clip advantages to strictly positive:
    if clip_advantage:
        advantages *= tf.where(advantages > 0.0, tf.ones_like(advantages),
                               tf.zeros_like(advantages))
    policy_gradient_loss = tf.reduce_sum(
        target_neg_log_probs * tf.stop_gradient(vtrace_returns.pg_advantages))
    baseline_loss = .5 * tf.reduce_sum(
        tf.square(vtrace_returns.vs - learner_outputs.baseline))
    entropy_loss = tf.reduce_sum(entropy_loss)

    # Compute the CLEAR policy cloning loss and the value cloning as described in https://arxiv.org/abs/1811.11682:
    policy_cloning_loss = sum(
        tf.nest.map_structure(compute_policy_cloning_loss,
                              target_logits_from_buffer,
                              behaviour_logits_from_buffer))
    value_cloning_loss = tf.reduce_sum(
        tf.square(learner_outputs_from_buffer.baseline -
                  tf.stop_gradient(agent_outputs_from_buffer.baseline)))

    # Combine individual losses, weighted by cost factors, to build overall loss:
    total_loss = policy_gradient_loss \
                 + baseline_cost * baseline_loss \
                 + entropy_cost * entropy_loss \
                 + policy_cloning_cost * policy_cloning_loss \
                 + value_cloning_cost * value_cloning_loss

    optimizer = tf.train.AdamOptimizer(learning_rate, adam_beta1, adam_beta2,
                                       adam_epsilon)
    parameters = tf.trainable_variables()
    gradients = tf.gradients(total_loss, parameters)
    gradients, grad_norm = clip_gradients(gradients, clip_grad_norm)
    train_op = optimizer.apply_gradients(list(zip(gradients, parameters)))

    # Merge updating the network and environment frames into a single tensor.
    with tf.control_dependencies([train_op]):
        if fixed_step_mul:
            step_env_frames = unroll_length * (
                batch_size - batch_size_from_replay) * step_mul
        else:
            # do not use replay samples to calculate num environment frames
            step_env_frames = tf.to_int64(
                tf.reduce_sum(
                    learner_outputs.step_mul[:, batch_size_from_replay:] + 1))
        num_env_frames_and_train = tf.train.get_global_step().assign_add(
            step_env_frames)

    # Adding a few summaries.
    tf.summary.scalar('learning_rate', learning_rate)
    tf.summary.scalar('entropy_cost', entropy_cost)
    tf.summary.scalar('loss/policy_gradient', policy_gradient_loss)
    tf.summary.scalar('loss/baseline', baseline_loss)
    tf.summary.scalar('loss/entropy', entropy_loss)
    tf.summary.scalar('loss/policy_cloning', policy_cloning_loss)
    tf.summary.scalar('loss/value_cloning', value_cloning_loss)
    tf.summary.scalar('loss/total_loss', total_loss)
    for action_name, action in agent_outputs.action.items():
        tf.summary.histogram(f'action/{action_name}', action)
    tf.summary.scalar('grad_norm', grad_norm)

    return done, infos, num_env_frames_and_train
示例#2
0
def build_critic_learner(agent: snt.RNNCore,
                         agent_state,
                         env_outputs,
                         agent_outputs,
                         reward_clipping: str,
                         discounting: float,
                         clip_grad_norm: float,
                         learning_rate: float,
                         batch_size: int,
                         batch_size_from_replay: int,
                         unroll_length: int,
                         reward_scaling: float = 1.0,
                         adam_beta1: float = 0.9,
                         adam_beta2: float = 0.999,
                         adam_epsilon: float = 1e-8,
                         fixed_step_mul: bool = False,
                         step_mul: int = 8):
    learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs,
                                      agent_state)

    bootstrap_value = learner_outputs.baseline[-1]
    rewards, infos, done, _ = tf.nest.map_structure(lambda t: t[1:],
                                                    env_outputs)
    learner_outputs = tf.nest.map_structure(lambda t: t[:-1], learner_outputs)

    rewards = rewards * reward_scaling
    clipped_rewards = clip_rewards(rewards, reward_clipping)
    discounts = tf.to_float(~done) * discounting

    returns = tf.scan(lambda a, x: x[0] + x[1] * a,
                      elems=[clipped_rewards, discounts],
                      initializer=bootstrap_value,
                      parallel_iterations=1,
                      reverse=True,
                      back_prop=False)

    baseline_loss = .5 * tf.reduce_sum(
        tf.square(returns - learner_outputs.baseline))

    # Optimization
    optimizer = tf.train.AdamOptimizer(learning_rate, adam_beta1, adam_beta2,
                                       adam_epsilon)
    parameters = tf.trainable_variables()
    gradients = tf.gradients(baseline_loss, parameters)
    gradients, grad_norm = clip_gradients(gradients, clip_grad_norm)
    train_op = optimizer.apply_gradients(list(zip(gradients, parameters)))

    # Merge updating the network and environment frames into a single tensor.
    with tf.control_dependencies([train_op]):
        if fixed_step_mul:
            step_env_frames = unroll_length * (
                batch_size - batch_size_from_replay) * step_mul
        else:
            # do not use replay samples to calculate num environment frames
            step_env_frames = tf.to_int64(
                tf.reduce_sum(
                    learner_outputs.step_mul[:, batch_size_from_replay:] + 1))
        num_env_frames_and_train = tf.train.get_global_step().assign_add(
            step_env_frames)

    # Adding a few summaries.
    tf.summary.scalar('ciritc_pretrain/learning_rate', learning_rate,
                      ['ciritc_pretrain_summaries'])
    tf.summary.scalar('ciritc_pretrain/baseline_loss', baseline_loss,
                      ['ciritc_pretrain_summaries'])
    tf.summary.scalar('ciritc_pretrain/grad_norm', grad_norm,
                      ['ciritc_pretrain_summaries'])

    summary_op = tf.summary.merge_all('ciritc_pretrain_summaries')

    return num_env_frames_and_train, summary_op