示例#1
0
def build_actor_loss(q1_t, q2_t):
    assert_scalar(q1_t)
    assert_scalar(q2_t)

    q_t = tf.minimum(q1_t, q2_t)
    loss = tf.reduce_mean(q_t)
    return loss
示例#2
0
def build_q_loss(q_t, rewards_tp1, v_tp1, dones_tp1, gamma):
    assert_scalar(q_t)
    assert_scalar(rewards_tp1)
    assert_scalar(v_tp1)
    assert_scalar(dones_tp1)

    target = tf.stop_gradient(rewards_tp1 + gamma * v_tp1 * (1.0 - dones_tp1))
    loss = 0.5 * tf.reduce_mean((target - q_t)**2)
    return loss
示例#3
0
def build_target(rewards_tp1, q1_tp1, q2_tp1, dones_tp1, gamma):
    assert_scalar(rewards_tp1)
    assert_scalar(q1_tp1)
    assert_scalar(q2_tp1)
    assert_scalar(dones_tp1)

    q_tp1 = tf.minimum(q1_tp1, q2_tp1)
    target = rewards_tp1 + gamma * q_tp1 * (1.0 - dones_tp1)
    return tf.stop_gradient(target)
示例#4
0
def build_critic_loss(q_t, rewards_tp1, q_tp1, dones_tp1, gamma):
    assert_scalar(q_t)
    assert_scalar(rewards_tp1)
    assert_scalar(q_tp1)
    assert_scalar(dones_tp1)

    target = rewards_tp1 + gamma * q_tp1 * (1.0 - dones_tp1)
    loss = tf.reduce_mean(tf.square(target - q_t))
    return loss
示例#5
0
def build_v_loss(v_t, q1_t, q2_t, log_prob_t):
    assert_scalar(v_t)
    assert_scalar(q1_t)
    assert_scalar(q2_t)
    assert_scalar(log_prob_t)

    q_t = tf.minimum(q1_t, q2_t)
    target = tf.stop_gradient(q_t - log_prob_t)
    loss = 0.5 * tf.reduce_mean((v_t - target)**2)
    return loss
示例#6
0
def test_assert_scalar():
    with pytest.raises(AssertionError):
        assert_scalar(np.random.random((4, )))
    with pytest.raises(AssertionError):
        assert_scalar(np.random.random((4, 10)))

    assert_scalar(np.random.random((4, 1)))
示例#7
0
def build_pi_loss(log_prob_t, q1_t, q2_t):
    assert_scalar(log_prob_t)
    assert_scalar(q1_t)
    assert_scalar(q2_t)

    q_t = tf.minimum(q1_t, q2_t)
    loss = tf.reduce_mean(log_prob_t - q_t)
    return loss
示例#8
0
def build_value_loss(values, returns, old_values, epsilon, value_factor):
    assert_scalar(values)
    assert_scalar(returns)
    assert_scalar(old_values)

    with tf.variable_scope('value_loss'):
        clipped_diff = tf.clip_by_value((values - old_values), -epsilon,
                                        epsilon)
        loss_clipped = (old_values + clipped_diff - returns)**2
        loss_non_clipped = (returns - values)**2
        loss = tf.reduce_mean(tf.maximum(loss_clipped, loss_non_clipped))
        return value_factor * loss
示例#9
0
def build_policy_loss(log_probs, old_log_probs, advantages, epsilon):
    assert_scalar(log_probs)
    assert_scalar(old_log_probs)
    assert_scalar(advantages)

    with tf.variable_scope('policy_loss'):
        ratio = tf.exp(log_probs - old_log_probs)
        surr1 = ratio * advantages
        surr2 = tf.clip_by_value(ratio, 1.0 - epsilon,
                                 1.0 + epsilon) * advantages
        surr = tf.minimum(surr1, surr2)
        loss = -tf.reduce_mean(surr)
    return loss