def build_actor_loss(q1_t, q2_t): assert_scalar(q1_t) assert_scalar(q2_t) q_t = tf.minimum(q1_t, q2_t) loss = tf.reduce_mean(q_t) return loss
def build_q_loss(q_t, rewards_tp1, v_tp1, dones_tp1, gamma): assert_scalar(q_t) assert_scalar(rewards_tp1) assert_scalar(v_tp1) assert_scalar(dones_tp1) target = tf.stop_gradient(rewards_tp1 + gamma * v_tp1 * (1.0 - dones_tp1)) loss = 0.5 * tf.reduce_mean((target - q_t)**2) return loss
def build_target(rewards_tp1, q1_tp1, q2_tp1, dones_tp1, gamma): assert_scalar(rewards_tp1) assert_scalar(q1_tp1) assert_scalar(q2_tp1) assert_scalar(dones_tp1) q_tp1 = tf.minimum(q1_tp1, q2_tp1) target = rewards_tp1 + gamma * q_tp1 * (1.0 - dones_tp1) return tf.stop_gradient(target)
def build_critic_loss(q_t, rewards_tp1, q_tp1, dones_tp1, gamma): assert_scalar(q_t) assert_scalar(rewards_tp1) assert_scalar(q_tp1) assert_scalar(dones_tp1) target = rewards_tp1 + gamma * q_tp1 * (1.0 - dones_tp1) loss = tf.reduce_mean(tf.square(target - q_t)) return loss
def build_v_loss(v_t, q1_t, q2_t, log_prob_t): assert_scalar(v_t) assert_scalar(q1_t) assert_scalar(q2_t) assert_scalar(log_prob_t) q_t = tf.minimum(q1_t, q2_t) target = tf.stop_gradient(q_t - log_prob_t) loss = 0.5 * tf.reduce_mean((v_t - target)**2) return loss
def test_assert_scalar(): with pytest.raises(AssertionError): assert_scalar(np.random.random((4, ))) with pytest.raises(AssertionError): assert_scalar(np.random.random((4, 10))) assert_scalar(np.random.random((4, 1)))
def build_pi_loss(log_prob_t, q1_t, q2_t): assert_scalar(log_prob_t) assert_scalar(q1_t) assert_scalar(q2_t) q_t = tf.minimum(q1_t, q2_t) loss = tf.reduce_mean(log_prob_t - q_t) return loss
def build_value_loss(values, returns, old_values, epsilon, value_factor): assert_scalar(values) assert_scalar(returns) assert_scalar(old_values) with tf.variable_scope('value_loss'): clipped_diff = tf.clip_by_value((values - old_values), -epsilon, epsilon) loss_clipped = (old_values + clipped_diff - returns)**2 loss_non_clipped = (returns - values)**2 loss = tf.reduce_mean(tf.maximum(loss_clipped, loss_non_clipped)) return value_factor * loss
def build_policy_loss(log_probs, old_log_probs, advantages, epsilon): assert_scalar(log_probs) assert_scalar(old_log_probs) assert_scalar(advantages) with tf.variable_scope('policy_loss'): ratio = tf.exp(log_probs - old_log_probs) surr1 = ratio * advantages surr2 = tf.clip_by_value(ratio, 1.0 - epsilon, 1.0 + epsilon) * advantages surr = tf.minimum(surr1, surr2) loss = -tf.reduce_mean(surr) return loss