def update_variable(self, var, grad_var): """Update the variable and its slots.""" params = self.params global_step = tf.to_float(self.global_step) + 1 # compute learning rate lrate = params.learning_rate if params.learning_rate_decay_scheme == "noam": lrate *= tf.minimum( global_step * params.learning_rate_warmup_steps**-1.5, global_step**-0.5) else: assert params.learning_rate_decay_scheme == "none" lrate *= tf.minumum( global_step / params.learning_rate_warmup_steps, 1.0) # compute adjustment due to second moment slots = params.slots[var.op.name] grad_squared = tf.square(grad_var) beta2_pow = tf.pow(params.beta2, global_step) if params.factored_second_moment_accumulator and len(var.shape) == 2: vr_update = tf.assign( slots["adam_vr"], slots["adam_vr"] * params.beta2 + tf.reduce_mean(grad_squared, 1, keep_dims=True) * (1.0 - params.beta2)) vc_update = tf.assign( slots["adam_vc"], slots["adam_vc"] * params.beta2 + tf.reduce_mean(grad_squared, 0, keep_dims=True) * (1.0 - params.beta2)) with tf.control_dependencies([vr_update, vc_update]): vr = tf.sqrt(slots["adam_vr"] / (1.0 - beta2_pow)) + params.epsilon vc = tf.sqrt(slots["adam_vc"] / (1.0 - beta2_pow)) + params.epsilon vc /= tf.reduce_mean(vc) denom = vr * vc else: v_update = tf.assign( slots["adam_v"], slots["adam_v"] * params.beta2 + grad_squared * (1.0 - params.beta2)) with tf.control_dependencies([v_update]): denom = tf.sqrt(slots["adam_v"] / (1.0 - beta2_pow)) + params.epsilon # compute momentum if applicable if params.beta1 != 0.0: m_update = tf.assign( slots["adam_m"], slots["adam_m"] * params.beta1 + grad_var * (1.0 - params.beta1)) with tf.control_dependencies([m_update]): grad_var = slots["adam_m"] # update var subtrahend = lrate * grad_var / denom new_val = _quantize(_dequantize(var, params) - subtrahend, params) return tf.assign(var, new_val)
def update_variable(self, var, grad_var): """Update the variable and its slots.""" params = self.params global_step = tf.to_float(self.global_step) + 1 # compute learning rate lrate = params.learning_rate if params.learning_rate_decay_scheme == "noam": lrate *= tf.minimum(global_step * params.learning_rate_warmup_steps**-1.5, global_step**-0.5) else: assert params.learning_rate_decay_scheme == "none" lrate *= tf.minumum(global_step / params.learning_rate_warmup_steps, 1.0) # compute adjustment due to second moment slots = params.slots[var.op.name] grad_squared = tf.square(grad_var) beta2_pow = tf.pow(params.beta2, global_step) if params.factored_second_moment_accumulator and len(var.shape) == 2: vr_update = tf.assign(slots["adam_vr"], slots["adam_vr"] * params.beta2 + tf.reduce_mean(grad_squared, 1, keep_dims=True) * (1.0 - params.beta2)) vc_update = tf.assign(slots["adam_vc"], slots["adam_vc"] * params.beta2 + tf.reduce_mean(grad_squared, 0, keep_dims=True) * (1.0 - params.beta2)) with tf.control_dependencies([vr_update, vc_update]): vr = tf.sqrt(slots["adam_vr"] / (1.0 - beta2_pow)) + params.epsilon vc = tf.sqrt(slots["adam_vc"] / (1.0 - beta2_pow)) + params.epsilon vc /= tf.reduce_mean(vc) denom = vr * vc else: v_update = tf.assign(slots["adam_v"], slots["adam_v"] * params.beta2 + grad_squared * (1.0 - params.beta2)) with tf.control_dependencies([v_update]): denom = tf.sqrt(slots["adam_v"] / (1.0 - beta2_pow)) + params.epsilon # compute momentum if applicable if params.beta1 != 0.0: m_update = tf.assign(slots["adam_m"], slots["adam_m"] * params.beta1 + grad_var * (1.0 - params.beta1)) with tf.control_dependencies([m_update]): grad_var = slots["adam_m"] # update var subtrahend = lrate * grad_var / denom new_val = _quantize(_dequantize(var, params) - subtrahend, params) return tf.assign(var, new_val)