def gumbel_noise(shape, eps=None): """Generate gumbel noise shaped by shape""" if eps is None: eps = dtype.epsilon() u = tf.random_uniform(shape, minval=0, maxval=1) return -tf.log(-tf.log(u + eps) + eps)
def rms_norm(x, eps=None, scope=None): """RMS-based Layer normalization layer""" if eps is None: eps = dtype.epsilon() with tf.variable_scope(scope or "rms_norm", dtype=tf.as_dtype(dtype.floatx())): layer_size = util.shape_list(x)[-1] scale = tf.get_variable("scale", [layer_size], initializer=tf.ones_initializer()) ms = tf.reduce_mean(x ** 2, -1, keep_dims=True) return scale * x * tf.rsqrt(ms + eps)
def layer_norm(x, eps=None, scope=None): """Layer normalization layer""" if eps is None: eps = dtype.epsilon() with tf.variable_scope(scope or "layer_norm", dtype=tf.as_dtype(dtype.floatx())): layer_size = util.shape_list(x)[-1] scale = tf.get_variable("scale", [layer_size], initializer=tf.ones_initializer()) offset = tf.get_variable("offset", [layer_size], initializer=tf.zeros_initializer()) mean = tf.reduce_mean(x, -1, keep_dims=True) var = tf.reduce_mean((x - mean) ** 2, -1, keep_dims=True) return scale * (x - mean) * tf.rsqrt(var + eps) + offset
def gated_rms_norm(x, eps=None, scope=None): """RMS-based Layer normalization layer""" if eps is None: eps = dtype.epsilon() with tf.variable_scope(scope or "rms_norm", dtype=tf.as_dtype(dtype.floatx())): layer_size = util.shape_list(x)[-1] scale = tf.get_variable("scale", [layer_size], initializer=tf.ones_initializer()) gate = tf.get_variable("gate", [layer_size], initializer=None) ms = tf.reduce_mean(x ** 2, -1, keep_dims=True) # adding gating here which slightly improves quality return scale * x * tf.rsqrt(ms + eps) * tf.nn.sigmoid(gate * x)