def conv1d(x, scope, nf, std=0.02, relu=False, fast_gelu=False): with tf.variable_scope(scope): nx = x.shape[-1].value ndims = x.shape.ndims # Note: param initializers are not particularly well tuned in this code # w = tf.get_variable("w", [nx, nf], initializer=tf.random_normal_initializer(stddev=std), dtype=tf.float32) w = tf.get_variable("w", [nx, nf], initializer=tf.random_uniform_initializer(minval=-std, maxval=std), dtype=tf.float32) b = tf.get_variable("b", [ nf], initializer=tf.constant_initializer(0.0), dtype=tf.float32) # if hps.float16: # # We delay weight casting till just before use to minimize memory footprint. # # In recompute mode these casts are released just after use on forward pass, # # then remade on the recompute pass. # with tf.control_dependencies([x.op]): # # By setting dx_dtype to float16 we prevent useless casting back to fp32 in the backwards pass. # # Our all-reduce and fused optimizers can accept fp16 natively. # w = bs.float_cast(w, dtype=tf.float16, dx_dtype=tf.float16) # merge context and batch dims for more efficient matmul if ndims > 2: y_shape = tf.concat([x.shape[: ndims - 1], [nf]], axis=0) x = tf.reshape(x, [-1, nx]) y = tf.matmul(x, w) # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False) if ndims > 2: y = tf.reshape(y, y_shape) return y
def conv1d(self, x, scope, channel, std=0.02, relu=False, fast_gelu=False, bias=True): with tf.variable_scope(scope): nx = x.shape[-1].value ndims = x.shape.ndims # Note: param initializers are not particularly well tuned in this code w = tf.get_variable("w", [nx, channel], initializer=create_initializer(initializer_range=std), dtype=tf.float16 if self.config.float16 else tf.float32) if bias: b = tf.get_variable("bias", [channel], initializer=tf.constant_initializer(0.0)) else: b = 0 # merge context and batch dims for more efficient matmul if ndims > 2: y_shape = tf.concat([tf.shape(x)[: ndims - 1], [channel]], axis=0) x = tf.reshape(x, [-1, nx]) y = tf.matmul(x, w) # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False) if ndims > 2: y = tf.reshape(y, y_shape) return y
def dense(x, hidden_size, activation=None, name='dense', kernel_initializer=None, bias=True): if kernel_initializer is None: kernel_initializer = create_initializer(0.02) with tf.variable_scope(name): nx = x.shape[-1].value ndims = x.shape.ndims dtype = x.dtype # Note: param initializers are not particularly well tuned in this code w = tf.get_variable("kernel", [nx, hidden_size], initializer=kernel_initializer, dtype=dtype) assert x.op.device != '' if bias: b = tf.get_variable("bias", [hidden_size], initializer=tf.zeros_initializer) else: b = 0 # merge context and batch dims for more efficient matmul if ndims > 2: y_shape = tf.concat([tf.shape(x)[:ndims - 1], [hidden_size]], axis=0) x = tf.reshape(x, [-1, nx]) y = tf.matmul(x, w) if activation == 'fast_gelu' or activation == 'gelu': fast_gelu = True else: fast_gelu = False if activation == 'relu': relu = True else: relu = False y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False) if activation == 'tanh': y = tf.tanh(y) elif activation == 'sigmoid': y = tf.sigmoid(y) if ndims > 2: y = tf.reshape(y, y_shape) return y
def conv1d(x, scope, channel, std=0.02, relu=False, fast_gelu=False, bias=True, float16=False): with tf.variable_scope(scope): nx = x.shape[-1].value ndims = x.shape.ndims # Note: param initializers are not particularly well tuned in this code w = tf.get_variable( "w", [nx, channel], initializer=create_initializer(initializer_range=std)) if bias: b = tf.get_variable("bias", [channel], initializer=tf.constant_initializer(0.0)) else: b = tf.zeros([channel]) if float16: # We delay weight casting till just before use to minimize memory footprint. # In recompute mode these casts are released just after use on forward pass, # then remade on the recompute pass. with tf.control_dependencies([x.op]): # By setting dx_dtype to float16 we prevent useless casting back to fp32 in the backwards pass. # Our all-reduce and fused optimizers can accept fp16 natively. w = bs.float_cast(w, dtype=tf.float16, dx_dtype=tf.float16) # merge context and batch dims for more efficient matmul if ndims > 2: y_shape = tf.concat([tf.shape(x)[:ndims - 1], [channel]], axis=0) x = tf.reshape(x, [-1, nx]) y = tf.matmul(x, w) # avoid atomics in bias grad, but be careful as tf handles temp memory badly in the presense of async ops like all-reduce y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False) if ndims > 2: y = tf.reshape(y, y_shape) return y