def clip_by_global_norm(grads, clip_norm=1.0, grad_scale=1.0, saturate=0.0, zero_infs=False, zero_nans=False): grad_float = list() grad_ehalf = list() grad_bhalf = list() for grad in grads: if grad.dtype is tf.float32: grad_float.append(grad) elif grad.dtype is tf.float16: grad_ehalf.append(grad) elif grad.dtype is tf.bfloat16: grad_bhalf.append(grad) else: raise ValueError("unsupported grad dtype") with tf.device("/gpu:0"): global_norm, norm_scale, _ = clip_global_norm_op( scalar_constant(grad_scale, dtype=tf.float32), scalar_constant(clip_norm, dtype=tf.float32), grad_float, grad_ehalf, grad_bhalf, saturate=saturate, zero_infs=zero_infs, zero_nans=zero_nans) return global_norm, norm_scale
def softmax(self, x, scale=1.0, dtype=None): nn_lut = get_constant(self.nn_lut, name="nn") if dtype is None: dtype = self.softmax_dtype return blocksparse_softmax(x, scalar_constant(scale, dtype=tf.float32), nn_lut, blocks=self.blocks, blk_size=self.blk_size, ctx_blks=self.ctx_blks_q, lut_max=self.nn_max, T=dtype)
def embedding_lookup(emb, idx, sort_grad=True, bench=0, use_tf=False): dev = emb.op.device.lower() if use_tf or not dev or "cpu" in dev: #print("######################### Using TF embeding:", dev) y = tf.nn.embedding_lookup(convert_gradient_to_tensor(emb), idx) else: y = embedding_lookup_op(emb, idx, scalar_constant(emb.shape[0].value, dtype=tf.int32), sorted=sort_grad, bench=bench) return y
def masked_softmax(self, x, scale=1.0, autoregress_at_key=None, dtype=None): if self.softmax_mask is None: if autoregress_at_key is not None: raise ValueError("autoregress_at_key only applies to ops with mask_callback defined.") return self.softmax(x, scale) nn_lut = get_constant(self.nn_lut, name="nn") sm_mask = get_constant(self.softmax_mask, name="sm") if autoregress_at_key is not None: lut = get_constant(self.nt_lut, name="nt") key = scalar_constant(autoregress_at_key, dtype=tf.int32) with tf.control_dependencies([x.op]): sm_mask = bst_partial_autoregressive_mask(sm_mask, lut, key, blocks=self.blocks, blk_size=self.blk_size, ctx_blks_k=self.ctx_blks_k) if dtype is None: dtype = self.softmax_dtype return blocksparse_masked_softmax(x, scalar_constant(scale, dtype=tf.float32), nn_lut, sm_mask, blocks=self.blocks, blk_size=self.blk_size, ctx_blks=self.ctx_blks_q, lut_max=self.nn_max, T=dtype)
def filter_tensor(x, scale=1.0, saturate=0.0, zero_infs=False, zero_nans=False): return filter_tensor_op(x, scalar_constant(scale, dtype=tf.float32), saturate=float(saturate), zero_infs=zero_infs, zero_nans=zero_nans)
def blocksparse_l2_decay(param, gate=None, rate=0.05, epsilon=1e-12): _check_param_shape(param, gate) gate = [gate] if gate is not None else [] return l2_decay_op(param, scalar_constant(rate, dtype=tf.float32), gate, epsilon=epsilon)
def block_reduced_full_dw(param_grad, scale=1.0, norm="max", group_size=8): # max(abs()) or l2_norm() norm = 0 if norm.lower() == "max" else 1 # host side scalar, if zero will cause compute for this op to be skipped. scale = scalar_constant(scale, dtype=tf.float32) assert group_size <= 8 # backward walk param grad to find BlocksparseMatmulDW ops # this should only hit BlocksparseMatmulDWs, BlocksparseMatmulDGs, AddNs or FloatCasts ops = get_parents(param_grad, "BlocksparseMatmulDW") if len(ops) < 1: raise ValueError("BlocksparseMatmulDW op not found") # this sorting is dependent on the op names being correctly ordered. ops.sort(key=lambda op: op.name.split('/')[-1], reverse=True) # use the parent scope for the new ops scope = ops[-1].name.split('/') scope = '/'.join(scope[0:-1]) # we're going to be using absolute names, so clear name_scope with tf.name_scope(None): dw_full = None offset = 0 while offset < len(ops): xs = [op.inputs[0] for op in ops[offset:offset+group_size] ] gs = [op.inputs[1] for op in ops[offset:offset+group_size] ] # Get the corresponding activation grad op for the last param grad op in the group bprop = None for consumer in gs[-1].consumers(): if consumer.type == "BlocksparseMatmulDX": bprop = consumer break assert bprop is not None # get attributes of first op in group up = ops[offset] bsize = up.get_attr("bsize") axis = up.get_attr("axis") name = "%s/block_reduced_full_dw_%03d" % (scope, offset) dw_full = [] if dw_full is None else [dw_full] dw_full, _, _ = blocksparse_reduced_dw(xs, gs, scale, dw_full, bsize=bsize, norm=norm, axis=axis, name=name) # force the dw op before any more time steps are processed bprop._add_control_input(dw_full.op) offset += group_size return dw_full
def blocksparse_prune(param, gate, step, sparsity=None, threshold=None, norm="max", frequency=1): _check_param_shape(param, gate) # one must be set assert (sparsity is None) ^ (threshold is None) if sparsity is not None: # apply pruning to the moving average norms = blocksparse_norm(param, norm=norm) k = scalar_constant(param.shape[0].value, dtype=tf.int32) _, idx = tf.nn.top_k(norms, k=k, sorted=True) return blocksparse_prune_op(gate, idx, scalar_constant(sparsity, dtype=tf.float32), step, frequency=frequency) elif threshold is not None: norm = 1 if norm.lower() == "l2" else 0 return blocksparse_threshold_prune_op(gate, param, scalar_constant( threshold, dtype=tf.float32), step, frequency=frequency, norm_type=norm)
def masked_softmax(x, mask=None, scale=1.0, bench=0): if mask is not None: x_shape = x.shape.as_list() m_shape = mask.shape.as_list() assert len(x_shape) == len(m_shape) for i in range(len(m_shape)): assert m_shape[i] in (1, x_shape[i]) mask = [ mask ] else: mask = [] return masked_softmax_op(x, scalar_constant(scale, dtype=tf.float32), mask, bench=bench)
def concrete_gate(loga, tempurature=2.0 / 3.0, limit_a=-0.1, limit_b=1.1, epsilon=1e-6): gate, _ = concrete_gate_op(loga, get_entropy(), scalar_constant(tempurature, dtype=tf.float32), limit_a=limit_a, limit_b=limit_b, epsilon=epsilon) return gate
def __init__(self, learning_rate=5e-4, beta2=0.999, epsilon=1e-30, clip_thresh=1.0, norm_scale=None, grad_scale=1.0, saturate=0.0, zero_infs=False, zero_nans=False, name="Adafactor", zero_init_variables=False): super().__init__(False, name) self.epsilon = epsilon self.saturate = saturate self.zero_infs = zero_infs self.zero_nans = zero_nans self.name = name self.norm_scale = [] if norm_scale is None else [norm_scale] beta2_init = 0.0 if zero_init_variables else beta2 with tf.device("/cpu:0"), tf.variable_scope("adafactor_decay"): one = scalar_constant(1.0, dtype=tf.float32) self.decay1_power = tf.Variable(initial_value=beta2_init, name="decay1_power", trainable=False) self.decay2_power = tf.Variable(initial_value=beta2_init * beta2_init, name="decay2_power", trainable=False) self.learn_rate = scalar_constant(learning_rate, dtype=tf.float32) self.clip_thresh = scalar_constant(clip_thresh, dtype=tf.float32) self.grad_scale = scalar_constant(grad_scale, dtype=tf.float32) self.decay_t = scalar_constant(beta2, dtype=tf.float32) self.decay = self.decay_t * (one - self.decay1_power) / ( one - self.decay2_power)
def masked_top_k_softmax(x, k, mask=None, scale=1.0): assert k <= x.shape[-1].value <= 1024 if mask is not None: x_shape = x.shape.as_list() m_shape = mask.shape.as_list() assert len(x_shape) == len(m_shape) for i in range(len(m_shape)): assert m_shape[i] in (1, x_shape[i]) mask = [ mask ] else: mask = [] return masked_top_k_softmax_op(x, k, scalar_constant(scale, dtype=tf.float32), mask)
def __init__(self, learning_rate=3e-4, beta1=0.9, beta2=0.999, epsilon=1e-8, clip_sigmas=0.0, norm_scale=None, grad_scale=1.0, saturate=0.0, zero_infs=False, zero_nans=False, gated=False, param_qspec=None, mean_qspec=None, var_qspec=None, fp16=False, zero_init_variables=False, name="Adam"): super().__init__(False, name) self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.saturate = saturate self.zero_infs = zero_infs self.zero_nans = zero_nans self.gated = gated self.param_qspec = param_qspec self.mean_qspec = mean_qspec self.var_qspec = var_qspec self.name = name self.norm_scale = [] if norm_scale is None else [norm_scale] self.fp16 = fp16 beta1_init = 0.0 if zero_init_variables else beta1 beta2_init = 0.0 if zero_init_variables else beta2 with tf.device("/cpu:0"), tf.variable_scope("adam_beta"): one = scalar_constant(1.0, dtype=tf.float32) self.beta1_power = tf.Variable(initial_value=beta1_init, name="beta1_power", trainable=False) self.beta2_power = tf.Variable(initial_value=beta2_init, name="beta2_power", trainable=False) self.beta1_t = scalar_constant(beta1, dtype=tf.float32) self.beta2_t = scalar_constant(beta2, dtype=tf.float32) self.clip_sigma = scalar_constant(clip_sigmas, dtype=tf.float32) self.grad_scale = scalar_constant(grad_scale, dtype=tf.float32) self.lr = scalar_constant( learning_rate, dtype=tf.float32) * tf.sqrt( one - self.beta2_power) / (one - self.beta1_power)
def dropout(x, keep_prob, mask=None, mask_shape=None): keep_prob = scalar_constant(keep_prob) if mask is None: if mask_shape is not None and len(mask_shape) > 0: size = 1 for m_dim, x_dim in zip(mask_shape, x.shape.as_list()): # we don't currently support placeholder dims when broadcasting the dropout mask assert m_dim == 1 or m_dim == x_dim, "incompatible mask_shape: %s x.shape: %s" % (mask_shape, x.shape) size *= m_dim else: size = 0 mask = gen_dropout_mask_op(x, get_entropy(), keep_prob, size=size) if mask_shape is None: mask_shape = [] return apply_dropout_mask_op(x, mask, keep_prob, mask_shape=mask_shape), mask
def softmax(x, scale=1.0, bench=0): return masked_softmax_op(x, scalar_constant(scale, dtype=tf.float32), [], bench=bench)