def BlocksparseAdam(grads, params, lr=0.001, decay_mean=0.9, decay_var=0.999, epsilon=1e-8, clip_sigma=0.0, global_step=None, gated=False, norm_scale=None, grad_scale=1.0, saturate=0.0, zero_infs=False, zero_nans=False, param_qspec=None, mean_qspec=None, var_qspec=None): with tf.device("/cpu:0"), tf.variable_scope("adam_lr"): if global_step is None: t = tf.Variable(initial_value=0.0, name="t", trainable=False) t = t.assign_add(1.0) else: t = tf.cast(global_step.assign_add(1), tf.float32) one = tf.constant(1.0) lr = lr * tf.sqrt((one - tf.pow(decay_var, t))) / (one - tf.pow(decay_mean, t)) if type(grad_scale) is float: grad_scale = tf.constant(grad_scale) if type(clip_sigma) is float: clip_sigma = tf.constant(clip_sigma) norm_scale = [] if norm_scale is None else [norm_scale] updates = list() for grad, param in zip(grads, params): mean = slot_creator.create_zeros_slot(param, "adam_mean") var = slot_creator.create_zeros_slot(param, "adam_variance") gate = getattr(param, "gate", None) colon = param.name.find(":") name = param.name if colon < 0 else param.name[0:colon] with tf.device("/gpu:0"), tf.variable_scope("adam/" + name): if gated and gate is not None: op = adam_gated_op(gate, grad, param, mean, var, lr, grad_scale, clip_sigma, norm_scale, decay_mean=decay_mean, decay_var=decay_var, epsilon=epsilon, saturate=saturate, zero_infs=zero_infs, zero_nans=zero_nans) else: op = adam_op(grad, param, mean, var, lr, grad_scale, clip_sigma, norm_scale, decay_mean=decay_mean, decay_var=decay_var, epsilon=epsilon, saturate=saturate, zero_infs=zero_infs, zero_nans=zero_nans) if param_qspec is not None: updates.append(param.assign(quantize(op.out_param, param_qspec, name="param"))) else: updates.append(op.out_param) if mean_qspec is not None: updates.append(mean.assign(quantize(op.out_mean, mean_qspec, name="mean"))) if var_qspec is not None: updates.append(var.assign(quantize(op.out_var, var_qspec, name="var"))) return tf.group(*updates)
def _apply_dense(self, grad, param): m = self.get_slot(param, "Mean") v = self.get_slot(param, "Var") gate = getattr(param, "gate", None) gate = [gate] if self.gated and gate is not None else [] op = adam_op(grad, param, m, v, self.lr, self.grad_scale, self.clip_sigma, self.norm_scale, gate, decay_mean=self.beta1, decay_var=self.beta2, epsilon=self.epsilon, saturate=self.saturate, zero_infs=self.zero_infs, zero_nans=self.zero_nans, lazy_emb=hasattr(grad, "lazy")) updates = list() if self.param_qspec is not None: updates.append( param.assign( quantize(op.out_param, self.param_qspec, name="param_" + param.op.name))) else: updates.append(op.out_param) if self.mean_qspec is not None: updates.append( m.assign( quantize(op.out_mean, self.mean_qspec, name="mean_" + param.op.name))) if self.var_qspec is not None: updates.append( v.assign( quantize(op.out_var, self.var_qspec, name="var_" + param.op.name))) return tf.group(*updates) if len(updates) > 1 else updates[0]
def apply(self, grad_params, gpu=0, qspec=None): for grad, param in grad_params: with ops.init_scope(): self.averages[param] = slot_creator.create_slot( param, param.initialized_value(), "ema") ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, param) ema_ops = [] for grad, param in grad_params: colon = param.name.find(":") name = param.name if colon < 0 else param.name[0:colon] with tf.device("/gpu:%d" % gpu), tf.variable_scope("ema/" + name): ema = self.averages[param] gate = getattr(param, "gate", None) if self.gated and gate is not None: op = ema_gated_op(ema, param, gate, decay=self.decay) else: op = ema_op(ema, param, decay=self.decay) if qspec is not None: ema_ops.append(ema.assign(quantize(op, qspec, name="ema"))) else: ema_ops.append(op) return tf.group(*ema_ops)
def quantize_post(x, name, tag): if tag != "none": if mpi_rank == 0: qspec_f = QuantizeSpec(copy=qspec_e6f7, logfile="qspec_e6f07.f.%s.txt" % tag) qspec_b = QuantizeSpec(copy=qspec_e5f2, logfile="qspec_e5f02.b.%s.txt" % tag) else: qspec_f = qspec_e6f7 qspec_b = qspec_e5f2 return quantize(x, qspec_f, qspec_b, name=name) return x
def quantize_pre(x, name, tag): if tag != "none": if mpi_rank == 0: qspec_f = QuantizeSpec(copy=qspec_e4f3, logfile="qspec_e4f03.f.%s.txt" % tag) qspec_b = QuantizeSpec(copy=qspec_e6f7, logfile="qspec_e6f07.b.%s.txt" % tag) else: qspec_f = qspec_e4f3 qspec_b = qspec_e6f7 return quantize(x, qspec_f, qspec_b, name=name) return x
def apply(self, params, qspec=None): with tf.device("/gpu:0"), tf.control_dependencies(None): for param in params: if self.fp16 == 2 or (self.fp16 and is_param_casted(param)): # only use fp16 for params that are explicitly cast to fp16 before use init = float_cast(param.initialized_value(), dtype=tf.float16) dtype = tf.float16 else: init = param.initialized_value() dtype = tf.float32 with tf.variable_scope(None, param.op.name + "/" + self.name): # use the Identity read op output as the key # this lets us lookup ema vars by Cast op outputs self.averages[param.value()] = tf.get_variable( "ema", dtype=dtype, initializer=init, trainable=False) ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, param) ema_ops = [] for param in params: ema = self.averages[param.value()] gate = getattr(param, "gate", None) gate = [gate] if self.gated and gate is not None else [] op = ema_op(ema, param, gate, decay=self.decay) if qspec is not None: ema_ops.append( ema.assign(quantize(op, qspec, name="ema_" + param.op.name))) else: ema_ops.append(op) return tf.group(*ema_ops)