Пример #1
0
def BlocksparseAdam(grads, params,
        lr=0.001, decay_mean=0.9, decay_var=0.999, epsilon=1e-8, clip_sigma=0.0, global_step=None, gated=False,
        norm_scale=None, grad_scale=1.0, saturate=0.0, zero_infs=False, zero_nans=False,
        param_qspec=None, mean_qspec=None, var_qspec=None):

    with tf.device("/cpu:0"), tf.variable_scope("adam_lr"):

        if global_step is None:
            t = tf.Variable(initial_value=0.0, name="t", trainable=False)
            t = t.assign_add(1.0)
        else:
            t = tf.cast(global_step.assign_add(1), tf.float32)
        one = tf.constant(1.0)

        lr = lr * tf.sqrt((one - tf.pow(decay_var, t))) /  (one - tf.pow(decay_mean, t))

        if type(grad_scale) is float:
            grad_scale = tf.constant(grad_scale)
        if type(clip_sigma) is float:
            clip_sigma = tf.constant(clip_sigma)

    norm_scale = [] if norm_scale is None else [norm_scale]

    updates = list()
    for grad, param in zip(grads, params):

        mean = slot_creator.create_zeros_slot(param, "adam_mean")
        var  = slot_creator.create_zeros_slot(param, "adam_variance")
        gate = getattr(param, "gate", None)

        colon = param.name.find(":")
        name  = param.name if colon < 0 else param.name[0:colon]

        with tf.device("/gpu:0"), tf.variable_scope("adam/" + name):
            if gated and gate is not None:
                op = adam_gated_op(gate, grad, param, mean, var, lr, grad_scale, clip_sigma, norm_scale,
                        decay_mean=decay_mean, decay_var=decay_var, epsilon=epsilon,
                        saturate=saturate, zero_infs=zero_infs, zero_nans=zero_nans)
            else:
                op = adam_op(grad, param, mean, var, lr, grad_scale, clip_sigma, norm_scale,
                        decay_mean=decay_mean, decay_var=decay_var, epsilon=epsilon,
                        saturate=saturate, zero_infs=zero_infs, zero_nans=zero_nans)

            if param_qspec is not None:
                updates.append(param.assign(quantize(op.out_param, param_qspec, name="param")))
            else:
                updates.append(op.out_param)

            if mean_qspec is not None:
                updates.append(mean.assign(quantize(op.out_mean, mean_qspec, name="mean")))

            if var_qspec is not None:
                updates.append(var.assign(quantize(op.out_var, var_qspec, name="var")))

    return tf.group(*updates)
Пример #2
0
    def _apply_dense(self, grad, param):

        m = self.get_slot(param, "Mean")
        v = self.get_slot(param, "Var")

        gate = getattr(param, "gate", None)
        gate = [gate] if self.gated and gate is not None else []

        op = adam_op(grad,
                     param,
                     m,
                     v,
                     self.lr,
                     self.grad_scale,
                     self.clip_sigma,
                     self.norm_scale,
                     gate,
                     decay_mean=self.beta1,
                     decay_var=self.beta2,
                     epsilon=self.epsilon,
                     saturate=self.saturate,
                     zero_infs=self.zero_infs,
                     zero_nans=self.zero_nans,
                     lazy_emb=hasattr(grad, "lazy"))

        updates = list()
        if self.param_qspec is not None:
            updates.append(
                param.assign(
                    quantize(op.out_param,
                             self.param_qspec,
                             name="param_" + param.op.name)))
        else:
            updates.append(op.out_param)

        if self.mean_qspec is not None:
            updates.append(
                m.assign(
                    quantize(op.out_mean,
                             self.mean_qspec,
                             name="mean_" + param.op.name)))

        if self.var_qspec is not None:
            updates.append(
                v.assign(
                    quantize(op.out_var,
                             self.var_qspec,
                             name="var_" + param.op.name)))

        return tf.group(*updates) if len(updates) > 1 else updates[0]
Пример #3
0
    def apply(self, grad_params, gpu=0, qspec=None):

        for grad, param in grad_params:
            with ops.init_scope():

                self.averages[param] = slot_creator.create_slot(
                    param, param.initialized_value(), "ema")

                ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                                      param)

        ema_ops = []
        for grad, param in grad_params:

            colon = param.name.find(":")
            name = param.name if colon < 0 else param.name[0:colon]

            with tf.device("/gpu:%d" % gpu), tf.variable_scope("ema/" + name):
                ema = self.averages[param]
                gate = getattr(param, "gate", None)
                if self.gated and gate is not None:
                    op = ema_gated_op(ema, param, gate, decay=self.decay)
                else:
                    op = ema_op(ema, param, decay=self.decay)

                if qspec is not None:
                    ema_ops.append(ema.assign(quantize(op, qspec, name="ema")))
                else:
                    ema_ops.append(op)

        return tf.group(*ema_ops)
Пример #4
0
def quantize_post(x, name, tag):
    if tag != "none":
        if mpi_rank == 0:
            qspec_f = QuantizeSpec(copy=qspec_e6f7, logfile="qspec_e6f07.f.%s.txt" % tag)
            qspec_b = QuantizeSpec(copy=qspec_e5f2, logfile="qspec_e5f02.b.%s.txt" % tag)
        else:
            qspec_f = qspec_e6f7
            qspec_b = qspec_e5f2
        return quantize(x, qspec_f, qspec_b, name=name)
    return x
Пример #5
0
def quantize_pre(x, name, tag):
    if tag != "none":
        if mpi_rank == 0:
            qspec_f = QuantizeSpec(copy=qspec_e4f3, logfile="qspec_e4f03.f.%s.txt" % tag)
            qspec_b = QuantizeSpec(copy=qspec_e6f7, logfile="qspec_e6f07.b.%s.txt" % tag)
        else:
            qspec_f = qspec_e4f3
            qspec_b = qspec_e6f7
        return quantize(x, qspec_f, qspec_b, name=name)
    return x
Пример #6
0
    def apply(self, params, qspec=None):

        with tf.device("/gpu:0"), tf.control_dependencies(None):
            for param in params:
                if self.fp16 == 2 or (self.fp16 and is_param_casted(param)):
                    # only use fp16 for params that are explicitly cast to fp16 before use
                    init = float_cast(param.initialized_value(),
                                      dtype=tf.float16)
                    dtype = tf.float16
                else:
                    init = param.initialized_value()
                    dtype = tf.float32

                with tf.variable_scope(None, param.op.name + "/" + self.name):
                    # use the Identity read op output as the key
                    # this lets us lookup ema vars by Cast op outputs
                    self.averages[param.value()] = tf.get_variable(
                        "ema", dtype=dtype, initializer=init, trainable=False)
                ops.add_to_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                                      param)

        ema_ops = []
        for param in params:

            ema = self.averages[param.value()]
            gate = getattr(param, "gate", None)
            gate = [gate] if self.gated and gate is not None else []

            op = ema_op(ema, param, gate, decay=self.decay)

            if qspec is not None:
                ema_ops.append(
                    ema.assign(quantize(op, qspec,
                                        name="ema_" + param.op.name)))
            else:
                ema_ops.append(op)

        return tf.group(*ema_ops)