示例#1
0
def group_allreduce(grads, params, search_strings=None, cast_all=None):
    if mpi_size() == 1:
        return grads
    return nccl.group_allreduce(grads,
                                params,
                                search_strings=search_strings,
                                cast_all=cast_all,
                                num_comms=num_comms(),
                                prereduce=prereduce_size())
示例#2
0
def model(xs, ys):

    with tf.variable_scope("model"):

        with tf.device("/cpu:0"):
            global_step = tf.Variable(1.0, trainable=False)
            learning_rate = tf.minimum(
                global_step * tf.constant(1.0 / hps.warmup_iters),
                1.0) * tf.constant(hps.lr)

        with tf.device("/gpu:0"):

            # embed discrete inputs to continous space and add learned position embeddings
            with tf.variable_scope('embed'):
                x_embed = fp16(
                    tf.get_variable(
                        "x", [hps.n_vocab, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.02)))
                pos_embed = fp16(
                    tf.get_variable(
                        'pos', [1, hps.n_timesteps, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.01)))
                h = embedding_lookup(x_embed, xs) + pos_embed

            for l in range(hps.n_layer):
                h = transformer_block(h, 'layer_%d' % l, hps.n_head,
                                      hps.n_timesteps)

            #average pool transformer features and apply linear classifier
            with tf.variable_scope('logits'):
                h = tf.reshape(h, [-1, hps.n_state])
                logits = tf.matmul(h, x_embed, transpose_b=True)

            labels = tf.reshape(ys, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=fp32(logits), labels=tf.cast(labels, tf.int32))
            loss = tf.reduce_mean(loss)

            params = tf.trainable_variables()
            grads = tf.gradients(loss * hps.cost_scale, params)

            mpi_scale = 1.0 / mpi_size

            if mpi_size > 1:
                loss = allreduce(loss) * mpi_scale

                group_allreduce(grads,
                                params,
                                search_strings=[
                                    "layer_%d" % l
                                    for l in range(hps.n_layer - 1, -1, -1)
                                ] + ["embed"],
                                cast_all=tf.float16)

            # for tuning fp16 cost scaling
            if hps.log_stats and mpi_rank == 0:
                for i, (grad, param) in enumerate(zip(grads, params)):
                    name = param.op.name + "_" + "_".join(
                        str(x) for x in param.shape.as_list())
                    grads[i] = log_stats(grad,
                                         tf.cast(global_step, tf.int32),
                                         logfile="scale_stats.txt",
                                         name=name)

            # use adafactor for most params and adam for embeddings
            fact_grads = list()
            adam_grads = list()
            for grad, param in zip(grads, params):
                if "embed" in param.op.name:
                    # for input embedding, only update param + running stats when embedding vector was selected by input
                    # more stable learning for rarely used embedding entries
                    # if "x" in param.op.name:
                    #     grad.lazy = True
                    adam_grads.append((grad, param))
                else:
                    fact_grads.append((grad, param))

            fact = AdafactorOptimizer(learning_rate=learning_rate,
                                      grad_scale=mpi_scale / hps.cost_scale,
                                      sat_infs=True)
            adam = AdamOptimizer(learning_rate=learning_rate,
                                 grad_scale=mpi_scale / hps.cost_scale,
                                 sat_infs=True)
            train_op = tf.group(fact.apply_gradients(fact_grads),
                                adam.apply_gradients(adam_grads))

        # update global step after we're done using it for this update
        with tf.control_dependencies([train_op]), tf.device("/cpu:0"):
            update_op = tf.assign_add(global_step, 1.0)

        return loss, tf.group(train_op, update_op)
示例#3
0
def model(X, Y, hps):

    # tf Variable of random ints of size (3 * GPU_SMs * 1024)
    # tf doesn't support int32 variables?  Hack with float32 view.
    entropy_init = np.random.randint(-(1<<31), (1<<31), size=80*3*1024, dtype=np.int32).view(np.float32)

    if hps.tag != "none":
        qspec_e4f11 = QuantizeSpec(
            ebits      = 4,
            fbits      = 11,
            stochastic = 2,
            denorm     = True,
            frequency  = 512,
            bias_pad   = 1,
            logfile="qspec_e4f11.%s.b.txt" % hps.tag,
        )
        qspec_e5f10 = QuantizeSpec(
            ebits      = 5,
            fbits      = 10,
            stochastic = 2,
            denorm     = True,
            frequency  = 512,
            bias_pad   = 4,
            logfile="qspec_e5f10.%s.b.txt" % hps.tag,
        )
    else:
        qspec_e4f11 = None
        qspec_e5f10 = None
    xs = tf.split(X, mpi_size, 0)
    ys = tf.split(Y, mpi_size, 0)

    with tf.device("/gpu:0"), tf.variable_scope("model"):

        entropy = tf.get_variable("entropy", initializer=entropy_init, trainable=False)
        set_entropy(entropy)

        h = embed_input(xs[mpi_rank], hps)
        for l in range(hps.n_layer):
            h = transformer_block(h, 'layer_%d' % l, hps.n_head)
        logits = output(h, hps)

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ys[mpi_rank])
        loss = tf.reduce_mean(loss)

        params = tf.trainable_variables()
        grads  = tf.gradients(loss, params)

        # for p in params:
        #     print(p.op.name + "_" + "_".join(str(x) for x in p.shape.as_list()))

        test = tf.reduce_sum(tf.cast(tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), ys[mpi_rank]), tf.float32))

        grad_scale = 1.0 / mpi_size

        # all reduce grads
        if mpi_size > 1:
            group_allreduce(grads, params, search_strings=["classifier"] + ["layer_%d" % l for l in range(hps.n_layer-1, -1, -1)])

            loss = allreduce(loss) * grad_scale
            test = allreduce(test)

        train = Adam(grads, params, grad_scale=grad_scale, param_qspec=qspec_e4f11, mean_qspec=qspec_e5f10, var_qspec=qspec_e5f10)

    return loss, train, test
示例#4
0
def model(xs, ys, cost_scale, grad_scale):

    with tf.variable_scope("model"):

        with tf.device("/cpu:0"):
            global_step = tf.Variable(1.0, trainable=False)
            learning_rate = tf.minimum(
                global_step * tf.constant(1.0 / hps.warmup_iters),
                tf.constant(1.0)) * tf.constant(hps.lr)

        with tf.device("/gpu:0"):

            # Contains scope/var_name substrings we use to group gradients for all reduce
            # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise allreduce could hang.
            # The groups should be ordered in which the all-reduce is called.
            # Any gradients not matching the substrings will get appended to the last group.
            grad_groups = []

            # embed discrete inputs to continous space and add learned position embeddings
            with tf.variable_scope('embed'):
                x_embed = fp16(
                    tf.get_variable(
                        "x", [hps.n_vocab, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.02)))
                pos_embed = fp16(
                    tf.get_variable(
                        'pos', [1, hps.n_timesteps, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.01)))
                h = embedding_lookup(x_embed, xs) + pos_embed
                grad_groups.insert(0, 'embed')

            for l in range(hps.n_layer):
                layer_name = 'layer_%d' % l
                h = transformer_block(h, layer_name, hps.n_head,
                                      hps.n_timesteps)
                grad_groups.insert(0, layer_name)

            #average pool transformer features and apply linear classifier
            with tf.variable_scope('logits'):
                h = tf.reshape(h, [-1, hps.n_state])
                logits = tf.matmul(h, x_embed, transpose_b=True)

            # labels = tf.reshape(ys, [-1])
            # loss   = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=fp32(logits), labels=tf.cast(labels, tf.int32))
            loss = softmax_cross_entropy(logits=logits, labels=ys)
            loss = tf.reduce_mean(loss)

            params = tf.trainable_variables()
            # use scale_tensor so we can keep the cost_scale a host side placeholder
            grads = tf.gradients(scale_tensor(loss, cost_scale), params)

            if mpi_size > 1:
                loss = allreduce(loss) * tf.constant(1.0 / mpi_size)

                group_allreduce(grads, params, search_strings=grad_groups)

            global_norm, norm_scale = ClipGlobalNorm(grads,
                                                     grad_scale=grad_scale,
                                                     clip_norm=hps.clip_norm)

            # for tuning fp16 cost scaling
            if hps.log_stats and mpi_rank == 0:
                for i, (grad, param) in enumerate(zip(grads, params)):
                    name = param.op.name + "_" + "_".join(
                        str(x) for x in param.shape.as_list())
                    grads[i] = log_stats(grad,
                                         tf.cast(global_step, tf.int32),
                                         logfile="scale_stats.txt",
                                         name=name)

            # use adafactor for most params and adam for embeddings
            fact_grads = list()
            adam_grads = list()
            for grad, param in zip(grads, params):
                if "embed" in param.op.name:
                    # for input embedding, only update param + running stats when embedding vector was selected by input
                    # more stable learning for rarely used embedding entries
                    # Note that we use the x_embed as the output logits projection, so there's little value to using lazy here.
                    # if "x" in param.op.name:
                    #     grad.lazy = True
                    adam_grads.append((grad, param))
                else:
                    fact_grads.append((grad, param))

            fact = AdafactorOptimizer(learning_rate=learning_rate,
                                      norm_scale=norm_scale,
                                      grad_scale=grad_scale)
            adam = AdamOptimizer(learning_rate=learning_rate,
                                 norm_scale=norm_scale,
                                 grad_scale=grad_scale)
            train_op = tf.group(fact.apply_gradients(fact_grads),
                                adam.apply_gradients(adam_grads))

        # update global step after we're done using it for this update
        with tf.control_dependencies([train_op]), tf.device("/cpu:0"):
            update_op = tf.assign_add(global_step, 1.0)

        return loss, tf.group(train_op, update_op), global_norm, norm_scale