示例#1
0
def embed_input(x, hps):
    """
    embed discrete inputs to continous space and add learned position embeddings
    """
    x_embed   = tf.get_variable('x_embed',   [hps.n_bin, hps.n_state], initializer=tf.random_normal_initializer(stddev=0.02))
    pos_embed = tf.get_variable('pos_embed', [hps.n_x,   hps.n_state], initializer=tf.random_normal_initializer(stddev=0.01))
    h = tf.add(embedding_lookup(x_embed, x), pos_embed)
    return h
示例#2
0
    def testEmbeddingLookup(self):

        config = tf.ConfigProto(intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)

        with self.test_session(config=config) as sess:

            for shapeW, shapeI in shapes:

                C = shapeW[0]
                shapeY = shapeI + shapeW[1:]

                np.random.seed(int(time()))
                cpuI = np.random.randint(0, C, size=shapeI, dtype=np.int32)
                cpuW = np.random.uniform(-1.0, 1.0, shapeW).astype(np.float32)
                cpuE = np.random.uniform(-1.0, 1.0, shapeY).astype(np.float32)

                for dtype in (
                        tf.float32,
                        tf.float16,
                ):  #tf.float16, tf.float32
                    for sort in (True, False):

                        results = []
                        for device in ("gpu", "cpu"):

                            if bench and device == "cpu":
                                break

                            castW = device == "gpu" and dtype is not tf.float32
                            if castW:
                                if C <= 256:
                                    castI = tf.uint8
                                elif C <= 65536:
                                    castI = tf.uint16
                                else:
                                    castI = None
                            else:
                                castI = None

                            with tf.device("/%s:0" %
                                           device), tf.name_scope(device):

                                i = tf.placeholder(tf.int32,
                                                   cpuI.shape,
                                                   name="i")
                                w = tf.placeholder(tf.float32,
                                                   cpuW.shape,
                                                   name="w")
                                e = tf.placeholder(tf.float32,
                                                   cpuE.shape,
                                                   name="e")

                                feed_dict = {i: cpuI, w: cpuW, e: cpuE}

                                wf = ew.float_cast(w,
                                                   dtype=dtype) if castW else w
                                i = tf.cast(
                                    i, dtype=castI) if castI is not None else i

                                y = embedding_lookup(wf,
                                                     i,
                                                     sort_grad=sort,
                                                     bench=bench)

                                if castW:
                                    y = ew.float_cast(y, dtype=tf.float32)

                                dw, = tf.gradients(y, [w], e)

                                results.append(sess.run([y, dw], feed_dict))

                        if not bench:

                            for op, dev, cpu in zip(["y", "dw"], results[0],
                                                    results[1]):

                                dif = np.abs(cpu - dev)
                                avgval = np.average(abs(cpu))
                                maxdif = dif.max()
                                max_err = maxdif if avgval == 0 else maxdif / avgval
                                l2_err = np.sqrt(
                                    np.square(dif).sum()) / np.sqrt(
                                        np.square(cpu).sum())

                                print(
                                    "%s, shape:%22s, op:%3s, err:%17.12f, l2_err:%17.12f"
                                    % (dtype.name, str(
                                        cpu.shape), op, max_err, l2_err))
示例#3
0
def model(xs, ys):

    with tf.variable_scope("model"):

        with tf.device("/cpu:0"):
            global_step = tf.Variable(1.0, trainable=False)
            learning_rate = tf.minimum(
                global_step * tf.constant(1.0 / hps.warmup_iters),
                1.0) * tf.constant(hps.lr)

        with tf.device("/gpu:0"):

            # embed discrete inputs to continous space and add learned position embeddings
            with tf.variable_scope('embed'):
                x_embed = fp16(
                    tf.get_variable(
                        "x", [hps.n_vocab, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.02)))
                pos_embed = fp16(
                    tf.get_variable(
                        'pos', [1, hps.n_timesteps, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.01)))
                h = embedding_lookup(x_embed, xs) + pos_embed

            for l in range(hps.n_layer):
                h = transformer_block(h, 'layer_%d' % l, hps.n_head,
                                      hps.n_timesteps)

            #average pool transformer features and apply linear classifier
            with tf.variable_scope('logits'):
                h = tf.reshape(h, [-1, hps.n_state])
                logits = tf.matmul(h, x_embed, transpose_b=True)

            labels = tf.reshape(ys, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=fp32(logits), labels=tf.cast(labels, tf.int32))
            loss = tf.reduce_mean(loss)

            params = tf.trainable_variables()
            grads = tf.gradients(loss * hps.cost_scale, params)

            mpi_scale = 1.0 / mpi_size

            if mpi_size > 1:
                loss = allreduce(loss) * mpi_scale

                group_allreduce(grads,
                                params,
                                search_strings=[
                                    "layer_%d" % l
                                    for l in range(hps.n_layer - 1, -1, -1)
                                ] + ["embed"],
                                cast_all=tf.float16)

            # for tuning fp16 cost scaling
            if hps.log_stats and mpi_rank == 0:
                for i, (grad, param) in enumerate(zip(grads, params)):
                    name = param.op.name + "_" + "_".join(
                        str(x) for x in param.shape.as_list())
                    grads[i] = log_stats(grad,
                                         tf.cast(global_step, tf.int32),
                                         logfile="scale_stats.txt",
                                         name=name)

            # use adafactor for most params and adam for embeddings
            fact_grads = list()
            adam_grads = list()
            for grad, param in zip(grads, params):
                if "embed" in param.op.name:
                    # for input embedding, only update param + running stats when embedding vector was selected by input
                    # more stable learning for rarely used embedding entries
                    # if "x" in param.op.name:
                    #     grad.lazy = True
                    adam_grads.append((grad, param))
                else:
                    fact_grads.append((grad, param))

            fact = AdafactorOptimizer(learning_rate=learning_rate,
                                      grad_scale=mpi_scale / hps.cost_scale,
                                      sat_infs=True)
            adam = AdamOptimizer(learning_rate=learning_rate,
                                 grad_scale=mpi_scale / hps.cost_scale,
                                 sat_infs=True)
            train_op = tf.group(fact.apply_gradients(fact_grads),
                                adam.apply_gradients(adam_grads))

        # update global step after we're done using it for this update
        with tf.control_dependencies([train_op]), tf.device("/cpu:0"):
            update_op = tf.assign_add(global_step, 1.0)

        return loss, tf.group(train_op, update_op)
示例#4
0
def model(xs, ys, cost_scale, grad_scale):

    with tf.variable_scope("model"):

        with tf.device("/cpu:0"):
            global_step = tf.Variable(1.0, trainable=False)
            learning_rate = tf.minimum(
                global_step * tf.constant(1.0 / hps.warmup_iters),
                tf.constant(1.0)) * tf.constant(hps.lr)

        with tf.device("/gpu:0"):

            # Contains scope/var_name substrings we use to group gradients for all reduce
            # You'll want to find groupings that are scheduled uniquely by tensorflow, otherwise allreduce could hang.
            # The groups should be ordered in which the all-reduce is called.
            # Any gradients not matching the substrings will get appended to the last group.
            grad_groups = []

            # embed discrete inputs to continous space and add learned position embeddings
            with tf.variable_scope('embed'):
                x_embed = fp16(
                    tf.get_variable(
                        "x", [hps.n_vocab, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.02)))
                pos_embed = fp16(
                    tf.get_variable(
                        'pos', [1, hps.n_timesteps, hps.n_state],
                        initializer=tf.random_normal_initializer(stddev=0.01)))
                h = embedding_lookup(x_embed, xs) + pos_embed
                grad_groups.insert(0, 'embed')

            for l in range(hps.n_layer):
                layer_name = 'layer_%d' % l
                h = transformer_block(h, layer_name, hps.n_head,
                                      hps.n_timesteps)
                grad_groups.insert(0, layer_name)

            #average pool transformer features and apply linear classifier
            with tf.variable_scope('logits'):
                h = tf.reshape(h, [-1, hps.n_state])
                logits = tf.matmul(h, x_embed, transpose_b=True)

            # labels = tf.reshape(ys, [-1])
            # loss   = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=fp32(logits), labels=tf.cast(labels, tf.int32))
            loss = softmax_cross_entropy(logits=logits, labels=ys)
            loss = tf.reduce_mean(loss)

            params = tf.trainable_variables()
            # use scale_tensor so we can keep the cost_scale a host side placeholder
            grads = tf.gradients(scale_tensor(loss, cost_scale), params)

            if mpi_size > 1:
                loss = allreduce(loss) * tf.constant(1.0 / mpi_size)

                group_allreduce(grads, params, search_strings=grad_groups)

            global_norm, norm_scale = ClipGlobalNorm(grads,
                                                     grad_scale=grad_scale,
                                                     clip_norm=hps.clip_norm)

            # for tuning fp16 cost scaling
            if hps.log_stats and mpi_rank == 0:
                for i, (grad, param) in enumerate(zip(grads, params)):
                    name = param.op.name + "_" + "_".join(
                        str(x) for x in param.shape.as_list())
                    grads[i] = log_stats(grad,
                                         tf.cast(global_step, tf.int32),
                                         logfile="scale_stats.txt",
                                         name=name)

            # use adafactor for most params and adam for embeddings
            fact_grads = list()
            adam_grads = list()
            for grad, param in zip(grads, params):
                if "embed" in param.op.name:
                    # for input embedding, only update param + running stats when embedding vector was selected by input
                    # more stable learning for rarely used embedding entries
                    # Note that we use the x_embed as the output logits projection, so there's little value to using lazy here.
                    # if "x" in param.op.name:
                    #     grad.lazy = True
                    adam_grads.append((grad, param))
                else:
                    fact_grads.append((grad, param))

            fact = AdafactorOptimizer(learning_rate=learning_rate,
                                      norm_scale=norm_scale,
                                      grad_scale=grad_scale)
            adam = AdamOptimizer(learning_rate=learning_rate,
                                 norm_scale=norm_scale,
                                 grad_scale=grad_scale)
            train_op = tf.group(fact.apply_gradients(fact_grads),
                                adam.apply_gradients(adam_grads))

        # update global step after we're done using it for this update
        with tf.control_dependencies([train_op]), tf.device("/cpu:0"):
            update_op = tf.assign_add(global_step, 1.0)

        return loss, tf.group(train_op, update_op), global_norm, norm_scale