Exemplo n.º 1
0
def sg_optim(loss, **kwargs):
    r"""Applies gradients to variables.
    Args:
        loss: A 0-D `Tensor` containing the value to minimize. list of 0-D tensor for Multiple GPU
        kwargs:
          optim: A name for optimizer. 'MaxProp' (default), 'AdaMax', 'Adam', 'RMSProp' or 'sgd'.
          lr: A Python Scalar (optional). Learning rate. Default is .001.
          beta1: A Python Scalar (optional). Default is .9.
          beta2: A Python Scalar (optional). Default is .99.
          momentum : A Python Scalar for RMSProp optimizer (optional). Default is 0.
          category: A string or string list. Specifies the variables that should be trained (optional).
            Only if the name of a trainable variable starts with `category`, it's value is updated.
            Default is '', which means all trainable variables are updated.
    """
    opt = tf.sg_opt(kwargs)

    # default training options
    opt += tf.sg_opt(optim='MaxProp', lr=0.001, beta1=0.9, beta2=0.99, momentum=0., category='')

    # select optimizer
    if opt.optim == 'MaxProp':
        optim = tf.sg_optimize.MaxPropOptimizer(learning_rate=opt.lr, beta2=opt.beta2)
    elif opt.optim == 'AdaMax':
        optim = tf.sg_optimize.AdaMaxOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2)
    elif opt.optim == 'Adam':
        optim = tf.train.AdamOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2)
    elif opt.optim == 'RMSProp':
        optim = tf.train.RMSPropOptimizer(learning_rate=opt.lr, decay=opt.beta1, momentum=opt.momentum)
    else:
        optim = tf.train.GradientDescentOptimizer(learning_rate=opt.lr)

    # get trainable variables
    if isinstance(opt.category, (tuple, list)):
        var_list = []
        for cat in opt.category:
            var_list.extend([t for t in tf.trainable_variables() if t.name.startswith(cat)])
    else:
        var_list = [t for t in tf.trainable_variables() if t.name.startswith(opt.category)]

    #
    # calc gradient
    #

    # multiple GPUs case
    if isinstance(loss, (tuple, list)):
        gradients = []
        # loop for each GPU tower
        for i, loss_ in enumerate(loss):
            # specify device
            with tf.device('/gpu:%d' % i):
                # give new scope only to operation
                with tf.name_scope('gpu_%d' % i):
                    # add gradient calculation operation for each GPU tower
                    gradients.append(tf.gradients(loss_, var_list))

        # averaging gradient
        gradient = []
        for grad in zip(*gradients):
            gradient.append(tf.add_n(grad) / len(loss))
    # single GPU case
    else:
        gradient = tf.gradients(loss, var_list)

    gradient, _ = tf.clip_by_global_norm(gradient, opt.clip_grad_norm)

    # gradient update op
    with tf.device('/gpu:0'):
        grad_var = [(g, v) for g, v in zip(gradient, var_list)]
        grad_op = optim.apply_gradients(grad_var, global_step=tf.sg_global_step())

    # add summary using last tower value
    for g, v in grad_var:
        # exclude batch normal statics
        if 'mean' not in v.name and 'variance' not in v.name \
                and 'beta' not in v.name and 'gamma' not in v.name:
            tf.sg_summary_gradient(v, g)

    # extra update ops within category ( for example, batch normal running stat update )
    if isinstance(opt.category, (tuple, list)):
        update_op = []
        for cat in opt.category:
            update_op.extend([t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(cat)])
    else:
        update_op = [t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(opt.category)]

    return tf.group(*([grad_op] + update_op))
Exemplo n.º 2
0
def tower_loss2_old(xx, scope, reuse_vars=False):

    # make embedding matrix for source and target
    with tf.variable_scope('embs', reuse=reuse_vars):
        emb_x = tf.sg_emb(name='emb_x',
                          voca_size=Hp.vs,
                          dim=Hp.hd,
                          dev=self._dev)
        emb_y = tf.sg_emb(name='emb_y',
                          voca_size=Hp.vs,
                          dim=Hp.hd,
                          dev=self._dev)

    x_sents = tf.unstack(xx, axis=1)  #each element is (batch, sentlen)

    # generate first an unconditioned sentence
    n_input = Hp.hd

    subrec1 = subrec_zero_state(Hp.bs, Hp.hd)
    subrec2 = subrec_zero_state(Hp.bs, Hp.hd)

    rnn_cell = LSTMCell(in_dim=n_input, dim=Hp.hd)
    (rnn_state, rnn_h) = rnn_cell.zero_state(Hp.bs)

    crnn_cell = ConvLSTMCell(in_dim=n_input, dim=Hp.hd)
    (crnn_state, crnn_h) = crnn_cell.zero_state(n_input)

    for sent in range(len(x_sents) - 1):
        y = x_sents[i + 1]
        x = x_sents[i]  #   (batch, sentlen) = (16, 200)
        # shift target by one step for training source
        y_src = tf.concat([tf.zeros((Hp.bs, 1), tf.sg_intx), y[:, :-1]], 1)

        # embed table lookup
        enc = x.sg_lookup(emb=emb_x)  #(batch, sentlen, dim1)
        # loop dilated conv block
        for i in range(num_blocks):
            enc = (enc.sg_res_block(
                size=5, rate=1, name="enc1_%d" % (i),
                reuse_vars=reuse_vars).sg_res_block(
                    size=5,
                    rate=2,
                    name="enc2_%d" % (i),
                    reuse_vars=reuse_vars).sg_res_block(
                        size=5,
                        rate=4,
                        name="enc4_%d" % (i),
                        reuse_vars=reuse_vars).sg_res_block(
                            size=5,
                            rate=8,
                            name="enc8_%d" % (i),
                            reuse_vars=reuse_vars).sg_res_block(
                                size=5,
                                rate=16,
                                name="enc16_%d" % (i),
                                reuse_vars=reuse_vars))

        #quasi rnn layer  [batch * 3, t, dim2 ]
        conv = enc.sg_quasi_conv1d(is_enc=True,
                                   size=2,
                                   name="conv1",
                                   reuse_vars=reuse_vars)
        #attention layer
        # recurrent layer # 1 + final encoder hidden state
        concat = subrec1.sg_concat(target=conv, dim=0)
        subrec1 = conv.sg_quasi_rnn(is_enc=True, att=True)

        conv = pool.sg_quasi_conv1d(is_enc=True,
                                    size=2,
                                    name="conv2",
                                    reuse_vars=reuse_vars)
        concat = subrec2.sg_concat(target=conv, dim=0)
        subrec2 = conv.sg_quasi_rnn(is_enc=True, att=True)

        # conv LSTM
        (crnn_state, crnn_h) = crnn_cell(subrec2, (crnn_state, crnn_h), 5)

        # recurrent block
        (rnn_state, rnn_h) = rnn_cell(crnn_h, (rnn_state, rnn_h))

        # CNN decoder
        dec = crnn_h.sg_concat(target=y_src.sg_lookup(emb=emb_y), name="dec")

        for i in range(num_blocks):
            dec = (dec.sg_res_block(
                size=3,
                rate=1,
                causal=True,
                name="dec1_%d" % (i),
                reuse_vars=reuse_vars).sg_res_block(
                    size=3,
                    rate=2,
                    causal=True,
                    name="dec2_%d" % (i),
                    reuse_vars=reuse_vars).sg_res_block(
                        size=3,
                        rate=4,
                        causal=True,
                        name="dec4_%d" % (i),
                        reuse_vars=reuse_vars).sg_res_block(
                            size=3,
                            rate=8,
                            causal=True,
                            name="dec8_%d" % (i),
                            reuse_vars=reuse_vars).sg_res_block(
                                size=3,
                                rate=16,
                                causal=True,
                                name="dec16_%d" % (i),
                                reuse_vars=reuse_vars))

        # final fully convolution layer for softmax
        dec = dec.sg_conv1d_gpus(size=1, dim=Hp.vs,name="out",summary=False,\
          dev = self._dev,reuse=reuse_vars)

        ce_array = dec.sg_ce(target=y, mask=True, name="cross_ent_example")
        cross_entropy_mean = tf.reduce_mean(ce_array, name='cross_entropy')
        tf.add_to_collection('losses', cross_entropy_mean)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)
    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    return total_loss
Exemplo n.º 3
0
    size=1, act='tanh', bn=True, dim=20, dout=0.25).sg_conv1d(
        size=1, act='tanh', bn=True, dim=2,
        dout=0.25).sg_reshape(shape=(batch_size, 200)).sg_dense(
            in_dim=200, dim=50,
            act='relu', dout=0.20).sg_dense(in_dim=50,
                                            dim=10,
                                            act='relu',
                                            dout=0.20).sg_dense(in_dim=10,
                                                                dim=2,
                                                                dout=0.20))

# CTC loss
#loss = logit.sg_ctc(target=y, seq_len=seq_len)
reg_lambda = 0.0002
trainable = tf.trainable_variables()
lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in trainable]) * reg_lambda
loss = logit.sg_ce(target=y, one_hot=True) + lossL2

# train
config = tf.ConfigProto(allow_soft_placement=True,
                        inter_op_parallelism_threads=6,
                        intra_op_parallelism_threads=6)
sess = tf.Session(config=config)
tf.sg_init(sess)

learning_rate = tf.train.exponential_decay(0.00001,
                                           tf.sg_global_step(),
                                           100,
                                           0.95,
                                           staircase=False)
Exemplo n.º 4
0
    def rnn_body(time, subrec1, subrec2, rnn_state, rnn_h, crnn_state, crnn_h,
                 losses):
        x = x_sent.read(time)
        y = x_sent.read(time + 1)  #   (batch, sentlen) = (16, 200)

        # shift target by one step for training source
        y_src = tf.concat([tf.zeros((Hp.batch_size, 1), tf.int32), y[:, :-1]],
                          1)
        reuse_vars = time == tf.constant(0) or reu_vars

        # --------------------------   BYTENET ENCODER   --------------------------

        # embed table lookup
        enc = x.sg_lookup(emb=emb_x)  #(batch, sentlen, latentdim)
        # loop dilated conv block
        for i in range(num_blocks):
            enc = (enc.sg_res_block(
                size=5, rate=1, name="enc1_%d" % (i),
                reuse_vars=reuse_vars).sg_res_block(
                    size=5,
                    rate=2,
                    name="enc2_%d" % (i),
                    reuse_vars=reuse_vars).sg_res_block(
                        size=5,
                        rate=4,
                        name="enc4_%d" % (i),
                        reuse_vars=reuse_vars).sg_res_block(
                            size=5,
                            rate=8,
                            name="enc8_%d" % (i),
                            reuse_vars=reuse_vars).sg_res_block(
                                size=5,
                                rate=16,
                                name="enc16_%d" % (i),
                                reuse_vars=reuse_vars))


# --------------------------   QCNN + QPOOL ENCODER with attention #1  --------------------------

#quasi cnn layer ZFO  [batch * 3, t, dim2 ]
        conv = enc.sg_quasi_conv1d(is_enc=True,
                                   size=3,
                                   name="qconv_1",
                                   reuse_vars=reuse_vars)
        #attention layer
        # recurrent layer # 1 + final encoder hidden state
        subrec1 = tf.tile((subrec1.sg_expand_dims(axis=1)), [1, Hp.maxlen, 1])
        concat = conv.sg_concat(target=subrec1,
                                axis=0)  # (batch*4, sentlen, latentdim)
        pool = concat.sg_quasi_rnn(is_enc=True,
                                   att=True,
                                   name="qrnn_1",
                                   reuse_vars=reuse_vars)
        subrec1 = pool[:Hp.batch_size, -1, :]  # last character in sequence

        # --------------------------   QCNN + QPOOL ENCODER with attention #2  --------------------------

        # quazi cnn ZFO (batch*3, sentlen, latentdim)
        conv = pool.sg_quasi_conv1d(is_enc=True,
                                    size=2,
                                    name="qconv_2",
                                    reuse_vars=reuse_vars)
        # (batch, sentlen-duplicated, latentdim)
        subrec2 = tf.tile((subrec2.sg_expand_dims(axis=1)), [1, Hp.maxlen, 1])
        # (batch*4, sentlen, latentdim)
        concat = conv.sg_concat(target=subrec2, axis=0)
        pool = concat.sg_quasi_rnn(is_enc=True,
                                   att=True,
                                   name="qrnn_2",
                                   reuse_vars=reuse_vars)
        subrec2 = pool[:Hp.batch_size, -1, :]  # last character in sequence

        # --------------------------   ConvLSTM with RESIDUAL connection and MULTIPLICATIVE block   --------------------------

        #residual block
        causal = False  # for encoder
        crnn_input = (pool[:Hp.batch_size, :, :].sg_bypass_gpus(
            name='relu_0', act='relu', bn=(not causal),
            ln=causal).sg_conv1d_gpus(name="dimred_0",
                                      size=1,
                                      dev="/cpu:0",
                                      reuse=reuse_vars,
                                      dim=Hp.hd / 2,
                                      act='relu',
                                      bn=(not causal),
                                      ln=causal))

        # conv LSTM
        with tf.variable_scope("mem/clstm") as scp:
            (crnn_state, crnn_h) = crnn_cell(crnn_input, (crnn_state, crnn_h),
                                             size=5,
                                             reuse_vars=reuse_vars)
        # dimension recover and residual connection
        rnn_input0 = pool[:Hp.batch_size,:,:] + crnn_h\
                    .sg_conv1d_gpus(name = "diminc_0",size=1,dev="/cpu:0", dim=Hp.hd,reuse=reuse_vars, act='relu', bn=(not causal), ln=causal)

        # --------------------------   QCNN + QPOOL ENCODER with attention #3  --------------------------

        # pooling for lstm input
        # quazi cnn ZFO (batch*3, sentlen, latentdim)
        conv = rnn_input0.sg_quasi_conv1d(is_enc=True,
                                          size=2,
                                          name="qconv_3",
                                          reuse_vars=reuse_vars)
        pool = conv.sg_quasi_rnn(is_enc=True,
                                 att=False,
                                 name="qrnn_3",
                                 reuse_vars=reuse_vars)
        rnn_input = pool[:Hp.batch_size, -1, :]  # last character in sequence

        # --------------------------   LSTM with RESIDUAL connection and MULTIPLICATIVE block --------------------------

        # recurrent block
        with tf.variable_scope("mem/lstm") as scp:
            (rnn_state, rnn_h) = rnn_cell(rnn_input, (rnn_state, rnn_h))

        rnn_h2 = tf.tile(((rnn_h + rnn_input).sg_expand_dims(axis=1)),
                         [1, Hp.maxlen, 1])

        # --------------------------   BYTENET DECODER   --------------------------

        # CNN decoder
        dec = y_src.sg_lookup(emb=emb_y).sg_concat(target=rnn_h2, name="dec")

        for i in range(num_blocks):
            dec = (dec.sg_res_block(
                size=3,
                rate=1,
                causal=True,
                name="dec1_%d" % (i),
                reuse_vars=reuse_vars).sg_res_block(
                    size=3,
                    rate=2,
                    causal=True,
                    name="dec2_%d" % (i),
                    reuse_vars=reuse_vars).sg_res_block(
                        size=3,
                        rate=4,
                        causal=True,
                        name="dec4_%d" % (i),
                        reuse_vars=reuse_vars).sg_res_block(
                            size=3,
                            rate=8,
                            causal=True,
                            name="dec8_%d" % (i),
                            reuse_vars=reuse_vars).sg_res_block(
                                size=3,
                                rate=16,
                                causal=True,
                                name="dec16_%d" % (i),
                                reuse_vars=reuse_vars))

        # final fully convolution layer for softmax
        dec = dec.sg_conv1d_gpus(size=1,
                                 dim=Hp.vs,
                                 name="out",
                                 summary=False,
                                 dev=self._dev,
                                 reuse=reuse_vars)

        ce_array = dec.sg_ce(target=y, mask=True, name="cross_ent_example")
        cross_entropy_mean = tf.reduce_mean(ce_array, name='cross_entropy')

        losses = tf.add_n([losses, cross_entropy_mean], name='total_loss')

        return (time + 1, subrec1, subrec2, rnn_state, rnn_h, crnn_state,
                crnn_h, losses)