def sg_reverse_seq(tensor, opt): r"""Reverses variable length slices. Before applying the pure tensorflow function tf.reverse_sequence, this function calculates sequence lengths by counting non-zeros. For example, ``` tensor = [[1, 2, 3, 0, 0], [4, 5, 0, 0, 0]] tensor.sg_reverse_seq() => [[3 2 1 0 0] [5 4 0 0 0]] ``` Args: tensor: A 2-D `Tensor` (automatically given by chain). opt: dim: Dimension to reverse. Default is 1. name : If provided, it replaces current tensor's name. Returns: A `Tensor` with the same shape and type as `tensor`. """ # default sequence dimension opt += tf.sg_opt(dim=1) seq_len = tf.not_equal(tensor, tf.zeros_like(tensor)).sg_int().sg_sum(dims=opt.dim) return tf.reverse_sequence(tensor, seq_len, opt.dim, name=opt.name)
def sg_quasi_rnn(tensor, opt): # Split if opt.att: H, Z, F, O = tf.split(tensor, 4, axis=0) # (16, 150, 320) for all else: Z, F, O = tf.split(tensor, 3, axis=0) # (16, 150, 320) for all # step func def step(z, f, o, c): ''' Runs fo-pooling at each time step ''' c = f * c + (1 - f) * z if opt.att: # attention a = tf.nn.softmax(tf.einsum("ijk,ik->ij", H, c)) # alpha. (16, 150) k = (a.sg_expand_dims() * H).sg_sum( axis=1) # attentional sum. (16, 320) h = o * (k.sg_dense(act="linear") + \ c.sg_dense(act="linear")) else: h = o * c return h, c # hidden states, (new) cell memories # Do rnn loop c, hs = 0, [] timesteps = tensor.get_shape().as_list()[1] for t in range(timesteps): z = Z[:, t, :] # (16, 320) f = F[:, t, :] # (16, 320) o = O[:, t, :] # (16, 320) # apply step function h, c = step(z, f, o, c) # (16, 320), (16, 320) # save result hs.append(h.sg_expand_dims(axis=1)) # Concat to return H = tf.concat(hs, 1) # (16, 150, 320) seqlen = tf.to_int32( tf.reduce_sum(tf.sign(tf.abs(tf.reduce_sum(H, axis=-1))), 1)) # (16,) float32 h = tf.reverse_sequence(input=H, seq_lengths=seqlen, seq_dim=1)[:, 0, :] # last hidden state vector if opt.is_enc: H_z = tf.tile((h.sg_dense(act="linear").sg_expand_dims(axis=1)), [1, timesteps, 1]) H_f = tf.tile((h.sg_dense(act="linear").sg_expand_dims(axis=1)), [1, timesteps, 1]) H_o = tf.tile((h.sg_dense(act="linear").sg_expand_dims(axis=1)), [1, timesteps, 1]) concatenated = tf.concat([H, H_z, H_f, H_o], 0) # (16*4, 150, 320) return concatenated else: return H # (16, 150, 320)
def tower_infer_dec(chars, scope, rnn_cell, dec_cell, word_emb, rnn_state, out_reuse_vars=False, dev='/cpu:0'): with tf.device(dev): with tf.variable_scope('embatch_size', reuse=True): # (vocab_size, latent_dim) emb_char = tf.sg_emb(name='emb_char', voca_size=Hp.char_vs, dim=Hp.hd, dev=dev) emb_word = tf.sg_emb(name='emb_word', emb=word_emb, voca_size=Hp.word_vs, dim=300, dev=dev) print(chars) ch = chars ch = tf.reverse_sequence(input=ch, seq_lengths=[Hp.c_maxlen] * Hp.batch_size, seq_dim=1) reuse_vars = reuse_vars_enc = True # -------------------------- BYTENET ENCODER -------------------------- with tf.variable_scope('encoder'): # embed table lookup enc = ch.sg_lookup(emb=emb_char) #(batch, sentlen, latentdim) # loop dilated conv block for i in range(Hp.num_blocks): enc = (enc.sg_res_block(size=5, rate=1, name="enc1_%d" % (i), is_first=True, reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars, dev=dev)) byte_enc = enc # -------------------------- QCNN + QPOOL ENCODER #1 -------------------------- with tf.variable_scope('quazi'): #quasi cnn layer ZFO [batch * 3, seqlen, dim2 ] conv = byte_enc.sg_quasi_conv1d(is_enc=True, size=4, name="qconv_1", dev=dev, reuse_vars=reuse_vars) # c = f * c + (1 - f) * z, h = o*c [batch * 4, seqlen, hd] pool0 = conv.sg_quasi_rnn(is_enc=False, att=False, name="qrnn_1", reuse_vars=reuse_vars, dev=dev) qpool_last = pool0[:, -1, :] # -------------------------- MAXPOOL along time dimension -------------------------- inpt_maxpl = tf.expand_dims(byte_enc, 1) # [batch, 1, seqlen, channels] maxpool = tf.nn.max_pool(inpt_maxpl, [1, 1, Hp.c_maxlen, 1], [1, 1, 1, 1], 'VALID') maxpool = tf.squeeze(maxpool, [1, 2]) # -------------------------- HIGHWAY -------------------------- concat = qpool_last + maxpool with tf.variable_scope('highway', reuse=reuse_vars): input_lstm = highway(concat, concat.get_shape()[-1], num_layers=1) # -------------------------- CONTEXT LSTM -------------------------- input_lstm = tf.nn.dropout(input_lstm, Hp.keep_prob) with tf.variable_scope('contx_lstm', reuse=reuse_vars): output, rnn_state = rnn_cell(input_lstm, rnn_state) beam_size = 8 reuse_vars = out_reuse_vars greedy = False if greedy: dec_state = rnn_state dec_out = [] d_out = tf.constant([1] * Hp.batch_size) for idx in range(Hp.w_maxlen): w_input = d_out.sg_lookup(emb=emb_word) dec_state = tf.contrib.rnn.LSTMStateTuple(c=dec_state.c, h=dec_state.h) with tf.variable_scope('dec_lstm', reuse=idx > 0 or reuse_vars): d_out, dec_state = dec_cell(w_input, dec_state) dec_out.append(d_out) d_out = tf.expand_dims(d_out, 1).sg_conv1d_gpus(size=1, dim=Hp.word_vs, name="out_conv", act="linear", dev=dev, reuse=idx > 0 or reuse_vars) d_out = tf.squeeze(d_out).sg_argmax() dec_out = tf.stack(dec_out, 1) dec = dec_out.sg_conv1d_gpus(size=1, dim=Hp.word_vs, name="out_conv", act="linear", dev=dev, reuse=True) return dec.sg_argmax(), rnn_state else: # ------------------ BEAM SEARCH -------------------- dec_state = tf.contrib.rnn.LSTMStateTuple( tf.tile(tf.expand_dims(rnn_state[0], 1), [1, beam_size, 1]), tf.tile(tf.expand_dims(rnn_state[1], 1), [1, beam_size, 1])) initial_ids = tf.constant([1] * Hp.batch_size) def symbols_to_logits_fn(ids, dec_state): dec = [] dec_c, dec_h = [], [] # (batch x beam_size x decoded_seq) ids = tf.reshape(ids, [Hp.batch_size, beam_size, -1]) print("dec_state ", dec_state[0].get_shape().as_list()) for ind in range(beam_size): with tf.variable_scope('dec_lstm', reuse=ind > 0 or reuse_vars): w_input = ids[:, ind, -1].sg_lookup(emb=emb_word) dec_state0 = tf.contrib.rnn.LSTMStateTuple( c=dec_state.c[:, ind, :], h=dec_state.h[:, ind, :]) dec_out, dec_state_i = dec_cell(w_input, dec_state0) dec_out = tf.expand_dims(dec_out, 1) dec_i = dec_out.sg_conv1d_gpus(size=1, dim=Hp.word_vs, name="out_conv", act="linear", dev=dev, reuse=ind > 0 or reuse_vars) dec.append(tf.squeeze(dec_i, 1)) dec_c.append(dec_state_i[0]) dec_h.append(dec_state_i[1]) return tf.stack(dec, 1), tf.contrib.rnn.LSTMStateTuple( tf.stack(dec_c, 1), tf.stack(dec_h, 1)) final_ids, final_probs = beam_search.beam_search(symbols_to_logits_fn, dec_state, initial_ids, beam_size, Hp.w_maxlen - 1, Hp.word_vs, 3.5, eos_id=2) return final_ids[:, 0, :], rnn_state
def rnn_body_stat(time, rnn_state): ch = chars_sent.read(time) ch = tf.reverse_sequence(input=ch, seq_lengths=[Hp.c_maxlen] * Hp.batch_size, seq_dim=1) reuse_vars = out_reuse_vars # -------------------------- BYTENET ENCODER -------------------------- with tf.variable_scope('encoder'): # embed table lookup enc = ch.sg_lookup(emb=emb_char) #(batch, sentlen, latentdim) # loop dilated conv block for i in range(Hp.num_blocks): enc = (enc.sg_res_block(size=5, rate=1, name="enc1_%d" % (i), is_first=True, reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars, dev=dev).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars, dev=dev)) byte_enc = enc # -------------------------- QCNN + QPOOL ENCODER #1 -------------------------- with tf.variable_scope('quazi'): #quasi cnn layer ZFO [batch * 3, seqlen, dim2 ] conv = byte_enc.sg_quasi_conv1d(is_enc=True, size=4, name="qconv_1", dev=dev, reuse_vars=reuse_vars) # c = f * c + (1 - f) * z, h = o*c [batch * 4, seqlen, hd] pool0 = conv.sg_quasi_rnn(is_enc=False, att=False, name="qrnn_1", reuse_vars=reuse_vars, dev=dev) qpool_last = pool0[:, -1, :] # -------------------------- MAXPOOL along time dimension -------------------------- inpt_maxpl = tf.expand_dims(byte_enc, 1) # [batch, 1, seqlen, channels] maxpool = tf.nn.max_pool(inpt_maxpl, [1, 1, Hp.c_maxlen, 1], [1, 1, 1, 1], 'VALID') maxpool = tf.squeeze(maxpool, [1, 2]) # -------------------------- HIGHWAY -------------------------- concat = qpool_last + maxpool with tf.variable_scope('highway', reuse=reuse_vars): input_lstm = highway(concat, concat.get_shape()[-1], num_layers=1) # -------------------------- CONTEXT LSTM -------------------------- input_lstm = tf.nn.dropout(input_lstm, Hp.keep_prob) with tf.variable_scope('contx_lstm', reuse=reuse_vars): output, rnn_state = rnn_cell(input_lstm, rnn_state) return (time + 1, rnn_state)
def sg_reverse_seq(tensor, opt): # default sequence dimension opt += tf.sg_opt(dim=1) seq_len = tf.not_equal(tensor, tf.zeros_like(tensor)).sg_int().sg_sum(dims=opt.dim) return tf.reverse_sequence(tensor, seq_len, opt.dim, name=opt.name)
def sg_lstm(tensor, opt): r"""Applies an LSTM. Args: tensor: A 3-D `Tensor` (automatically passed by decorator). opt: in_dim: A positive `integer`. The size of input dimension. dim: A positive `integer`. The size of output dimension. bias: Boolean. If True, biases are added. ln: Boolean. If True, layer normalization is applied. init_state: A 2-D `Tensor`. If None, the initial state is set to zeros. last_only: Boolean. If True, the outputs in the last time step are returned. mask: Boolean 2-D `Tensor` or None(default). For false elements values are excluded from the calculation. As a result, the outputs for the locations become 0. summary: If True, summaries are added. The default is True. Returns: A `Tensor`. If last_only is True, the output tensor has shape [batch size, dim]. Otherwise, [batch size, time steps, dim]. """ # layer normalization # noinspection PyPep8 ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step func def step(hh, cc, x): # forget gate f = tf.sigmoid(ln(tf.matmul(x, w_f) + tf.matmul(hh, u_f) + (b_f if opt.bias else 0))) # input gate ii = tf.sigmoid(ln(tf.matmul(x, w_i) + tf.matmul(hh, u_i) + (b_i if opt.bias else 0))) # new cell value c_new = tf.tanh(ln(tf.matmul(x, w_c) + tf.matmul(hh, u_c) + (b_c if opt.bias else 0))) # out gate o = tf.sigmoid(ln(tf.matmul(x, w_o) + tf.matmul(hh, u_o) + (b_o if opt.bias else 0))) # cell update cell = f * cc + ii * c_new # final output y = o * tf.tanh(cell) return y, cell # parameter initialize w_i = tf.sg_initializer.orthogonal('W_i', (opt.in_dim, opt.dim), summary=opt.summary) u_i = tf.sg_initializer.identity('U_i', opt.dim, summary=opt.summary) w_f = tf.sg_initializer.orthogonal('W_f', (opt.in_dim, opt.dim), summary=opt.summary) u_f = tf.sg_initializer.identity('U_f', opt.dim, summary=opt.summary) w_o = tf.sg_initializer.orthogonal('W_o', (opt.in_dim, opt.dim), summary=opt.summary) u_o = tf.sg_initializer.identity('U_o', opt.dim, summary=opt.summary) w_c = tf.sg_initializer.orthogonal('W_c', (opt.in_dim, opt.dim), summary=opt.summary) u_c = tf.sg_initializer.identity('U_c', opt.dim, summary=opt.summary) if opt.bias: b_i = tf.sg_initializer.constant('b_i', opt.dim, summary=opt.summary) b_f = tf.sg_initializer.constant('b_f', opt.dim, summary=opt.summary) b_o = tf.sg_initializer.constant('b_o', opt.dim, value=1, summary=opt.summary) b_c = tf.sg_initializer.constant('b_c', opt.dim, summary=opt.summary) # layer normalization parameters if opt.ln: # offset, scale parameter beta = tf.sg_initializer.constant('beta', opt.dim, summary=opt.summary) gamma = tf.sg_initializer.constant('gamma', opt.dim, value=1, summary=opt.summary) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, c, out = init_h, init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step function h, c = step(h, c, tensor[:, i, :]) # save result out.append(h.sg_expand_dims(axis=1)) # merge tensor out = tf.concat(out, 1) # apply mask if opt.mask is None: if opt.last_only: return out[:, -1, :] else: return out else: # apply mask out *= opt.mask.sg_expand_dims(axis=2).sg_float() if opt.last_only: # calc sequence length using given mask seq_len = opt.mask.sg_int().sg_sum(axis=1) # get last output rev = tf.reverse_sequence(out, seq_len, seq_axis=1) return rev[:, 0, :] else: return out
def sg_gru(tensor, opt): r"""Applies a GRU. Args: tensor: A 3-D `Tensor` (automatically passed by decorator). opt: in_dim: A positive `integer`. The size of input dimension. dim: A positive `integer`. The size of output dimension. bias: Boolean. If True, biases are added. ln: Boolean. If True, layer normalization is applied. init_state: A 2-D `Tensor`. If None, the initial state is set to zeros. last_only: Boolean. If True, the outputs in the last time step are returned. mask: Boolean 2-D `Tensor` or None(default). For false elements values are excluded from the calculation. As a result, the outputs for the locations become 0. summary: If True, summaries are added. The default is True. Returns: A `Tensor`. If last_only is True, the output tensor has shape [batch size, dim]. Otherwise, [batch size, time steps, dim]. """ # layer normalization # noinspection PyPep8 ln = lambda v: _ln_rnn(v, gamma, beta) if opt.ln else v # step func def step(hh, x): # update gate z = tf.sigmoid(ln(tf.matmul(x, w_z) + tf.matmul(hh, u_z) + (b_z if opt.bias else 0))) # reset gate r = tf.sigmoid(ln(tf.matmul(x, w_r) + tf.matmul(hh, u_r) + (b_r if opt.bias else 0))) # h_hat h_hat = tf.tanh(ln(tf.matmul(x, w_h) + tf.matmul(r * hh, u_h) + (b_h if opt.bias else 0))) # final output y = (1. - z) * h_hat + z * hh return y # parameter initialize w_z = tf.sg_initializer.orthogonal('W_z', (opt.in_dim, opt.dim), summary=opt.summary) u_z = tf.sg_initializer.identity('U_z', opt.dim, summary=opt.summary) w_r = tf.sg_initializer.orthogonal('W_r', (opt.in_dim, opt.dim), summary=opt.summary) u_r = tf.sg_initializer.identity('U_r', opt.dim, summary=opt.summary) w_h = tf.sg_initializer.orthogonal('W_h', (opt.in_dim, opt.dim), summary=opt.summary) u_h = tf.sg_initializer.identity('U_h', opt.dim, summary=opt.summary) if opt.bias: b_z = tf.sg_initializer.constant('b_z', opt.dim, summary=opt.summary) b_r = tf.sg_initializer.constant('b_r', opt.dim, summary=opt.summary) b_h = tf.sg_initializer.constant('b_h', opt.dim, summary=opt.summary) # layer normalization parameters if opt.ln: # offset, scale parameter beta = tf.sg_initializer.constant('beta', opt.dim, summary=opt.summary) gamma = tf.sg_initializer.constant('gamma', opt.dim, value=1, summary=opt.summary) # initial state init_h = opt.init_state if opt.init_state is not None \ else tf.zeros((tensor.get_shape().as_list()[0], opt.dim), dtype=tf.sg_floatx) # do rnn loop h, out = init_h, [] for i in range(tensor.get_shape().as_list()[1]): # apply step function h = step(h, tensor[:, i, :]) # save result # noinspection PyUnresolvedReferences out.append(h.sg_expand_dims(axis=1)) # merge tensor out = tf.concat(out, 1) # apply mask if opt.mask is None: if opt.last_only: return out[:, -1, :] else: return out else: # apply mask out *= opt.mask.sg_expand_dims(axis=2).sg_float() if opt.last_only: # calc sequence length using given mask seq_len = opt.mask.sg_int().sg_sum(axis=1) # get last output rev = tf.reverse_sequence(out, seq_len, seq_axis=1) return rev[:, 0, :] else: return out