def encoder(source, params): mask = tf.to_float(tf.cast(source, tf.bool)) hidden_size = params.hidden_size source, mask = util.remove_invalid_seq(source, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "src_embedding" src_emb = tf.get_variable(embed_name, [params.src_vocab.size(), params.embed_size]) src_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(src_emb, source) inputs = tf.nn.bias_add(inputs, src_bias) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("encoder"): # forward rnn with tf.variable_scope('forward'): outputs = rnn.rnn(params.cell, inputs, hidden_size, mask=mask, ln=params.layer_norm, sm=params.swap_memory) output_fw, state_fw = outputs[1] # backward rnn with tf.variable_scope('backward'): if not params.caencoder: outputs = rnn.rnn(params.cell, tf.reverse(inputs, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory) output_bw, state_bw = outputs[1] else: outputs = rnn.cond_rnn(params.cell, tf.reverse(inputs, [1]), tf.reverse(output_fw, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, one2one=True) output_bw, state_bw = outputs[1] output_bw = tf.reverse(output_bw, [1]) if not params.caencoder: source_encodes = tf.concat([output_fw, output_bw], -1) source_feature = tf.concat([state_fw, state_bw], -1) else: source_encodes = output_bw source_feature = state_bw with tf.variable_scope("decoder_initializer"): decoder_init = rnn.get_cell( params.cell, hidden_size, ln=params.layer_norm ).get_init_state(x=source_feature) decoder_init = tf.tanh(decoder_init) return { "encodes": source_encodes, "decoder_initializer": decoder_init, "mask": mask }
def __init__(self, cell, num_layers, num_units, batch_size, input_size, keep_prob=1.0, is_train=None, scope="native_rnn"): self.num_layers = num_layers self.cell_type = cell self.inits = [] self.dropout_mask = [] self.num_units = num_units self.scope = scope for layer in range(num_layers): input_size_ = input_size if layer == 0 else 2 * num_units init_fw = rnn.get_cell(cell, num_units).get_init_state( shape=[batch_size], scope="fw_{}".format(layer)) init_bw = rnn.get_cell(cell, num_units).get_init_state( shape=[batch_size], scope="bw_{}".format(layer)) mask_fw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode=None) self.inits.append(( init_fw, init_bw, )) self.dropout_mask.append(( mask_fw, mask_bw, ))
def match_layer(features, params): with tf.variable_scope("match", reuse=tf.AUTO_REUSE): p_emb = features["p_emb"] h_emb = features["h_emb"] p_mask = features["p_mask"] h_mask = features["h_mask"] p_seq_enc, p_vec_enc = wrap_rnn( p_emb, params.cell, 1, params.hidden_size, mask=p_mask, use_ln=params.layer_norm, dropout=params.dropout, scope="enc_p" ) with tf.variable_scope("h_init"): h_init = rnn.get_cell( params.cell, params.hidden_size, ln=params.layer_norm ).get_init_state(x=p_vec_enc) h_init = tf.tanh(h_init) _, (h_seq_enc, h_vec_enc), _, _ = rnn.cond_rnn( params.cell, h_emb, p_seq_enc, params.hidden_size, init_state=h_init, mask=h_mask, mem_mask=p_mask, ln=params.layer_norm, num_heads=params.num_heads ) p_encs = [p_seq_enc] h_encs = [h_seq_enc] if params.enable_bert: p_encs.append(features['bert_p_enc']) h_encs.append(features['bert_h_enc']) p_enc = tf.concat(p_encs, -1) h_enc = tf.concat(h_encs, -1) p_seq_enc, _ = wrap_rnn( p_enc, params.cell, 1, params.hidden_size, mask=p_mask, use_ln=params.layer_norm, dropout=params.dropout, scope="post_enc_p" ) h_seq_enc, _ = wrap_rnn( h_enc, params.cell, 1, params.hidden_size, mask=h_mask, use_ln=params.layer_norm, dropout=params.dropout, scope="post_enc_h" ) features.update({ 'p_enc': p_seq_enc, 'h_enc': h_seq_enc }) return features
def encoder(source, params): mask = dtype.tf_to_float(tf.cast(source, tf.bool)) hidden_size = params.hidden_size source, mask = util.remove_invalid_seq(source, mask) # extract source word embedding and apply dropout embed_name = "embedding" if params.shared_source_target_embedding \ else "src_embedding" src_emb = tf.get_variable(embed_name, [params.src_vocab.size(), params.embed_size]) src_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(src_emb, source) inputs = tf.nn.bias_add(inputs, src_bias) inputs = util.valid_apply_dropout(inputs, params.dropout) # the encoder module used in the deep attention paper with tf.variable_scope("encoder"): # x: embedding input, h: the hidden state x = inputs h = 0 z = 0 for layer in range(params.num_encoder_layer + 1): with tf.variable_scope("layer_{}".format(layer)): if layer == 0: # for the first layer, we perform a normal rnn layer to collect context information outputs = rnn.rnn(params.cell, x, hidden_size, mask=mask, ln=params.layer_norm, sm=params.swap_memory) h = outputs[1][0] else: # for deeper encoder layers, we incorporate both embedding input and previous inversed hidden # state sequence as input. # the embedding informs current input while hidden state tells future context is_reverse = (layer % 2 == 1) outputs = rnn.cond_rnn( params.cell, tf.reverse(x, [1]) if is_reverse else x, tf.reverse(h, [1]) if is_reverse else h, hidden_size, mask=tf.reverse(mask, [1]) if is_reverse else mask, ln=params.layer_norm, sm=params.swap_memory, num_heads=params.num_heads, one2one=True) h = outputs[1][0] h = tf.reverse(h, [1]) if is_reverse else h # the final hidden state used for decoder state initialization z = outputs[1][1] with tf.variable_scope("decoder_initializer"): decoder_cell = rnn.get_cell(params.cell, hidden_size, ln=params.layer_norm) return { "encodes": h, "decoder_initializer": { 'layer': decoder_cell.get_init_state(x=z, scope="dec_init_state") }, "mask": mask }
def deep_att_dec_rnn(cell_name, x, memory, d, init_state=None, mask=None, mem_mask=None, ln=False, sm=True, depth=1, num_heads=1): """Self implemented conditional-RNN procedure, supporting mask trick""" # cell_name: gru, lstm or atr # x: input sequence embedding matrix, [batch, seq_len, dim] # memory: the conditional part # d: hidden dimension for rnn # mask: mask matrix, [batch, seq_len] # mem_mask: memory mask matrix, [batch, mem_seq_len] # ln: whether use layer normalization # init_state: the initial hidden states, for cache purpose # sm: whether apply swap memory during rnn scan # depth: depth for the decoder in deep attention # num_heads: number of attention heads, multi-head attention # dp: variational dropout in_shape = util.shape_list(x) batch_size, time_steps = in_shape[:2] mem_shape = util.shape_list(memory) cell_lower = rnn.get_cell(cell_name, d, ln=ln, scope="{}_lower".format(cell_name)) cells_higher = [] for layer in range(depth): cell_higher = rnn.get_cell(cell_name, d, ln=ln, scope="{}_higher_{}".format( cell_name, layer)) cells_higher.append(cell_higher) if init_state is None: init_state = cell_lower.get_init_state(shape=[batch_size]) if mask is None: mask = dtype.tf_to_float(tf.ones([batch_size, time_steps])) if mem_mask is None: mem_mask = dtype.tf_to_float(tf.ones([batch_size, mem_shape[1]])) # prepare projected encodes and inputs cache_inputs = cell_lower.fetch_states(x) cache_inputs = [tf.transpose(v, [1, 0, 2]) for v in list(cache_inputs)] proj_memories = func.linear(memory, mem_shape[-1], bias=False, ln=ln, scope="context_att") mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2]) init_context = dtype.tf_to_float( tf.zeros([batch_size, depth, mem_shape[-1]])) init_weight = dtype.tf_to_float( tf.zeros([batch_size, depth, num_heads, mem_shape[1]])) mask_pos = len(cache_inputs) def _step_fn(prev, x): t, h_, c_, a_ = prev m, v = x[mask_pos], x[:mask_pos] # the first decoder rnn subcell, composing previous hidden state with the current word embedding s_ = cell_lower(h_, v) s_ = m * s_ + (1. - m) * h_ atts, att_ctxs = [], [] for layer in range(depth): # perform attention prev_cell = cell_lower if layer == 0 else cells_higher[layer - 1] vle = func.additive_attention( prev_cell.get_hidden(s_), memory, mem_mask, mem_shape[-1], ln=ln, num_heads=num_heads, proj_memory=proj_memories, scope="deep_attention_{}".format(layer)) a, c = vle['weights'], vle['output'] atts.append(tf.expand_dims(a, 1)) att_ctxs.append(tf.expand_dims(c, 1)) # perform next-level recurrence c_c = cells_higher[layer].fetch_states(c) ss_ = cells_higher[layer](s_, c_c) s_ = m * ss_ + (1. - m) * s_ h = s_ a = tf.concat(atts, axis=1) c = tf.concat(att_ctxs, axis=1) return t + 1, h, c, a time = tf.constant(0, dtype=tf.int32, name="time") step_states = (time, init_state, init_context, init_weight) step_vars = cache_inputs + [mask_ta] outputs = tf.scan(_step_fn, step_vars, initializer=step_states, parallel_iterations=32, swap_memory=sm) output_ta = outputs[1] context_ta = outputs[2] attention_ta = outputs[3] outputs = tf.transpose(output_ta, [1, 0, 2]) output_states = outputs[:, -1] # batch x target length x depth x mem-dimension contexts = tf.transpose(context_ta, [1, 0, 2, 3]) # batch x num_heads x depth x target length x source length attentions = tf.transpose(attention_ta, [1, 3, 2, 0, 4]) return (outputs, output_states), \ (cells_higher[-1].get_hidden(outputs), cells_higher[-1].get_hidden(output_states)), \ contexts, attentions
def encoder(source, params): mask = tf.to_float(tf.cast(source, tf.bool)) hidden_size = params.hidden_size source, mask = util.remove_invalid_seq(source, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "src_embedding" src_emb = tf.get_variable(embed_name, [params.src_vocab.size(), params.embed_size]) src_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(src_emb, source) inputs = tf.nn.bias_add(inputs, src_bias) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("encoder"): x = inputs for layer in range(params.num_encoder_layer): with tf.variable_scope("layer_{}".format(layer)): # forward rnn with tf.variable_scope('forward'): outputs = rnn.rnn(params.cell, x, hidden_size, mask=mask, ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) output_fw, state_fw = outputs[1] if layer == 0: # backward rnn with tf.variable_scope('backward'): if not params.caencoder: outputs = rnn.rnn(params.cell, tf.reverse(x, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) output_bw, state_bw = outputs[1] else: outputs = rnn.cond_rnn(params.cell, tf.reverse(x, [1]), tf.reverse(output_fw, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, num_heads=params.num_heads, one2one=True) output_bw, state_bw = outputs[1] output_bw = tf.reverse(output_bw, [1]) if not params.caencoder: y = tf.concat([output_fw, output_bw], -1) z = tf.concat([state_fw, state_bw], -1) else: y = output_bw z = state_bw else: y = output_fw z = state_fw y = func.linear(y, hidden_size, ln=False, scope="ff") # short cut via residual connection if x.get_shape()[-1].value == y.get_shape()[-1].value: x = func.residual_fn(x, y, dropout=params.dropout) else: x = y if params.layer_norm: x = func.layer_norm(x, scope="ln") with tf.variable_scope("decoder_initializer"): decoder_cell = rnn.get_cell(params.cell, hidden_size, ln=params.layer_norm) return { "encodes": x, "decoder_initializer": { "layer_{}".format(l): decoder_cell.get_init_state(x=z, scope="layer_{}".format(l)) for l in range(params.num_decoder_layer) }, "mask": mask }