def __call__(self, h_, x): # h_: the concatenation of previous hidden state # and memory cell state # x_i/x: the current input state for input gate # x_f/x: the current input state for forget gate # x_o/x: the current input state for output gate # x_c/x: the current input state for candidate cell """ f = sigmoid(h_, x) i = sigmoid(h_, x) o = sigmoid(h_, x) c' = tanh(h_, x) c = f * c_ + i * c' h = o * tanh(c) """ with tf.variable_scope("cell_{}".format(self.scope or "lstm")): x_g, x_c = x h_, c_ = tf.split(h_, 2, -1) h_g = linear(h_, self.d * 3, ln=self.ln, scope="gate_h") i, f, o = tf.split(tf.sigmoid(x_g + h_g), 3, -1) h_c = linear(h_, self.d, ln=self.ln, scope="hide_h") h_c = tf.tanh(x_c + h_c) c = i * h_c + f * c_ h = o * tf.tanh(c) return tf.concat([h, c], -1)
def tensor2vector(tensor, hidden_size, mask=None, init=None, use_ln=False, dropout=0.1, scope="vecatt"): with tf.variable_scope(scope): if util.valid_dropout(dropout): tensor = tf.nn.dropout(tensor, 1. - dropout) if init is None: m = tf.nn.tanh( func.linear(tensor, hidden_size, ln=use_ln, scope="m_tensor")) else: init = util.expand_tile_dims(init, tf.shape(tensor)[1], 1) if util.valid_dropout(dropout): init = tf.nn.dropout(init, 1. - dropout) m = tf.nn.tanh( func.linear(tensor, hidden_size, ln=use_ln, scope="m_tensor") + func.linear(init, hidden_size, scope="m_init")) s = func.linear(m, 1, bias=False, scope="sore") if mask is None: mask = tf.ones( [tf.shape(tensor)[0], tf.shape(tensor)[1]], tf.float32) s = tf.squeeze(s, -1) + (1. - mask) * (-1e9) w = tf.nn.softmax(s) return tf.reduce_sum(tf.expand_dims(w, 2) * tensor, axis=1), s
def ffn_layer(x, d, d_o, dropout=None, scope=None, numblocks=None): """ FFN layer in Transformer :param numblocks: size of 'L' in fixup paper :param scope: """ with tf.variable_scope(scope or "ffn_layer", dtype=tf.as_dtype(dtype.floatx())) as scope: assert numblocks is not None, 'Fixup requires the total model depth L' in_initializer = initializer.scale_initializer( math.pow(numblocks, -1. / 2.), scope.initializer) x = shift_layer(x) hidden = func.linear(x, d, scope="enlarge", weight_initializer=in_initializer, bias=False) hidden = shift_layer(hidden) hidden = tf.nn.relu(hidden) hidden = util.valid_apply_dropout(hidden, dropout) hidden = shift_layer(hidden) output = func.linear(hidden, d_o, scope="output", bias=False, weight_initializer=tf.zeros_initializer()) output = scale_layer(output) return output
def loss_layer(features, params): t_enc = features['t_enc'] feature = [t_enc] if params.enable_bert: s_mask = tf.to_float( tf.cast(tf.reduce_sum(features['t_mask'], 1), tf.bool)) batch_size = tf.shape(features['l'])[0] s_mask = tf.reshape(s_mask, [batch_size, -1]) bert_feature = features['feature'] bert_feature = tf.reshape( bert_feature, [batch_size, -1, bert_feature.get_shape().as_list()[-1]]) bert_vec, _ = tensor2vector(bert_feature, params.hidden_size, mask=s_mask, use_ln=params.layer_norm, dropout=params.dropout, scope="bert_att") feature.append(bert_vec) feature = tf.concat(feature, -1) label_logits = func.linear(feature, params.label_size, ln=params.layer_norm, scope="label") # multi-label classification-based objective def mlceloss(logits, labels): soft_label, normalizer = util.label_smooth(labels, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(labels)) return tf.reduce_mean(centropy) loss = mlceloss(label_logits, features['l']) if params.weight_decay > 0: with tf.variable_scope('l2_loss'): l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ]) loss += params.weight_decay * l2_loss features.update({'loss': loss}) prediction = tf.argmax(label_logits, -1) label_output = tf.nn.softmax(label_logits, -1) return features, prediction, label_output
def __call__(self, h_, x): # h_: the previous hidden state # x: the current input state """ p = W x q = U h_ i = sigmoid(p + q) f = sigmoid(p - q) h = i * p + f * h_ """ if isinstance(x, (list, tuple)): x = x[0] with tf.variable_scope("cell_{}".format(self.scope or "atr")): q = linear(h_, self.d, ln=self.ln, scope="hide_h") p = x f = tf.sigmoid(p - q) if self.twin: i = tf.sigmoid(p + q) # we empirically find that the following simple form is more stable. else: i = 1. - f h = i * p + f * h_ return h
def __call__(self, h_, x): # h_: the previous hidden state # x_g/x: the current input state for gate # x_h/x: the current input state for hidden """ z = sigmoid(h_, x) r = sigmoid(h_, x) h' = tanh(x, r * h_) h = z * h_ + (1. - z) * h' """ with tf.variable_scope("cell_{}".format(self.scope or "gru")): x_g, x_h = x h_g = linear(h_, self.d * 2, ln=self.ln, scope="gate_h") z, r = tf.split(tf.sigmoid(x_g + h_g), 2, -1) h_h = linear(h_ * r, self.d, ln=self.ln, scope="hide_h") h = tf.tanh(x_h + h_h) h = z * h_ + (1. - z) * h return h
def _get_init_state(self, d, shape=None, x=None, scope=None): # gen init state vector # if no evidence x is provided, use zero initialization if x is None: assert shape is not None, "you should provide shape" if not isinstance(shape, (tuple, list)): shape = [shape] shape = shape + [d] return dtype.tf_to_float(tf.zeros(shape)) else: return linear(x, d, bias=True, ln=self.ln, scope="{}_init".format(scope or self.scope))
def loss_layer(features, params): p_enc = features['p_enc'] h_enc = features['h_enc'] p_mask = features['p_mask'] h_mask = features['h_mask'] feature_list = [ tensor2vector(p_enc, params.hidden_size, mask=p_mask, scope="p_att")[0], tensor2vector(h_enc, params.hidden_size, mask=h_mask, scope="h_att")[0], ] if params.enable_bert: feature_list.append(features['feature']) feature = tf.concat(feature_list, -1) label_logits = func.linear(feature, params.label_size, ln=params.layer_norm, scope="label") def celoss(logits, labels): soft_label, normalizer = util.label_smooth( labels, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=soft_label ) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(labels)) return tf.reduce_mean(centropy) loss = celoss(label_logits, features['l']) if params.weight_decay > 0: with tf.variable_scope('l2_loss'): l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) loss += params.weight_decay * l2_loss features.update({ 'loss': loss }) return features, tf.argmax(label_logits, -1)
def __call__(self, h_, x): # h_: the previous hidden state # x: the current input state """ p = W x q = U h_ i = sigmoid(p + q) f = sigmoid(p - q) h = i * p + f * h_ """ if isinstance(x, (list, tuple)): x = x[0] with tf.variable_scope("cell_{}".format(self.scope or "atr")): q = linear(h_, self.d, ln=self.ln, scope="hide_h") p = x f = tf.sigmoid(p - q) i = tf.sigmoid(p + q) h = tf.tanh(i * p + f * h_) return h
def fetch_states(self, x): with tf.variable_scope("fetch_state_{}".format(self.scope or "lstm")): g = linear(x, self.d * 3, bias=True, ln=self.ln, scope="gate_x") c = linear(x, self.d, bias=True, ln=self.ln, scope="hide_x") return g, c
def encoder(source, params): mask = tf.to_float(tf.cast(source, tf.bool)) hidden_size = params.hidden_size source, mask = util.remove_invalid_seq(source, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "src_embedding" src_emb = tf.get_variable(embed_name, [params.src_vocab.size(), params.embed_size]) src_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(src_emb, source) inputs = tf.nn.bias_add(inputs, src_bias) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("encoder"): x = inputs for layer in range(params.num_encoder_layer): with tf.variable_scope("layer_{}".format(layer)): # forward rnn with tf.variable_scope('forward'): outputs = rnn.rnn(params.cell, x, hidden_size, mask=mask, ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) output_fw, state_fw = outputs[1] if layer == 0: # backward rnn with tf.variable_scope('backward'): if not params.caencoder: outputs = rnn.rnn(params.cell, tf.reverse(x, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) output_bw, state_bw = outputs[1] else: outputs = rnn.cond_rnn(params.cell, tf.reverse(x, [1]), tf.reverse(output_fw, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, num_heads=params.num_heads, one2one=True) output_bw, state_bw = outputs[1] output_bw = tf.reverse(output_bw, [1]) if not params.caencoder: y = tf.concat([output_fw, output_bw], -1) z = tf.concat([state_fw, state_bw], -1) else: y = output_bw z = state_bw else: y = output_fw z = state_fw y = func.linear(y, hidden_size, ln=False, scope="ff") # short cut via residual connection if x.get_shape()[-1].value == y.get_shape()[-1].value: x = func.residual_fn(x, y, dropout=params.dropout) else: x = y if params.layer_norm: x = func.layer_norm(x, scope="ln") with tf.variable_scope("decoder_initializer"): decoder_cell = rnn.get_cell(params.cell, hidden_size, ln=params.layer_norm) return { "encodes": x, "decoder_initializer": { "layer_{}".format(l): decoder_cell.get_init_state(x=z, scope="layer_{}".format(l)) for l in range(params.num_decoder_layer) }, "mask": mask }
def deep_att_dec_rnn(cell_name, x, memory, d, init_state=None, mask=None, mem_mask=None, ln=False, sm=True, depth=1, num_heads=1): """Self implemented conditional-RNN procedure, supporting mask trick""" # cell_name: gru, lstm or atr # x: input sequence embedding matrix, [batch, seq_len, dim] # memory: the conditional part # d: hidden dimension for rnn # mask: mask matrix, [batch, seq_len] # mem_mask: memory mask matrix, [batch, mem_seq_len] # ln: whether use layer normalization # init_state: the initial hidden states, for cache purpose # sm: whether apply swap memory during rnn scan # depth: depth for the decoder in deep attention # num_heads: number of attention heads, multi-head attention # dp: variational dropout in_shape = util.shape_list(x) batch_size, time_steps = in_shape[:2] mem_shape = util.shape_list(memory) cell_lower = rnn.get_cell(cell_name, d, ln=ln, scope="{}_lower".format(cell_name)) cells_higher = [] for layer in range(depth): cell_higher = rnn.get_cell(cell_name, d, ln=ln, scope="{}_higher_{}".format( cell_name, layer)) cells_higher.append(cell_higher) if init_state is None: init_state = cell_lower.get_init_state(shape=[batch_size]) if mask is None: mask = dtype.tf_to_float(tf.ones([batch_size, time_steps])) if mem_mask is None: mem_mask = dtype.tf_to_float(tf.ones([batch_size, mem_shape[1]])) # prepare projected encodes and inputs cache_inputs = cell_lower.fetch_states(x) cache_inputs = [tf.transpose(v, [1, 0, 2]) for v in list(cache_inputs)] proj_memories = func.linear(memory, mem_shape[-1], bias=False, ln=ln, scope="context_att") mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2]) init_context = dtype.tf_to_float( tf.zeros([batch_size, depth, mem_shape[-1]])) init_weight = dtype.tf_to_float( tf.zeros([batch_size, depth, num_heads, mem_shape[1]])) mask_pos = len(cache_inputs) def _step_fn(prev, x): t, h_, c_, a_ = prev m, v = x[mask_pos], x[:mask_pos] # the first decoder rnn subcell, composing previous hidden state with the current word embedding s_ = cell_lower(h_, v) s_ = m * s_ + (1. - m) * h_ atts, att_ctxs = [], [] for layer in range(depth): # perform attention prev_cell = cell_lower if layer == 0 else cells_higher[layer - 1] vle = func.additive_attention( prev_cell.get_hidden(s_), memory, mem_mask, mem_shape[-1], ln=ln, num_heads=num_heads, proj_memory=proj_memories, scope="deep_attention_{}".format(layer)) a, c = vle['weights'], vle['output'] atts.append(tf.expand_dims(a, 1)) att_ctxs.append(tf.expand_dims(c, 1)) # perform next-level recurrence c_c = cells_higher[layer].fetch_states(c) ss_ = cells_higher[layer](s_, c_c) s_ = m * ss_ + (1. - m) * s_ h = s_ a = tf.concat(atts, axis=1) c = tf.concat(att_ctxs, axis=1) return t + 1, h, c, a time = tf.constant(0, dtype=tf.int32, name="time") step_states = (time, init_state, init_context, init_weight) step_vars = cache_inputs + [mask_ta] outputs = tf.scan(_step_fn, step_vars, initializer=step_states, parallel_iterations=32, swap_memory=sm) output_ta = outputs[1] context_ta = outputs[2] attention_ta = outputs[3] outputs = tf.transpose(output_ta, [1, 0, 2]) output_states = outputs[:, -1] # batch x target length x depth x mem-dimension contexts = tf.transpose(context_ta, [1, 0, 2, 3]) # batch x num_heads x depth x target length x source length attentions = tf.transpose(attention_ta, [1, 3, 2, 0, 4]) return (outputs, output_states), \ (cells_higher[-1].get_hidden(outputs), cells_higher[-1].get_hidden(output_states)), \ contexts, attentions
def embedding_layer(features, params): t = features['t'] t_mask = tf.to_float(tf.cast(t, tf.bool)) with tf.device('/cpu:0'): symbol_embeddings = tf.get_variable('special_symbol_embeddings', shape=(3, params.embed_size), trainable=True) embedding_initializer = tf.glorot_uniform_initializer() if params.word_vocab.pretrained_embedding is not None: pretrain_embedding = params.word_vocab.pretrained_embedding embedding_initializer = tf.constant_initializer(pretrain_embedding) general_embeddings = tf.get_variable( 'general_symbol_embeddings', shape=(params.word_vocab.size() - 3, params.embed_size), initializer=embedding_initializer, trainable=params.word_vocab.pretrained_embedding is None) word_embeddings = tf.concat([symbol_embeddings, general_embeddings], 0) # apply word dropout wd_mask = util.valid_apply_dropout(t_mask, params.word_dropout) wd_mask = tf.to_float(tf.cast(wd_mask, tf.bool)) t_emb = tf.nn.embedding_lookup(word_embeddings, t * tf.to_int32(wd_mask)) t_emb = t_emb * tf.expand_dims(t_mask, -1) embed_features = [t_emb] if params.enable_bert: embed_features.append(features['bert_enc']) if params.use_char: c = features['c'] c_mask = tf.to_float(tf.cast(c, tf.bool)) c = tf.reshape(c, [-1, tf.shape(c)[-1]]) c_mask = tf.reshape(c_mask, [-1, tf.shape(c_mask)[-1]]) with tf.device('/cpu:0'): char_embeddings = tf.get_variable( 'char_embeddings', shape=(params.char_vocab.size(), params.char_embed_size), initializer=tf.glorot_uniform_initializer(), trainable=True) with tf.variable_scope('char_embedding'): c_emb = tf.nn.embedding_lookup(char_embeddings, c) c_emb = util.valid_apply_dropout(c_emb, 0.5 * params.dropout) with tf.variable_scope("char_encoding", reuse=tf.AUTO_REUSE): c_emb = c_emb * tf.expand_dims(c_mask, -1) c_shp = util.shape_list(features['c']) c_emb = tf.reshape( c_emb, [c_shp[0], c_shp[1], c_shp[2], params.char_embed_size]) c_state = func.linear(tf.reduce_max(c_emb, 2), params.char_embed_size, scope="cmap") embed_features.append(c_state) t_emb = tf.concat(embed_features, axis=2) * tf.expand_dims(t_mask, -1) features.update({ 't_emb': t_emb, 't_mask': t_mask, }) return features
def fetch_states(self, x): with tf.variable_scope("fetch_state_{}".format(self.scope or "lrn")): h = linear(x, self.d * 3, bias=True, ln=self.ln, scope="hide_x") return (h, )
def decoder(target, state, params): mask = dtype.tf_to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size is_training = ('decoder' not in state) # handling target-side word embedding, including shift-padding for training embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if is_training: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] else: inputs = tf.cond( tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) inputs = util.valid_apply_dropout(inputs, params.dropout) with tf.variable_scope("decoder"): x = inputs init_state = state["decoder_initializer"]["layer"] if not is_training: init_state = state["decoder"]["state"]["layer"] returns = deep_att_dec_rnn(params.cell, x, state["encodes"], hidden_size, init_state=init_state, mask=mask, num_heads=params.num_heads, mem_mask=state["mask"], ln=params.layer_norm, sm=params.swap_memory, depth=params.num_decoder_layer) (_, hidden_state), (outputs, _), contexts, attentions = returns if not is_training: state['decoder']['state']['layer'] = hidden_state x = outputs cshp = util.shape_list(contexts) c = tf.reshape(contexts, [cshp[0], cshp[1], cshp[2] * cshp[3]]) feature = func.linear(tf.concat([x, c, inputs], -1), params.embed_size, ln=params.layer_norm, scope="ff") feature = tf.nn.tanh(feature) feature = util.valid_apply_dropout(feature, params.dropout) if 'dev_decode' in state: feature = feature[:, -1, :] embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) logits = tf.cast(logits, tf.float32) soft_label, normalizer = util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(target)) mask = tf.cast(mask, tf.float32) per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum( mask, -1) loss = tf.reduce_mean(per_sample_loss) # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: loss) return loss, logits, state, per_sample_loss
def fetch_states(self, x): with tf.variable_scope("fetch_state_{}".format(self.scope or "gru")): g = linear(x, self.d * 2, bias=False, ln=self.ln, scope="gate_x") h = linear(x, self.d, bias=False, ln=self.ln, scope="hide_x") return g, h
def dot_attention(query, memory, mem_mask, hidden_size, ln=False, num_heads=1, cache=None, dropout=None, out_map=True, scope=None): """ dotted attention model :param query: [batch_size, qey_len, dim] :param memory: [batch_size, seq_len, mem_dim] or None :param mem_mask: [batch_size, seq_len] :param hidden_size: attention space dimension :param ln: whether use layer normalization :param num_heads: attention head number :param dropout: attention dropout, default disable :param out_map: output additional mapping :param cache: cache-based decoding :param scope: :return: a value matrix, [batch_size, qey_len, mem_dim] """ with tf.variable_scope(scope or "dot_attention", reuse=tf.AUTO_REUSE, dtype=tf.as_dtype(dtype.floatx())): if memory is None: # suppose self-attention from queries alone h = func.linear(query, hidden_size * 3, ln=ln, scope="qkv_map") q, k, v = tf.split(h, 3, -1) if cache is not None: k = tf.concat([cache['k'], k], axis=1) v = tf.concat([cache['v'], v], axis=1) cache = { 'k': k, 'v': v, } else: q = func.linear(query, hidden_size, ln=ln, scope="q_map") if cache is not None and ('mk' in cache and 'mv' in cache): k, v = cache['mk'], cache['mv'] else: k = func.linear(memory, hidden_size, ln=ln, scope="k_map") v = func.linear(memory, hidden_size, ln=ln, scope="v_map") if cache is not None: cache['mk'] = k cache['mv'] = v q = func.split_heads(q, num_heads) k = func.split_heads(k, num_heads) v = func.split_heads(v, num_heads) q *= (hidden_size // num_heads) ** (-0.5) # q * k => attention weights logits = tf.matmul(q, k, transpose_b=True) # convert the mask to 0-1 form and multiply to logits if mem_mask is not None: zero_one_mask = tf.to_float(tf.equal(mem_mask, 0.0)) logits *= zero_one_mask # replace softmax with relu # weights = tf.nn.softmax(logits) weights = tf.nn.relu(logits) dweights = util.valid_apply_dropout(weights, dropout) # weights * v => attention vectors o = tf.matmul(dweights, v) o = func.combine_heads(o) # perform RMSNorm to stabilize running o = gated_rms_norm(o, scope="post") if out_map: o = func.linear(o, hidden_size, ln=ln, scope="o_map") results = { 'weights': weights, 'output': o, 'cache': cache } return results
def decoder(target, state, params): mask = dtype.tf_to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5) is_training = ('decoder' not in state) if is_training: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if is_training: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] inputs = func.add_timing_signal(inputs) else: inputs = tf.cond( tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) inputs = func.add_timing_signal(inputs, time=dtype.tf_to_float(state['time'])) inputs = util.valid_apply_dropout(inputs, params.dropout) with tf.variable_scope("decoder"): x = inputs for layer in range(params.num_decoder_layer): if params.deep_transformer_init: layer_initializer = tf.variance_scaling_initializer( params.initializer_gain * (layer + 1)**-0.5, mode="fan_avg", distribution="uniform") else: layer_initializer = None with tf.variable_scope("layer_{}".format(layer), initializer=layer_initializer): with tf.variable_scope("average_attention"): x_fwds = [] for strategy in params.strategies: with tf.variable_scope(strategy): x_fwd = average_attention_strategy( strategy, x, mask, state, layer, params) x_fwds.append(x_fwd) x_fwd = tf.add_n(x_fwds) / len(x_fwds) # FFN activation if params.use_ffn: y = func.ffn_layer( x_fwd, params.filter_size, hidden_size, dropout=params.relu_dropout, ) else: y = x_fwd # Gating layer z = func.linear(tf.concat([x, y], axis=-1), hidden_size * 2, scope="z_project") i, f = tf.split(z, 2, axis=-1) y = tf.sigmoid(i) * x + tf.sigmoid(f) * y x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("cross_attention"): y = func.dot_attention( x, state['encodes'], func.attention_bias(state['mask'], "masking"), hidden_size, num_heads=params.num_heads, dropout=params.attention_dropout, cache=None if is_training else state['decoder']['state']['layer_{}'.format(layer)]) if not is_training: # mk, mv state['decoder']['state']['layer_{}'.format(layer)]\ .update(y['cache']) y = y['output'] x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("feed_forward"): y = func.ffn_layer( x, params.filter_size, hidden_size, dropout=params.relu_dropout, ) x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) feature = x if 'dev_decode' in state: feature = x[:, -1, :] embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) logits = tf.cast(logits, tf.float32) soft_label, normalizer = util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(target)) mask = tf.cast(mask, tf.float32) per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum( mask, -1) loss = tf.reduce_mean(per_sample_loss) # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: loss) return loss, logits, state, per_sample_loss
def embedding_layer(features, params): p = features['p'] h = features['h'] p_mask = tf.to_float(tf.cast(p, tf.bool)) h_mask = tf.to_float(tf.cast(h, tf.bool)) with tf.device('/cpu:0'): symbol_embeddings = tf.get_variable('special_symbol_embeddings', shape=(3, params.embed_size), trainable=True) embedding_initializer = tf.glorot_uniform_initializer() if tf.gfile.Exists(params.pretrain_word_embedding_file): pretrain_embedding = np.load(params.pretrain_word_embedding_file)['data'] embedding_initializer = tf.constant_initializer(pretrain_embedding) general_embeddings = tf.get_variable('general_symbol_embeddings', shape=(params.word_vocab.size() - 3, params.embed_size), initializer=embedding_initializer, trainable=False) word_embeddings = tf.concat([symbol_embeddings, general_embeddings], 0) p_emb = tf.nn.embedding_lookup(word_embeddings, p) h_emb = tf.nn.embedding_lookup(word_embeddings, h) p_features = [p_emb] h_features = [h_emb] if params.enable_bert: p_features.append(features['bert_p_enc']) h_features.append(features['bert_h_enc']) if params.use_char: pc = features['pc'] hc = features['hc'] pc_mask = tf.to_float(tf.cast(pc, tf.bool)) hc_mask = tf.to_float(tf.cast(hc, tf.bool)) pc = tf.reshape(pc, [-1, tf.shape(pc)[-1]]) hc = tf.reshape(hc, [-1, tf.shape(hc)[-1]]) pc_mask = tf.reshape(pc_mask, [-1, tf.shape(pc_mask)[-1]]) hc_mask = tf.reshape(hc_mask, [-1, tf.shape(hc_mask)[-1]]) with tf.device('/cpu:0'): char_embeddings = tf.get_variable('char_embeddings', shape=(params.char_vocab.size(), params.char_embed_size), initializer=tf.glorot_uniform_initializer(), trainable=True) with tf.variable_scope('char_embedding'): pc_emb = tf.nn.embedding_lookup(char_embeddings, pc) hc_emb = tf.nn.embedding_lookup(char_embeddings, hc) if util.valid_dropout(params.dropout): pc_emb = tf.nn.dropout(pc_emb, 1. - 0.5 * params.dropout) hc_emb = tf.nn.dropout(hc_emb, 1. - 0.5 * params.dropout) with tf.variable_scope("char_encoding", reuse=tf.AUTO_REUSE): pc_emb = pc_emb * tf.expand_dims(pc_mask, -1) hc_emb = hc_emb * tf.expand_dims(hc_mask, -1) pc_shp = util.shape_list(features['pc']) pc_emb = tf.reshape(pc_emb, [pc_shp[0], pc_shp[1], pc_shp[2], params.char_embed_size]) hc_shp = util.shape_list(features['hc']) hc_emb = tf.reshape(hc_emb, [hc_shp[0], hc_shp[1], hc_shp[2], params.char_embed_size]) pc_state = func.linear(tf.reduce_max(pc_emb, 2), params.char_embed_size, scope="cmap") hc_state = func.linear(tf.reduce_max(hc_emb, 2), params.char_embed_size, scope="cmap") p_features.append(pc_state) h_features.append(hc_state) ''' p_emb = func.highway(tf.concat(p_features, axis=2), size=params.hidden_size, dropout=params.dropout, num_layers=2, scope='highway') h_emb = func.highway(tf.concat(h_features, axis=2), size=params.hidden_size, dropout=params.dropout, num_layers=2, scope='highway') ''' p_emb = tf.concat(p_features, axis=2) h_emb = tf.concat(h_features, axis=2) p_emb = p_emb * tf.expand_dims(p_mask, -1) h_emb = h_emb * tf.expand_dims(h_mask, -1) features.update({'p_emb': p_emb, 'h_emb': h_emb, 'p_mask': p_mask, 'h_mask': h_mask, }) return features
def decoder(target, state, params): mask = tf.to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size if 'decoder' not in state: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if 'decoder' not in state: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] else: inputs = tf.cond( tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("decoder"): x = inputs for layer in range(params.num_decoder_layer): with tf.variable_scope("layer_{}".format(layer)): init_state = state["decoder_initializer"]["layer_{}".format( layer)] if 'decoder' in state: init_state = state["decoder"]["state"]["layer_{}".format( layer)] if layer == 0 or params.use_deep_att: returns = rnn.cond_rnn(params.cell, x, state["encodes"], hidden_size, init_state=init_state, mask=mask, num_heads=params.num_heads, mem_mask=state["mask"], ln=params.layer_norm, sm=params.swap_memory, one2one=False, dp=params.dropout) (_, hidden_state), (outputs, _), contexts, attentions = returns c = contexts else: if params.caencoder: returns = rnn.cond_rnn(params.cell, x, c, hidden_size, init_state=init_state, mask=mask, mem_mask=mask, ln=params.layer_norm, sm=params.swap_memory, num_heads=params.num_heads, one2one=True, dp=params.dropout) (_, hidden_state), (outputs, _), contexts, attentions = returns else: outputs = rnn.rnn(params.cell, tf.concat([x, c], -1), hidden_size, mask=mask, init_state=init_state, ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) outputs, hidden_state = outputs[1] if 'decoder' in state: state['decoder']['state']['layer_{}'.format( layer)] = hidden_state y = func.linear(outputs, hidden_size, ln=False, scope="ff") # short cut via residual connection if x.get_shape()[-1].value == y.get_shape()[-1].value: x = func.residual_fn(x, y, dropout=params.dropout) else: x = y if params.layer_norm: x = func.layer_norm(x, scope="ln") feature = func.linear(tf.concat([x, c], -1), params.embed_size, ln=params.layer_norm, scope="ff") feature = tf.nn.tanh(feature) if util.valid_dropout(params.dropout): feature = tf.nn.dropout(feature, 1. - params.dropout) if 'dev_decode' in state: feature = x[:, -1, :] embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) soft_label, normalizer = util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(target)) loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(mask, -1) loss = tf.reduce_mean(loss) # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: loss) return loss, logits, state
def dot_attention(query, memory, mem_mask, hidden_size, ln=False, num_heads=1, cache=None, dropout=None, use_relative_pos=False, max_relative_position=16, out_map=True, scope=None, fuse_mask=None, decode_step=None, numblocks=None): """ dotted attention model :param query: [batch_size, qey_len, dim] :param memory: [batch_size, seq_len, mem_dim] or None :param mem_mask: [batch_size, seq_len] :param hidden_size: attention space dimension :param ln: whether use layer normalization :param num_heads: attention head number :param dropout: attention dropout, default disable :param out_map: output additional mapping :param cache: cache-based decoding :param fuse_mask: aan mask during training, and timestep for testing :param max_relative_position: maximum position considered for relative embedding :param use_relative_pos: whether use relative position information :param decode_step: the time step of current decoding, 0-based :param numblocks: size of 'L' in fixup paper :param scope: :return: a value matrix, [batch_size, qey_len, mem_dim] """ with tf.variable_scope(scope or "dot_attention", reuse=tf.AUTO_REUSE, dtype=tf.as_dtype(dtype.floatx())) as scope: if fuse_mask: assert memory is not None, 'Fuse mechanism only applied with cross-attention' if cache and use_relative_pos: assert decode_step is not None, 'Decode Step must provide when use relative position encoding' assert numblocks is not None, 'Fixup requires the total model depth L' scale_base = 6. if fuse_mask is None else 8. in_initializer = initializer.scale_initializer( math.pow(numblocks, -1. / scale_base), scope.initializer) if memory is None: # suppose self-attention from queries alone h = func.linear(query, hidden_size * 3, ln=ln, scope="qkv_map", weight_initializer=in_initializer, bias=False) q, k, v = tf.split(h, 3, -1) if cache is not None: k = tf.concat([cache['k'], k], axis=1) v = tf.concat([cache['v'], v], axis=1) cache = { 'k': k, 'v': v, } else: q = func.linear(query, hidden_size, ln=ln, scope="q_map", weight_initializer=in_initializer, bias=False) if cache is not None and ('mk' in cache and 'mv' in cache): k, v = cache['mk'], cache['mv'] else: k = func.linear(memory, hidden_size, ln=ln, scope="k_map", weight_initializer=in_initializer, bias=False) v = func.linear(memory, hidden_size, ln=ln, scope="v_map", weight_initializer=in_initializer, bias=False) if cache is not None: cache['mk'] = k cache['mv'] = v q = func.split_heads(q, num_heads) k = func.split_heads(k, num_heads) v = func.split_heads(v, num_heads) q *= (hidden_size // num_heads)**(-0.5) q_shp = util.shape_list(q) k_shp = util.shape_list(k) v_shp = util.shape_list(v) q_len = q_shp[2] if decode_step is None else decode_step + 1 r_lst = None if decode_step is None else 1 # q * k => attention weights if use_relative_pos: r = rpr.get_relative_positions_embeddings(q_len, k_shp[2], k_shp[3], max_relative_position, name="rpr_keys", last=r_lst) logits = rpr.relative_attention_inner(q, k, r, transpose=True) else: logits = tf.matmul(q, k, transpose_b=True) if mem_mask is not None: logits += mem_mask weights = tf.nn.softmax(logits) dweights = util.valid_apply_dropout(weights, dropout) # weights * v => attention vectors if use_relative_pos: r = rpr.get_relative_positions_embeddings(q_len, k_shp[2], v_shp[3], max_relative_position, name="rpr_values", last=r_lst) o = rpr.relative_attention_inner(dweights, v, r, transpose=False) else: o = tf.matmul(dweights, v) o = func.combine_heads(o) if fuse_mask is not None: # This is for AAN, the important part is sharing v_map v_q = func.linear(query, hidden_size, ln=ln, scope="v_map", weight_initializer=in_initializer, bias=False) if cache is not None and 'aan' in cache: aan_o = (v_q + cache['aan']) / dtype.tf_to_float(fuse_mask + 1) else: # Simplified Average Attention Network aan_o = tf.matmul(fuse_mask, v_q) if cache is not None: if 'aan' not in cache: cache['aan'] = v_q else: cache['aan'] = v_q + cache['aan'] # Directly sum both self-attention and cross attention o = o + aan_o if out_map: o = func.linear(o, hidden_size, ln=ln, scope="o_map", weight_initializer=tf.zeros_initializer(), bias=False) results = {'weights': weights, 'output': o, 'cache': cache} return results
def dot_attention(query, memory, mem_mask, hidden_size, ln=False, num_heads=1, cache=None, dropout=None, out_map=True, scope=None, count_mask=None): """ dotted attention model with l0drop :param query: [batch_size, qey_len, dim] :param memory: [batch_size, seq_len, mem_dim] or None :param mem_mask: [batch_size, seq_len] :param hidden_size: attention space dimension :param ln: whether use layer normalization :param num_heads: attention head number :param dropout: attention dropout, default disable :param out_map: output additional mapping :param cache: cache-based decoding :param count_mask: counting vector for l0drop :param scope: :return: a value matrix, [batch_size, qey_len, mem_dim] """ with tf.variable_scope(scope or "dot_attention", reuse=tf.AUTO_REUSE, dtype=tf.as_dtype(dtype.floatx())): if memory is None: # suppose self-attention from queries alone h = func.linear(query, hidden_size * 3, ln=ln, scope="qkv_map") q, k, v = tf.split(h, 3, -1) if cache is not None: k = tf.concat([cache['k'], k], axis=1) v = tf.concat([cache['v'], v], axis=1) cache = { 'k': k, 'v': v, } else: q = func.linear(query, hidden_size, ln=ln, scope="q_map") if cache is not None and ('mk' in cache and 'mv' in cache): k, v = cache['mk'], cache['mv'] else: k = func.linear(memory, hidden_size, ln=ln, scope="k_map") v = func.linear(memory, hidden_size, ln=ln, scope="v_map") if cache is not None: cache['mk'] = k cache['mv'] = v q = func.split_heads(q, num_heads) k = func.split_heads(k, num_heads) v = func.split_heads(v, num_heads) q *= (hidden_size // num_heads)**(-0.5) # q * k => attention weights logits = tf.matmul(q, k, transpose_b=True) if mem_mask is not None: logits += mem_mask # modifying 'weights = tf.nn.softmax(logits)' to include the counting information. # -------- logits = logits - tf.reduce_max(logits, -1, keepdims=True) exp_logits = tf.exp(logits) # basically, the count considers how many states are dropped (i.e. gate value 0s) if count_mask is not None: exp_logits *= count_mask exp_sum_logits = tf.reduce_sum(exp_logits, -1, keepdims=True) weights = exp_logits / exp_sum_logits # -------- dweights = util.valid_apply_dropout(weights, dropout) # weights * v => attention vectors o = tf.matmul(dweights, v) o = func.combine_heads(o) if out_map: o = func.linear(o, hidden_size, ln=ln, scope="o_map") results = {'weights': weights, 'output': o, 'cache': cache} return results
def decoder(target, state, params): mask = dtype.tf_to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size initializer = tf.random_normal_initializer(0.0, hidden_size**-0.5) is_training = ('decoder' not in state) if is_training: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) * (hidden_size**0.5) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if is_training: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] inputs = func.add_timing_signal(inputs) else: inputs = tf.cond( tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) inputs = func.add_timing_signal(inputs, time=dtype.tf_to_float(state['time'])) inputs = util.valid_apply_dropout(inputs, params.dropout) # Applying L0Drop # -------- source_memory = state["encodes"] source_mask = state["mask"] # source_pruning: log alpha_i = x_i w^T source_pruning = func.linear(source_memory, 1, scope="source_pruning") if is_training: # training source_memory, l0_mask = l0norm.var_train( (source_memory, source_pruning)) l0_norm_loss = tf.squeeze(l0norm.l0_norm(source_pruning), -1) l0_norm_loss = tf.reduce_sum(l0_norm_loss * source_mask, -1) / tf.reduce_sum(source_mask, -1) l0_norm_loss = tf.reduce_mean(l0_norm_loss) l0_norm_loss = l0norm.l0_regularization_loss( l0_norm_loss, reg_scalar=params.l0_norm_reg_scalar, start_reg_ramp_up=params.l0_norm_start_reg_ramp_up, end_reg_ramp_up=params.l0_norm_end_reg_ramp_up, warm_up=params.l0_norm_warm_up, ) # force the model to only attend to unmasked position source_mask = dtype.tf_to_float( tf.cast(tf.squeeze(l0_mask, -1), tf.bool)) * source_mask else: # evaluation source_memory, l0_mask = l0norm.var_eval( (source_memory, source_pruning)) l0_norm_loss = 0.0 source_memory, source_mask, count_mask = extract_encodes( source_memory, source_mask, l0_mask) count_mask = tf.expand_dims(tf.expand_dims(count_mask, 1), 1) # -------- with tf.variable_scope("decoder"): x = inputs for layer in range(params.num_decoder_layer): if params.deep_transformer_init: layer_initializer = tf.variance_scaling_initializer( params.initializer_gain * (layer + 1)**-0.5, mode="fan_avg", distribution="uniform") else: layer_initializer = None with tf.variable_scope("layer_{}".format(layer), initializer=layer_initializer): with tf.variable_scope("self_attention"): y = func.dot_attention( x, None, func.attention_bias(tf.shape(mask)[1], "causal"), hidden_size, num_heads=params.num_heads, dropout=params.attention_dropout, cache=None if is_training else state['decoder']['state']['layer_{}'.format(layer)]) if not is_training: # k, v state['decoder']['state']['layer_{}'.format(layer)] \ .update(y['cache']) y = y['output'] x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("cross_attention"): if is_training: y = func.dot_attention( x, source_memory, func.attention_bias(source_mask, "masking"), hidden_size, num_heads=params.num_heads, dropout=params.attention_dropout, ) else: y = dot_attention(x, source_memory, func.attention_bias( source_mask, "masking"), hidden_size, count_mask=count_mask, num_heads=params.num_heads, dropout=params.attention_dropout, cache=state['decoder']['state'][ 'layer_{}'.format(layer)]) # mk, mv state['decoder']['state']['layer_{}'.format(layer)] \ .update(y['cache']) y = y['output'] x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) with tf.variable_scope("feed_forward"): y = func.ffn_layer( x, params.filter_size, hidden_size, dropout=params.relu_dropout, ) x = func.residual_fn(x, y, dropout=params.residual_dropout) x = func.layer_norm(x) feature = x if 'dev_decode' in state: feature = x[:, -1, :] embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size], initializer=initializer) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) logits = tf.cast(logits, tf.float32) soft_label, normalizer = util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(target)) mask = tf.cast(mask, tf.float32) per_sample_loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum( mask, -1) loss = tf.reduce_mean(per_sample_loss) loss = loss + l0_norm_loss # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, tf.float32), lambda: loss) return loss, logits, state, per_sample_loss
def cond_rnn(cell_name, x, memory, d, init_state=None, mask=None, mem_mask=None, ln=False, sm=True, one2one=False): """Self implemented conditional-RNN procedure, supporting mask trick""" # cell_name: gru, lstm or atr # x: input sequence embedding matrix, [batch, seq_len, dim] # memory: the conditional part # d: hidden dimension for rnn # mask: mask matrix, [batch, seq_len] # mem_mask: memory mask matrix, [batch, mem_seq_len] # ln: whether use layer normalization # init_state: the initial hidden states, for cache purpose # sm: whether apply swap memory during rnn scan # one2one: whether the memory is one-to-one mapping for x in_shape = util.shape_list(x) batch_size, time_steps = in_shape[:2] mem_shape = util.shape_list(memory) cell_lower = get_cell(cell_name, d, ln=ln, scope="{}_lower".format(cell_name)) cell_higher = get_cell(cell_name, d, ln=ln, scope="{}_higher".format(cell_name)) if init_state is None: init_state = cell_lower.get_init_state(shape=[batch_size]) if mask is None: mask = tf.ones([batch_size, time_steps], tf.float32) if mem_mask is None: mem_mask = tf.ones([batch_size, mem_shape[1]], tf.float32) # prepare projected encodes and inputs cache_inputs = cell_lower.fetch_states(x) cache_inputs = [tf.transpose(v, [1, 0, 2]) for v in list(cache_inputs)] if not one2one: proj_memories = linear(memory, mem_shape[-1], bias=False, ln=ln, scope="context_att") else: cache_memories = cell_higher.fetch_states(memory) cache_memories = [ tf.transpose(v, [1, 0, 2]) for v in list(cache_memories) ] mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2]) init_context = tf.zeros([batch_size, mem_shape[-1]], tf.float32) init_weight = tf.zeros([batch_size, mem_shape[1]], tf.float32) mask_pos = len(cache_inputs) def _step_fn(prev, x): t, h_, c_, a_ = prev if not one2one: m, v = x[mask_pos], x[:mask_pos] else: c, c_c, m, v = x[-1], x[mask_pos + 1:-1], x[mask_pos], x[:mask_pos] s = cell_lower(h_, v) s = m * s + (1. - m) * h_ if not one2one: a, c = additive_attention(cell_lower.get_hidden(s), memory, mem_mask, mem_shape[-1], ln=ln, proj_memory=proj_memories, scope="attention") c_c = cell_higher.fetch_states(c) else: a = tf.tile(tf.expand_dims(tf.range(time_steps), 0), [batch_size, 1]) a = tf.to_float(a == t) a = tf.reshape(a, tf.shape(init_weight)) h = cell_higher(s, c_c) h = m * h + (1. - m) * s return t + 1, h, c, a time = tf.constant(0, dtype=tf.int32, name="time") step_states = (time, init_state, init_context, init_weight) step_vars = cache_inputs + [mask_ta] if one2one: step_vars += cache_memories + [memory] outputs = tf.scan(_step_fn, step_vars, initializer=step_states, parallel_iterations=32, swap_memory=sm) output_ta = outputs[1] context_ta = outputs[2] attention_ta = outputs[3] outputs = tf.transpose(output_ta, [1, 0, 2]) output_states = outputs[:, -1] contexts = tf.transpose(context_ta, [1, 0, 2]) attentions = tf.transpose(attention_ta, [1, 0, 2]) return (outputs, output_states), \ (cell_higher.get_hidden(outputs), cell_higher.get_hidden(output_states)), \ contexts, attentions
def decoder(target, state, params): mask = tf.to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size if 'decoder' not in state: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if 'decoder' not in state: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] else: inputs = tf.cond(tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("decoder"): init_state = state["decoder_initializer"] if 'decoder' in state: init_state = state["decoder"]["state"] returns = rnn.cond_rnn(params.cell, inputs, state["encodes"], hidden_size, init_state=init_state, mask=mask, mem_mask=state["mask"], ln=params.layer_norm, sm=params.swap_memory, one2one=False) (hidden_states, _), (outputs, _), contexts, attentions = returns feature = linear([outputs, contexts, inputs], params.embed_size, ln=params.layer_norm, scope="pre_logits") feature = tf.tanh(feature) if util.valid_dropout(params.dropout): feature = tf.nn.dropout(feature, 1. - params.dropout) embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) centropy = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) ) centropy = tf.reshape(centropy, tf.shape(target)) loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(mask, -1) loss = tf.reduce_mean(loss) # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: loss) if 'decoder' in state: state['decoder']['state'] = hidden_states return loss, logits, state