def tensor2vector(tensor, hidden_size, mask=None, init=None, use_ln=False, dropout=0.1, scope="vecatt"): with tf.variable_scope(scope): if util.valid_dropout(dropout): tensor = tf.nn.dropout(tensor, 1. - dropout) if init is None: m = tf.nn.tanh( func.linear(tensor, hidden_size, ln=use_ln, scope="m_tensor")) else: init = util.expand_tile_dims(init, tf.shape(tensor)[1], 1) if util.valid_dropout(dropout): init = tf.nn.dropout(init, 1. - dropout) m = tf.nn.tanh( func.linear(tensor, hidden_size, ln=use_ln, scope="m_tensor") + func.linear(init, hidden_size, scope="m_init")) s = func.linear(m, 1, bias=False, scope="sore") if mask is None: mask = tf.ones( [tf.shape(tensor)[0], tf.shape(tensor)[1]], tf.float32) s = tf.squeeze(s, -1) + (1. - mask) * (-1e9) w = tf.nn.softmax(s) return tf.reduce_sum(tf.expand_dims(w, 2) * tensor, axis=1), s
def encoder(source, params): mask = tf.to_float(tf.cast(source, tf.bool)) hidden_size = params.hidden_size source, mask = util.remove_invalid_seq(source, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "src_embedding" src_emb = tf.get_variable(embed_name, [params.src_vocab.size(), params.embed_size]) src_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(src_emb, source) inputs = tf.nn.bias_add(inputs, src_bias) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("encoder"): # forward rnn with tf.variable_scope('forward'): outputs = rnn.rnn(params.cell, inputs, hidden_size, mask=mask, ln=params.layer_norm, sm=params.swap_memory) output_fw, state_fw = outputs[1] # backward rnn with tf.variable_scope('backward'): if not params.caencoder: outputs = rnn.rnn(params.cell, tf.reverse(inputs, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory) output_bw, state_bw = outputs[1] else: outputs = rnn.cond_rnn(params.cell, tf.reverse(inputs, [1]), tf.reverse(output_fw, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, one2one=True) output_bw, state_bw = outputs[1] output_bw = tf.reverse(output_bw, [1]) if not params.caencoder: source_encodes = tf.concat([output_fw, output_bw], -1) source_feature = tf.concat([state_fw, state_bw], -1) else: source_encodes = output_bw source_feature = state_bw with tf.variable_scope("decoder_initializer"): decoder_init = rnn.get_cell( params.cell, hidden_size, ln=params.layer_norm ).get_init_state(x=source_feature) decoder_init = tf.tanh(decoder_init) return { "encodes": source_encodes, "decoder_initializer": decoder_init, "mask": mask }
def embedding_layer(features, params): p = features['p'] h = features['h'] p_mask = tf.to_float(tf.cast(p, tf.bool)) h_mask = tf.to_float(tf.cast(h, tf.bool)) with tf.device('/cpu:0'): symbol_embeddings = tf.get_variable('special_symbol_embeddings', shape=(3, params.embed_size), trainable=True) embedding_initializer = tf.glorot_uniform_initializer() if tf.gfile.Exists(params.pretrain_word_embedding_file): pretrain_embedding = np.load(params.pretrain_word_embedding_file)['data'] embedding_initializer = tf.constant_initializer(pretrain_embedding) general_embeddings = tf.get_variable('general_symbol_embeddings', shape=(params.word_vocab.size() - 3, params.embed_size), initializer=embedding_initializer, trainable=False) word_embeddings = tf.concat([symbol_embeddings, general_embeddings], 0) p_emb = tf.nn.embedding_lookup(word_embeddings, p) h_emb = tf.nn.embedding_lookup(word_embeddings, h) p_features = [p_emb] h_features = [h_emb] if params.enable_bert: p_features.append(features['bert_p_enc']) h_features.append(features['bert_h_enc']) if params.use_char: pc = features['pc'] hc = features['hc'] pc_mask = tf.to_float(tf.cast(pc, tf.bool)) hc_mask = tf.to_float(tf.cast(hc, tf.bool)) pc = tf.reshape(pc, [-1, tf.shape(pc)[-1]]) hc = tf.reshape(hc, [-1, tf.shape(hc)[-1]]) pc_mask = tf.reshape(pc_mask, [-1, tf.shape(pc_mask)[-1]]) hc_mask = tf.reshape(hc_mask, [-1, tf.shape(hc_mask)[-1]]) with tf.device('/cpu:0'): char_embeddings = tf.get_variable('char_embeddings', shape=(params.char_vocab.size(), params.char_embed_size), initializer=tf.glorot_uniform_initializer(), trainable=True) with tf.variable_scope('char_embedding'): pc_emb = tf.nn.embedding_lookup(char_embeddings, pc) hc_emb = tf.nn.embedding_lookup(char_embeddings, hc) if util.valid_dropout(params.dropout): pc_emb = tf.nn.dropout(pc_emb, 1. - 0.5 * params.dropout) hc_emb = tf.nn.dropout(hc_emb, 1. - 0.5 * params.dropout) with tf.variable_scope("char_encoding", reuse=tf.AUTO_REUSE): pc_emb = pc_emb * tf.expand_dims(pc_mask, -1) hc_emb = hc_emb * tf.expand_dims(hc_mask, -1) pc_shp = util.shape_list(features['pc']) pc_emb = tf.reshape(pc_emb, [pc_shp[0], pc_shp[1], pc_shp[2], params.char_embed_size]) hc_shp = util.shape_list(features['hc']) hc_emb = tf.reshape(hc_emb, [hc_shp[0], hc_shp[1], hc_shp[2], params.char_embed_size]) pc_state = func.linear(tf.reduce_max(pc_emb, 2), params.char_embed_size, scope="cmap") hc_state = func.linear(tf.reduce_max(hc_emb, 2), params.char_embed_size, scope="cmap") p_features.append(pc_state) h_features.append(hc_state) ''' p_emb = func.highway(tf.concat(p_features, axis=2), size=params.hidden_size, dropout=params.dropout, num_layers=2, scope='highway') h_emb = func.highway(tf.concat(h_features, axis=2), size=params.hidden_size, dropout=params.dropout, num_layers=2, scope='highway') ''' p_emb = tf.concat(p_features, axis=2) h_emb = tf.concat(h_features, axis=2) p_emb = p_emb * tf.expand_dims(p_mask, -1) h_emb = h_emb * tf.expand_dims(h_mask, -1) features.update({'p_emb': p_emb, 'h_emb': h_emb, 'p_mask': p_mask, 'h_mask': h_mask, }) return features
def encoder(source, params): mask = tf.to_float(tf.cast(source, tf.bool)) hidden_size = params.hidden_size source, mask = util.remove_invalid_seq(source, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "src_embedding" src_emb = tf.get_variable(embed_name, [params.src_vocab.size(), params.embed_size]) src_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(src_emb, source) inputs = tf.nn.bias_add(inputs, src_bias) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("encoder"): x = inputs for layer in range(params.num_encoder_layer): with tf.variable_scope("layer_{}".format(layer)): # forward rnn with tf.variable_scope('forward'): outputs = rnn.rnn(params.cell, x, hidden_size, mask=mask, ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) output_fw, state_fw = outputs[1] if layer == 0: # backward rnn with tf.variable_scope('backward'): if not params.caencoder: outputs = rnn.rnn(params.cell, tf.reverse(x, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) output_bw, state_bw = outputs[1] else: outputs = rnn.cond_rnn(params.cell, tf.reverse(x, [1]), tf.reverse(output_fw, [1]), hidden_size, mask=tf.reverse(mask, [1]), ln=params.layer_norm, sm=params.swap_memory, num_heads=params.num_heads, one2one=True) output_bw, state_bw = outputs[1] output_bw = tf.reverse(output_bw, [1]) if not params.caencoder: y = tf.concat([output_fw, output_bw], -1) z = tf.concat([state_fw, state_bw], -1) else: y = output_bw z = state_bw else: y = output_fw z = state_fw y = func.linear(y, hidden_size, ln=False, scope="ff") # short cut via residual connection if x.get_shape()[-1].value == y.get_shape()[-1].value: x = func.residual_fn(x, y, dropout=params.dropout) else: x = y if params.layer_norm: x = func.layer_norm(x, scope="ln") with tf.variable_scope("decoder_initializer"): decoder_cell = rnn.get_cell(params.cell, hidden_size, ln=params.layer_norm) return { "encodes": x, "decoder_initializer": { "layer_{}".format(l): decoder_cell.get_init_state(x=z, scope="layer_{}".format(l)) for l in range(params.num_decoder_layer) }, "mask": mask }
def decoder(target, state, params): mask = tf.to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size if 'decoder' not in state: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if 'decoder' not in state: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] else: inputs = tf.cond( tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("decoder"): x = inputs for layer in range(params.num_decoder_layer): with tf.variable_scope("layer_{}".format(layer)): init_state = state["decoder_initializer"]["layer_{}".format( layer)] if 'decoder' in state: init_state = state["decoder"]["state"]["layer_{}".format( layer)] if layer == 0 or params.use_deep_att: returns = rnn.cond_rnn(params.cell, x, state["encodes"], hidden_size, init_state=init_state, mask=mask, num_heads=params.num_heads, mem_mask=state["mask"], ln=params.layer_norm, sm=params.swap_memory, one2one=False, dp=params.dropout) (_, hidden_state), (outputs, _), contexts, attentions = returns c = contexts else: if params.caencoder: returns = rnn.cond_rnn(params.cell, x, c, hidden_size, init_state=init_state, mask=mask, mem_mask=mask, ln=params.layer_norm, sm=params.swap_memory, num_heads=params.num_heads, one2one=True, dp=params.dropout) (_, hidden_state), (outputs, _), contexts, attentions = returns else: outputs = rnn.rnn(params.cell, tf.concat([x, c], -1), hidden_size, mask=mask, init_state=init_state, ln=params.layer_norm, sm=params.swap_memory, dp=params.dropout) outputs, hidden_state = outputs[1] if 'decoder' in state: state['decoder']['state']['layer_{}'.format( layer)] = hidden_state y = func.linear(outputs, hidden_size, ln=False, scope="ff") # short cut via residual connection if x.get_shape()[-1].value == y.get_shape()[-1].value: x = func.residual_fn(x, y, dropout=params.dropout) else: x = y if params.layer_norm: x = func.layer_norm(x, scope="ln") feature = func.linear(tf.concat([x, c], -1), params.embed_size, ln=params.layer_norm, scope="ff") feature = tf.nn.tanh(feature) if util.valid_dropout(params.dropout): feature = tf.nn.dropout(feature, 1. - params.dropout) if 'dev_decode' in state: feature = x[:, -1, :] embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) soft_label, normalizer = util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) centropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=soft_label) centropy -= normalizer centropy = tf.reshape(centropy, tf.shape(target)) loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(mask, -1) loss = tf.reduce_mean(loss) # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: loss) return loss, logits, state
def decoder(target, state, params): mask = tf.to_float(tf.cast(target, tf.bool)) hidden_size = params.hidden_size if 'decoder' not in state: target, mask = util.remove_invalid_seq(target, mask) embed_name = "embedding" if params.shared_source_target_embedding \ else "tgt_embedding" tgt_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) tgt_bias = tf.get_variable("bias", [params.embed_size]) inputs = tf.gather(tgt_emb, target) inputs = tf.nn.bias_add(inputs, tgt_bias) # shift if 'decoder' not in state: inputs = tf.pad(inputs, [[0, 0], [1, 0], [0, 0]]) inputs = inputs[:, :-1, :] else: inputs = tf.cond(tf.reduce_all(tf.equal(target, params.tgt_vocab.pad())), lambda: tf.zeros_like(inputs), lambda: inputs) mask = tf.ones_like(mask) if util.valid_dropout(params.dropout): inputs = tf.nn.dropout(inputs, 1. - params.dropout) with tf.variable_scope("decoder"): init_state = state["decoder_initializer"] if 'decoder' in state: init_state = state["decoder"]["state"] returns = rnn.cond_rnn(params.cell, inputs, state["encodes"], hidden_size, init_state=init_state, mask=mask, mem_mask=state["mask"], ln=params.layer_norm, sm=params.swap_memory, one2one=False) (hidden_states, _), (outputs, _), contexts, attentions = returns feature = linear([outputs, contexts, inputs], params.embed_size, ln=params.layer_norm, scope="pre_logits") feature = tf.tanh(feature) if util.valid_dropout(params.dropout): feature = tf.nn.dropout(feature, 1. - params.dropout) embed_name = "tgt_embedding" if params.shared_target_softmax_embedding \ else "softmax_embedding" embed_name = "embedding" if params.shared_source_target_embedding \ else embed_name softmax_emb = tf.get_variable(embed_name, [params.tgt_vocab.size(), params.embed_size]) feature = tf.reshape(feature, [-1, params.embed_size]) logits = tf.matmul(feature, softmax_emb, False, True) centropy = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=util.label_smooth(target, util.shape_list(logits)[-1], factor=params.label_smooth) ) centropy = tf.reshape(centropy, tf.shape(target)) loss = tf.reduce_sum(centropy * mask, -1) / tf.reduce_sum(mask, -1) loss = tf.reduce_mean(loss) # these mask tricks mainly used to deal with zero shapes, such as [0, 1] loss = tf.cond(tf.equal(tf.shape(target)[0], 0), lambda: tf.constant(0, dtype=tf.float32), lambda: loss) if 'decoder' in state: state['decoder']['state'] = hidden_states return loss, logits, state