예제 #1
0
def encoder(source, params):
    mask = tf.to_float(tf.cast(source, tf.bool))
    hidden_size = params.hidden_size

    source, mask = util.remove_invalid_seq(source, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "src_embedding"
    src_emb = tf.get_variable(embed_name,
                              [params.src_vocab.size(), params.embed_size])
    src_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(src_emb, source)
    inputs = tf.nn.bias_add(inputs, src_bias)

    if util.valid_dropout(params.dropout):
        inputs = tf.nn.dropout(inputs, 1. - params.dropout)

    with tf.variable_scope("encoder"):
        # forward rnn
        with tf.variable_scope('forward'):
            outputs = rnn.rnn(params.cell, inputs, hidden_size, mask=mask,
                              ln=params.layer_norm, sm=params.swap_memory)
            output_fw, state_fw = outputs[1]
        # backward rnn
        with tf.variable_scope('backward'):
            if not params.caencoder:
                outputs = rnn.rnn(params.cell, tf.reverse(inputs, [1]),
                                  hidden_size, mask=tf.reverse(mask, [1]),
                                  ln=params.layer_norm, sm=params.swap_memory)
                output_bw, state_bw = outputs[1]
            else:
                outputs = rnn.cond_rnn(params.cell, tf.reverse(inputs, [1]),
                                       tf.reverse(output_fw, [1]), hidden_size,
                                       mask=tf.reverse(mask, [1]),
                                       ln=params.layer_norm,
                                       sm=params.swap_memory,
                                       one2one=True)
                output_bw, state_bw = outputs[1]

            output_bw = tf.reverse(output_bw, [1])

    if not params.caencoder:
        source_encodes = tf.concat([output_fw, output_bw], -1)
        source_feature = tf.concat([state_fw, state_bw], -1)
    else:
        source_encodes = output_bw
        source_feature = state_bw

    with tf.variable_scope("decoder_initializer"):
        decoder_init = rnn.get_cell(
            params.cell, hidden_size, ln=params.layer_norm
        ).get_init_state(x=source_feature)
    decoder_init = tf.tanh(decoder_init)

    return {
        "encodes": source_encodes,
        "decoder_initializer": decoder_init,
        "mask": mask
    }
예제 #2
0
 def __init__(self,
              cell,
              num_layers,
              num_units,
              batch_size,
              input_size,
              keep_prob=1.0,
              is_train=None,
              scope="native_rnn"):
     self.num_layers = num_layers
     self.cell_type = cell
     self.inits = []
     self.dropout_mask = []
     self.num_units = num_units
     self.scope = scope
     for layer in range(num_layers):
         input_size_ = input_size if layer == 0 else 2 * num_units
         init_fw = rnn.get_cell(cell, num_units).get_init_state(
             shape=[batch_size], scope="fw_{}".format(layer))
         init_bw = rnn.get_cell(cell, num_units).get_init_state(
             shape=[batch_size], scope="bw_{}".format(layer))
         mask_fw = dropout(tf.ones([batch_size, 1, input_size_],
                                   dtype=tf.float32),
                           keep_prob=keep_prob,
                           is_train=is_train,
                           mode=None)
         mask_bw = dropout(tf.ones([batch_size, 1, input_size_],
                                   dtype=tf.float32),
                           keep_prob=keep_prob,
                           is_train=is_train,
                           mode=None)
         self.inits.append((
             init_fw,
             init_bw,
         ))
         self.dropout_mask.append((
             mask_fw,
             mask_bw,
         ))
예제 #3
0
def match_layer(features, params):
    with tf.variable_scope("match", reuse=tf.AUTO_REUSE):
        p_emb = features["p_emb"]
        h_emb = features["h_emb"]
        p_mask = features["p_mask"]
        h_mask = features["h_mask"]

        p_seq_enc, p_vec_enc = wrap_rnn(
            p_emb,
            params.cell,
            1,
            params.hidden_size,
            mask=p_mask,
            use_ln=params.layer_norm,
            dropout=params.dropout,
            scope="enc_p"
        )

        with tf.variable_scope("h_init"):
            h_init = rnn.get_cell(
                params.cell,
                params.hidden_size,
                ln=params.layer_norm
            ).get_init_state(x=p_vec_enc)
            h_init = tf.tanh(h_init)

        _, (h_seq_enc, h_vec_enc), _, _ = rnn.cond_rnn(
            params.cell,
            h_emb,
            p_seq_enc,
            params.hidden_size,
            init_state=h_init,
            mask=h_mask,
            mem_mask=p_mask,
            ln=params.layer_norm,
            num_heads=params.num_heads
        )

        p_encs = [p_seq_enc]
        h_encs = [h_seq_enc]

        if params.enable_bert:
            p_encs.append(features['bert_p_enc'])
            h_encs.append(features['bert_h_enc'])

        p_enc = tf.concat(p_encs, -1)
        h_enc = tf.concat(h_encs, -1)

        p_seq_enc, _ = wrap_rnn(
            p_enc,
            params.cell,
            1,
            params.hidden_size,
            mask=p_mask,
            use_ln=params.layer_norm,
            dropout=params.dropout,
            scope="post_enc_p"
        )

        h_seq_enc, _ = wrap_rnn(
            h_enc,
            params.cell,
            1,
            params.hidden_size,
            mask=h_mask,
            use_ln=params.layer_norm,
            dropout=params.dropout,
            scope="post_enc_h"
        )

        features.update({
            'p_enc': p_seq_enc,
            'h_enc': h_seq_enc
        })

    return features
예제 #4
0
def encoder(source, params):
    mask = dtype.tf_to_float(tf.cast(source, tf.bool))
    hidden_size = params.hidden_size

    source, mask = util.remove_invalid_seq(source, mask)

    # extract source word embedding and apply dropout
    embed_name = "embedding" if params.shared_source_target_embedding \
        else "src_embedding"
    src_emb = tf.get_variable(embed_name,
                              [params.src_vocab.size(), params.embed_size])
    src_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(src_emb, source)
    inputs = tf.nn.bias_add(inputs, src_bias)

    inputs = util.valid_apply_dropout(inputs, params.dropout)

    # the encoder module used in the deep attention paper
    with tf.variable_scope("encoder"):
        # x: embedding input, h: the hidden state
        x = inputs
        h = 0
        z = 0

        for layer in range(params.num_encoder_layer + 1):
            with tf.variable_scope("layer_{}".format(layer)):
                if layer == 0:
                    # for the first layer, we perform a normal rnn layer to collect context information
                    outputs = rnn.rnn(params.cell,
                                      x,
                                      hidden_size,
                                      mask=mask,
                                      ln=params.layer_norm,
                                      sm=params.swap_memory)
                    h = outputs[1][0]

                else:
                    # for deeper encoder layers, we incorporate both embedding input and previous inversed hidden
                    # state sequence as input.
                    # the embedding informs current input while hidden state tells future context
                    is_reverse = (layer % 2 == 1)
                    outputs = rnn.cond_rnn(
                        params.cell,
                        tf.reverse(x, [1]) if is_reverse else x,
                        tf.reverse(h, [1]) if is_reverse else h,
                        hidden_size,
                        mask=tf.reverse(mask, [1]) if is_reverse else mask,
                        ln=params.layer_norm,
                        sm=params.swap_memory,
                        num_heads=params.num_heads,
                        one2one=True)
                    h = outputs[1][0]
                    h = tf.reverse(h, [1]) if is_reverse else h

                # the final hidden state used for decoder state initialization
                z = outputs[1][1]

    with tf.variable_scope("decoder_initializer"):
        decoder_cell = rnn.get_cell(params.cell,
                                    hidden_size,
                                    ln=params.layer_norm)

    return {
        "encodes": h,
        "decoder_initializer": {
            'layer': decoder_cell.get_init_state(x=z, scope="dec_init_state")
        },
        "mask": mask
    }
예제 #5
0
def deep_att_dec_rnn(cell_name,
                     x,
                     memory,
                     d,
                     init_state=None,
                     mask=None,
                     mem_mask=None,
                     ln=False,
                     sm=True,
                     depth=1,
                     num_heads=1):
    """Self implemented conditional-RNN procedure, supporting mask trick"""
    # cell_name: gru, lstm or atr
    # x: input sequence embedding matrix, [batch, seq_len, dim]
    # memory: the conditional part
    # d: hidden dimension for rnn
    # mask: mask matrix, [batch, seq_len]
    # mem_mask: memory mask matrix, [batch, mem_seq_len]
    # ln: whether use layer normalization
    # init_state: the initial hidden states, for cache purpose
    # sm: whether apply swap memory during rnn scan
    # depth: depth for the decoder in deep attention
    # num_heads: number of attention heads, multi-head attention
    # dp: variational dropout

    in_shape = util.shape_list(x)
    batch_size, time_steps = in_shape[:2]
    mem_shape = util.shape_list(memory)

    cell_lower = rnn.get_cell(cell_name,
                              d,
                              ln=ln,
                              scope="{}_lower".format(cell_name))
    cells_higher = []
    for layer in range(depth):
        cell_higher = rnn.get_cell(cell_name,
                                   d,
                                   ln=ln,
                                   scope="{}_higher_{}".format(
                                       cell_name, layer))
        cells_higher.append(cell_higher)

    if init_state is None:
        init_state = cell_lower.get_init_state(shape=[batch_size])
    if mask is None:
        mask = dtype.tf_to_float(tf.ones([batch_size, time_steps]))
    if mem_mask is None:
        mem_mask = dtype.tf_to_float(tf.ones([batch_size, mem_shape[1]]))

    # prepare projected encodes and inputs
    cache_inputs = cell_lower.fetch_states(x)
    cache_inputs = [tf.transpose(v, [1, 0, 2]) for v in list(cache_inputs)]
    proj_memories = func.linear(memory,
                                mem_shape[-1],
                                bias=False,
                                ln=ln,
                                scope="context_att")

    mask_ta = tf.transpose(tf.expand_dims(mask, -1), [1, 0, 2])
    init_context = dtype.tf_to_float(
        tf.zeros([batch_size, depth, mem_shape[-1]]))
    init_weight = dtype.tf_to_float(
        tf.zeros([batch_size, depth, num_heads, mem_shape[1]]))
    mask_pos = len(cache_inputs)

    def _step_fn(prev, x):
        t, h_, c_, a_ = prev

        m, v = x[mask_pos], x[:mask_pos]

        # the first decoder rnn subcell, composing previous hidden state with the current word embedding
        s_ = cell_lower(h_, v)
        s_ = m * s_ + (1. - m) * h_

        atts, att_ctxs = [], []

        for layer in range(depth):
            # perform attention
            prev_cell = cell_lower if layer == 0 else cells_higher[layer - 1]
            vle = func.additive_attention(
                prev_cell.get_hidden(s_),
                memory,
                mem_mask,
                mem_shape[-1],
                ln=ln,
                num_heads=num_heads,
                proj_memory=proj_memories,
                scope="deep_attention_{}".format(layer))
            a, c = vle['weights'], vle['output']
            atts.append(tf.expand_dims(a, 1))
            att_ctxs.append(tf.expand_dims(c, 1))

            # perform next-level recurrence
            c_c = cells_higher[layer].fetch_states(c)
            ss_ = cells_higher[layer](s_, c_c)
            s_ = m * ss_ + (1. - m) * s_

        h = s_
        a = tf.concat(atts, axis=1)
        c = tf.concat(att_ctxs, axis=1)

        return t + 1, h, c, a

    time = tf.constant(0, dtype=tf.int32, name="time")
    step_states = (time, init_state, init_context, init_weight)
    step_vars = cache_inputs + [mask_ta]

    outputs = tf.scan(_step_fn,
                      step_vars,
                      initializer=step_states,
                      parallel_iterations=32,
                      swap_memory=sm)

    output_ta = outputs[1]
    context_ta = outputs[2]
    attention_ta = outputs[3]

    outputs = tf.transpose(output_ta, [1, 0, 2])
    output_states = outputs[:, -1]
    # batch x target length x depth x mem-dimension
    contexts = tf.transpose(context_ta, [1, 0, 2, 3])
    # batch x num_heads x depth x target length x source length
    attentions = tf.transpose(attention_ta, [1, 3, 2, 0, 4])

    return (outputs, output_states), \
           (cells_higher[-1].get_hidden(outputs), cells_higher[-1].get_hidden(output_states)), \
        contexts, attentions
예제 #6
0
def encoder(source, params):
    mask = tf.to_float(tf.cast(source, tf.bool))
    hidden_size = params.hidden_size

    source, mask = util.remove_invalid_seq(source, mask)

    embed_name = "embedding" if params.shared_source_target_embedding \
        else "src_embedding"
    src_emb = tf.get_variable(embed_name,
                              [params.src_vocab.size(), params.embed_size])
    src_bias = tf.get_variable("bias", [params.embed_size])

    inputs = tf.gather(src_emb, source)
    inputs = tf.nn.bias_add(inputs, src_bias)

    if util.valid_dropout(params.dropout):
        inputs = tf.nn.dropout(inputs, 1. - params.dropout)

    with tf.variable_scope("encoder"):
        x = inputs

        for layer in range(params.num_encoder_layer):
            with tf.variable_scope("layer_{}".format(layer)):
                # forward rnn
                with tf.variable_scope('forward'):
                    outputs = rnn.rnn(params.cell,
                                      x,
                                      hidden_size,
                                      mask=mask,
                                      ln=params.layer_norm,
                                      sm=params.swap_memory,
                                      dp=params.dropout)
                    output_fw, state_fw = outputs[1]
                if layer == 0:
                    # backward rnn
                    with tf.variable_scope('backward'):
                        if not params.caencoder:
                            outputs = rnn.rnn(params.cell,
                                              tf.reverse(x, [1]),
                                              hidden_size,
                                              mask=tf.reverse(mask, [1]),
                                              ln=params.layer_norm,
                                              sm=params.swap_memory,
                                              dp=params.dropout)
                            output_bw, state_bw = outputs[1]
                        else:
                            outputs = rnn.cond_rnn(params.cell,
                                                   tf.reverse(x, [1]),
                                                   tf.reverse(output_fw, [1]),
                                                   hidden_size,
                                                   mask=tf.reverse(mask, [1]),
                                                   ln=params.layer_norm,
                                                   sm=params.swap_memory,
                                                   num_heads=params.num_heads,
                                                   one2one=True)
                            output_bw, state_bw = outputs[1]

                        output_bw = tf.reverse(output_bw, [1])

                    if not params.caencoder:
                        y = tf.concat([output_fw, output_bw], -1)
                        z = tf.concat([state_fw, state_bw], -1)
                    else:
                        y = output_bw
                        z = state_bw
                else:
                    y = output_fw
                    z = state_fw

                y = func.linear(y, hidden_size, ln=False, scope="ff")

                # short cut via residual connection
                if x.get_shape()[-1].value == y.get_shape()[-1].value:
                    x = func.residual_fn(x, y, dropout=params.dropout)
                else:
                    x = y
                if params.layer_norm:
                    x = func.layer_norm(x, scope="ln")

    with tf.variable_scope("decoder_initializer"):
        decoder_cell = rnn.get_cell(params.cell,
                                    hidden_size,
                                    ln=params.layer_norm)

    return {
        "encodes": x,
        "decoder_initializer": {
            "layer_{}".format(l):
            decoder_cell.get_init_state(x=z, scope="layer_{}".format(l))
            for l in range(params.num_decoder_layer)
        },
        "mask": mask
    }