示例#1
0
def attn(
    x,
    scope,
    n_state,
    n_head,
    resid_pdrop,
    attn_pdrop,
    train=False,
    scale=False,
    mask=True,
    explain=False,
    lengths=None,
):
    assert n_state % n_head == 0
    with tf.variable_scope(scope):
        q, k, v = multihead_qkv(x, n_state, n_head, train, explain)
        w = attn_weights(q,
                         k,
                         v,
                         scale=scale,
                         mask=mask,
                         explain=explain,
                         lengths=lengths)
        w = dropout(w, attn_pdrop, train)
        a = tf.matmul(w, v)
        a = merge_heads(a)
        a = conv1d(a, "c_proj", n_state, 1, train=train)
        a = dropout(a, resid_pdrop, train)
        return a
示例#2
0
def add_auxiliary(context, context_dim, clf_h, seq_feats, config, train):
    context.set_shape([None, config.max_length, context_dim])
    context_embed_weights = tf.get_variable(
        name="ce",
        shape=[context_dim, config.n_context_embed],
        initializer=tf.random_normal_initializer(stddev=config.weight_stddev),
    )

    context_weighted_avg = tf.get_variable(
        name="cwa",
        shape=[context_dim],
        initializer=tf.random_normal_initializer(stddev=config.weight_stddev),
    )

    if config.train_embeddings:
        context_embed_weights = dropout(context_embed_weights,
                                        config.embed_p_drop, train)
        context_weighted_avg = dropout(context_weighted_avg,
                                       config.embed_p_drop, train)
    else:
        context_embed_weights = tf.stop_gradient(context_embed_weights)

    with tf.variable_scope("context_embedding"):
        weighted_C = tf.multiply(
            context, context_weighted_avg
        )  # [batch_size, seq_length, context_dim] * [context_dim] = [batch_size, seq_length, context_dim], with weighted inputs
        c_embed = tf.tensordot(
            weighted_C, context_embed_weights, axes=[[2], [0]]
        )  # [batch_size, seq_length, context_dim] * [context_dim, n_embed] = [batch_size, seq_length, n_embed]
        c_embed = norm(c_embed, tf.get_variable_scope())
        seq_feats = tf.concat([seq_feats, c_embed], axis=2)
        c_embed = tf.reduce_mean(c_embed, axis=1)
        clf_h = tf.concat([clf_h, c_embed], axis=1)
        return clf_h, seq_feats
示例#3
0
def mlp(x, scope, n_state, act_fn, resid_pdrop, train=False):
    with tf.variable_scope(scope):
        nx = shape_list(x)[-1]
        act = act_fns[act_fn]
        h = act(conv1d(x, "c_fc", n_state, 1, train=train))
        h2 = conv1d(h, "c_proj", nx, 1, train=train)
        h2 = dropout(h2, resid_pdrop, train)
        return h2
示例#4
0
def adapter(X, adapter_size, nx, train=False, hidden_dropout_prob=0.1):
    down_projection = tf.layers.dense(
        X,
        adapter_size,
        activation="sigmoid",
        kernel_initializer=create_initializer(0.001),
    )
    down_projection = dropout(down_projection, hidden_dropout_prob, train)
    up_projection = tf.layers.dense(
        down_projection, nx, kernel_initializer=create_initializer(0.001))
    return up_projection + X
示例#5
0
def gpt_featurizer(X,
                   encoder,
                   config,
                   train=False,
                   reuse=None,
                   explain=False,
                   **kwargs):
    """
    The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))
    sequence_length = tf.shape(X)[1]

    with tf.variable_scope("model/featurizer", reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[encoder.vocab_size + config.max_length, config.n_embed],
            initializer=tf.random_normal_initializer(
                stddev=config.weight_stddev),
        )
        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

#        X = tf.reshape(X, [-1, config.max_length, 2])

        clf_token = encoder.end_token
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32)

        if explain:
            X = add_explain_tokens(X, sequence_length, pool_idx)

        h = embed(X, embed_weights)
        for layer in range(config.n_layer):
            if ((config.n_layer - layer) == config.num_layers_trained
                    and config.num_layers_trained != config.n_layer
                    and config.adapter_size is None):
                h = tf.stop_gradient(h)
                train_layer = False
            else:
                train_layer = train

            with tf.variable_scope("h%d_" % layer):
                block_fn = functools.partial(
                    block,
                    n_head=config.n_heads,
                    act_fn=config.act_fn,
                    resid_pdrop=config.resid_p_drop,
                    attn_pdrop=config.attn_p_drop,
                    scope="h%d" % layer,
                    train=train_layer,
                    scale=True,
                    explain=explain,
                    adptr_size=config.adapter_size,
                )
                if config.low_memory_mode and train_layer:
                    block_fn = recompute_grad(block_fn, use_entire_scope=True)
                if layer < config.n_layer - 1:
                    h = block_fn(h)
                else:
                    h_out = block_fn(h)

            # get the attention weights from the last layer
            if layer == config.n_layer - 1:
                with tf.variable_scope("h%d_/h%d/attn" % (layer, layer),
                                       reuse=True):
                    q, k, v = multihead_qkv(h,
                                            n_state=shape_list(h)[-1],
                                            n_head=config.n_heads,
                                            train=train)
                    w = attn_weights(q, k, v, scale=True)

        if explain:
            explain_out = h_out[:, initial_shape[1]:]
            explain_out = tf.reshape(
                explain_out,
                shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0))
            h_out = h_out[:, :initial_shape[1]]

        # Use hidden state at classifier token as input to final proj. + softmax
        clf_h = tf.reshape(h_out,
                           [-1, config.n_embed])  # [batch * seq_len, embed]
        clf_h = tf.gather(
            clf_h,
            tf.range(shape_list(X)[0], dtype=tf.int32) * sequence_length +
            pool_idx,
        )
        clf_h = tf.reshape(clf_h,
                           shape=tf.concat(
                               (initial_shape[:-2], [config.n_embed]), 0))
        seq_feats = tf.reshape(h_out,
                               shape=tf.concat(
                                   (initial_shape[:-1], [config.n_embed]), 0))

        lengths = lengths_from_eos_idx(eos_idx=pool_idx,
                                       max_length=sequence_length)

        out = {
            "embed_weights": embed_weights,
            "features": clf_h,
            "sequence_features": seq_feats,
            "eos_idx": pool_idx,
            "lengths": lengths,
            "attention_weights": w,  # [n_heads, seq_len, seq_len]
        }
        if explain:
            out["explain_out"] = explain_out
        return out