예제 #1
0
def gpt2_featurizer(
    X,
    encoder,
    config,
    train=False,
    reuse=None,
    **kwargs
):
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))
    X.set_shape([None, None, None])

    with tf.variable_scope("model/featurizer", reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[encoder.vocab_size + config.max_length, config.n_embed],
            initializer=tf.random_normal_initializer(stddev=config.weight_stddev),
        )
        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

        X = tf.reshape(X, [-1, config.max_length, 2])
        h = embed(X, embed_weights)

        # Transformer
        pasts = [None] * config.n_layer
        for layer, past in enumerate(pasts):
            if (
                (config.n_layer - layer) == config.num_layers_trained
                and config.num_layers_trained != config.n_layer
                and config.adapter_size is None
            ):
                h = tf.stop_gradient(h)
                train_layer = False
            else:
                train_layer = train

            with tf.variable_scope("h%d" % layer):
                block_fn = functools.partial(
                    block, past=past, hparams=config, train=train
                )
                if config.low_memory_mode and train_layer:
                    block_fn = recompute_grad(block_fn, use_entire_scope=True)
                h = block_fn(h)

        h = norm(h, "ln_f")

        # Use hidden state at classifier token as input to final proj. + softmax
        clf_h = tf.reshape(h, [-1, config.n_embed])  # [batch * seq_len, embed]
        clf_token = encoder["_classify_"]
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32
        )
        clf_h = tf.gather(
            clf_h,
            tf.range(shape_list(X)[0], dtype=tf.int32) * config.max_length + pool_idx,
        )
        clf_h = tf.reshape(
            clf_h, shape=tf.concat((initial_shape[:-2], [config.n_embed]), 0)
        )
        seq_feats = tf.reshape(
            h, shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0)
        )

        lengths = lengths_from_eos_idx(eos_idx=pool_idx, max_length=shape_list(X)[0])

        return {
            "embed_weights": embed_weights,
            "features": clf_h,
            "sequence_features": seq_feats,
            "eos_idx": pool_idx,
            "lengths": lengths
        }
예제 #2
0
def bert_featurizer(X, encoder, config, train=False, reuse=None, **kwargs):
    """
    The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """

    is_roberta = RoBERTaEncoder == config.base_model.encoder

    bert_config = BertConfig(vocab_size=encoder.vocab_size,
                             hidden_size=config.n_embed,
                             num_hidden_layers=config.n_layer,
                             num_attention_heads=config.n_heads,
                             intermediate_size=config.bert_intermediate_size,
                             hidden_act=config.act_fn,
                             hidden_dropout_prob=config.resid_p_drop,
                             attention_probs_dropout_prob=config.attn_p_drop,
                             max_position_embeddings=config.max_length,
                             type_vocab_size=2,
                             initializer_range=config.weight_stddev,
                             adapter_size=config.adapter_size,
                             low_memory_mode=config.low_memory_mode)

    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))
    X.set_shape([None, None, None])
    # To fit the interface of finetune we are going to compute the mask and type id at runtime.
    input_ids = X[:, :, 0]  # slice off pos-embed ids.
    delimiters = tf.cast(tf.equal(input_ids, encoder.delimiter_token),
                         tf.int32)

    token_type_ids = tf.cumsum(delimiters, exclusive=True, axis=1)

    seq_length = tf.shape(delimiters)[1]

    eos_idx = tf.argmax(
        tf.cast(delimiters, tf.float32) * tf.expand_dims(
            tf.range(tf.cast(seq_length, tf.float32), dtype=tf.float32), 0),
        axis=1,
    )

    lengths = lengths_from_eos_idx(eos_idx=eos_idx, max_length=seq_length)

    if is_roberta:
        # Because roberta embeddings include an unused <MASK> token, and our embedding
        #  layer size needs to accommodate for that.
        bert_config.vocab_size += 1
        # In our use case (padding token has index 1), roberta's position indexes begin at 2, so our
        # positions embeddings come from indices 2:514.
        bert_config.max_position_embeddings += 2

    mask = tf.sequence_mask(lengths, maxlen=seq_length, dtype=tf.float32)

    if config.num_layers_trained not in [config.n_layer, 0]:
        raise ValueError(
            "Bert base model does not support num_layers_trained not equal to 0 or n_layer"
        )

    with tf.variable_scope("model/featurizer", reuse=reuse):
        bert = BertModel(config=bert_config,
                         is_training=train,
                         input_ids=input_ids,
                         input_mask=mask,
                         token_type_ids=token_type_ids,
                         use_one_hot_embeddings=False,
                         scope=None,
                         use_pooler=config.bert_use_pooler,
                         use_token_type=config.bert_use_type_embed,
                         roberta=is_roberta)

        embed_weights = bert.get_embedding_table()
        features = tf.reshape(
            bert.get_pooled_output(),
            shape=tf.concat((initial_shape[:-2], [config.n_embed]), 0),
        )
        sequence_features = tf.reshape(
            bert.get_sequence_output(),
            shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0),
        )

        output_state = {
            "embed_weights": embed_weights,
            "features": features,
            "sequence_features": sequence_features,
            "lengths": lengths,
            "eos_idx": eos_idx,
        }
        if config.num_layers_trained == 0:
            output_state = {
                k: tf.stop_gradient(v)
                for k, v in output_state.items()
            }

        return output_state
예제 #3
0
def featurizer(X, encoder, config, train=False, reuse=None, encoder_state=None, context=None, context_dim=None, **kwargs):
    """
    The main element of the OSCAR model. Maps from tokens ids to a dense, embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = [a or -1 for a in X.get_shape().as_list()]
    if len(initial_shape) != 3:
        X = tf.reshape(X, shape=[-1] + initial_shape[-2:])

    x_shape = tf.shape(X)
    with tf.variable_scope('model/featurizer', reuse=reuse):
        encoder._lazy_init()
        clf_token = encoder.end_token
        pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32)
        if encoder_state is None:
            embed_weights = tf.get_variable("we", [encoder.vocab_size + config.max_length, config.n_embed],
                                            initializer=tf.random_normal_initializer(stddev=config.weight_stddev))
        else:
            embed_weights = encoder_state["embed_weights"]

        if config.oscar_use_fp16:
            embed_weights = tf.cast(embed_weights, tf.float16)

        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

        X = tf.reshape(X, [-1, x_shape[1], 2])

        if config.oscar_use_timing:
            h = embed(X, embed_weights)
        else:
            h = embed_no_timing(X, embed_weights)

        for layer in range(config.n_layer):
            with tf.variable_scope('h%d_' % layer):
                if (
                        (config.n_layer - layer) == config.num_layers_trained and
                        config.num_layers_trained != config.n_layer
                ):
                    h = tf.stop_gradient(h)

                block_fn_fwd = functools.partial(
                    block, block_name='block%d_' % layer, use_fp16=config.oscar_use_fp16,
                    pool_idx=None, encoder_state=encoder_state, train=train,
                    pdrop=config.resid_p_drop, use_fused_kernel=config.oscar_use_fused_kernel,
                )

                if config.low_memory_mode and train:
                    block_fn_fwd = recompute_grad(block_fn_fwd, use_entire_scope=True)
                h = block_fn_fwd(h)

        h = normal_1d_conv_block(h, 1, "output", config.oscar_use_fp16, dilation=1)

        mask = tf.expand_dims(tf.sequence_mask(pool_idx, maxlen=tf.shape(h)[1], dtype=h.dtype), -1)

        if config.oscar_feat_mode == "clf_tok":
            clf_h = tf.gather_nd(h, tf.stack([tf.range(shape_list(h)[0]), pool_idx], 1))
        elif config.oscar_feat_mode == "mean_tok":
            clf_h = tf.reduce_sum(h * mask, 1) / tf.reduce_sum(h)
        elif config.oscar_feat_mode == "max_tok":
            clf_h = tf.reduce_max(h - (1e5 * (1.0 - mask)), 1)
        else:
            raise ValueError("config.feat_mode should be one of clf_tok, mean_tok or max_tok")

        if len(initial_shape) != 3:
            seq_feats = tf.reshape(h, shape=initial_shape[:-1] + [config.n_embed])
        else:
            seq_feats = h

        return {
            'embed_weights': embed_weights,
            'features': cast_maybe(clf_h, tf.float32),
            'sequence_features': seq_feats,
            'eos_idx': pool_idx,
            'encoded_input': X[:, :tf.reduce_min(pool_idx), 0],
            'lengths': lengths_from_eos_idx(eos_idx=pool_idx, max_length=shape_list(X)[0])
        }
예제 #4
0
def tcn_featurizer(X, encoder, config, train=False, reuse=None, **kwargs):
    """
    The featurizer element of the finetuning model. Maps from tokens ids to a dense embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))

    with tf.variable_scope("model/featurizer", reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[
                encoder.vocab_size + config.max_length,
                config.n_embed_featurizer
            ],
            initializer=tf.random_normal_initializer(
                stddev=config.weight_stddev),
        )

        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

        X = tf.reshape(X, [-1, config.max_length, 2])

        # we remove positional embeddings from the model
        h = embed(X[:, :, :1], embed_weights)

        # keep track of the classify token
        clf_token = encoder["_classify_"]

        with tf.variable_scope("tcn_stack"):
            representation = h
            for layer_num in range(config.n_layer):
                representation = TemporalBlock(
                    n_filters=config.n_filter,
                    kernel_size=config.kernel_size,
                    rate=config.resid_p_drop if train else 0,
                    dilation_rate=2**layer_num,
                    scope="Temporal{}".format(layer_num),
                )(representation)

        seq_feats = tf.reshape(representation,
                               shape=[-1, config.max_length, config.n_filter])

        # mask out the values past the classify token before performing pooling
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32,
        )

        # mask is past the classify token (i.e. make those results extremely negative)
        mask = tf.expand_dims(
            1.0 - tf.sequence_mask(pool_idx,
                                   maxlen=tf.shape(representation)[1],
                                   dtype=tf.float32),
            -1,
        )
        pool = tf.reduce_max(representation + mask * -1e9, 1)
        clf_h = pool
        clf_h = tf.reshape(clf_h,
                           shape=tf.concat(
                               (initial_shape[:-2], [config.n_filter]), 0))

        # note that, due to convolution and pooling, the dimensionality of the features is much smaller than in the
        # transformer base models

        lengths = lengths_from_eos_idx(eos_idx=pool_idx,
                                       max_length=config.max_length)
        return {
            "embed_weights": embed_weights,
            "features":
            clf_h,  # [batch_size, n_embed] for classify, [batch_size, 1, n_embed] for comparison, etc.
            "sequence_features": seq_feats,  # [batch_size, seq_len, n_embed]
            "eos_idx": pool_idx,  # [batch_size]
            "lengths": lengths
        }
예제 #5
0
def gpt_featurizer(X,
                   encoder,
                   config,
                   train=False,
                   reuse=None,
                   explain=False,
                   **kwargs):
    """
    The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))
    sequence_length = tf.shape(X)[1]

    with tf.variable_scope("model/featurizer", reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[encoder.vocab_size + config.max_length, config.n_embed],
            initializer=tf.random_normal_initializer(
                stddev=config.weight_stddev),
        )
        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

#        X = tf.reshape(X, [-1, config.max_length, 2])

        clf_token = encoder.end_token
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32)

        if explain:
            X = add_explain_tokens(X, sequence_length, pool_idx)

        h = embed(X, embed_weights)
        for layer in range(config.n_layer):
            if ((config.n_layer - layer) == config.num_layers_trained
                    and config.num_layers_trained != config.n_layer
                    and config.adapter_size is None):
                h = tf.stop_gradient(h)
                train_layer = False
            else:
                train_layer = train

            with tf.variable_scope("h%d_" % layer):
                block_fn = functools.partial(
                    block,
                    n_head=config.n_heads,
                    act_fn=config.act_fn,
                    resid_pdrop=config.resid_p_drop,
                    attn_pdrop=config.attn_p_drop,
                    scope="h%d" % layer,
                    train=train_layer,
                    scale=True,
                    explain=explain,
                    adptr_size=config.adapter_size,
                )
                if config.low_memory_mode and train_layer:
                    block_fn = recompute_grad(block_fn, use_entire_scope=True)
                if layer < config.n_layer - 1:
                    h = block_fn(h)
                else:
                    h_out = block_fn(h)

            # get the attention weights from the last layer
            if layer == config.n_layer - 1:
                with tf.variable_scope("h%d_/h%d/attn" % (layer, layer),
                                       reuse=True):
                    q, k, v = multihead_qkv(h,
                                            n_state=shape_list(h)[-1],
                                            n_head=config.n_heads,
                                            train=train)
                    w = attn_weights(q, k, v, scale=True)

        if explain:
            explain_out = h_out[:, initial_shape[1]:]
            explain_out = tf.reshape(
                explain_out,
                shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0))
            h_out = h_out[:, :initial_shape[1]]

        # Use hidden state at classifier token as input to final proj. + softmax
        clf_h = tf.reshape(h_out,
                           [-1, config.n_embed])  # [batch * seq_len, embed]
        clf_h = tf.gather(
            clf_h,
            tf.range(shape_list(X)[0], dtype=tf.int32) * sequence_length +
            pool_idx,
        )
        clf_h = tf.reshape(clf_h,
                           shape=tf.concat(
                               (initial_shape[:-2], [config.n_embed]), 0))
        seq_feats = tf.reshape(h_out,
                               shape=tf.concat(
                                   (initial_shape[:-1], [config.n_embed]), 0))

        lengths = lengths_from_eos_idx(eos_idx=pool_idx,
                                       max_length=sequence_length)

        out = {
            "embed_weights": embed_weights,
            "features": clf_h,
            "sequence_features": seq_feats,
            "eos_idx": pool_idx,
            "lengths": lengths,
            "attention_weights": w,  # [n_heads, seq_len, seq_len]
        }
        if explain:
            out["explain_out"] = explain_out
        return out
예제 #6
0
def textcnn_featurizer(X, encoder, config, train=False, reuse=None, **kwargs):
    """
    The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence.

    :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx]
    :param encoder: A TextEncoder object.
    :param config: A config object, containing all parameters for the featurizer.
    :param train: If this flag is true, dropout and losses are added to the graph.
    :param reuse: Should reuse be set within this scope.
    :return: A dict containing;
        embed_weights: the word embedding matrix.
        features: The output of the featurizer_final state.
        sequence_features: The output of the featurizer at each timestep.
    """
    initial_shape = tf.shape(X)
    X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0))
    sequence_length = tf.shape(X)[1]
    with tf.variable_scope("model/featurizer", reuse=reuse):
        embed_weights = tf.get_variable(
            name="we",
            shape=[
                encoder.vocab_size + config.max_length,
                config.n_embed_featurizer
            ],
            initializer=tf.random_normal_initializer(
                stddev=config.weight_stddev),
        )

        if config.train_embeddings:
            embed_weights = dropout(embed_weights, config.embed_p_drop, train)
        else:
            embed_weights = tf.stop_gradient(embed_weights)

#        X = tf.reshape(X, [-1, config.max_length, 2])

# we remove positional embeddings from the model
        h = embed(X[:, :, :1], embed_weights)

        # keep track of the classify token
        clf_token = encoder["_classify_"]

        # mask out the values past the classify token before performing pooling
        pool_idx = tf.cast(
            tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1),
            tf.int32,
        )
        # mask is past the classify token (i.e. make those results extremely negative)
        mask = tf.expand_dims(
            1.0 - tf.sequence_mask(
                pool_idx, maxlen=tf.shape(h)[1], dtype=tf.float32),
            -1,
        )

        # Convolutional Layer (this is all the same layer, just different filter sizes)
        pool_layers = []
        conv_layers = []
        for i, kernel_size in enumerate(config.kernel_sizes):
            conv = tf.layers.conv1d(
                inputs=h,
                filters=config.num_filters_per_size,
                kernel_size=kernel_size,
                padding="same",
                activation=tf.nn.relu,
                name="conv" + str(i),
                kernel_initializer=tf.initializers.glorot_normal,
            )
            conv_layers.append(conv)
            pool = tf.reduce_max(conv + mask * -1e9, 1)
            pool_layers.append(pool)

        # Concat the output of the convolutional layers for use in sequence embedding
        conv_seq = tf.concat(conv_layers, axis=2)
        seq_feats = tf.reshape(conv_seq,
                               shape=[-1, sequence_length, config.n_embed])

        # Concatenate the univariate vectors as features for classification
        clf_h = tf.concat(pool_layers, axis=1)
        clf_h = tf.reshape(clf_h,
                           shape=tf.concat(
                               (initial_shape[:-2], [config.n_embed]), 0))

        # note that, due to convolution and pooling, the dimensionality of the features is much smaller than in the
        # transformer base models
        lengths = lengths_from_eos_idx(eos_idx=pool_idx,
                                       max_length=sequence_length)

        return {
            "embed_weights": embed_weights,
            "features":
            clf_h,  # [batch_size, n_embed] for classify, [batch_size, 1, n_embed] for comparison, etc.
            "sequence_features": seq_feats,  # [batch_size, seq_len, n_embed]
            "eos_idx": pool_idx,  # [batch_size]
            "lengths": lengths
        }