def gpt2_featurizer( X, encoder, config, train=False, reuse=None, **kwargs ): initial_shape = tf.shape(X) X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0)) X.set_shape([None, None, None]) with tf.variable_scope("model/featurizer", reuse=reuse): embed_weights = tf.get_variable( name="we", shape=[encoder.vocab_size + config.max_length, config.n_embed], initializer=tf.random_normal_initializer(stddev=config.weight_stddev), ) if config.train_embeddings: embed_weights = dropout(embed_weights, config.embed_p_drop, train) else: embed_weights = tf.stop_gradient(embed_weights) X = tf.reshape(X, [-1, config.max_length, 2]) h = embed(X, embed_weights) # Transformer pasts = [None] * config.n_layer for layer, past in enumerate(pasts): if ( (config.n_layer - layer) == config.num_layers_trained and config.num_layers_trained != config.n_layer and config.adapter_size is None ): h = tf.stop_gradient(h) train_layer = False else: train_layer = train with tf.variable_scope("h%d" % layer): block_fn = functools.partial( block, past=past, hparams=config, train=train ) if config.low_memory_mode and train_layer: block_fn = recompute_grad(block_fn, use_entire_scope=True) h = block_fn(h) h = norm(h, "ln_f") # Use hidden state at classifier token as input to final proj. + softmax clf_h = tf.reshape(h, [-1, config.n_embed]) # [batch * seq_len, embed] clf_token = encoder["_classify_"] pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32 ) clf_h = tf.gather( clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * config.max_length + pool_idx, ) clf_h = tf.reshape( clf_h, shape=tf.concat((initial_shape[:-2], [config.n_embed]), 0) ) seq_feats = tf.reshape( h, shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0) ) lengths = lengths_from_eos_idx(eos_idx=pool_idx, max_length=shape_list(X)[0]) return { "embed_weights": embed_weights, "features": clf_h, "sequence_features": seq_feats, "eos_idx": pool_idx, "lengths": lengths }
def bert_featurizer(X, encoder, config, train=False, reuse=None, **kwargs): """ The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ is_roberta = RoBERTaEncoder == config.base_model.encoder bert_config = BertConfig(vocab_size=encoder.vocab_size, hidden_size=config.n_embed, num_hidden_layers=config.n_layer, num_attention_heads=config.n_heads, intermediate_size=config.bert_intermediate_size, hidden_act=config.act_fn, hidden_dropout_prob=config.resid_p_drop, attention_probs_dropout_prob=config.attn_p_drop, max_position_embeddings=config.max_length, type_vocab_size=2, initializer_range=config.weight_stddev, adapter_size=config.adapter_size, low_memory_mode=config.low_memory_mode) initial_shape = tf.shape(X) X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0)) X.set_shape([None, None, None]) # To fit the interface of finetune we are going to compute the mask and type id at runtime. input_ids = X[:, :, 0] # slice off pos-embed ids. delimiters = tf.cast(tf.equal(input_ids, encoder.delimiter_token), tf.int32) token_type_ids = tf.cumsum(delimiters, exclusive=True, axis=1) seq_length = tf.shape(delimiters)[1] eos_idx = tf.argmax( tf.cast(delimiters, tf.float32) * tf.expand_dims( tf.range(tf.cast(seq_length, tf.float32), dtype=tf.float32), 0), axis=1, ) lengths = lengths_from_eos_idx(eos_idx=eos_idx, max_length=seq_length) if is_roberta: # Because roberta embeddings include an unused <MASK> token, and our embedding # layer size needs to accommodate for that. bert_config.vocab_size += 1 # In our use case (padding token has index 1), roberta's position indexes begin at 2, so our # positions embeddings come from indices 2:514. bert_config.max_position_embeddings += 2 mask = tf.sequence_mask(lengths, maxlen=seq_length, dtype=tf.float32) if config.num_layers_trained not in [config.n_layer, 0]: raise ValueError( "Bert base model does not support num_layers_trained not equal to 0 or n_layer" ) with tf.variable_scope("model/featurizer", reuse=reuse): bert = BertModel(config=bert_config, is_training=train, input_ids=input_ids, input_mask=mask, token_type_ids=token_type_ids, use_one_hot_embeddings=False, scope=None, use_pooler=config.bert_use_pooler, use_token_type=config.bert_use_type_embed, roberta=is_roberta) embed_weights = bert.get_embedding_table() features = tf.reshape( bert.get_pooled_output(), shape=tf.concat((initial_shape[:-2], [config.n_embed]), 0), ) sequence_features = tf.reshape( bert.get_sequence_output(), shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0), ) output_state = { "embed_weights": embed_weights, "features": features, "sequence_features": sequence_features, "lengths": lengths, "eos_idx": eos_idx, } if config.num_layers_trained == 0: output_state = { k: tf.stop_gradient(v) for k, v in output_state.items() } return output_state
def featurizer(X, encoder, config, train=False, reuse=None, encoder_state=None, context=None, context_dim=None, **kwargs): """ The main element of the OSCAR model. Maps from tokens ids to a dense, embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ initial_shape = [a or -1 for a in X.get_shape().as_list()] if len(initial_shape) != 3: X = tf.reshape(X, shape=[-1] + initial_shape[-2:]) x_shape = tf.shape(X) with tf.variable_scope('model/featurizer', reuse=reuse): encoder._lazy_init() clf_token = encoder.end_token pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) if encoder_state is None: embed_weights = tf.get_variable("we", [encoder.vocab_size + config.max_length, config.n_embed], initializer=tf.random_normal_initializer(stddev=config.weight_stddev)) else: embed_weights = encoder_state["embed_weights"] if config.oscar_use_fp16: embed_weights = tf.cast(embed_weights, tf.float16) if config.train_embeddings: embed_weights = dropout(embed_weights, config.embed_p_drop, train) else: embed_weights = tf.stop_gradient(embed_weights) X = tf.reshape(X, [-1, x_shape[1], 2]) if config.oscar_use_timing: h = embed(X, embed_weights) else: h = embed_no_timing(X, embed_weights) for layer in range(config.n_layer): with tf.variable_scope('h%d_' % layer): if ( (config.n_layer - layer) == config.num_layers_trained and config.num_layers_trained != config.n_layer ): h = tf.stop_gradient(h) block_fn_fwd = functools.partial( block, block_name='block%d_' % layer, use_fp16=config.oscar_use_fp16, pool_idx=None, encoder_state=encoder_state, train=train, pdrop=config.resid_p_drop, use_fused_kernel=config.oscar_use_fused_kernel, ) if config.low_memory_mode and train: block_fn_fwd = recompute_grad(block_fn_fwd, use_entire_scope=True) h = block_fn_fwd(h) h = normal_1d_conv_block(h, 1, "output", config.oscar_use_fp16, dilation=1) mask = tf.expand_dims(tf.sequence_mask(pool_idx, maxlen=tf.shape(h)[1], dtype=h.dtype), -1) if config.oscar_feat_mode == "clf_tok": clf_h = tf.gather_nd(h, tf.stack([tf.range(shape_list(h)[0]), pool_idx], 1)) elif config.oscar_feat_mode == "mean_tok": clf_h = tf.reduce_sum(h * mask, 1) / tf.reduce_sum(h) elif config.oscar_feat_mode == "max_tok": clf_h = tf.reduce_max(h - (1e5 * (1.0 - mask)), 1) else: raise ValueError("config.feat_mode should be one of clf_tok, mean_tok or max_tok") if len(initial_shape) != 3: seq_feats = tf.reshape(h, shape=initial_shape[:-1] + [config.n_embed]) else: seq_feats = h return { 'embed_weights': embed_weights, 'features': cast_maybe(clf_h, tf.float32), 'sequence_features': seq_feats, 'eos_idx': pool_idx, 'encoded_input': X[:, :tf.reduce_min(pool_idx), 0], 'lengths': lengths_from_eos_idx(eos_idx=pool_idx, max_length=shape_list(X)[0]) }
def tcn_featurizer(X, encoder, config, train=False, reuse=None, **kwargs): """ The featurizer element of the finetuning model. Maps from tokens ids to a dense embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ initial_shape = tf.shape(X) X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0)) with tf.variable_scope("model/featurizer", reuse=reuse): embed_weights = tf.get_variable( name="we", shape=[ encoder.vocab_size + config.max_length, config.n_embed_featurizer ], initializer=tf.random_normal_initializer( stddev=config.weight_stddev), ) if config.train_embeddings: embed_weights = dropout(embed_weights, config.embed_p_drop, train) else: embed_weights = tf.stop_gradient(embed_weights) X = tf.reshape(X, [-1, config.max_length, 2]) # we remove positional embeddings from the model h = embed(X[:, :, :1], embed_weights) # keep track of the classify token clf_token = encoder["_classify_"] with tf.variable_scope("tcn_stack"): representation = h for layer_num in range(config.n_layer): representation = TemporalBlock( n_filters=config.n_filter, kernel_size=config.kernel_size, rate=config.resid_p_drop if train else 0, dilation_rate=2**layer_num, scope="Temporal{}".format(layer_num), )(representation) seq_feats = tf.reshape(representation, shape=[-1, config.max_length, config.n_filter]) # mask out the values past the classify token before performing pooling pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32, ) # mask is past the classify token (i.e. make those results extremely negative) mask = tf.expand_dims( 1.0 - tf.sequence_mask(pool_idx, maxlen=tf.shape(representation)[1], dtype=tf.float32), -1, ) pool = tf.reduce_max(representation + mask * -1e9, 1) clf_h = pool clf_h = tf.reshape(clf_h, shape=tf.concat( (initial_shape[:-2], [config.n_filter]), 0)) # note that, due to convolution and pooling, the dimensionality of the features is much smaller than in the # transformer base models lengths = lengths_from_eos_idx(eos_idx=pool_idx, max_length=config.max_length) return { "embed_weights": embed_weights, "features": clf_h, # [batch_size, n_embed] for classify, [batch_size, 1, n_embed] for comparison, etc. "sequence_features": seq_feats, # [batch_size, seq_len, n_embed] "eos_idx": pool_idx, # [batch_size] "lengths": lengths }
def gpt_featurizer(X, encoder, config, train=False, reuse=None, explain=False, **kwargs): """ The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ initial_shape = tf.shape(X) X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0)) sequence_length = tf.shape(X)[1] with tf.variable_scope("model/featurizer", reuse=reuse): embed_weights = tf.get_variable( name="we", shape=[encoder.vocab_size + config.max_length, config.n_embed], initializer=tf.random_normal_initializer( stddev=config.weight_stddev), ) if config.train_embeddings: embed_weights = dropout(embed_weights, config.embed_p_drop, train) else: embed_weights = tf.stop_gradient(embed_weights) # X = tf.reshape(X, [-1, config.max_length, 2]) clf_token = encoder.end_token pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) if explain: X = add_explain_tokens(X, sequence_length, pool_idx) h = embed(X, embed_weights) for layer in range(config.n_layer): if ((config.n_layer - layer) == config.num_layers_trained and config.num_layers_trained != config.n_layer and config.adapter_size is None): h = tf.stop_gradient(h) train_layer = False else: train_layer = train with tf.variable_scope("h%d_" % layer): block_fn = functools.partial( block, n_head=config.n_heads, act_fn=config.act_fn, resid_pdrop=config.resid_p_drop, attn_pdrop=config.attn_p_drop, scope="h%d" % layer, train=train_layer, scale=True, explain=explain, adptr_size=config.adapter_size, ) if config.low_memory_mode and train_layer: block_fn = recompute_grad(block_fn, use_entire_scope=True) if layer < config.n_layer - 1: h = block_fn(h) else: h_out = block_fn(h) # get the attention weights from the last layer if layer == config.n_layer - 1: with tf.variable_scope("h%d_/h%d/attn" % (layer, layer), reuse=True): q, k, v = multihead_qkv(h, n_state=shape_list(h)[-1], n_head=config.n_heads, train=train) w = attn_weights(q, k, v, scale=True) if explain: explain_out = h_out[:, initial_shape[1]:] explain_out = tf.reshape( explain_out, shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0)) h_out = h_out[:, :initial_shape[1]] # Use hidden state at classifier token as input to final proj. + softmax clf_h = tf.reshape(h_out, [-1, config.n_embed]) # [batch * seq_len, embed] clf_h = tf.gather( clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * sequence_length + pool_idx, ) clf_h = tf.reshape(clf_h, shape=tf.concat( (initial_shape[:-2], [config.n_embed]), 0)) seq_feats = tf.reshape(h_out, shape=tf.concat( (initial_shape[:-1], [config.n_embed]), 0)) lengths = lengths_from_eos_idx(eos_idx=pool_idx, max_length=sequence_length) out = { "embed_weights": embed_weights, "features": clf_h, "sequence_features": seq_feats, "eos_idx": pool_idx, "lengths": lengths, "attention_weights": w, # [n_heads, seq_len, seq_len] } if explain: out["explain_out"] = explain_out return out
def textcnn_featurizer(X, encoder, config, train=False, reuse=None, **kwargs): """ The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ initial_shape = tf.shape(X) X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0)) sequence_length = tf.shape(X)[1] with tf.variable_scope("model/featurizer", reuse=reuse): embed_weights = tf.get_variable( name="we", shape=[ encoder.vocab_size + config.max_length, config.n_embed_featurizer ], initializer=tf.random_normal_initializer( stddev=config.weight_stddev), ) if config.train_embeddings: embed_weights = dropout(embed_weights, config.embed_p_drop, train) else: embed_weights = tf.stop_gradient(embed_weights) # X = tf.reshape(X, [-1, config.max_length, 2]) # we remove positional embeddings from the model h = embed(X[:, :, :1], embed_weights) # keep track of the classify token clf_token = encoder["_classify_"] # mask out the values past the classify token before performing pooling pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32, ) # mask is past the classify token (i.e. make those results extremely negative) mask = tf.expand_dims( 1.0 - tf.sequence_mask( pool_idx, maxlen=tf.shape(h)[1], dtype=tf.float32), -1, ) # Convolutional Layer (this is all the same layer, just different filter sizes) pool_layers = [] conv_layers = [] for i, kernel_size in enumerate(config.kernel_sizes): conv = tf.layers.conv1d( inputs=h, filters=config.num_filters_per_size, kernel_size=kernel_size, padding="same", activation=tf.nn.relu, name="conv" + str(i), kernel_initializer=tf.initializers.glorot_normal, ) conv_layers.append(conv) pool = tf.reduce_max(conv + mask * -1e9, 1) pool_layers.append(pool) # Concat the output of the convolutional layers for use in sequence embedding conv_seq = tf.concat(conv_layers, axis=2) seq_feats = tf.reshape(conv_seq, shape=[-1, sequence_length, config.n_embed]) # Concatenate the univariate vectors as features for classification clf_h = tf.concat(pool_layers, axis=1) clf_h = tf.reshape(clf_h, shape=tf.concat( (initial_shape[:-2], [config.n_embed]), 0)) # note that, due to convolution and pooling, the dimensionality of the features is much smaller than in the # transformer base models lengths = lengths_from_eos_idx(eos_idx=pool_idx, max_length=sequence_length) return { "embed_weights": embed_weights, "features": clf_h, # [batch_size, n_embed] for classify, [batch_size, 1, n_embed] for comparison, etc. "sequence_features": seq_feats, # [batch_size, seq_len, n_embed] "eos_idx": pool_idx, # [batch_size] "lengths": lengths }