def attn( x, scope, n_state, n_head, resid_pdrop, attn_pdrop, train=False, scale=False, mask=True, explain=False, lengths=None, ): assert n_state % n_head == 0 with tf.variable_scope(scope): q, k, v = multihead_qkv(x, n_state, n_head, train, explain) w = attn_weights(q, k, v, scale=scale, mask=mask, explain=explain, lengths=lengths) w = dropout(w, attn_pdrop, train) a = tf.matmul(w, v) a = merge_heads(a) a = conv1d(a, "c_proj", n_state, 1, train=train) a = dropout(a, resid_pdrop, train) return a
def add_auxiliary(context, context_dim, clf_h, seq_feats, config, train): context.set_shape([None, config.max_length, context_dim]) context_embed_weights = tf.get_variable( name="ce", shape=[context_dim, config.n_context_embed], initializer=tf.random_normal_initializer(stddev=config.weight_stddev), ) context_weighted_avg = tf.get_variable( name="cwa", shape=[context_dim], initializer=tf.random_normal_initializer(stddev=config.weight_stddev), ) if config.train_embeddings: context_embed_weights = dropout(context_embed_weights, config.embed_p_drop, train) context_weighted_avg = dropout(context_weighted_avg, config.embed_p_drop, train) else: context_embed_weights = tf.stop_gradient(context_embed_weights) with tf.variable_scope("context_embedding"): weighted_C = tf.multiply( context, context_weighted_avg ) # [batch_size, seq_length, context_dim] * [context_dim] = [batch_size, seq_length, context_dim], with weighted inputs c_embed = tf.tensordot( weighted_C, context_embed_weights, axes=[[2], [0]] ) # [batch_size, seq_length, context_dim] * [context_dim, n_embed] = [batch_size, seq_length, n_embed] c_embed = norm(c_embed, tf.get_variable_scope()) seq_feats = tf.concat([seq_feats, c_embed], axis=2) c_embed = tf.reduce_mean(c_embed, axis=1) clf_h = tf.concat([clf_h, c_embed], axis=1) return clf_h, seq_feats
def mlp(x, scope, n_state, act_fn, resid_pdrop, train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] act = act_fns[act_fn] h = act(conv1d(x, "c_fc", n_state, 1, train=train)) h2 = conv1d(h, "c_proj", nx, 1, train=train) h2 = dropout(h2, resid_pdrop, train) return h2
def adapter(X, adapter_size, nx, train=False, hidden_dropout_prob=0.1): down_projection = tf.layers.dense( X, adapter_size, activation="sigmoid", kernel_initializer=create_initializer(0.001), ) down_projection = dropout(down_projection, hidden_dropout_prob, train) up_projection = tf.layers.dense( down_projection, nx, kernel_initializer=create_initializer(0.001)) return up_projection + X
def gpt_featurizer(X, encoder, config, train=False, reuse=None, explain=False, **kwargs): """ The transformer element of the finetuning model. Maps from tokens ids to a dense, embedding of the sequence. :param X: A tensor of token indexes with shape [batch_size, sequence_length, token_idx] :param encoder: A TextEncoder object. :param config: A config object, containing all parameters for the featurizer. :param train: If this flag is true, dropout and losses are added to the graph. :param reuse: Should reuse be set within this scope. :return: A dict containing; embed_weights: the word embedding matrix. features: The output of the featurizer_final state. sequence_features: The output of the featurizer at each timestep. """ initial_shape = tf.shape(X) X = tf.reshape(X, shape=tf.concat(([-1], initial_shape[-2:]), 0)) sequence_length = tf.shape(X)[1] with tf.variable_scope("model/featurizer", reuse=reuse): embed_weights = tf.get_variable( name="we", shape=[encoder.vocab_size + config.max_length, config.n_embed], initializer=tf.random_normal_initializer( stddev=config.weight_stddev), ) if config.train_embeddings: embed_weights = dropout(embed_weights, config.embed_p_drop, train) else: embed_weights = tf.stop_gradient(embed_weights) # X = tf.reshape(X, [-1, config.max_length, 2]) clf_token = encoder.end_token pool_idx = tf.cast( tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) if explain: X = add_explain_tokens(X, sequence_length, pool_idx) h = embed(X, embed_weights) for layer in range(config.n_layer): if ((config.n_layer - layer) == config.num_layers_trained and config.num_layers_trained != config.n_layer and config.adapter_size is None): h = tf.stop_gradient(h) train_layer = False else: train_layer = train with tf.variable_scope("h%d_" % layer): block_fn = functools.partial( block, n_head=config.n_heads, act_fn=config.act_fn, resid_pdrop=config.resid_p_drop, attn_pdrop=config.attn_p_drop, scope="h%d" % layer, train=train_layer, scale=True, explain=explain, adptr_size=config.adapter_size, ) if config.low_memory_mode and train_layer: block_fn = recompute_grad(block_fn, use_entire_scope=True) if layer < config.n_layer - 1: h = block_fn(h) else: h_out = block_fn(h) # get the attention weights from the last layer if layer == config.n_layer - 1: with tf.variable_scope("h%d_/h%d/attn" % (layer, layer), reuse=True): q, k, v = multihead_qkv(h, n_state=shape_list(h)[-1], n_head=config.n_heads, train=train) w = attn_weights(q, k, v, scale=True) if explain: explain_out = h_out[:, initial_shape[1]:] explain_out = tf.reshape( explain_out, shape=tf.concat((initial_shape[:-1], [config.n_embed]), 0)) h_out = h_out[:, :initial_shape[1]] # Use hidden state at classifier token as input to final proj. + softmax clf_h = tf.reshape(h_out, [-1, config.n_embed]) # [batch * seq_len, embed] clf_h = tf.gather( clf_h, tf.range(shape_list(X)[0], dtype=tf.int32) * sequence_length + pool_idx, ) clf_h = tf.reshape(clf_h, shape=tf.concat( (initial_shape[:-2], [config.n_embed]), 0)) seq_feats = tf.reshape(h_out, shape=tf.concat( (initial_shape[:-1], [config.n_embed]), 0)) lengths = lengths_from_eos_idx(eos_idx=pool_idx, max_length=sequence_length) out = { "embed_weights": embed_weights, "features": clf_h, "sequence_features": seq_feats, "eos_idx": pool_idx, "lengths": lengths, "attention_weights": w, # [n_heads, seq_len, seq_len] } if explain: out["explain_out"] = explain_out return out