def create_positional_emb_2d(self, targets): """Learned 2d positional embedding for images.""" mesh = targets.mesh positional_emb_rows_var = mtf.get_variable( mesh, "positional_emb_rows", mtf.Shape([self.pos_dim, self.model_dim]), initializer=tf.random_normal_initializer(), activation_dtype=self.activation_type) positional_emb_cols_var = mtf.get_variable( mesh, "positional_emb_cols", mtf.Shape([self.pos_dim, self.model_dim]), initializer=tf.random_normal_initializer(), activation_dtype=self.activation_type) targets_position_x = mtf.range(mesh, self.rows_dim, dtype=tf.int32) targets_position_y = mtf.range(mesh, self.cols_dim, dtype=tf.int32) position_x = mtf.broadcast( mtf.gather(positional_emb_rows_var, targets_position_x, self.pos_dim), mtf.Shape([self.rows_dim, self.cols_dim, self.model_dim])) position_y = mtf.broadcast( mtf.gather(positional_emb_cols_var, targets_position_y, self.pos_dim), mtf.Shape([self.rows_dim, self.cols_dim, self.model_dim])) return position_x + position_y
def create_positional_emb_2d(self, targets): """Learned 2d positional embedding for images.""" mesh = targets.mesh positional_emb_rows_var = mtf.get_variable( mesh, "positional_emb_rows", mtf.Shape([self.pos_dim, self.model_dim]), initializer=tf.random_normal_initializer(), activation_dtype=self.activation_type) positional_emb_cols_var = mtf.get_variable( mesh, "positional_emb_cols", mtf.Shape([self.pos_dim, self.model_dim]), initializer=tf.random_normal_initializer(), activation_dtype=self.activation_type) targets_position_x = mtf.range(mesh, self.rows_dim, dtype=tf.int32) targets_position_y = mtf.range(mesh, self.cols_dim, dtype=tf.int32) position_x = mtf.broadcast( mtf.gather(positional_emb_rows_var, targets_position_x, self.pos_dim), mtf.Shape([self.rows_dim, self.cols_dim, self.model_dim])) position_y = mtf.broadcast( mtf.gather(positional_emb_cols_var, targets_position_y, self.pos_dim), mtf.Shape([self.rows_dim, self.cols_dim, self.model_dim])) return position_x + position_y
def get_attn_mask(self, mesh, nd, ns): if not exists(self.attn_mask): i = mtf.range(mesh, nd, tf.int32) + ns.size - nd.size j = mtf.range(mesh, ns, tf.int32) i, j = map(lambda t: mtf.broadcast(t, [nd, ns]), (i, j)) self.attn_mask = mtf.cast(mtf.less( i, j), self.variable_dtype.activation_dtype) * -1e10 return self.attn_mask
def biasmask_attn_weights(mesh, nd, ns, variable_dtype): # The old mask_attn_weights applied directly to the QK; # this returns a bias that the attention code from mtf adds to the attention matrix. # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. # n_src and n_dest are both the same, i.e equal to sequence length # We rename ns because we want bias to have shape [batch, heads, memory_length, sequence] to match up with QK^T # Information flows from k and v (memory_length) to q (sequence) i = mtf.range(mesh, nd, tf.int32) + ns.size - nd.size j = mtf.range(mesh, ns, tf.int32) i, j = map(lambda t: mtf.broadcast(t, [nd, ns]), (i, j)) dtype = variable_dtype.activation_dtype return mtf.cast(mtf.less(i, j), dtype) * -1e10
def forward(self, features, return_loss=True, return_logits=False): inputs = features["tokens"] tokens = self.positional_embedding(self.embedding(inputs, "embedding"), "positional_embedding") mask = self.get_attn_mask(tokens.mesh, tokens.shape[1], self.dimensions["memory_len_dim"]) out = self.transformer(tokens, mask=mask) logits = self.to_logits(out) if not return_loss: return logits labels = pad(inputs, [0, 1], dim_name="total_seq_dim", pad_value=self.eos_token_id) indices = mtf.range(labels.mesh, mtf.Dimension("range", labels.shape[1].size - 1), tf.int32, name="labels_indices") + 1 labels = mtf.gather(labels, indices, dim=labels.shape[1]) labels = mtf.rename_dimension(labels, "range", "total_seq_dim") loss, loss_batch = self._loss(logits, labels) if return_logits and return_loss: # Cast back to checkpoint dtype logits = mtf.cast(logits, self.variable_dtype.master_dtype) return loss, loss_batch, logits return loss, loss_batch
def call(self, context, x, losses=None): """Call the layer.""" memory_length = self.memory_length(context) q = self.compute_q(context, x) if context.mode == "incremental": m = x else: m = mtf.replace_dimensions(x, context.length_dim, memory_length) k = self.compute_k(context, m) v = self.compute_v(context, m) if context.mode == "incremental": one_hot = mtf.one_hot( context.position, memory_length, dtype=context.activation_dtype) inv_one_hot = 1.0 - one_hot old_k, old_v = context.get_states(2) k = old_k * inv_one_hot + k * one_hot v = old_v * inv_one_hot + v * one_hot memory_position = mtf.range(context.mesh, memory_length, tf.int32) else: memory_position = self.rename_length_to_memory_length( context.position, context) if context.mode == "incremental" or context.mode == "first_part": context.record_new_states([k, v]) bias = self.compute_bias(context, memory_position, x, self.softmax_heads_dims, q) return self.attention_internal(context, x, m, q, k, v, memory_length, bias)
def call(self, context, x, losses=None): """Call the layer.""" params = mtf.layers.multihead_attention_params(context.mesh, self.heads_dim, context.model_dim, self.kv_dim, context.variable_dtype) if context.mode == "incremental": prev_k, prev_v = context.get_states(2) y, new_k, new_v = mtf.layers.masked_local_attention_1d_incremental( x, prev_k, prev_v, context.position, params=params) context.record_new_states([new_k, new_v]) return y else: kv = [] y = mtf.layers.masked_local_attention_1d(x, self.kv_dim, self.heads_dim, self.window_size, params=params, return_kv=kv) if context.mode == "first_part": k = kv[0] v = kv[1] window_dim = mtf.Dimension("window", self.window_size) mesh = k.mesh window_pos = mtf.range(mesh, window_dim, tf.int32) pos = mtf.range(mesh, context.length_dim, tf.int32) select_recent = mtf.cast( mtf.equal(window_pos, mtf.mod(pos, self.window_size)), k.dtype) select_recent *= mtf.cast( mtf.less(pos, context.initial_position), k.dtype) select_recent *= mtf.cast( mtf.greater_equal( pos, context.initial_position - self.window_size), k.dtype) state_shape = k.shape.dims[:-2] + [window_dim, self.kv_dim] k_state = mtf.einsum([k, select_recent], output_shape=state_shape, reduced_dims=[context.length_dim]) v_state = mtf.einsum([v, select_recent], output_shape=state_shape, reduced_dims=[context.length_dim]) context.new_states.extend([k_state, v_state]) return y
def attention(x, dim_head, dim_features_head, scope='attn', causal=False): with tf.variable_scope(scope): mesh, batch, seq, dim = x.mesh, *x.shape dim_heads = mtf.Dimension('dim_heads', dim_head.size * dim_features_head.size) dim_intermediate = mtf.Dimension('qkv_dimension', dim_heads.size * 3) qkv = linear(x, dim_intermediate, bias=False, scope='to_qkv') q, k, v = mtf.split(qkv, dim_intermediate, 3) q, k, v = map( lambda t: mtf.reshape(t, [batch, seq, dim_head, dim_features_head] ), (q, k, v)) q, k, v = map( lambda t: mtf.transpose( t, [batch, dim_head, seq, dim_features_head]), (q, k, v)) k, v = map( lambda t: mtf.rename_dimension(t, seq.name, 'memory_length'), (k, v)) mem_len_dim = v.shape[-2] dots = mtf.layers.us_einsum([q, k], [batch, dim_head, seq, mem_len_dim]) if causal: i = mtf.range(mesh, seq, tf.int32) j = mtf.range(mesh, mem_len_dim, tf.int32) i, j = map(lambda t: mtf.broadcast(t, [seq, mem_len_dim]), (i, j)) mask = mtf.less(i + mem_len_dim.size - seq.size, j) mask = mtf.cast(mask, tf.float32) * -1e10 dots += mask attn = mtf.softmax(dots, mem_len_dim) out = mtf.einsum([attn, v], [batch, dim_head, seq, dim_features_head]) out = mtf.transpose(out, [batch, seq, dim_head, dim_features_head]) out = mtf.reshape(out, [batch, seq, dim_heads]) combined_out = linear(out, dim, scope='combine_output') return combined_out
def call(self, context, x, losses=None): """Call the layer.""" params = self.make_params(context) q = params.compute_q(x) memory_length = self.memory_length(context) if context.mode == "incremental": m = x else: m = mtf.replace_dimensions(x, context.length_dim, memory_length) if self.shared_kv: kv = params.compute_kv(m) else: k = params.compute_k(m) v = params.compute_v(m) if context.mode == "incremental": one_hot = mtf.one_hot( context.position, memory_length, dtype=context.activation_dtype) inv_one_hot = 1.0 - one_hot if self.shared_kv: old_kv = context.get_states(1) kv = old_kv * inv_one_hot + kv * one_hot else: old_k, old_v = context.get_states(2) k = old_k * inv_one_hot + k * one_hot v = old_v * inv_one_hot + v * one_hot memory_position = mtf.range(context.mesh, memory_length, tf.int32) else: memory_position = self.rename_length_to_memory_length( context.position, context) if context.mode == "incremental" or context.mode == "first_part": context.record_new_states([kv] if self.shared_kv else [k, v]) if self.shared_kv: k = kv v = kv if self.attention_func == "hybrid": o = attention.hybrid_attention( q, k, v, context, memory_length, self.kv_dim, self.kv_dim, self.compute_bias( context, memory_position, x, params.query_heads_dims), **self.attention_kwargs_from_context(context)) else: o = attention.attention( q, k, v, memory_length, self.kv_dim, self.kv_dim, self.compute_bias( context, memory_position, x, params.query_heads_dims), **self.attention_kwargs_from_context(context)) return params.compute_output(o, output_shape=x.shape)
def test_entmax(): graph = mtf.Graph() mesh = mtf.Mesh(graph, "my_mesh") length = mtf.Dimension("tensor_length", 8) tensor = mtf.range(mesh, length, tf.float32) output = entmax(tensor) grad = mtf.gradients([output], [tensor])[0] sample = sample_categorical(output, length) mesh_impl = placement_mesh_impl.PlacementMeshImpl(shape=[], layout={}, devices=[""]) lowering = mtf.Lowering(graph, {mesh: mesh_impl}) sample = lowering.export_to_tf_tensor(sample) grad = lowering.export_to_tf_tensor(grad)
def call(self, context, x, losses=None): """Call the layer.""" wq, wk, wv, wo = mtf.layers.multihead_attention_params( context.mesh, self.heads_dim, context.model_dim, self.kv_dim, context.variable_dtype) memory_length = mtf.Dimension("memory_length", context.length_dim.size) q = mtf.einsum([x, wq], reduced_dims=[context.model_dim]) if context.mode == "incremental": m = x else: m = mtf.rename_dimension(x, context.length_dim.name, "memory_length") k = mtf.einsum([m, wk], reduced_dims=[context.model_dim]) v = mtf.einsum([m, wv], reduced_dims=[context.model_dim]) if context.mode == "incremental": old_k, old_v = context.get_states(2) one_hot = mtf.one_hot(context.position, memory_length, dtype=context.activation_dtype) inv_one_hot = 1.0 - one_hot k = old_k * inv_one_hot + k * one_hot v = old_v * inv_one_hot + v * one_hot if context.mode == "incremental" or context.mode == "first_part": context.record_new_states([k, v]) masks = [] if context.autoregressive: masks.append( mtf.cast( mtf.less( context.position, mtf.range(context.mesh, memory_length, dtype=tf.int32)), context.activation_dtype) * -1e9) if (context.sequence_id is not None and isinstance(context.sequence_id, mtf.Tensor) and context.length_dim in context.sequence_id.shape): masks.append( mtf.cast( mtf.not_equal( context.sequence_id, mtf.layers.rename_length_to_memory_length( context.sequence_id)), context.activation_dtype) * -1e9) mask = mtf.add_n(masks) if masks else None o = mtf.layers.dot_product_attention_v2( q, k, v, memory_length, self.kv_dim, self.kv_dim, mask, self.dropout_rate if context.train else 0.0, [context.length_dim]) return mtf.einsum([o, wo], x.shape, reduced_dims=[self.heads_dim, self.kv_dim])
def _noisy_targets_from_spec(self, targets, noising_spec, losses=None): if noising_spec["type"] == "mask": # Replace a randomly-chosen noising_spec["prob"] of input tokens with 0. return targets * mtf.cast( mtf.greater(mtf.random_uniform(targets.mesh, targets.shape), noising_spec["prob"]), targets.dtype) elif noising_spec["type"] == "random_zipfian": # Replace a randomly-chosen noising_spec["prob"] of input tokens. # Rather than drawing the replacement tokens uniformly, we sample from # a distribution favoring lower token-ids, assuming that the ids have # been assigned in frequency order. The probability of choosing an # id is proportional to 1/(id+10) logits = mtf.log(1.0 / (mtf.range( targets.mesh, self.targets_vocab_dim, dtype=tf.float32) + 10.0)) logits = mtf.broadcast(logits, new_shape=targets.shape + logits.shape) r = mtf.sample_with_temperature(logits, self.targets_vocab_dim) use_noise = mtf.less( mtf.random_uniform(targets.mesh, targets.shape), noising_spec["prob"]) return mtf.where(use_noise, r, targets) elif noising_spec["type"] == "transformer": # Train a small transformer to fill in masked out values, then # sample from it. hparams = self._hparams if hparams.mode != tf.estimator.ModeKeys.TRAIN: raise NotImplementedError("Not implemented") noiser_hparams = copy.copy(self._hparams) noiser_hparams.del_hparam("mode") noiser_hparams.override_from_dict(noising_spec["overrides"]) with tf.variable_scope("noiser"): noiser = MtfTransformer(noiser_hparams, mode=hparams.mode, problem_hparams=self._problem_hparams) logits, loss = noiser._mtf_model_fn( # pylint: disable=protected-access self._original_features, targets.mesh) samples = mtf.sample_with_temperature(logits, self.targets_vocab_dim) losses.append(loss) return samples else: raise ValueError("unknown noising spec %s" % noising_spec)
def _noisy_targets_from_spec(self, targets, noising_spec, losses=None): if noising_spec["type"] == "mask": # Replace a randomly-chosen noising_spec["prob"] of input tokens with 0. return targets * mtf.cast( mtf.greater(mtf.random_uniform(targets.mesh, targets.shape), noising_spec["prob"]), targets.dtype) elif noising_spec["type"] == "random_zipfian": # Replace a randomly-chosen noising_spec["prob"] of input tokens. # Rather than drawing the replacement tokens uniformly, we sample from # a distribution favoring lower token-ids, assuming that the ids have # been assigned in frequency order. The probability of choosing an # id is proportional to 1/(id+10) logits = mtf.log(1.0 / (mtf.range( targets.mesh, self.targets_vocab_dim, dtype=tf.float32) + 10.0)) logits = mtf.broadcast(logits, new_shape=targets.shape + logits.shape) r = mtf.sample_with_temperature(logits, self.targets_vocab_dim) use_noise = mtf.less( mtf.random_uniform(targets.mesh, targets.shape), noising_spec["prob"]) return mtf.where(use_noise, r, targets) elif noising_spec["type"] == "transformer": # Train a small transformer to fill in masked out values, then # sample from it. hparams = self._hparams if hparams.mode != tf.estimator.ModeKeys.TRAIN: raise NotImplementedError("Not implemented") noiser_hparams = copy.copy(self._hparams) noiser_hparams.del_hparam("mode") noiser_hparams.override_from_dict(noising_spec["overrides"]) with tf.variable_scope("noiser"): noiser = MtfTransformer( noiser_hparams, mode=hparams.mode, problem_hparams=self._problem_hparams) logits, loss = noiser._mtf_model_fn( # pylint: disable=protected-access self._original_features, targets.mesh) samples = mtf.sample_with_temperature(logits, self.targets_vocab_dim) losses.append(loss) return samples else: raise ValueError("unknown noising spec %s" % noising_spec)
def transformer_lm(x, *, dim, num_tokens, depth, max_seq_len, dim_head, dim_features_head, causal=False): mesh, batch, seq_dim = x.mesh, *x.shape dim = mtf.Dimension('dim', dim) dim_head = mtf.Dimension('dim_head', dim_head) dim_features_head = mtf.Dimension('dim_features_head', dim_features_head) dim_num_tokens = mtf.Dimension('vocab_size', num_tokens) dim_max_seq_len = mtf.Dimension('max_seq_len', max_seq_len) wte = mtf.get_variable(mesh, name='wte', shape=mtf.Shape([dim_num_tokens, dim]), dtype=tf.float32) wpe = mtf.get_variable(mesh, name='wpe', shape=mtf.Shape([seq_dim, dim]), dtype=tf.float32) x = mtf.gather(wte, x, dim_num_tokens) p = mtf.gather(wpe, mtf.range(mesh, seq_dim, dtype=tf.int32), dim_max_seq_len) x = x + p x = transformer(x, depth=depth, dim_head=dim_head, dim_features_head=dim_features_head, causal=causal) logits = linear(x, dim_num_tokens, scope='to_logits') return logits
def call(self, context, x, losses=None): """Call the layer.""" memory_length = self.memory_length(context) q = self.compute_q(context, x) if context.mode == "incremental": m = x else: m = mtf.replace_dimensions(x, context.length_dim, memory_length) if context.mode == "incremental": one_hot = mtf.one_hot( context.position, memory_length, dtype=context.activation_dtype) inv_one_hot = 1.0 - one_hot old_m, = context.get_states(1) m = old_m * inv_one_hot + one_hot * m memory_position = mtf.range(context.mesh, memory_length, tf.int32) else: memory_position = self.rename_length_to_memory_length( context.position, context) if context.mode == "incremental" or context.mode == "first_part": context.record_new_states([m]) bias = self.compute_bias(context, memory_position, x, self.heads_dims, q) return self.attention_internal(context, q, m, memory_length, bias)
def positional_embedding(self, x, name): with tf.variable_scope(name): # Positional embedding wpe = mtf.get_variable( x.mesh, "wpe", mtf.Shape([ self.dimensions["embed_seq_dim"], self.dimensions["embed_dim"] ]), initializer=tf.random_normal_initializer(stddev=0.01), master_dtype=self.variable_dtype.master_dtype, slice_dtype=self.variable_dtype.slice_dtype, activation_dtype=self.variable_dtype.activation_dtype) position_indices = mtf.range(x.mesh, self.dimensions["total_seq_dim"], tf.int64) if not \ self.is_incremental_inference else (self.context.position - 1) pos_emb = mtf.gather(wpe, position_indices, wpe.shape[0]) embed_dropout = self.params.get("embed_dropout", 0) if embed_dropout > 0 and self.mode == "train": pos_emb = mtf.dropout(pos_emb, rate=embed_dropout, name="wte_dropout") x += pos_emb return x
def __init__(self, mesh, batch_dims, length_dim, model_dim, variable_dtype, beam_dim=None, mode=tf.estimator.ModeKeys.TRAIN, autoregressive=False, position=None, sequence_id=None, states=None, new_states=None, losses=None, initial_position=None, layer_outputs=None, encoder_output=None, encoder_sequence_id=None, constant_states=None, shared_params=None, layout=None, mesh_shape=None, encoder_layer_outputs=None): """Create a context. Args: mesh: a mtf.Mesh batch_dims: a list of mtf.Dimension length_dim: a mtf.Dimension model_dim: a mtf.Dimension variable_dtype: a mtf.VariableDType beam_dim: an optional mtf.Dimension (present in beam search) mode: either a tf.estimator.ModeKeys or one of the follwing: "first_part" "incremental" autoregressive: a boolean - controls whether attention layers shold mask out the future. position: an optional Tensor - represents position in the sequence sequence_id: an optional int32 Tensor aligned with position - used to separate out different sequences which have been concatenated to form a single training example. Also used to mark padding. Id 0 is used for padding, and different positive values are used for the different sequences. states: an optional list of Tensors representing loop variables (consumed in "incremental" mode) new_states: an optional list of Tensors onto which to append the new values of loop variables. (produced in "first_part" and "incremental" modes) losses: an optional list of Tensors onto which to append losses initial_position: an optional Tensor ("first_part" mode) layer_outputs: an optional list onto which to append layer outputs encoder_output: an optional Tensor (output of the encoder stack) encoder_sequence_id: an optional int32 Tensor (similar to sequence_id) but aligned with the encoder output. constant_states: an optional list of structures produced during "first_part" mode and consumed during "incremental" mode. shared_params: an optional dictionary which can be populated by parameters that are shared between Transformers - e.g. between the encoder and decoder Unitransformers in a Bitransformer. layout: optional - an input to mtf.convert_to_layout_rules Some layers (e.g. MoE layers) cheat by looking at layout and mesh_shape mesh_shape: optional - an input to mtf.convert_to_shape Some layers (e.g. MoE layers) cheat by looking at layout and mesh_shape encoder_layer_outputs: optional - readonly list of tensor activations when decoding, one per each input layer + the embedding layer """ self.mesh = mesh self.batch_dims = batch_dims self.length_dim = length_dim self.variable_dtype = variable_dtype self.beam_dim = beam_dim self.model_dim = model_dim self.mode = mode self.autoregressive = autoregressive if position is None: self.position_is_default = True self.position = mtf.range(mesh, length_dim, dtype=tf.int32) else: self.position_is_default = False self.position = position self.sequence_id = sequence_id self.states = states self.new_states = new_states self.losses = losses self.initial_position = initial_position self.layer_outputs = layer_outputs self.encoder_output = encoder_output self.encoder_sequence_id = encoder_sequence_id self.constant_states = constant_states self.next_constant_state = 0 self.shared_params = shared_params or {} self.layout = layout self.mesh_shape = mesh_shape self.layer_index = 0 self.encoder_layer_outputs = encoder_layer_outputs
def call(self, context, x, losses=None): """Call the layer.""" params = self.make_params(context) if self.share_qk_rep: q, k = params.mdha_shared_qk(x, context) else: q = params.mdha_q(x, context) memory_length = self.memory_length(context) if context.mode == "incremental": m = x else: if self.share_qk_rep: k = mtf.replace_dimensions(k, context.length_dim, memory_length) m = mtf.replace_dimensions(x, context.length_dim, memory_length) if self.shared_kv: kv = params.compute_kv(m) else: if not self.share_qk_rep: k = params.mdha_k(m, context) v = params.mdha_v(m, context) if context.mode == "incremental": one_hot = mtf.one_hot(context.position, memory_length, dtype=context.activation_dtype) inv_one_hot = 1.0 - one_hot if self.shared_kv: old_kv = context.get_states(1) kv = old_kv * inv_one_hot + kv * one_hot else: old_k, old_v = context.get_states(2) k = old_k * inv_one_hot + k * one_hot v = old_v * inv_one_hot + v * one_hot memory_position = mtf.range(context.mesh, memory_length, tf.int32) else: memory_position = self.rename_length_to_memory_length( context.position, context) if context.mode == "incremental" or context.mode == "first_part": context.record_new_states([kv] if self.shared_kv else [k, v]) if self.shared_kv: k = kv v = kv o = self.attention_fn(q, k, v, context=context, memory_length_dim=memory_length, key_dim=self.kv_dim, value_dim=self.kv_dim, bias=self.compute_bias(context, memory_position, x, params.query_heads_dims, q), **self.attention_kwargs_from_context(context)) attention_output_shape = self.expected_attention_output_shape( x, params) attention_output = params.compute_output( o, output_shape=attention_output_shape) return self.layer_output_from_attention_output(context, attention_output, losses)
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, scope=None, mesh_shape="", layout=""): self.config = copy.deepcopy(config) del config if not is_training: self.config.layer_output_dropout_prob = 0.0 self.config.attention_probs_dropout_prob = 0.0 self.config.feedforward_intermediate_dropout_prob = 0.0 input_shape = input_ids.shape assert input_shape.ndims == 2 self._seq_dim = input_shape.dims[1] self._memory_seq_dim = mtf.Dimension("memory_seq", self.seq_dim.size) self._extra_losses = [] mesh = input_ids.mesh if token_type_ids is None: token_type_ids = mtf.zeros(mesh, input_shape, dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. self.embedding_table = mtf.get_variable( mesh, "word_embeddings", mtf.Shape([self.vocab_dim, self.model_dim]), initializer=self.embedding_initializer) self.word_embedding_output = mtf.gather( self.embedding_table, input_ids, self.vocab_dim) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = self.word_embedding_output token_type_table = mtf.get_variable( mesh, "token_type_embeddings", mtf.Shape([self.token_type_vocab_dim, self.model_dim]), initializer=self.embedding_initializer) if token_type_ids is not None: self.embedding_output += mtf.gather( token_type_table, token_type_ids, self.token_type_vocab_dim) if self.config.position_signal == "embedding": full_position_table = mtf.get_variable( mesh, "position_embeddings", mtf.Shape([self.max_position_embeddings_dim, self.model_dim]), initializer=self.embedding_initializer) short_position_table = mtf.rename_dimension( mtf.slice(full_position_table, 0, self.seq_dim.size, self.max_position_embeddings_dim.name), self.max_position_embeddings_dim.name, self.seq_dim.name) self.embedding_output += short_position_table self.embedding_output = self.normalize(self.embedding_output) self.embedding_output = mtf.dropout( self.embedding_output, is_training, keep_prob=1.0 - self.config.layer_output_dropout_prob) with tf.variable_scope("encoder"): attention_biases = [] if input_mask: # [batch_dim, memory_seq_dim] attention_biases.append( (1.0 - mtf.to_float(mtf.replace_dimensions( input_mask, self.seq_dim, self.memory_seq_dim))) * -10000.0) if self.config.position_signal == "relative_attention_bias": buckets_dim = mtf.Dimension("buckets", 32) rp_bucket = _relative_position_bucket( mtf.range(mesh, self.memory_seq_dim, tf.int32) - mtf.range(mesh, self.seq_dim, tf.int32), num_buckets=buckets_dim.size) bias_var = mtf.get_variable( mesh, "relative_attention_bias", [self.num_heads_dim, buckets_dim], initializer=tf.zeros_initializer()) attention_biases.append(mtf.gather(bias_var, rp_bucket, buckets_dim)) attention_bias = mtf.add_n(attention_biases) prev_layer_output = self.embedding_output self.all_encoder_layers = [] for block_num in range(self.config.num_blocks): with tf.variable_scope("block_%d" % block_num): for layer_idx, layer_type in enumerate(self.config.block_layers): layer_name = layer_type count = self.config.block_layers[:layer_idx].count(layer_type) if count: layer_name += "_%d" % count with tf.variable_scope(layer_name): x = prev_layer_output if self.config.residual_structure == "direct": x = self.normalize(x) if layer_type == "attention": x = self.self_attention(x, attention_bias) elif layer_type == "feedforward": x = self.feedforward(x) elif layer_type == "moe": x = self.moe(x, layout, mesh_shape, input_mask, is_training) else: raise ValueError("unknown layer type " + layer_type) x = mtf.dropout( x, is_training, keep_prob=1.0 - self.config.layer_output_dropout_prob) layer_output = prev_layer_output + x if self.config.residual_structure == "original": layer_output = self.normalize(layer_output) prev_layer_output = layer_output self.all_encoder_layers.append(layer_output) self.sequence_output = prev_layer_output if self.config.residual_structure == "direct": self.sequence_output = self.normalize(self.sequence_output) # The "pooler" converts the encoded sequence tensor of shape # [batch_dim, seq_dim, hidden_size] to a tensor of shape # [batch_dim, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = mtf.gather(self.sequence_output, 0, self.seq_dim) self.pooled_output = mtf.layers.dense( first_token_tensor, reduced_dims=[self.model_dim], new_dims=[self.model_dim], activation=mtf.tanh, kernel_initializer=self.dense_initializer, use_bias=self.config.use_bias)
def _mtf_model_fn(self, features, mesh): features = copy.copy(features) hparams = self._hparams targets = tf.to_int32(features["targets"]) if len(targets.get_shape()) > 2: tf.logging.info("targets = %s" % targets) targets = tf.squeeze(targets, [2, 3]) # pad targets to max_length def pad_to_max_length(x): extra_length = hparams.max_length - tf.shape(x)[1] x = tf.pad(x, [[0, 0], [0, extra_length]]) x = tf.reshape(x, [hparams.batch_size, hparams.max_length]) return x targets = pad_to_max_length(targets) for key in [ "targets_segmentation", "targets_position", "inputs_segmentation", "inputs_position" ]: if key in features: features[key] = pad_to_max_length(features[key]) shifted_targets = common_layers.shift_right_2d(targets) targets = self._import_to_batch_by_length(targets, "targets", mesh, hparams) shifted_targets = self._import_to_batch_by_length( shifted_targets, "shifted_targets", mesh, hparams) if "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = self._import_to_batch_by_length( features["targets_segmentation"], "targets_segmentation", mesh, hparams) targets_position = self._import_to_batch_by_length( features["targets_position"], "targets_position", mesh, hparams) decoder_self_attention_mask = ( mtf.layers.attention_mask_autoregressive( targets_position, dtype=self.activation_dtype) + mtf.layers.attention_mask_same_segment( targets_segmentation, dtype=self.activation_dtype)) else: targets_position = mtf.range(mesh, self.length_dim, dtype=tf.int32) decoder_self_attention_mask = mtf.layers.attention_mask_autoregressive( targets_position, dtype=self.activation_dtype) def layer_prepostprocess_dropout(x): return mtf.dropout( x, keep_prob=1.0 - hparams.layer_prepostprocess_dropout, noise_shape=mtf.Shape(self.batch_dims + [self.model_dim])) extra_losses = [] (inputs_embedding_var, targets_embedding_var, softmax_var, positional_embedding_var) = self._embedding_and_softmax_vars(mesh) if hparams.transformer_type == "decoder": encoder_output = None encoder_decoder_attention_mask = None else: inputs = tf.squeeze(tf.to_int32(features["inputs"]), [2, 3]) inputs = pad_to_max_length(inputs) inputs = self._import_to_batch_by_length(inputs, "inputs", mesh, hparams) if "inputs_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. inputs_segmentation = self._import_to_batch_by_length( features["inputs_segmentation"], "inputs_segmentation", mesh, hparams) inputs_position = self._import_to_batch_by_length( features["inputs_position"], "inputs_position", mesh, hparams) encoder_self_attention_mask = ( mtf.layers.attention_mask_same_segment( inputs_segmentation, dtype=self.activation_dtype)) else: inputs_position = mtf.range(mesh, self.length_dim, dtype=tf.int32) encoder_self_attention_mask = ( mtf.layers.attention_mask_ignore_padding( inputs, dtype=self.activation_dtype)) x = (mtf.gather(inputs_embedding_var, inputs, self.inputs_vocab_dim) + mtf.gather(positional_embedding_var, inputs_position, self.max_length_dim)) x = layer_prepostprocess_dropout(x) with tf.variable_scope("encoder"): x = self._layer_stack( x, hparams.encoder_layers, self_attention_mask=encoder_self_attention_mask, losses=extra_losses) if hparams.transformer_type == "encdec": if "inputs_segmentation" in features: encoder_decoder_attention_mask = ( mtf.layers.attention_mask_same_segment( targets_segmentation, inputs_segmentation, dtype=self.activation_dtype)) else: encoder_decoder_attention_mask = encoder_self_attention_mask encoder_output = mtf.rename_dimension(x, self.length_dim.name, self.memory_length_dim.name) if hparams.transformer_type != "encoder": # DECODER x = (mtf.gather(targets_embedding_var, shifted_targets, self.targets_vocab_dim) + mtf.gather(positional_embedding_var, targets_position, self.max_length_dim)) x = layer_prepostprocess_dropout(x) with tf.variable_scope("decoder"): x = self._layer_stack( x, hparams.decoder_layers, encoder_output=encoder_output, self_attention_mask=decoder_self_attention_mask, encdec_attention_mask=encoder_decoder_attention_mask, losses=extra_losses) logits = mtf.matmul(x, softmax_var) if hparams.mode == tf.estimator.ModeKeys.TRAIN: logits = mtf.layers.multiplicative_jitter(logits, epsilon=1e-2) off_value = hparams.label_smoothing / self._targets_vocab_size on_value = 1.0 - hparams.label_smoothing + off_value soft_targets = mtf.one_hot(targets, self.targets_vocab_dim, on_value=on_value, off_value=off_value, dtype=self.activation_dtype) loss = mtf.layers.softmax_cross_entropy_with_logits( logits, soft_targets, self.targets_vocab_dim) weights = mtf.layers.weights_nonzero(targets, dtype=self.activation_dtype) loss = mtf.reduce_mean(loss * weights) for l in extra_losses: loss += l logits = mtf.to_float(logits) # combine batch dims if len(self.batch_dims) > 1: combined_batch_dim = mtf.Dimension(self.batch_dims[0].name, mtf.Shape(self.batch_dims).size) logits = mtf.reshape(logits, [combined_batch_dim] + logits.shape.dims[-2:]) return logits, loss
def call(self, context, x, losses=None): """Call the layer.""" params = self.make_params(context) q = params.compute_q(x) if self.shared_kv: kv = params.compute_kv(x) k = kv v = kv else: k = params.compute_k(x) v = params.compute_v(x) if context.mode == "incremental": if self.shared_kv: prev_kv, = context.get_states(1) else: prev_k, prev_v = context.get_states(2) current_position = mtf.equal( mtf.range(context.mesh, self.window_dim, dtype=tf.int32), mtf.mod(context.position, self.radius)) if self.shared_kv: kv = mtf.where(current_position, kv, prev_kv, output_shape=prev_kv.shape) k = kv v = kv context.record_new_states([kv]) else: k = mtf.where(current_position, params.compute_k(x), prev_k, output_shape=prev_k.shape) v = mtf.where(current_position, params.compute_v(x), prev_v, output_shape=prev_v.shape) context.record_new_states([k, v]) window_pos = mtf.range(context.mesh, self.window_dim, tf.int32) visible = mtf.greater_equal(context.position, window_pos) bias = attention.visibility_mask_to_attention_bias( visible, context.activation_dtype) o = attention.attention( q, k, v, self.window_dim, self.kv_dim, self.kv_dim, bias, **self.attention_kwargs_from_context(context)) elif context.length_dim.size <= max(256, self.radius * 4): # nothing fancy - just do full attention and mask memory_length = self.rename_length_to_memory_length( context.position, context) o = attention.attention( q, self.rename_length_to_memory_length(k, context), self.rename_length_to_memory_length(v, context), self.memory_length(context), self.kv_dim, self.kv_dim, self.compute_bias(context, memory_length, x), **self.attention_kwargs_from_context(context)) else: # fancy local attention algorithm o = attention.local_attention_1d( q=q, k=k, v=None if self.shared_kv else v, length_dim=context.length_dim, key_dim=self.kv_dim, value_dim=self.kv_dim, length_dim_num_splits=1, # TODO(noam): look at the layout autoregressive=context.model.fully_autoregressive, radius=self.radius, sequence_id=context.sequence_id, write_priority=context.write_priority, read_priority=context.read_priority, attention_kwargs=self.attention_kwargs_from_context(context)) if context.mode == "first_part": window_pos = mtf.range(context.mesh, self.window_dim, tf.int32) pos = mtf.range(context.mesh, context.length_dim, tf.int32) select_recent = mtf.cast( mtf.equal(mtf.mod(pos, self.radius), window_pos), x.dtype) select_recent *= mtf.cast(mtf.less(pos, context.initial_position), x.dtype) select_recent *= mtf.cast( mtf.greater_equal(pos, context.initial_position - self.radius), x.dtype) state_shape = (k.shape - [context.length_dim, self.kv_dim] + [self.window_dim, self.kv_dim]) k_state = mtf.einsum([k, select_recent], output_shape=state_shape, reduced_dims=[context.length_dim]) context.new_states.append(k_state) if not self.shared_kv: v_state = mtf.einsum([v, select_recent], output_shape=state_shape, reduced_dims=[context.length_dim]) context.new_states.append(v_state) return params.compute_output(o, output_shape=x.shape)
def local_attention_1d(q, k, v, length_dim, key_dim, value_dim, fully_autoregressive=True, length_dim_num_splits=1, radius=128, sequence_id=1, write_priority=None, read_priority=None, attention_kwargs=None): """Attention to the a neighborood around the source. If fully_autoregressive, then query position p can only see memory positions in the range (p - radius, p]. If not fully_autoregressive, then query position p can only see memory positions in the range (p - window_size, p + radius]. In addition, if write_priority and read_priority are provided, then attention is limited to position pairs where read_priority[query position] >= write_priority[memory position] Args: q: a Tensor containing length_dim k: a Tensor containing length_dim v: an optional Tensor containing length_dim. If none then uses v=k. length_dim: a Dimension key_dim: a Dimension (the channels dimension of q and k) value_dim: a Dimension (the channels dimension of v) fully_autoregressive: a boolean length_dim_num_splits: an optional integer indicating how many ways the length dimension is split radius: an integer sequence_id: a Tensor or an integer write_priority: an optional Tensor containing length_dim read_priority: an optional Tensor containing length_dim attention_kwargs: optional keyword arguments for attention() Returns: a Tensor with the shape x.shape - key_dim + value_dim Raises: ValueError: if channels or depth don't match. """ # Choose a suitable block size. # We choose the greatest divisor of length_per_split less than or equal # to max(window_size, 128) length_per_split = length_dim.size // length_dim_num_splits block_length = max(radius, 128) while length_per_split % block_length != 0: block_length -= 1 query_block_length = mtf.Dimension("query_block_length", block_length) memory_block_length = mtf.Dimension("memory_block_length", block_length) # The num_blocks dimension gets the same name as the length dimension, # so it will be split in the same way. num_blocks = mtf.Dimension(length_dim.name, length_dim.size // block_length) def _reshape_query(x): return mtf.replace_dimensions( x, length_dim, [num_blocks, query_block_length]) def _reshape_memory(x): x = mtf.replace_dimensions( x, length_dim, [num_blocks, memory_block_length]) return (mtf.left_halo_exchange if fully_autoregressive else mtf.halo_exchange)( x, num_blocks, memory_block_length, radius) q = _reshape_query(q) k = _reshape_memory(k) if v: v = _reshape_memory(v) else: v = k if sequence_id is None: sequence_id = 1 if (not isinstance(sequence_id, mtf.Tensor) or length_dim not in sequence_id.shape.dims): sequence_id += mtf.zeros(q.mesh, [length_dim], tf.int32) q_sequence_id = _reshape_query(sequence_id) m_sequence_id = _reshape_memory(sequence_id) pos = mtf.range(q.mesh, length_dim, dtype=tf.int32) q_pos = _reshape_query(pos) m_pos = _reshape_memory(pos) padded_memory_block_length = mtf.Dimension( "memory_block_length", (1 if fully_autoregressive else 2) * radius + block_length) relative_position = m_pos - q_pos visible = mtf.equal(q_sequence_id, m_sequence_id) visible = mtf.logical_and(visible, mtf.greater(relative_position, -radius)) visible = mtf.logical_and(visible, mtf.less_equal( relative_position, 0 if fully_autoregressive else radius)) if read_priority is not None: write_priority = _reshape_memory(write_priority) read_priority = _reshape_query(read_priority) visible = mtf.logical_and( visible, mtf.greater_equal(read_priority, write_priority)) bias = visibility_mask_to_attention_bias(visible, q.dtype) o = attention(q, k, v, padded_memory_block_length, key_dim, value_dim, bias, **attention_kwargs) return mtf.replace_dimensions(o, [num_blocks, query_block_length], length_dim)
def _mtf_model_fn(self, features, mesh): self._original_features = features features = copy.copy(features) hparams = self._hparams extra_losses = [] targets = tf.to_int32(features["targets"]) mode = getattr(hparams, "mode", tf.estimator.ModeKeys.TRAIN) is_training = mode == tf.estimator.ModeKeys.TRAIN if len(targets.get_shape()) > 2: tf.logging.info("targets = %s" % targets) targets = tf.squeeze(targets, [2, 3]) # pad targets to max_length def pad_to_max_length(x): extra_length = hparams.max_length - tf.shape(x)[1] x = tf.pad(x, [[0, 0], [0, extra_length]]) x = tf.reshape(x, [hparams.batch_size, hparams.max_length]) return x targets = pad_to_max_length(targets) targets = self._import_to_batch_by_length(targets, "targets", mesh, hparams) for key in ["targets_segmentation", "targets_position", "inputs_segmentation", "inputs_position"]: if key in features: features[key] = pad_to_max_length(features[key]) if hparams.decoder_type == "autoregressive": shifted_targets = mtf.shift( targets, offset=1, dim=self.length_dim, wrap=False) elif hparams.decoder_type == "denoising": shifted_targets = self._noisy_targets(targets, extra_losses) else: raise ValueError( "unknown hparams.decoder_type = %s" % hparams.decoder_type) if "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = self._import_to_batch_by_length( features["targets_segmentation"], "targets_segmentation", mesh, hparams) targets_position = self._import_to_batch_by_length( features["targets_position"], "targets_position", mesh, hparams) decoder_self_attention_mask = mtf.layers.attention_mask_same_segment( targets_segmentation, dtype=self.activation_dtype) if hparams.decoder_type == "autoregressive": decoder_self_attention_mask += mtf.layers.attention_mask_autoregressive( targets_position, dtype=self.activation_dtype) else: targets_position = mtf.range(mesh, self.length_dim, dtype=tf.int32) if hparams.decoder_type == "autoregressive": decoder_self_attention_mask = mtf.layers.attention_mask_autoregressive( targets_position, dtype=self.activation_dtype) else: decoder_self_attention_mask = None def layer_prepostprocess_dropout(x): return mtf.dropout( x, is_training, keep_prob=1.0 - hparams.layer_prepostprocess_dropout, noise_shape=mtf.Shape(self.batch_dims + [self.model_dim])) (inputs_embedding_var, targets_embedding_var, softmax_var, positional_embedding_var) = self._embedding_and_softmax_vars(mesh) if hparams.transformer_type == "decoder": encoder_output = None encoder_decoder_attention_mask = None else: inputs = tf.squeeze(tf.to_int32(features["inputs"]), [2, 3]) inputs = pad_to_max_length(inputs) inputs = self._import_to_batch_by_length(inputs, "inputs", mesh, hparams) if "inputs_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. inputs_segmentation = self._import_to_batch_by_length( features["inputs_segmentation"], "inputs_segmentation", mesh, hparams) inputs_position = self._import_to_batch_by_length( features["inputs_position"], "inputs_position", mesh, hparams) encoder_self_attention_mask = ( mtf.layers.attention_mask_same_segment( inputs_segmentation, dtype=self.activation_dtype)) else: inputs_position = mtf.range(mesh, self.length_dim, dtype=tf.int32) encoder_self_attention_mask = ( mtf.layers.attention_mask_ignore_padding( inputs, dtype=self.activation_dtype)) x = (mtf.gather(inputs_embedding_var, inputs, self.inputs_vocab_dim) + mtf.gather(positional_embedding_var, inputs_position, self.max_length_dim)) x = layer_prepostprocess_dropout(x) with tf.variable_scope("encoder"): x = self._layer_stack(x, hparams.encoder_layers, self_attention_mask=encoder_self_attention_mask, losses=extra_losses) if hparams.transformer_type == "encdec": if "inputs_segmentation" in features: encoder_decoder_attention_mask = ( mtf.layers.attention_mask_same_segment( targets_segmentation, inputs_segmentation, dtype=self.activation_dtype)) else: encoder_decoder_attention_mask = encoder_self_attention_mask encoder_output = mtf.rename_dimension( x, self.length_dim.name, self.memory_length_dim.name) if hparams.transformer_type != "encoder": # DECODER x = (mtf.gather( targets_embedding_var, shifted_targets, self.targets_vocab_dim) + mtf.gather( positional_embedding_var, targets_position, self.max_length_dim)) x = layer_prepostprocess_dropout(x) with tf.variable_scope("decoder"): x = self._layer_stack( x, hparams.decoder_layers, encoder_output=encoder_output, self_attention_mask=decoder_self_attention_mask, encdec_attention_mask=encoder_decoder_attention_mask, losses=extra_losses) if (hparams.reshape_logits_hack and hparams.mode == tf.estimator.ModeKeys.TRAIN): # For some reason, the logits computation is extremely slow on TPU # in some cases where the batch size per core is 1. Reshape the logits # and the targets to double the batch size and halve the length. # TODO(noam): file a bug. old_dims = self.batch_dims + [self.length_dim] new_dims = self.batch_dims[:-1] + [ mtf.Dimension(self.batch_dims[-1].name, self.batch_dims[-1].size * 2), mtf.Dimension(self.length_dim.name, self.length_dim.size // 2)] x = mtf.reshape(x, new_dims + [self.model_dim]) targets = mtf.reshape(targets, new_dims) logits = mtf.matmul(x, softmax_var) if hparams.mode == tf.estimator.ModeKeys.TRAIN: logits = mtf.layers.multiplicative_jitter(logits, epsilon=1e-2) off_value = hparams.label_smoothing / self._targets_vocab_size on_value = 1.0 - hparams.label_smoothing + off_value soft_targets = mtf.one_hot( targets, self.targets_vocab_dim, on_value=on_value, off_value=off_value, dtype=self.activation_dtype) loss = mtf.layers.softmax_cross_entropy_with_logits( logits, soft_targets, self.targets_vocab_dim) weights = mtf.layers.weights_nonzero(targets, dtype=self.activation_dtype) loss = mtf.reduce_mean(loss * weights) for l in extra_losses: loss += l if (hparams.reshape_logits_hack and hparams.mode == tf.estimator.ModeKeys.TRAIN): logits = mtf.reshape(logits, old_dims + [self.targets_vocab_dim]) logits = mtf.to_float(logits) return logits, loss
def model(mtf_features, other_features, params, mesh, variable_dtype, context=None): """A GPT style model implemented in mesh tensorflow.""" x, batch_dim, sequence_dim, embd_dim, vocab_dim, embed_sequence_dim = parse_inputs(mtf_features, other_features) if is_incremental_inference(context): # reshape inputs if in inference mode x = mtf.gather(x, context.position - 1, sequence_dim) x = mtf.reshape(x, [batch_dim]) use_axial_pos_emb = params["axial_pos_emb"] is not None if not use_axial_pos_emb: # Use standard position encoding wpe = mtf.get_variable(mesh, "wpe", mtf.Shape([embed_sequence_dim, embd_dim]), initializer=tf.random_normal_initializer(stddev=0.01), master_dtype=variable_dtype.master_dtype, slice_dtype=variable_dtype.slice_dtype, activation_dtype=variable_dtype.activation_dtype) else: wpe = axial_positional_emb(embd_dim, mesh, params, variable_dtype) # Text encoding wte = mtf.get_variable(mesh, "wte", mtf.Shape([vocab_dim, embd_dim]), initializer=tf.random_normal_initializer(stddev=0.02), master_dtype=variable_dtype.master_dtype, slice_dtype=variable_dtype.slice_dtype, activation_dtype=variable_dtype.activation_dtype) with tf.variable_scope("token_embd"): # Text embedding h = mtf.gather(wte, x, vocab_dim) if params["embed_dropout"] > 0 and params["mode"] == "train": h = mtf.dropout(h, rate=params["embed_dropout"], name="wte_dropout") with tf.variable_scope("pos_embd"): # Positional embedding position_indices = mtf.range(mesh, sequence_dim, tf.int64) if not is_incremental_inference(context) else ( context.position - 1) pos_emb = mtf.gather(wpe, position_indices, wpe.shape[0]) if params["embed_dropout"] > 0 and params["mode"] == "train": pos_emb = mtf.dropout(pos_emb, rate=params["embed_dropout"], name="wte_dropout") h += pos_emb aux_losses = 0 # instantiate auxiliary losses (for MOE models) for layer in range(params["n_layer"]): # attn blocks share_parameters = exists(params["share_parameters"]) and params["share_parameters"] == True block_scope = f"h{layer}" if not share_parameters else "" block_fn = block(params=params, scope=block_scope, layer_num=layer, bias=other_features["attn_bias"], sequence_dim=sequence_dim, memory_length_dim=other_features["memory_length_dim"], variable_dtype=variable_dtype, context=context) # If true and in train mode, enable gradient checkpointing recompute_grad = params["recompute_grad"] and (params["mode"] == "train") == True h, loss = block_fn(h) if not recompute_grad else mtf.recompute_grad(block_fn, [h]) aux_losses += loss no_weight_tie_emb = params["no_weight_tie"] == True if no_weight_tie_emb: with tf.variable_scope("wte_final_linear"): logits = linear(h, "linear_out", vocab_dim, variable_dtype=variable_dtype, params=params) else: # Layer normalize & affine transform h = layer_norm(h, "ln_f", variable_dtype=variable_dtype) seq_dim = sequence_dim if not is_incremental_inference(context) else mtf.Dimension("sequence", 1) with tf.variable_scope("wte_final_einsum"): # Equivalent to tf.matmul logits = mtf.einsum([h, wte], output_shape=[batch_dim, seq_dim, vocab_dim]) if params["mode"] in ["train", "eval"]: labels = mtf_features["labels"] z_loss = params.get("z_loss", 1e-4) # an auxiliary loss used to stabilize mtf xentropy # Go to full precision for the logits logits = mtf.cast(logits, tf.float32) use_entmax_loss = params.get("entmax_loss", False) loss_fn = mtf.layers.softmax_cross_entropy_with_logits if not use_entmax_loss else entmax_cross_entropy_with_logits with tf.variable_scope("xentropy_final"): loss_batch = loss_fn(logits=logits, targets=labels, vocab_dim=logits.shape[-1], z_loss=z_loss) # For non-autoregressive models (masked language modeling training) # Make sure labels with padding tokens are not counted in the loss if not params["causal"]: padding_id = params.get("padding_id", 0) loss_batch = mtf.where(mtf.not_equal(labels, padding_id), loss_batch, mtf.zeros_like(loss_batch)) with tf.variable_scope("reduce_mean_final"): loss = mtf.reduce_mean(loss_batch) loss += aux_losses # Add on auxiliary losses (currently only used for MoE) loss /= params["num_microbatches"] # Convert to train dtype loss = mtf.cast(loss, variable_dtype.slice_dtype) else: loss = None loss_batch = None # Cast back to checkpoint dtype logits = mtf.cast(logits, variable_dtype.master_dtype) return logits, loss, loss_batch
def beam_search(self, inputs, decode_length, dst_attributes=None, variable_dtype=mtf.VariableDType(tf.float32), encoder_output=None, encoder_sequence_id=None, encoder_inputs=None, alpha=0.6, shared_params=None, encoder_layer_outputs=None, z=None): """Beam search. Args: inputs: an int32 zero-Tensor with shape [<batch_dims>, beam_dim, length_dim].# decode_length: an int32 mtf scalar. Maximum decode length. attributes: an int32 zero-Tensor with shape [<batch_dims>, beam_dim, length_dim] ([<batch_dims>] [<batch_dims>, beam_dim]). variable_dtype: a mtf.VariableDType encoder_output: an optional Tensor encoder_sequence_id: an optional Tensor encoder_inputs: an optional Tensor alpha: a floating point value (length bonus) shared_params: an optional dictionary encoder_layer_outputs: optional - readonly list of tensor activations when decoding, one per each input layer + the embedding layer Returns: a Tensor with shape [<batch_dims>, beam_dim, length_dim] """ attributes = dst_attributes if not self.autoregressive: raise ValueError("must be autoregressive") batch_dims = inputs.shape.dims[:-2] if len(batch_dims) != 1: raise NotImplementedError( "beam search supports exactly one batch dimension.") beam_dim = inputs.shape.dims[-2] length_dim = inputs.shape.dims[-1] length_range = mtf.range(inputs.mesh, length_dim, tf.int32) initial_position = mtf.reduce_sum(mtf.to_int32(mtf.not_equal( inputs, 0)), reduced_dim=length_dim) sequence_id = 1 if encoder_sequence_id is not None else None if self.input_full_attention: # This only makes sense in the case of beam search with given partial # sequences, which is not yet implemented. # TODO(noam): implement raise NotImplementedError( "Beam search for language models not yet implemented") else: read_priority = write_priority = length_range context_first_part = Context( model=self, mesh=inputs.mesh, batch_dims=batch_dims + [beam_dim], length_dim=length_dim, variable_dtype=variable_dtype, mode="first_part", position=length_range, position_is_default=True, new_states=[], initial_position=initial_position, sequence_id=sequence_id, encoder_output=encoder_output, encoder_sequence_id=encoder_sequence_id, constant_states=[], shared_params=shared_params, encoder_layer_outputs=encoder_layer_outputs, write_priority=write_priority, read_priority=read_priority, inputs=inputs, encoder_inputs=encoder_inputs) shifted_inputs = mtf.shift(inputs, offset=1, dim=length_dim, wrap=False) with tf.variable_scope(self.name): logits = self._call_internal(context_first_part, shifted_inputs, attributes=attributes, z=z) del logits # There are no partial targets. # Replace initial states by zeros to avoid computing them. initial_states = [ mtf.zeros_like(t) for t in context_first_part.new_states ] constant_states = context_first_part.constant_states def logits_fn(step_num, ids, states): """logits_fn for mtf.beam_search.beam_search().""" inputs_this_step = mtf.gather(ids, step_num - 1, length_dim) if self.attribute_embedding: attributes_this_step = mtf.gather(attributes, step_num - 1, length_dim) else: attributes_this_step = None context_incremental = Context( model=self, mesh=inputs.mesh, batch_dims=batch_dims + [beam_dim], length_dim=length_dim, variable_dtype=variable_dtype, mode="incremental", position=step_num, states=states, new_states=[], sequence_id=sequence_id, encoder_output=encoder_output, encoder_sequence_id=encoder_sequence_id, constant_states=constant_states, shared_params=shared_params, encoder_layer_outputs=encoder_layer_outputs, write_priority=write_priority, read_priority=step_num, inputs=inputs_this_step, encoder_inputs=encoder_inputs) with tf.variable_scope(self.name, reuse=True): logits = self._call_internal(context_incremental, inputs_this_step, attributes=attributes_this_step, z=z) return mtf.to_float(logits), context_incremental.new_states beams, unused_scores = mtf.beam_search.beam_search( logits_fn, inputs, alpha, states=initial_states, decode_length=decode_length, use_tpu=True, dtype=tf.float32, mesh_shape=self.mesh_shape, layout=self.layout) return mtf.gather(beams, mtf.constant(inputs.mesh, 0, dtype=tf.int32), beam_dim)
def call_simple(self, inputs, targets, compute_loss, attributes=None, mode=tf.estimator.ModeKeys.TRAIN, variable_dtype=mtf.VariableDType(tf.float32), sequence_id=None, subsequence_id=None, position=None, encoder_output=None, encoder_sequence_id=None, encoder_inputs=None, shared_params=None, layer_outputs=None, encoder_layer_outputs=None, z=None): """Compute logits based on inputs (all positions in parallel). This is called during training and evaluation. Args: inputs: an int32 Tensor with shape [<batch_dims>, length_dim] For training autoregressive models this should be equal to mtf.shift(targets, offset=1, dim=length_dim, wrap=False) targets: an optional int32 Tensor with shape [<batch_dims>, length_dim] compute_loss: a boolean attributes: an (optional?) int32 Tensor with shape [<batch_dims>, length_dim] ([<batch_dims>]) mode: a tf.estimator.ModeKeys variable_dtype: a mtf.VariableDType sequence_id: an optional Tensor subsequence_id: an optional Tensor position: an optional Tensor encoder_output: an optional Tensor encoder_sequence_id: an optional Tensor encoder_inputs: an optional Tensor shared_params: an optional dictionary layer_outputs: an optional list to append Tensor layer activations to encoder_layer_outputs: optional - readonly list of tensor activations when decoding, one per each input layer + the embedding layer Returns: logits: a Tensor with shape [<batch_dims>, output_vocab_dim] loss: an optional Scalar (if compute_loss=True) """ batch_dims = inputs.shape.dims[:-1] length_dim = inputs.shape.dims[-1] length_range = mtf.range(inputs.mesh, length_dim, dtype=tf.int32) if not self.positional_embedding: # To make relative attention faster, we drop the information about the # position in the subsequence. The relative attention code then # assumes that the positions are given by index in the tensor, # which still leads to the correct computation of relative position. position = None if position is None: position_is_default = True position = length_range else: position_is_default = False if self.input_full_attention: # The inputs part of each sequence can fully attend within itself. full_attention_region = delimited_lm_inputs_mask(targets) # We can include one additional position to the right - the position # where the final EOS of the inputs is read and the first target token # is predicted. full_attention_region = mtf.logical_or( full_attention_region, mtf.shift(full_attention_region, offset=1, dim=length_dim, wrap=False)) # We set read_priority and write_priority to 0 in the full-attention # region and equal to the position elsewhere. read_priority = write_priority = length_range * mtf.cast( mtf.logical_not(full_attention_region), tf.int32) elif self.autoregressive: # Vanilla autoregressive model - each position can see previous positions. read_priority = write_priority = length_range else: read_priority = write_priority = None context = Context(model=self, mesh=inputs.mesh, batch_dims=batch_dims, length_dim=length_dim, variable_dtype=variable_dtype, mode=mode, losses=[] if compute_loss else None, sequence_id=sequence_id, subsequence_id=subsequence_id, position=position, position_is_default=position_is_default, encoder_output=encoder_output, encoder_sequence_id=encoder_sequence_id, shared_params=shared_params, layer_outputs=layer_outputs, encoder_layer_outputs=encoder_layer_outputs, write_priority=write_priority, read_priority=read_priority, inputs=inputs, encoder_inputs=encoder_inputs) with tf.variable_scope(self.name): logits = self._call_internal(context, inputs, targets, attributes, z=z) if compute_loss: loss = mtf.add_n(context.losses) else: loss = None return logits, loss
def create_dummy_model(mesh, shapes, n_blocks=2, block_param_size_str="2_2", block_repeat_size_str="1_1"): """Creates a dummy model and layer stack with 4-dimensional input.""" assert len(shapes) == 4 outer_batch_size, batch_size, length, d_model = shapes batch_dim = mtf.Dimension("batch", batch_size) outer_batch_dim = mtf.Dimension("outer_batch", outer_batch_size) length_dim = mtf.Dimension("length", length) block_param_size = list(map(int, block_param_size_str.split("_"))) block_repeat_size = list(map(int, block_repeat_size_str.split("_"))) sublayers_initial = [ transformer.sublayer_dropout, ] sublayers_per_layer = [ transformer.sublayer_rms_norm, transformer.sublayer_call_layer, transformer.sublayer_dropout, transformer.sublayer_residual, ] sublayers_final = [ transformer.sublayer_rms_norm, transformer.sublayer_dropout, ] submodules = [ transformer_layers.SelfAttention(), transformer_layers.DenseReluDense() ] n_sublayers = np.array(block_param_size).prod() layers = submodules * n_sublayers layer_stack = funnel_transformer.FunnelTransformerLayerStack( layers=layers, n_blocks=n_blocks, block_param_size=block_param_size, block_repeat_size=block_repeat_size, sublayers_initial=sublayers_initial, sublayers_per_layer=sublayers_per_layer, sublayers_final=sublayers_final) model = transformer.Unitransformer(input_vocab_size=10, output_vocab_size=10, autoregressive=False, max_length=8, d_model=d_model, layer_stack=layer_stack) context = transformer.Context(model=model, mesh=mesh, batch_dims=[batch_dim, outer_batch_dim], length_dim=length_dim, variable_dtype=mtf.VariableDType(tf.float32), sequence_id=mtf.ones(mesh, mtf.Shape([length_dim ])), position=mtf.range(mesh, length_dim, dtype=tf.int32)) return layer_stack, context
def sample_autoregressive(self, partial_sequences, dst_attributes=None, stop_at_token=1, max_steps=None, temperature=0.0, variable_dtype=mtf.VariableDType(tf.float32), encoder_output=None, encoder_sequence_id=None, encoder_inputs=None, shared_params=None, has_partial_sequences=True, encoder_layer_outputs=None, never_end=False, remove_partial_sequences=False, sampling_keep_top_k=-1, z=None): """Sample randomly one token at a time. The partial_sequences represent partial sequences to be continued. The first tokens of each sequence are nonzero representing the given partial sequences and the last tokens of each sequence are zeros, representing what needs to be filled in. If there are no partial sequences (you want to sample from the beginning), then pass partial_sequences=mtf.zeros(mesh, shape, dtype=tf.int32) and has_partial_sequences=False (so we can skip computation). The dst_attributes represents the destination attributes in which we want to generate sequences. Args: partial_sequences: an int32 Tensor with shape [<batch_dims>, length_dim] dst_attribute: an int32 Tensor with shape [<batch_dims>, length_dim] ([<batch_dims>]) stop_at_token: an optional integer eos id. Stop when we produce it. max_steps: an optional integer, the max number of steps to decode. temperature: an optional floating point value between 0.0 and 1.0 0.0 means argmax, 1.0 means sample according to predicted distribution. variable_dtype: a mtf.VariableDType encoder_output: an optional Tensor encoder_sequence_id: an optional Tensor encoder_inputs: an optional Tensor shared_params: an optional dictionary has_partial_sequences: a boolean encoder_layer_outputs: optional - readonly list of tensor activations when decoding, one per each input layer + the embedding layer never_end: a boolean - if set, then avoid generating stop_at_token remove_partial_sequences: a boolean - whether to remove the partial sequences from the output sampling_keep_top_k: an integer - if not -1, only sample from the top k logits. Returns: a Tensor with shape [<batch_dims>, length_dim] """ if not self.autoregressive: raise ValueError("must be autoregressive") inputs = partial_sequences attributes = dst_attributes batch_dims = inputs.shape.dims[:-1] length_dim = inputs.shape.dims[-1] initial_position = mtf.reduce_sum(mtf.to_int32(mtf.not_equal( inputs, 0)), reduced_dim=length_dim) sequence_id = 1 if encoder_sequence_id is not None else None length_range = mtf.range(inputs.mesh, length_dim, tf.int32) if self.input_full_attention: read_priority = write_priority = length_range * mtf.to_int32( mtf.greater(length_range, initial_position)) else: read_priority = write_priority = length_range context_first_part = Context( model=self, mesh=inputs.mesh, batch_dims=batch_dims, length_dim=length_dim, variable_dtype=variable_dtype, mode="first_part", position=length_range, position_is_default=True, new_states=[], initial_position=initial_position, sequence_id=sequence_id, encoder_output=encoder_output, encoder_sequence_id=encoder_sequence_id, constant_states=[], shared_params=shared_params, encoder_layer_outputs=encoder_layer_outputs, write_priority=write_priority, read_priority=read_priority, inputs=inputs, encoder_inputs=encoder_inputs) shifted_inputs = mtf.shift(inputs, offset=1, dim=length_dim, wrap=False) with tf.variable_scope(self.name): logits = self._call_internal(context_first_part, shifted_inputs, attributes=attributes, z=z) del logits constant_states = context_first_part.constant_states if not has_partial_sequences: initial_states = [ mtf.zeros_like(t) for t in context_first_part.new_states ] partial_sequences_eos_count = 0 else: initial_states = context_first_part.new_states partial_sequences_eos_count = mtf.reduce_sum( mtf.to_int32(mtf.equal(partial_sequences, stop_at_token)), reduced_dim=length_dim) def cond_fn(position, ids, *unused_states): """Should we run another loop iteration.""" past_end = mtf.greater_equal(position, length_dim.size) if max_steps: past_end = mtf.logical_or( past_end, mtf.greater_equal(position - initial_position, max_steps)) is_done = past_end if stop_at_token is not None: eos_count = mtf.reduce_sum(mtf.to_int32( mtf.equal(ids, stop_at_token)), reduced_dim=length_dim) has_additional_eos = mtf.greater(eos_count, partial_sequences_eos_count) is_done = mtf.logical_or(is_done, has_additional_eos) all_done = mtf.reduce_all(is_done) return mtf.logical_not(all_done) def body_fn(position, ids, *states): """One step in the decode loop.""" inputs_this_step = mtf.gather(ids, position - 1, length_dim) if self.attribute_embedding: attributes_this_step = mtf.gather(attributes, position - 1, length_dim) else: attributes_this_step = None # raise ValueError("inputs_this_step shape=%s , ids shape=%s, position - 1 shape=%s, length_dim=%s" % (inputs_this_step.shape, ids.shape, (position - 1).shape, length_dim)) context_incremental = Context( model=self, mesh=inputs.mesh, batch_dims=batch_dims, length_dim=length_dim, variable_dtype=variable_dtype, mode="incremental", position=position, states=states, new_states=[], sequence_id=sequence_id, encoder_output=encoder_output, encoder_sequence_id=encoder_sequence_id, constant_states=constant_states, shared_params=shared_params, encoder_layer_outputs=encoder_layer_outputs, write_priority=write_priority, read_priority=position, inputs=inputs_this_step, encoder_inputs=encoder_inputs) with tf.variable_scope(self.name, reuse=True): logits = self._call_internal(context_incremental, inputs_this_step, attributes=attributes_this_step, z=z) if never_end: logits += mtf.one_hot(mtf.constant(logits.mesh, stop_at_token, dtype=tf.int32), self.output_vocab_dim, on_value=-1e9, off_value=0.0, dtype=logits.dtype) # TBD whether this should be before or after never_end: # Note for adding top_p sampling in the future, in other code bases, the # option to apply temperature is done before the top-k truncation. This # implementation does this in the opposite order. For top-k this doesn't # matter, but for top_p it will. if sampling_keep_top_k != -1: if sampling_keep_top_k <= 0: raise ValueError( "sampling_keep_top_k must either be -1 or positive.") k_largest = mtf.nth_largest_element( logits, n=sampling_keep_top_k, reduced_dim=self.output_vocab_dim) logits = mtf.where(mtf.less_equal(logits, k_largest), mtf.ones_like(logits) * -1e6, logits) ids_this_step = mtf.sample_with_temperature( logits, self.output_vocab_dim, temperature) new_position = position + 1 new_ids = ids + ids_this_step * mtf.one_hot( position, length_dim, dtype=tf.int32) return [new_position, new_ids] + context_incremental.new_states while_loop_inputs = [initial_position, inputs] + initial_states final_position, outputs = mtf.while_loop(cond_fn, body_fn, while_loop_inputs)[:2] del final_position if has_partial_sequences and remove_partial_sequences: # remove partial sequences from outputs partial_length = mtf.reduce_sum(mtf.to_int32( mtf.not_equal(partial_sequences, 0)), reduced_dim=length_dim) outputs = mtf.dynamic_shift(outputs, -partial_length, length_dim, wrap=False) return outputs
def sample_autoregressive( partial_sequences, other_features, params, stop_at_token=50256, max_steps=None, temperature=0.9, variable_dtype=mtf.VariableDType(tf.float32), encoder_output=None, encoder_sequence_id=None, encoder_inputs=None, shared_params=None, has_partial_sequences=True, encoder_layer_outputs=None, never_end=False, remove_partial_sequences=False, sampling_keep_top_k=-1, bos_id=50256, ): """Sample randomly one token at a time. The partial_sequences represent partial sequences to be continued. The first tokens of each sequence are nonzero representing the given partial sequences and the last tokens of each sequence are zeros, representing what needs to be filled in. If there are no partial sequences (you want to sample from the beginning), then pass partial_sequences=mtf.zeros(mesh, shape, dtype=tf.int32) and has_partial_sequences=False (so we can skip computation). Args: partial_sequences: an int32 Tensor with shape [<batch_dims>, length_dim] stop_at_token: an optional integer eos id. Stop when we produce it. max_steps: an optional integer, the max number of steps to decode. temperature: an optional floating point value between 0.0 and 1.0 0.0 means argmax, 1.0 means sample according to predicted distribution. variable_dtype: a mtf.VariableDType encoder_output: an optional Tensor encoder_sequence_id: an optional Tensor encoder_inputs: an optional Tensor shared_params: an optional dictionary has_partial_sequences: a boolean encoder_layer_outputs: optional - readonly list of tensor activations when decoding, one per each input layer + the embedding layer never_end: a boolean - if set, then avoid generating stop_at_token remove_partial_sequences: a boolean - whether to remove the partial sequences from the output sampling_keep_top_k: an integer - if not -1, only sample from the top k logits. bos_id: beginning of sequence id Returns: a Tensor with shape [<batch_dims>, length_dim] """ inputs = partial_sequences # Partial sequences to fill in batch_dims = inputs.shape.dims[:-1] length_dim = inputs.shape.dims[-1] padding_id = params.get("padding_id", 0) slow_sampling = params.get("slow_sampling", False) initial_position = mtf.reduce_sum( mtf.to_int32(mtf.not_equal(inputs, padding_id)), reduced_dim=length_dim) # Gets position where zero padding starts length_range = mtf.range(inputs.mesh, length_dim, tf.int32) input_full_attention = True # for now hardcode this to true bc lazy if input_full_attention: # Vanilla autoregressive model - each position can see previous positions. # Think this feeds in to the loop fn and tells each position where it can attend to? read_priority = write_priority = length_range * mtf.to_int32( mtf.greater(length_range, initial_position)) else: read_priority = write_priority = length_range # Builds context to pass around internally # The 'first part' context records initial states of k / v / x if not slow_sampling: context_first_part = mtf_transformer.transformer.Context( model=None, mesh=inputs.mesh, batch_dims=batch_dims, length_dim=length_dim, variable_dtype=variable_dtype, mode="first_part", position=length_range, position_is_default=True, new_states=[], initial_position=initial_position, sequence_id=None, encoder_output=encoder_output, encoder_sequence_id=encoder_sequence_id, constant_states=[], shared_params=shared_params, encoder_layer_outputs=encoder_layer_outputs, write_priority=write_priority, read_priority=read_priority, inputs=inputs, encoder_inputs=encoder_inputs) with tf.variable_scope("gpt2"): logits, _, _ = gpt2.model({"inputs": inputs}, other_features, params, inputs.mesh, variable_dtype=variable_dtype, context=context_first_part) if not has_partial_sequences: initial_states = [ mtf.zeros_like(t) for t in context_first_part.new_states ] else: initial_states = context_first_part.new_states else: initial_states = [] if not has_partial_sequences: partial_sequences_eos_count = 0 if stop_at_token is not None: partial_sequences_eos_count = mtf.reduce_sum(mtf.to_int32( mtf.equal(partial_sequences, stop_at_token)), reduced_dim=length_dim) def cond_fn(position, ids, *unused_states): """Should we run another loop iteration?""" past_end = mtf.greater_equal(position, length_dim.size) if max_steps: past_end = mtf.logical_or( past_end, mtf.greater_equal(position - initial_position, max_steps)) is_done = past_end if stop_at_token is not None: eos_count = mtf.reduce_sum(mtf.to_int32( mtf.equal(ids, stop_at_token)), reduced_dim=length_dim) has_additional_eos = mtf.greater(eos_count, partial_sequences_eos_count) is_done = mtf.logical_or(is_done, has_additional_eos) all_done = mtf.reduce_all(is_done) return mtf.logical_not(all_done) def body_fn(position, ids, *states): """One step in the decode loop.""" nonlocal sampling_keep_top_k context = mtf_transformer.transformer.Context( model=None, mesh=inputs.mesh, batch_dims=batch_dims, length_dim=length_dim, variable_dtype=variable_dtype, mode="incremental", position=position, position_is_default=True, states=states, new_states=[], initial_position=position, sequence_id=None, encoder_output=encoder_output, encoder_sequence_id=encoder_sequence_id, shared_params=shared_params, encoder_layer_outputs=encoder_layer_outputs, write_priority=write_priority, read_priority=read_priority, inputs=ids, encoder_inputs=encoder_inputs) if not slow_sampling else None with tf.variable_scope("gpt2", reuse=tf.AUTO_REUSE): logits, _, _ = gpt2.model({"inputs": ids}, other_features, params, inputs.mesh, variable_dtype=variable_dtype, context=context) # By default, do top_k sampling of 0.9 if sampling_keep_top_k == -2: sampling_keep_top_k = int(logits.shape[-1].size * 0.1) if sampling_keep_top_k != -1: if sampling_keep_top_k <= 0: raise ValueError( "sampling_keep_top_k must either be -1 or positive.") k_largest = mtf.nth_largest_element( logits, n=sampling_keep_top_k, reduced_dim=other_features["vocab_dim"]) logits = mtf.where(mtf.less_equal(logits, k_largest), mtf.ones_like(logits) * -1e6, logits) ids_this_step = mtf.sample_with_temperature( logits, other_features["vocab_dim"], temperature) if slow_sampling: ids_this_step = mtf.shift(ids_this_step, offset=1, dim=length_dim, wrap=False) else: ids_this_step = mtf.reshape(ids_this_step, (batch_dims)) one_hot = mtf.one_hot(position, length_dim, dtype=tf.int32) one_new_id = ids_this_step * one_hot new_ids = (1 - one_hot) * ids + one_new_id new_position = position + 1 ret = [new_position, new_ids] if context is not None: ret += context.new_states return ret while_loop_inputs = [initial_position, inputs] + initial_states final_position, outputs = mtf.while_loop(cond_fn, body_fn, while_loop_inputs)[:2] del final_position if has_partial_sequences and remove_partial_sequences: # Remove partial sequences from outputs partial_length = mtf.reduce_sum(mtf.to_int32( mtf.not_equal(partial_sequences, padding_id)), reduced_dim=length_dim) outputs = mtf.dynamic_shift(outputs, -partial_length, length_dim, wrap=False) return outputs
def _mtf_model_fn(self, features, mesh): self._original_features = features features = copy.copy(features) hparams = self._hparams extra_losses = [] targets = tf.to_int32(features["targets"]) if len(targets.get_shape()) > 2: tf.logging.info("targets = %s" % targets) targets = tf.squeeze(targets, [2, 3]) # pad targets to max_length def pad_to_max_length(x): extra_length = hparams.max_length - tf.shape(x)[1] x = tf.pad(x, [[0, 0], [0, extra_length]]) x = tf.reshape(x, [hparams.batch_size, hparams.max_length]) return x targets = pad_to_max_length(targets) targets = self._import_to_batch_by_length(targets, "targets", mesh, hparams) for key in ["targets_segmentation", "targets_position", "inputs_segmentation", "inputs_position"]: if key in features: features[key] = pad_to_max_length(features[key]) if hparams.decoder_type == "autoregressive": shifted_targets = mtf.shift( targets, offset=1, dim=self.length_dim, wrap=False) elif hparams.decoder_type == "denoising": shifted_targets = self._noisy_targets(targets, extra_losses) else: raise ValueError( "unknown hparams.decoder_type = %s" % hparams.decoder_type) if "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = self._import_to_batch_by_length( features["targets_segmentation"], "targets_segmentation", mesh, hparams) targets_position = self._import_to_batch_by_length( features["targets_position"], "targets_position", mesh, hparams) decoder_self_attention_mask = mtf.layers.attention_mask_same_segment( targets_segmentation, dtype=self.activation_dtype) if hparams.decoder_type == "autoregressive": decoder_self_attention_mask += mtf.layers.attention_mask_autoregressive( targets_position, dtype=self.activation_dtype) else: targets_position = mtf.range(mesh, self.length_dim, dtype=tf.int32) if hparams.decoder_type == "autoregressive": decoder_self_attention_mask = mtf.layers.attention_mask_autoregressive( targets_position, dtype=self.activation_dtype) else: decoder_self_attention_mask = None def layer_prepostprocess_dropout(x): return mtf.dropout( x, keep_prob=1.0 - hparams.layer_prepostprocess_dropout, noise_shape=mtf.Shape(self.batch_dims + [self.model_dim])) (inputs_embedding_var, targets_embedding_var, softmax_var, positional_embedding_var) = self._embedding_and_softmax_vars(mesh) if hparams.transformer_type == "decoder": encoder_output = None encoder_decoder_attention_mask = None else: inputs = tf.squeeze(tf.to_int32(features["inputs"]), [2, 3]) inputs = pad_to_max_length(inputs) inputs = self._import_to_batch_by_length(inputs, "inputs", mesh, hparams) if "inputs_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. inputs_segmentation = self._import_to_batch_by_length( features["inputs_segmentation"], "inputs_segmentation", mesh, hparams) inputs_position = self._import_to_batch_by_length( features["inputs_position"], "inputs_position", mesh, hparams) encoder_self_attention_mask = ( mtf.layers.attention_mask_same_segment( inputs_segmentation, dtype=self.activation_dtype)) else: inputs_position = mtf.range(mesh, self.length_dim, dtype=tf.int32) encoder_self_attention_mask = ( mtf.layers.attention_mask_ignore_padding( inputs, dtype=self.activation_dtype)) x = (mtf.gather(inputs_embedding_var, inputs, self.inputs_vocab_dim) + mtf.gather(positional_embedding_var, inputs_position, self.max_length_dim)) x = layer_prepostprocess_dropout(x) with tf.variable_scope("encoder"): x = self._layer_stack(x, hparams.encoder_layers, self_attention_mask=encoder_self_attention_mask, losses=extra_losses) if hparams.transformer_type == "encdec": if "inputs_segmentation" in features: encoder_decoder_attention_mask = ( mtf.layers.attention_mask_same_segment( targets_segmentation, inputs_segmentation, dtype=self.activation_dtype)) else: encoder_decoder_attention_mask = encoder_self_attention_mask encoder_output = mtf.rename_dimension( x, self.length_dim.name, self.memory_length_dim.name) if hparams.transformer_type != "encoder": # DECODER x = (mtf.gather( targets_embedding_var, shifted_targets, self.targets_vocab_dim) + mtf.gather( positional_embedding_var, targets_position, self.max_length_dim)) x = layer_prepostprocess_dropout(x) with tf.variable_scope("decoder"): x = self._layer_stack( x, hparams.decoder_layers, encoder_output=encoder_output, self_attention_mask=decoder_self_attention_mask, encdec_attention_mask=encoder_decoder_attention_mask, losses=extra_losses) if (hparams.reshape_logits_hack and hparams.mode == tf.estimator.ModeKeys.TRAIN): # For some reason, the logits computation is extremely slow on TPU # in some cases where the batch size per core is 1. Reshape the logits # and the targets to double the batch size and halve the length. # TODO(noam): file a bug. old_dims = self.batch_dims + [self.length_dim] new_dims = self.batch_dims[:-1] + [ mtf.Dimension(self.batch_dims[-1].name, self.batch_dims[-1].size * 2), mtf.Dimension(self.length_dim.name, self.length_dim.size // 2)] x = mtf.reshape(x, new_dims + [self.model_dim]) targets = mtf.reshape(targets, new_dims) logits = mtf.matmul(x, softmax_var) if hparams.mode == tf.estimator.ModeKeys.TRAIN: logits = mtf.layers.multiplicative_jitter(logits, epsilon=1e-2) off_value = hparams.label_smoothing / self._targets_vocab_size on_value = 1.0 - hparams.label_smoothing + off_value soft_targets = mtf.one_hot( targets, self.targets_vocab_dim, on_value=on_value, off_value=off_value, dtype=self.activation_dtype) loss = mtf.layers.softmax_cross_entropy_with_logits( logits, soft_targets, self.targets_vocab_dim) weights = mtf.layers.weights_nonzero(targets, dtype=self.activation_dtype) loss = mtf.reduce_mean(loss * weights) for l in extra_losses: loss += l if (hparams.reshape_logits_hack and hparams.mode == tf.estimator.ModeKeys.TRAIN): logits = mtf.reshape(logits, old_dims + [self.targets_vocab_dim]) logits = mtf.to_float(logits) return logits, loss