def _Proc(record): """Parses a serialized tf.Example record.""" outputs = [ ('inputs', tf.io.VarLenFeature(tf.int64)), ('targets', tf.io.VarLenFeature(tf.int64)), # Default eval weight to 1.0 ('eval_weight', tf.io.FixedLenFeature([], tf.float32, default_value=1.0)), ] features = tf.io.parse_single_example(record, dict(outputs)) for k, v in six.iteritems(features): if k != 'eval_weight': features[k] = v.values else: eval_weight = v src_ids = features['inputs'] tgt_labels = features['targets'] # Derive trivial segmentation for unpacked input. src_paddings, tgt_ids, tgt_paddings, tgt_weights, bucket_key = _DerivePaddingsAndIds( src_ids, tgt_labels) src_len = tf.shape(src_ids)[0] tgt_len = tf.shape(tgt_ids)[0] src_pos = tf.range(src_len, dtype=tf.int32) src_seg = tf.zeros_like(src_paddings) tgt_pos = tf.range(tgt_len, dtype=tf.int32) tgt_seg = tf.zeros_like(tgt_paddings) return [ src_ids, src_paddings, tgt_ids, tgt_paddings, tgt_labels, tgt_weights, src_pos, src_seg, tgt_pos, tgt_seg, eval_weight ], bucket_key
def FProp(self, theta, source_vecs, source_paddings, target_vecs, target_paddings, source_segment_id, target_segment_id, transparent_acc, transparent_acc_helper, source_task_id=None, target_task_id=None): del source_task_id del target_task_id p = self.params if p.inputs_from_decoder: transformer_output = target_vecs else: transformer_output = source_vecs dim1, dim2 = tf.shape(transformer_output)[0], tf.shape( transformer_output)[1] softmax_input = tf.reshape(transformer_output, [-1, p.input_dim]) output_shape = [dim1, dim2, p.num_classes] return tf.reshape( super(GPipeTransformerSoftmaxLayer, self).Logits(theta, [softmax_input]), output_shape)
def _InputBatch(self): p = self.params @tf.function def ReadData(): x, y = io_ops.restore_v2(p.ckpt, [p.data, p.label], [''] * 2, [p.data_dtype, p.label_dtype]) # Always convert to float32. return tf.cast(x, tf.float32), tf.cast(y, tf.float32) # Loads data and label into memory and keep it around. data, label = ops.cached_call(f=ReadData.get_concrete_function(), T=[tf.float32, tf.float32]) b, shape = self.InfeedBatchSize(), list(p.data_shape) data = tf.reshape(data, [-1] + shape) label = tf.reshape(label, [-1]) label = py_utils.HasShape(label, [tf.shape(data)[0]]) sample_ids = ops.random_permutation_sequence( num=p.num_samples, batch=b, repeat=p.repeat, seed=p.random_seed if p.random_seed else 0) n = tf.shape(sample_ids)[0] raw = py_utils.PadOrTrimTo(tf.gather(data, sample_ids), [b] + shape) ret = py_utils.NestedMap( raw=raw, data=self._Preprocess(raw), label=py_utils.PadOrTrimTo(tf.gather(label, sample_ids), [b]), weight=py_utils.PadOrTrimTo(tf.ones([n], dtype=tf.float32), [b])) if not py_utils.use_tpu(): ret['sample_ids'] = sample_ids return ret
def _Moments(inputs, mask, enable_cross_replica_sum_on_tpu=False): """Computes mean and variance over the valid data points in inputs.""" inputs = py_utils.with_dependencies([ py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)), py_utils.assert_greater_equal(mask, tf.zeros_like(mask)), ], inputs) rank = tf.rank(mask) reduce_over_dims = tf.range(0, rank - 1) sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype), reduce_over_dims) count_v = tf.reduce_sum(mask, reduce_over_dims) # Input shape is guaranteed to be a multiple of mask shape because the # inputs * mask op above was successfully broadcasted. mask_multiplier = tf.shape(inputs)[:-1] // tf.shape(mask)[:-1] count_v *= tf.cast(tf.reduce_prod(mask_multiplier), count_v.dtype) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_v = tf.tpu.cross_replica_sum(sum_v) count_v = tf.tpu.cross_replica_sum(count_v) count_v = tf.maximum(count_v, 1.0) mean = sum_v / count_v sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask, reduce_over_dims) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_vv = tf.tpu.cross_replica_sum(sum_vv) variance = py_utils.with_dependencies([ py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)), ], sum_vv / count_v) return mean, variance
def ComputeSplits(batch_size, num_splits): """Creates a tensor of size num_splits of number of values per split. Assigns each split floor(batch_size/num_splits) and round-robins the remainder (if any) to each split. Example:: batch_size: [5] num_splits: 3 returns: [2, 2, 1] Args: batch_size: tensor of rank 0, size of tensor to be split num_splits: number of splits to split tensor into Returns: tensor of length num_splits containing sizes of each split """ values = tf.tile(tf.div([batch_size], num_splits), tf.constant([num_splits], dtype=tf.int32)) mods = tf.tile(tf.constant([1]), tf.math.floormod([batch_size], num_splits)) zeros = tf.tile(tf.constant([0]), tf.subtract(tf.shape(values), tf.shape(mods))) mods = tf.concat([mods, zeros], 0) ret = tf.add(values, mods) # for some reason TF erases shape information if num_splits is 1 if num_splits == 1: ret.set_shape([1]) return ret
def Step(recurrent_theta, state0, inputs): """Computes one decoder step.""" del inputs with tf.name_scope('single_sampler_step'): # Compute logits and states. bs_result, bs_state1 = pre_step_callback( recurrent_theta.theta, recurrent_theta.encoder_outputs, tf.expand_dims(state0.ids, 1), # [batch, 1]. state0.bs_state, num_hyps_per_beam=1) batch = tf.shape(bs_result.log_probs)[0] state1 = py_utils.NestedMap(timestep=state0.timestep + 1) state1.logits = bs_result.log_probs # Sample ids from logits. [batch]. state1.ids = tf.reshape( tf.random.stateless_categorical( state1.logits / p.temperature, num_samples=1, seed=tf.stack( [recurrent_theta.random_seed, state0.timestep]), dtype=state0.ids.dtype, name='sample_next_id'), [batch]) if 'is_last_chunk' in bs_result and p.target_eoc_id >= 0: state1.ids = tf.where( tf.math.logical_and( bs_result.is_last_chunk, tf.equal(state1.ids, p.target_eoc_id)), tf.fill(tf.shape(state1.ids), p.target_eos_id), state1.ids) state1.bs_state = post_step_callback( recurrent_theta.theta, recurrent_theta.encoder_outputs, state1.ids, bs_state1) return state1, py_utils.NestedMap()
def FProp(self, theta, input_batch): """Encodes source as represented by `inputs` and `paddings`. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. input_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. Returns: A NestedMap containing: - encoded: The encoded features, a tensor of shape [time, batch, depth] - padding: of shape [time, batch] - segment_id: [time, batch] if packed inputs are supported by the model (and all layers), or None otherwise. """ p = self.params src_segment_id = None with tf.name_scope(p.name): # Now the rnn layers. inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]), py_utils.assert_shape_match(tf.shape(input_batch.ids), tf.shape(input_batch.paddings)) ], tf.transpose(input_batch.ids)) paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2) xs = self.emb.EmbLookup(theta.emb, inputs) xs = self.ApplyClipping(theta, xs) self._emb_out = xs ps = paddings # When cc_schedule is specified, make sure lstm_tpl is QuantizedLSTMCell # with the same cc_schedule so that the RNN layer output is within # clipping range. xs = self.rnn[0].FProp(theta.rnn[0], xs, ps) xs = self.dropout.FProp(theta.dropout, xs) for i in range(1, p.num_lstm_layers): layer = self.rnn[i] ys, _ = layer.FProp(theta.rnn[i], xs, ps) ys = self.dropout.FProp(theta.dropout, ys) if hasattr(layer.params, 'cell'): layer_params = layer.params.cell else: layer_params = layer.params if layer_params.num_input_nodes == layer_params.num_output_nodes: xs += ys # Residual skip xs = self.ApplyClipping(theta, xs) else: # When cc_schedule is specified, make sure lstm_tpl is # QuantizedLSTMCell with the same cc_schedule so that the RNN layer # output is within clipping range. xs = ys return py_utils.NestedMap(encoded=xs, padding=tf.squeeze(ps, [2]), segment_id=src_segment_id)
def _AugmentationNetwork(self, series_length, inputs, paddings, global_seed, domain_id_index=0): """Returns augmented features. Args: series_length: Total length of time series. inputs: Batch of input features of shape (batch_size, time_length, num_freq, channels). paddings: Batch of padding vectors of shape (batch_size, time_length). global_seed: an integer seed tensor for stateless random ops. domain_id_index: domain id index. Returns: Batch of output features of shape (batch_size, time_length, num_freq, channels) obtained by applying random augmentations to inputs. """ p = self.params dtype = p.dtype # Unstack the features. if p.unstack: inputs, paddings = self.UnstackFeatures(inputs, paddings) lengths = tf.reduce_sum(1 - paddings, 1) inputs = self._TimeWarp(inputs, lengths, global_seed=global_seed, dtype=dtype, domain_id_index=domain_id_index) inputs = self._TimeMask(inputs, lengths, global_seed=global_seed, noisify=p.use_noise, gaussian_noise=p.gaussian_noise, dtype=dtype, domain_id_index=domain_id_index) inputs = self._FrequencyMask(inputs, global_seed=global_seed, dtype=dtype, domain_id_index=domain_id_index) # Restack the features after applying specaugment. if p.unstack: inputs = tf.reshape( inputs, [tf.shape(inputs)[0], series_length, -1, tf.shape(inputs)[3]]) return inputs
def _DerivePaddingsAndIds(src_ids, tgt_labels): """tgt_ids is tgt_labels shifted right by one, with a SOS ID prepended.""" tgt_ids = tf.concat([[p.sos_id], tgt_labels[:-1]], axis=0) src_paddings = tf.zeros(tf.shape(src_ids), dtype=tf.float32) tgt_paddings = tf.zeros(tf.shape(tgt_ids), dtype=tf.float32) tgt_weights = tf.ones(tf.shape(tgt_ids), dtype=tf.float32) bucket_key = tf.cast( tf.maximum(tf.reduce_sum(1.0 - src_paddings), tf.reduce_sum(1.0 - tgt_paddings)), tf.int32) return src_paddings, tgt_ids, tgt_paddings, tgt_weights, bucket_key
def FProp(self, theta, input_batch): p = self.params with tf.name_scope(p.name): inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]), py_utils.assert_shape_match(tf.shape(input_batch.ids), tf.shape(input_batch.paddings)) ], tf.transpose(input_batch.ids)) paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2) if p.packed_input: src_segment_id = tf.expand_dims( tf.transpose(input_batch.segment_ids), 2) else: src_segment_id = None xs = self.emb.EmbLookup(theta.emb, inputs) xs = self.ApplyClipping(theta, xs) summary_utils.histogram('input_emb', xs) xs = self.dropout.FProp(theta.dropout, xs) ps = paddings # Now the rnn layers. outputs_list = [] for i in range(0, p.num_lstm_layers): layer = self.rnn[i] ys = layer.FProp(theta.rnn[i], xs, ps, segment_id=src_segment_id) ys = self.dropout.FProp(theta.dropout, ys) if i >= p.residual_start: xs += ys # Residual skip xs = self.ApplyClipping(theta, xs) else: xs = ys outputs_list.append(xs) summary_utils.histogram('layer_out_%s' % i, xs) if p.is_transparent: xs = self.transparent_merger.FProp(theta.transparent_merger, outputs_list) if p.lstm_cell_size * 2 != p.encoder_out_dim: # Project to the right depth. xs = self.final_proj.FProp(theta.final_proj, xs, ps) summary_utils.histogram('final_proj_out', xs) if src_segment_id is not None: src_segment_id = tf.squeeze(src_segment_id, [2]) return py_utils.NestedMap(encoded=xs, padding=tf.squeeze(ps, [2]), segment_id=src_segment_id)
def FProp(self, theta, inputs, paddings): """Apply convolution to inputs. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. It is expected to be of shape [batch, time, frequency, channel]. The time dimension corresponds to the height dimension as in images and the frequency dimension corresponds to the width dimension as in images. paddings: The paddings tensor, expected to be of shape [batch, time]. Returns: outputs, out_paddings pair. """ p = self.params with tf.name_scope(p.name): inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(paddings), [-1, -1]), py_utils.assert_shape_match( tf.shape(inputs), tf.concat([ tf.shape(paddings), [-1, symbolic.ToStatic(self.input_channels)] ], 0)) ], inputs) def _ApplyPadding(tensor_in, padding_in): padding_expanded = tf.expand_dims(tf.expand_dims(padding_in, -1), -1) return tensor_in * (1.0 - padding_expanded) # Zeroing out padded inputs. inputs = _ApplyPadding(inputs, paddings) # Apply conv on 'inputs'. out = self._ApplyConv(theta, inputs) if p.partial_conv: out = self._RescaleBoundary(out, paddings) # NOTE: this may be slightly inaccurate when p.dilation_rate[0] > 1. # But there's likely no real problems. Trying to set it gives an error: # pooling with SAME padding is not implemented for dilation_rate > 1. # NOTE: we use window=p.filter_stride[0] to be compatible with legacy # implementation. Consider updating it to be the actual shape. conv_padding = ComputeConvOutputPadding( paddings, window=p.filter_stride[0], stride=p.filter_stride[0]) # Assuming padded nodes will be properly zero-ed out if necessary by # sub-sequent layers. # out = _ApplyPadding(out, conv_padding) out = py_utils.HasShape( out, symbolic.ToStatic(self.OutShape(tf.shape(inputs)))) return out, conv_padding
def FProp(self, theta, input_batch, state0=None): p = self.params src_segment_id = None with tf.name_scope(p.name): # Reshape to [t, b] inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]), py_utils.assert_shape_match(tf.shape(input_batch.ids), tf.shape(input_batch.paddings)) ], tf.transpose(input_batch.ids)) paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2) # Setup streaming states. if not state0: state0 = self.zero_state(theta, tf.shape(inputs)[1]) state1 = py_utils.NestedMap(rnn=[None] * p.num_lstm_layers) xs = self.emb.EmbLookup(theta.emb, inputs) xs = self.ApplyClipping(theta, xs) summary_utils.histogram('input_emb', xs) xs = self.dropout.FProp(theta.dropout, xs) ps = paddings # Now the rnn layers. outputs_list = [] for i in range(0, p.num_lstm_layers): layer = self.rnn[i] ys, state1.rnn[i] = layer.FProp(theta.rnn[i], xs, ps, state0=state0.rnn[i]) ys = self.dropout.FProp(theta.dropout, ys) if i >= p.residual_start: xs += ys # Residual skip xs = self.ApplyClipping(theta, xs) else: xs = ys outputs_list.append(xs) summary_utils.histogram('layer_out_%s' % i, xs) if p.is_transparent: xs = self.transparent_merger.FProp(theta.transparent_merger, outputs_list) return py_utils.NestedMap(encoded=xs, padding=tf.squeeze(ps, [2]), segment_id=src_segment_id, state=state1)
def InitBeamSearchStateCallback(theta, encoder_outputs, num_hyps_per_beam): """Wrapper for adding bias to _InitBeamSearchStateCallback. Exapnds state to track consistency of hypothesis with provided target. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. encoder_outputs: A NestedMap computed by encoder. num_hyps_per_beam: An int, number hyps to keep for source sentence. Returns: initial_results: a `.NestedMap` of initial results. states: a `.NestedMap` of initial model states that the client would like to keep track of for each hyp. The states relevant here are: time_step: A scalar indicating current step (=0 for initial state) of decoder. Must be provided and maintained by super. consistent: A boolean tensor of shape [tgt_batch, 1] which tracks whether each hypothesis has exactly matched encoder_outputs.targets so far. """ initial_results, states = self._InitBeamSearchStateCallback( theta, encoder_outputs, num_hyps_per_beam) assert hasattr(states, 'time_step') num_hyps = tf.shape(encoder_outputs.padding)[1] * num_hyps_per_beam # states.consistent is initially all True states.consistent = tf.ones([ num_hyps, ], dtype=tf.bool) return initial_results, states
def TileForBeamAndFlatten(tensor): tensor = tf.reshape(tensor, [1, -1]) # [1, src_batch] tensor = tf.tile(tensor, [num_hyps_per_beam, 1 ]) # [num_hyps_per_beam, src_batch] tgt_batch = tf.shape(step_ids)[ 0] # num_hyps_per_beam*src_batch return tf.reshape(tensor, [tgt_batch])
def FProp(self, theta, inputs, paddings=None): """Apply batch normalization. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [..., dim]. paddings: The paddings tensor. Shaped [..., 1], with the same rank as the input tensor. Returns: Output after applying batch normalization, with the same shape as 'inputs'. """ p = self.params if paddings is None: paddings = self._GetDefaultPaddings(inputs) with tf.name_scope(p.name): norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments( theta, inputs, paddings) with tf.control_dependencies([ py_utils.assert_greater_equal( norm_variance, tf.zeros_like(norm_variance)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_mean)), py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(norm_variance)), ]): if p.use_fused_batch_norm_for_eval and self.do_eval: bn_output, _, _ = nn.fused_batch_norm(inputs, gamma, beta, norm_mean, norm_variance, self._epsilon, is_training=False) else: bn_output = tf.nn.batch_normalization( inputs, norm_mean, norm_variance, beta, gamma, self._epsilon) if p.set_padded_output_to_zero: bn_output *= 1.0 - paddings return bn_output
def _GreedySearchStep(self, theta, encoder_outputs, cur_step, step_ids, hyp_ids, hyp_lens, done_hyps, other_states, pre_beam_search_step_callback, post_beam_search_step_callback): """Extend greedy search hyps for one step. Args: theta: A `.NestedMap` object containing weights' values of the decoder layer and its children layers. encoder_outputs: A `.NestedMap` containing encoder outputs to be passed to the callbacks. cur_step: A scalar int tensor, the current time step, 0-based. step_ids: An int tensor of shape [num_hyps, 1]. The input ids to the current search step. hyp_ids: An int tensor of shape [num_hyps, tgt_seq_len]. hyp_lens: Valid length of all the hyps. Tokens after eos ids are not counted. done_hyps: Whether or not a hyp has finished. other_states: A `.NestedMap` of other beam search states. This `.NestedMap` is managed and updated by the client. It is expected that each of its member tensors are of rank >= 1. t[i, ...] is the state of the i-th hyp at the beginning of this search step. pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback. See class header comments for more details. post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback. See class header comments for more details. Returns: A tuple of following elements for the next greedy search step, (next step, new_step_ids, hyp_ids, hyp_lens, done_hyps, other_states) """ p = self.params # Increment hyp_lens by 1 if the hyp is not finished yet. hyp_lens = hyp_lens + (1 - tf.cast(done_hyps, tf.int32)) bs_results, new_other_states = pre_beam_search_step_callback( theta, encoder_outputs, step_ids, other_states, 1) # num_hyps_per_beam new_step_ids = tf.math.argmax(bs_results.log_probs, 1) new_step_ids = tf.cast(new_step_ids, tf.int32) new_step_ids = tf.reshape(new_step_ids, tf.shape(step_ids)) final_other_states = post_beam_search_step_callback( theta, encoder_outputs, new_step_ids, new_other_states) # Stash new_step_ids into the right slot. new_step_ids_1d = tf.reshape(new_step_ids, [-1]) hyp_ids = inplace_ops.alias_inplace_update(hyp_ids, cur_step, new_step_ids_1d) # Update done_hyps if the current step_ids is the end of sequence token. done_hyps = tf.math.logical_or( done_hyps, tf.equal(new_step_ids_1d, p.target_eos_id)) return (cur_step + 1, new_step_ids, hyp_ids, hyp_lens, done_hyps, final_other_states)
def FProp(self, theta, inputs): """Applies batch normalization. Using the implementation in github.com/ tensorflow/tpu/blob/master/models/official/amoeba_net/network_utils.py#L550 Args: theta: A nested map object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [..., dim]. Returns: Output after applying batch normalization, with the same shape as 'inputs'. """ p = self.params inputs_dtype = inputs.dtype inputs = tf.cast(inputs, p.dtype) inputs = py_utils.with_dependencies([ py_utils.assert_shape_match([tf.shape(inputs)[-1]], tf.shape(theta.beta)) ], inputs) with tf.name_scope(p.name) as scope: if self.do_eval: outputs = tf.nn.batch_normalization(inputs, theta.moving_mean, theta.moving_variance, theta.beta, theta.gamma, p.epsilon) else: mean, variance = self._Moments(inputs, p.bn_group_size) mean = py_utils.CheckNumerics( mean, 'mean of {} failed numeric check'.format(scope)) variance = py_utils.CheckNumerics( variance, 'variance of {} failed numeric check'.format(scope)) outputs = tf.nn.batch_normalization(inputs, mean, variance, theta.beta, theta.gamma, p.epsilon) outputs.set_shape(inputs.get_shape()) return tf.cast(outputs, inputs_dtype)
def ApplyBias(): """Bias and update log_probs and consistent.""" def TileForBeamAndFlatten(tensor): tensor = tf.reshape(tensor, [1, -1]) # [1, src_batch] tensor = tf.tile(tensor, [num_hyps_per_beam, 1 ]) # [num_hyps_per_beam, src_batch] tgt_batch = tf.shape(step_ids)[ 0] # num_hyps_per_beam*src_batch return tf.reshape(tensor, [tgt_batch]) # Consistent if step_ids == labels from previous step # TODO(navari): Consider updating consistent only if weights > 0. Then # re-evaluate the need for bias_only_if_consistent=True. # Note that prev_label is incorrrect for step 0 but is overridden later prev_label = TileForBeamAndFlatten( tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1)) is_step0 = tf.equal(time_step, 0) local_consistence = tf.math.logical_or( is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1))) consistent = tf.math.logical_and(states.consistent, local_consistence) # get label, weight slices corresponding to current time_step label = TileForBeamAndFlatten( tf.gather(labels, time_step, axis=1)) weight = TileForBeamAndFlatten( tf.gather(weights, time_step, axis=1)) if p.bias_only_if_consistent: weight = weight * tf.cast(consistent, p.dtype) # convert from dense label to sparse label probs vocab_size = tf.shape(bs_results.log_probs)[1] uncertainty = tf.constant( 1e-10, p.dtype) # avoid 0 probs which may cause issues with log label_probs = tf.one_hot( label, vocab_size, on_value=1 - uncertainty, off_value=uncertainty / tf.cast(vocab_size - 1, p.dtype), dtype=p.dtype) # [tgt_batch, vocab_size] pred_probs = tf.exp(bs_results.log_probs) # interpolate predicted probs and label probs weight = tf.expand_dims(weight, 1) probs = py_utils.with_dependencies([ py_utils.assert_less_equal(weight, 1.), py_utils.assert_greater_equal(weight, 0.) ], (1.0 - weight) * pred_probs + weight * label_probs) return tf.math.log(probs), consistent
def ReOrderHyps(x_in): """Reorders x_in based on prev hyp ids.""" if isinstance(x_in, tf.Tensor) and x_in.shape.ndims > 0: # For rank > 1 tensors we make use of an efficient matmul based gather # on tpu that takes in account the range of the values. For R1, we # rely on the tf.gather and xla to optimize it efficiently for R1 # layout. if x_in.shape.ndims > 1: if p.batch_major_state: num_hyps = tf.shape(old_hyp_ids)[0] x_out = beam_search_tpu_ops.fast_gather( x_in, old_hyp_ids, num_hyps, max_value=None, batch_major_state=p.batch_major_state) else: # Use corrected indices only here for batch major compute as # key/value caches are the states being affected. correct_old_hyp_ids = (old_hyp_ids_in_cache_order if p.batch_major_compute else old_hyp_ids) def _GatherStep(x_in, t): """Gather for one time step. Args: x_in: in the shape of [T, B, ...] we first get slice(t) from the tensors, then gather old_hyp_ids from the slice and write the interpolated slice inplace to update the original x_in. t: current time step Returns: Updated x_in and time step """ x = tf.gather(tf.gather(x_in, t), correct_old_hyp_ids) return inplace_ops.alias_inplace_update( x_in, t, x), t + 1 x_out, _ = tf.while_loop( lambda _, t: t <= cur_step, _GatherStep, (x_in, tf.zeros([], tf.int32))) else: x_out = tf.gather(x_in, old_hyp_ids) x_out.set_shape(x_in.get_shape()) return x_out else: return x_in
def SplitTensors(xs, num_splits): """Splits tensors in `xs` evenly into num_splits along the 1st dimenion. Args: xs: A tuple of tensors. Each tensor's 1st dimension is the same size. num_splits: A python integer. Returns: A tuple of lists of tensors, num elements in the tuple = len(xs). i-th element in each list corresponds to i-th split of each tensor in xs along the first dimension of each tensor. """ # assert first dim of all tensors in xs is equal batch_dims = [tf.shape(x)[0] for x in xs] all_batch_dims = tf.stack(batch_dims) all_batch_dims = py_utils.with_dependencies([ py_utils.assert_equal(all_batch_dims, tf.shape(xs[0])[0], message='first dim of tensors in xs must match'), py_utils.assert_greater_equal( tf.shape(xs[0])[0], num_splits, message='first dim of tensors in xs must be greater than num_splits' ) ], all_batch_dims) splits = ComputeSplits(tf.shape(xs[0])[0], num_splits) # add the above assertion into the compute graph splits = py_utils.with_dependencies([all_batch_dims], splits) split_xs = [ tf.split(axis=0, num_or_size_splits=splits, value=x) for x in xs ] return split_xs
def GetEncoderEmbeddingsDefaultTheta(self, input_ids, task_ids=None): p = self.params time_dim = 0 if p.batch_dim else 1 seq_len = tf.shape(input_ids)[time_dim] input_embs = self.src_token_emb.EmbLookup(self.theta.src_token_emb, input_ids) pos_embs = tf.expand_dims( self.src_pos_emb.FProp(self.theta.src_pos_emb, seq_len), p.batch_dim) input_embs += pos_embs if task_ids is not None and p.enc_task_emb: input_embs += self.src_task_emb.EmbLookup(self.theta.src_task_emb, task_ids) input_embs = self.src_dropout.FProp(self.theta.src_dropout, input_embs) return input_embs
def SequenceLength(padding): """Computes the length of a sequence based on binary padding. Args: padding: A tensor of binary paddings shaped [batch, seqlen]. Returns: seq_lens, A tensor of shape [batch] containing the non-padded length of each element of plot_tensor along the batch dimension. """ seq_lens = tf.cast(tf.round(tf.reduce_sum(1 - padding, axis=1)), tf.int32) # Get rid of any extra dimensions. batch_size = tf.shape(padding)[0] seq_lens = tf.reshape(seq_lens, [batch_size], name='seq_lens') return seq_lens
def GetEmbeddings(self, emb_theta, emb, pos_emb_theta, pos_emb, dropout_theta, dropout, input_ids, input_pos_ids, task_emb_theta, task_emb, task_ids): p = self.params time_dim = 0 if p.batch_dim else 1 seq_len = tf.shape(input_ids)[time_dim] input_embs = emb.EmbLookup(emb_theta, input_ids) if p.packed_input: # Packed inputs. pos_embs = pos_emb.FPropWithPosition(pos_emb_theta, input_pos_ids) else: pos_embs = tf.expand_dims(pos_emb.FProp(pos_emb_theta, seq_len), p.batch_dim) input_embs += pos_embs if task_emb: input_embs += task_emb.EmbLookup(task_emb_theta, task_ids) input_embs = dropout.FProp(dropout_theta, input_embs) return input_embs
def _FrequencyMask(self, inputs, global_seed, dtype=tf.float32, domain_id_index=0): """Applies frequency masking with given degree to inputs. Args: inputs: Batch of input features of shape (batch_size, time_length, num_freq, channels). global_seed: an integer seed tensor for stateless random ops. dtype: Data type. domain_id_index: domain id index. Returns: Inputs with random frequency masking applied. """ p = self.params # Mask parameters. freq_mask_max_bins = p.freq_mask_max_bins[domain_id_index] multiplicity = p.freq_mask_count[domain_id_index] # If masking length or count is zero, do nothing. if freq_mask_max_bins == 0 or multiplicity == 0: return inputs # Arguments to pass to mask generator. batch_size, _, num_freq, _ = py_utils.GetShape(inputs) choose_range = tf.cast(tf.broadcast_to(num_freq, (batch_size, )), dtype=tf.int32) # Create masks in frequency direction and apply. block_arrays = self._GetMask(tf.shape(inputs)[0], choose_range=choose_range, mask_size=num_freq, global_seed=global_seed, max_length=freq_mask_max_bins, masks_per_frame=0.0, multiplicity=multiplicity, dtype=dtype, max_ratio=1.0) return self.EinsumBxycByBxyc(inputs, block_arrays)
def FProp(self, theta, inputs): """Apply projection to inputs. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [..., input_dims]. Returns: Projected inputs. """ p = self.params with tf.name_scope(p.name): computation_cost.Add( self, 'flops', tf.reduce_prod(tf.cast(tf.shape(inputs)[:-1], tf.int64)) * tf.cast(symbolic.ToTensor(p.input_dims * p.output_dims), tf.int64) * 2) return py_utils.ProjectLastDim(inputs, theta.w, p.input_dims, p.output_dims)
def Finalize(self): """Finishes creation of the overall figure, returning the image summary.""" subplot_grid_shape = self._subplot_grid_shape if subplot_grid_shape is None: subplot_grid_shape = (len(self._subplots), 1) # AddMatplotlibFigureSummary (due to restrictions of py_func) only supports # flattened list of tensors so we must do some bookkeeping to maintain a # mapping from _SubplotMetadata object to flattened_tensors. subplot_slices = [] flattened_tensors = [] for subplot in self._subplots: start = len(flattened_tensors) subplot_slices.append((start, start + len(subplot.tensor_list))) flattened_tensors.extend(subplot.tensor_list) def PlotFunc(fig, *numpy_data_list): gs = gridspec.GridSpec(*subplot_grid_shape, **self._gridspec_kwargs) for n, subplot in enumerate(self._subplots): axes = fig.add_subplot(gs[n]) start, end = subplot_slices[n] subplot_data = numpy_data_list[start:end] subplot.plot_func(fig, axes, *subplot_data) func = functools.partial(_RenderMatplotlibFigures, self._figsize, self._max_outputs, PlotFunc) batch_sizes = [tf.shape(t)[0] for t in flattened_tensors] num_tensors = len(flattened_tensors) with tf.control_dependencies([ tf.assert_equal(batch_sizes, [batch_sizes[0]] * num_tensors, summarize=num_tensors) ]): rendered = tf.py_func(func, flattened_tensors, tf.uint8, name='RenderMatplotlibFigures') return tf.summary.image(self._name, rendered, max_outputs=self._max_outputs)
def GetDecoderEmbeddingsDefaultTheta(self, input_ids, task_ids=None, t=None): p = self.params input_embs = self.tgt_token_emb.EmbLookup(self.theta.tgt_token_emb, input_ids) if t is None: time_dim = 0 if p.batch_dim else 1 seq_len = tf.shape(input_ids)[time_dim] pos_embs = tf.expand_dims( self.tgt_pos_emb.FProp(self.theta.tgt_pos_emb, seq_len), p.batch_dim) else: # Support decoding. pos_embs = tf.slice( self.tgt_pos_emb.FProp(self.theta.tgt_pos_emb, p.max_seq_len), [t, 0], [1, p.token_emb.embedding_dim]) input_embs += pos_embs if task_ids is not None and p.dec_task_emb: input_embs += self.tgt_task_emb.EmbLookup(self.theta.tgt_task_emb, task_ids) input_embs = self.tgt_dropout.FProp(self.theta.tgt_dropout, input_embs) return input_embs
def FProp(self, theta, source_input, source_paddings, target_input=None, target_paddings=None, source_segment_id=None, target_segment_id=None, labels=None, label_weights=None, source_pos_id=None, target_pos_id=None, source_task_id=None, target_task_id=None): """Transforms source sequence of Tensors with Transformers layers. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. source_input: A sequence of ints indicating source input ids of [time, batch] shape or [batch, time] if batch_dim is 0. source_paddings: A sequence of 0s and 1s indicating input paddings of [time, batch] shape or [batch, time] if batch_dim is 0. target_input: A sequence of ints indicating target input ids of [time, batch] shape or [batch, time] if batch_dim is 0. target_paddings: [target_time, target_batch] or [target_batch, target_time] if batch_dim is 0. source_segment_id: A sequence of ints indicating source segment ids of [time, batch] shape or [batch, time] if batch_dim is 0. target_segment_id: A sequence of ints indicating target segment ids of [time, batch] shape or [batch, time] if batch_dim is 0. labels: A sequence of ints indicating label ids of [time, batch] shape, or [batch, time] if batch_dim is 0. label_weights: A sequence of floats indicates label weights of [time, batch] shape, or [batch, time] if batch_dim is 0. source_pos_id: A sequence of ints indicating source position ids of [time, batch] shape, or [batch, time] if batch_dim is 0. target_pos_id: A sequence of ints indicating target position ids of [time, batch] shape, or [batch, time] if batch_dim is 0. source_task_id: A sequence of ints indicating source task ids of [time, batch] shape, or [batch, time] if batch_dim is 0. target_task_id: A sequence of ints indicating target task ids of [time, batch] shape, or [batch, time] if batch_dim is 0. Returns: transformer_output with shape [time, batch, dim] or [batch, time, dim] if batch_dim is 0. """ p = self.params if p.num_decoder_layers > 0: assert target_input is not None assert target_paddings is not None if p.packed_input: assert source_segment_id is not None, ( 'Need to specify src_segment_id if packed input is supported.') assert source_pos_id is not None, ( 'Need to specify src_pos_id for packed input and embeddings.') logits = super(GPipeTransformerStack, self).FProp(theta, source_input, source_paddings, target_input, target_paddings, source_segment_id, target_segment_id, source_pos_id, target_pos_id, source_task_id, target_task_id) if not p.softmax_tpl: return logits label_weights = tf.reshape(label_weights, [-1]) target_probs = None if p.label_smoothing: if p.batch_dim: # Time-major target_probs = tf.transpose( self.smoother.FProp(theta.smoother, tf.transpose(target_paddings), tf.transpose(labels), target_ids=None), [1, 0, 2]) else: target_probs = self.smoother.FProp(theta.smoother, target_paddings, labels, target_ids=None) target_probs = tf.reshape(target_probs, [-1, p.softmax_tpl.num_classes]) reshaped_logits = tf.reshape(logits, [-1, p.softmax_tpl.num_classes]) tgt_labels = tf.reshape(labels, [-1]) num_splits = len(p.splits) softmax = self.children['cell_{}'.format(num_splits - 1)].softmax softmax_theta = theta['cell_{}'.format(num_splits - 1)].softmax per_example_xent, _ = softmax.XentLossFromLogits( softmax_theta, reshaped_logits, class_weights=tf.reshape(label_weights, [-1]), class_ids=tgt_labels, class_probabilities=target_probs) xent_shape = tf.shape(logits)[:2] per_example_xent = tf.reshape(per_example_xent, xent_shape) return per_example_xent, logits
def _EncodeToIds(self, word): # Below: # * a token is a wordpiece ID. # * the tokens array will be merged in-place. # * the candidates array is an array of size len(tokens) - 1. # It contains the token for the merged wordpiece, if it exists, # -1 otherwise. For instance, candidate[3] = id(token[3] + token[4]). # First, split into basic UTF-8 characters (letters). chars = tf.strings.unicode_split(word, 'UTF-8') tokens = self._StringToToken(chars) tokens = tf.where( tf.equal(tokens, NO_TOKEN), # Unseen character. tf.broadcast_to(self.unk_id, tf.shape(tokens)), tokens) # Create initial candidate list. candidates = tf.map_fn( self._MergeTokens, (tokens[:-1], tokens[1:]), dtype=tokens.dtype) def _ShouldMerge(unused_tokens, candidates): """Merge until not possible, or we abort early according to merge_prob.""" return tf.math.logical_and( tf.reduce_any(tf.not_equal(candidates, NO_TOKEN)), tf.random.uniform([]) < self._merge_prob) def _MergeOneToken(tokens, i): return tf.expand_dims( self._MergeTokens((tokens[i], tokens[i + 1])), axis=-1) def _MergeCandidates(tokens, candidates): """Merge in the reverse binary tree.""" best_id = tf.argmin(candidates, output_type=tf.int32) # Perform the merge at position best_id. tokens = tf.concat( [tokens[:best_id], [candidates[best_id]], tokens[best_id + 2:]], axis=0) # Recompute the merge candidates. # Only the neighbors of best_id need to be recomputed. empty = tf.zeros([0], dtype=candidates.dtype) def _MergeLeft(): return tf.concat( [candidates[:best_id - 1], _MergeOneToken(tokens, best_id - 1)], axis=0) left_candidates = tf.cond(tf.equal(best_id, 0), lambda: empty, _MergeLeft) def _MergeRight(): return tf.concat( [_MergeOneToken(tokens, best_id), candidates[best_id + 2:]], axis=0) right_candidates = tf.cond( tf.greater_equal(best_id, tf.size(tokens) - 1), lambda: empty, _MergeRight) candidates = tf.concat([left_candidates, right_candidates], axis=0) return tokens, candidates return tf.while_loop( _ShouldMerge, _MergeCandidates, (tokens, candidates), parallel_iterations=1, back_prop=False)[0]
def _StringToToken(self, tokstr): return tf.where( ops.token_in_vocab(tokstr, vocab=self._pieces), ops.vocab_token_to_id(tokstr, vocab=self._pieces), tf.broadcast_to(NO_TOKEN, tf.shape(tokstr)))