def FProp(self, theta, input_batch, state0=None): p = self.params src_segment_id = None with tf.name_scope(p.name): # Reshape to [t, b] inputs = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]), py_utils.assert_shape_match(tf.shape(input_batch.ids), tf.shape(input_batch.paddings)) ], tf.transpose(input_batch.ids)) paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2) # Setup streaming states. if not state0: state0 = self.zero_state(theta, tf.shape(inputs)[1]) state1 = py_utils.NestedMap(rnn=[None] * p.num_lstm_layers) xs = self.emb.EmbLookup(theta.emb, inputs) xs = self.ApplyClipping(theta, xs) summary_utils.histogram('input_emb', xs) xs = self.dropout.FProp(theta.dropout, xs) ps = paddings # Now the rnn layers. outputs_list = [] for i in range(0, p.num_lstm_layers): layer = self.rnn[i] ys, state1.rnn[i] = layer.FProp(theta.rnn[i], xs, ps, state0=state0.rnn[i]) ys = self.dropout.FProp(theta.dropout, ys) if i >= p.residual_start: xs += ys # Residual skip xs = self.ApplyClipping(theta, xs) else: xs = ys outputs_list.append(xs) summary_utils.histogram('layer_out_%s' % i, xs) if p.is_transparent: xs = self.transparent_merger.FProp(theta.transparent_merger, outputs_list) return py_utils.NestedMap(encoded=xs, padding=tf.squeeze(ps, [2]), segment_id=src_segment_id, state=state1)
def ApplyBias(): """Bias and update log_probs and consistent.""" def TileForBeamAndFlatten(tensor): tensor = tf.reshape(tensor, [1, -1]) # [1, src_batch] tensor = tf.tile( tensor, [num_hyps_per_beam, 1]) # [num_hyps_per_beam, src_batch] tgt_batch = tf.shape(step_ids)[0] # num_hyps_per_beam*src_batch return tf.reshape(tensor, [tgt_batch]) # Consistent if step_ids == labels from previous step # TODO(navari): Consider updating consistent only if weights > 0. Then # re-evaluate the need for bias_only_if_consistent=True. # Note that prev_label is incorrrect for step 0 but is overridden later prev_label = TileForBeamAndFlatten( tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1)) is_step0 = tf.equal(time_step, 0) local_consistence = tf.math.logical_or( is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1))) consistent = tf.math.logical_and(states.consistent, local_consistence) # get label, weight slices corresponding to current time_step label = TileForBeamAndFlatten(tf.gather(labels, time_step, axis=1)) weight = TileForBeamAndFlatten(tf.gather(weights, time_step, axis=1)) if p.bias_only_if_consistent: weight = weight * tf.cast(consistent, py_utils.FPropDtype(p)) # convert from dense label to sparse label probs vocab_size = tf.shape(bs_results.log_probs)[1] uncertainty = tf.constant(1e-10, py_utils.FPropDtype( p)) # avoid 0 probs which may cause issues with log label_probs = tf.one_hot( label, vocab_size, on_value=1 - uncertainty, off_value=uncertainty / tf.cast(vocab_size - 1, py_utils.FPropDtype(p)), dtype=py_utils.FPropDtype(p)) # [tgt_batch, vocab_size] pred_probs = tf.exp(bs_results.log_probs) # interpolate predicted probs and label probs weight = tf.expand_dims(weight, 1) probs = py_utils.with_dependencies([ py_utils.assert_less_equal(weight, 1.), py_utils.assert_greater_equal(weight, 0.) ], (1.0 - weight) * pred_probs + weight * label_probs) return tf.math.log(probs), consistent
def _ComputeConvOutputPaddingV2(paddings, window, stride, padding_algorithm='SAME'): """Computes paddings for convolution and pooling output. - If padding_algorithm='SAME': out_padding[i] == 0 if the in_padding corresponding to that output is 0. This prevents the output from shrinking unnecessarily when striding. - If padding algorithm='VALID': out_padding[i] == 1 iff any in_padding corresponding to that output is 1. Args: paddings: The paddings tensor. It is expected to be of shape [batch, time]. window: The size of the windows. stride: The time-stride between adjacent windows. padding_algorithm: 'SAME' or 'VALID'. Returns: out_padding, The new padding tensor of size [batch, ceil(time / stride)]. """ if stride == 1 and padding_algorithm == 'SAME': return paddings paddings, slice_len = _PadForLengthCompatibleStridesV2( paddings, stride, padding_algorithm, 1.0) expanded_paddings = tf.expand_dims(paddings, -1) if padding_algorithm == 'SAME': # Using a strided conv1d of size 1x1 we find all non-padded positions for # the specified stride. out_paddings = tf.nn.conv1d(expanded_paddings, filters=tf.ones([1, 1, 1], paddings.dtype), stride=stride, padding='SAME', name='padding_conv') elif padding_algorithm == 'VALID': out_paddings = tf.nn.pool(expanded_paddings, [window], 'MAX', padding=padding_algorithm, strides=[stride]) out_paddings = tf.squeeze(out_paddings, -1) if stride > 1: slice_end = py_utils.GetShape(out_paddings)[1] - slice_len out_paddings = out_paddings[:, :slice_end] return out_paddings
def _FPropLm(self, theta, state0, ids, paddings, misc=None): """LM FProp. Works for single step or entire seq. Args: theta: A NestedMap object containing weights for the layer and its children. state0: A NestedMap of states (specific to the layer). ids: Target ids, of shape [batch_size] for single step unrolling or [seq_len, batch_size] for the entire sequence. paddings: Target paddings, of the same shape as 'ids'. misc: NestedMap of miscellaneous items, which might be needed during training. Returns: (lm_output, state1): - lm_output: A NestedMap containing lm output. If 'ids' is 1-D, then lm_output should have shape [batch_size, dim]; if it is 2-D then the shape should be [seq_len, batch_size, dim]. - state1: A NestedMap of updated states. """ state1 = state0.DeepCopy() if isinstance(ids.shape, tf.TensorShape): is_single_step = (ids.shape.rank == 1) else: is_single_step = len(ids.shape) == 1 if is_single_step: seq_len = 1 else: seq_len = tf.shape(ids)[0] self._ModifyLmBeforeFProp(theta, state0, ids, paddings, misc) with tf.name_scope('lm'): ids = tf.reshape(ids, [seq_len, -1], name='reshape_ids') paddings = tf.reshape(paddings, [seq_len, -1], name='reshape_paddings') lm_output, state1.lm_states = self.lm.FProp(theta.lm, ids, paddings, state0.lm_states) if is_single_step: # lm outputs have dimension [time, batch, dim]. Since this is only one # step, remove time dimension. lm_output = lm_output.Transform(lambda v: tf.squeeze(v, axis=0)) return lm_output, state1
def factorized_pool(input_tensor, window_shape, pooling_type, strides, padding, name=None): """Performs m x n pooling through a combination of 1xm and 1xn pooling. Args: input_tensor: Input tensor. Must be rank 2 window_shape: Pooling window shape pooling_type: Either 'MAX' or 'AVG' strides: The stride of the pooling window padding: 'SAME' or 'VALID'. name: Name of the op Returns: A rank 2 tensor containing the pooled output Raises: ValueError: if the input tensor is not rank 2 """ if input_tensor.get_shape().ndims != 2: raise ValueError('factorized_pool() accepts tensors of rank 2 only') [height, width] = input_tensor.get_shape() with tf.name_scope(name, 'factorized_pool'): input_tensor_aligned = tf.reshape(input_tensor, [1, 1, height, width], name=input_tensor.op.name + '_aligned') height_pooling = tf.nn.pool(input_tensor_aligned, window_shape=[1, window_shape[0]], pooling_type=pooling_type, strides=[1, strides[0]], padding=padding) swap_height_width = tf.transpose(height_pooling, perm=[0, 1, 3, 2]) width_pooling = tf.nn.pool(swap_height_width, window_shape=[1, window_shape[1]], pooling_type=pooling_type, strides=[1, strides[1]], padding=padding) return tf.squeeze(tf.transpose(width_pooling, perm=[0, 1, 3, 2]), axis=[0, 1])
def _Slice(tensor): """Return a slice of this tensor at time=state0.t.""" shape = py_utils.GetShape(tensor) # All zeros except for t in the time dimension. # e.g. if params.axis=1, begin is [0, t, 0, 0, 0, ...] begin = tf.one_hot(self.params.axis, tf.rank(tensor), on_value=state0.t) # Same as shape, but with a 1 in the time dimension. # e.g. if params.axis=1, shape is [shape[0], 1, shape[2], shape[3], ...] size = tf.concat([ shape[0:self.params.axis], tf.constant([1], dtype=tf.int32), shape[self.params.axis + 1:] ], axis=0) # Make a slice where the time dimension is fixed at state0.t. time_slice = tf.slice(tensor, begin, size) # Remove the time dimension. return tf.squeeze(time_slice, axis=self.params.axis)
def _ProcessLine(self, line): """A single-text-line processor. Gets a string tensor representing a line of text that have been read from the input file, and splits it to graphemes (characters). We use original characters as the target labels, and the lowercased and punctuation-removed characters as the source labels. Args: line: a 1D string tensor. Returns: A list of tensors, in the expected order by __init__. """ # Tokenize the input into integer ids. # tgt_ids has the start-of-sentence token prepended, and tgt_labels has the # end-of-sentence token appended. tgt_ids, tgt_labels, tgt_paddings = self.StringsToIds( tf.convert_to_tensor([line])) def Normalize(line): # Lowercase and remove punctuation. line = line.lower().translate(None, string.punctuation.encode('utf-8')) # Convert multiple consecutive spaces to a single one. line = b' '.join(line.split()) return line normalized_line = tf.py_func(Normalize, [line], tf.string, stateful=False) _, src_labels, src_paddings = self.StringsToIds( tf.convert_to_tensor([normalized_line]), is_source=True) # The model expects the source without a start-of-sentence token. src_ids = src_labels # Compute the length for bucketing. bucket_key = tf.cast( tf.round( tf.maximum( tf.reduce_sum(1.0 - src_paddings), tf.reduce_sum(1.0 - tgt_paddings))), tf.int32) tgt_weights = 1.0 - tgt_paddings # Return tensors in an order consistent with __init__. out_tensors = [ src_ids, src_paddings, tgt_ids, tgt_paddings, tgt_labels, tgt_weights ] return [tf.squeeze(t, axis=0) for t in out_tensors], bucket_key
def _GetPcm(self): """Gets sample wav file pcm samples. Returns: (sample_rate, mono_audio) where mono_audio is of shape [batch (=1), samples]. """ with open( test_helper.test_src_dir_path('tools/testdata/gan_or_vae.wav'), 'rb') as f: wavdata = f.read() result = tf.audio.decode_wav(wavdata) # Remove the last dimension: channel is 1. audio = py_utils.HasShape(result.audio, [75900, 1]) audio = tf.squeeze(audio, axis=1) # Returns audio as batch-major data with a single batch. return result.sample_rate, tf.expand_dims(audio, axis=0)
def _InferenceSubgraph_Default(self): """Constructs graph for offline inference. Returns: (fetches, feeds) where both fetches and feeds are dictionaries. Each dictionary consists of keys corresponding to tensor names, and values corresponding to a tensor in the graph which should be input/read from. """ p = self.params with tf.name_scope('default'): # TODO(laurenzo): Once the migration to integrated frontends is complete, # this model should be upgraded to use the MelAsrFrontend in its # params vs relying on pre-computed feature generation and the inference # special casing. wav_bytes = tf.placeholder(dtype=tf.string, name='wav') frontend = self.frontend if p.frontend else None if not frontend: # No custom frontend. Instantiate the default. frontend_p = asr_frontend.MelAsrFrontend.Params() frontend = frontend_p.Instantiate() # Decode the wave bytes and use the explicit frontend. unused_sample_rate, audio = audio_lib.DecodeWav(wav_bytes) audio *= 32768 # Remove channel dimension, since we have a single channel. audio = tf.squeeze(audio, axis=1) # Add batch. audio = tf.expand_dims(audio, axis=0) input_batch_src = py_utils.NestedMap(src_inputs=audio, paddings=tf.zeros_like(audio)) input_batch_src = frontend.FPropDefaultTheta(input_batch_src) encoder_outputs = self.encoder.FPropDefaultTheta(input_batch_src) decoder_outputs = self.decoder.BeamSearchDecode(encoder_outputs) topk = self._GetTopK(decoder_outputs) feeds = {'wav': wav_bytes} fetches = { 'hypotheses': topk.decoded, 'scores': topk.scores, 'src_frames': input_batch_src.src_inputs, 'encoder_frames': encoder_outputs.encoded } return fetches, feeds
def ApplyBias(): """Bias and update log_probs and consistent.""" # Consistent if step_ids == labels from previous step # TODO(navari): Consider updating consistent only if weights > 0. Then # re-evaluate the need for bias_only_if_consistent=True. # Note that prev_label is incorrrect for step 0 but is overridden # later prev_label = TileForBeamAndFlatten( tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1)) is_step0 = tf.equal(time_step, 0) local_consistence = tf.math.logical_or( is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1))) consistent = tf.math.logical_and(states.consistent, local_consistence) # get label, weight slices corresponding to current time_step label = TileForBeamAndFlatten( tf.gather(labels, time_step, axis=1)) weight = TileForBeamAndFlatten( tf.gather(weights, time_step, axis=1)) if p.bias_only_if_consistent: weight = weight * tf.cast(consistent, py_utils.FPropDtype(p)) # convert from dense label to sparse label probs vocab_size = tf.shape(bs_results.log_probs)[1] label_probs = tf.one_hot(label, vocab_size, dtype=py_utils.FPropDtype( p)) # [tgt_batch, vocab_size] pred_probs = tf.exp(bs_results.log_probs) # interpolate predicted probs and label probs weight = tf.expand_dims(weight, 1) probs = py_utils.with_dependencies([ py_utils.assert_less_equal(weight, 1.), py_utils.assert_greater_equal(weight, 0.) ], (1.0 - weight) * pred_probs + weight * label_probs) # Ensure that tf.math.log is applied to positive values. probs = tf.maximum(probs, tf.constant(1e-12, dtype=probs.dtype)) return tf.math.log(probs), consistent
def ExtractLogMelFeatures(wav_bytes_t): """Create Log-Mel Filterbank Features from raw bytes. Args: wav_bytes_t: Tensor representing raw wav file as a string of bytes. It is currently assumed that the wav file is encoded at 16KHz (see DecodeWav, below). Returns: A Tensor representing three stacked log-Mel filterbank energies, sub-sampled every three frames. """ # We want to use these parameters exactly. def _CreateAsrFrontend(): """Parameters corresponding to default ASR frontend.""" p = asr_frontend.MelAsrFrontend.Params() p.sample_rate = 16000. p.frame_size_ms = 25. p.frame_step_ms = 10. p.num_bins = 80 p.lower_edge_hertz = 125. p.upper_edge_hertz = 7600. p.preemph = 0.97 p.noise_scale = 0. p.pad_end = False return p.Instantiate() sample_rate, audio = DecodeWav(wav_bytes_t) audio *= 32768 # Remove channel dimension, since we have a single channel. audio = tf.squeeze(audio, axis=1) # TODO(drpng): make batches. audio = tf.expand_dims(audio, axis=0) static_sample_rate = 16000 mel_frontend = _CreateAsrFrontend() with tf.control_dependencies( [tf.assert_equal(sample_rate, static_sample_rate)]): outputs = mel_frontend.FPropDefaultTheta( py_utils.NestedMap(src_inputs=audio, paddings=tf.zeros_like(audio))) log_mel = outputs.src_inputs return log_mel
def ComputeConvOutputPadding(paddings, window, stride, padding_algorithm='SAME', v2_padding=False): """Computes paddings for convolution and pooling output. WARNING: This implementation is buggy prefer using ComputeConvOutputPaddingV2. out_padding[i] == 1 iff any in_padding corresponding to that output is 1. Args: paddings: The paddings tensor. It is expected to be of shape [batch, time]. window: The size of the windows. stride: The time-stride between adjacent windows. padding_algorithm: 'SAME' or 'VALID'. v2_padding: Prefer setting to True. The default implementation is buggy for strided convolutions. Returns: out_padding, The new padding tensor of size [batch, ceil(time / stride)]. """ if v2_padding: return _ComputeConvOutputPaddingV2(paddings, window, stride, padding_algorithm) if stride == 1: return paddings # Pad so input_length divides stride. input_length = py_utils.GetShape(paddings)[1] pad_len = (input_length + stride - 1) // stride * stride - input_length paddings = tf.pad(paddings, [[0, 0], [0, pad_len]], constant_values=1.0) out_padding = tf.nn.pool( tf.expand_dims(paddings, -1), [window], 'MAX', padding=padding_algorithm, strides=[stride], ) return tf.squeeze(out_padding, -1)
def FProp(self, theta, inputs, paddings): """Builds FProp graph. Args: theta: A NestedMap of Tensors, see base class. inputs: A Tensor of shape [batch, seqlen, dim0]. paddings: A Tensor of shape [batch, seqlen]. Returns: output: A Tensor of shape [batch, seqlen, dim0]. out_paddings: A Tensor of shape [batch, seqlen]. """ p = self.params with tf.name_scope(p.name): unnormalized_inputs = inputs inputs = self.ln.FProp(theta.ln, inputs) inputs = self.linear_start.FProp(theta.linear_start, inputs) inputs = self._GLU(inputs) # [b, t, d] --> [b, t, 1, d] inputs = tf.expand_dims(inputs, 2) inputs, paddings = self.depthwise_conv1d.FProp( theta.depthwise_conv1d, inputs, paddings) # normalize on 4d inputs. sometimes normalization layer reshapes inputs, # so there's no hurry to squeeze the input back, which adds extra overhead # on tpu. # TODO(jamesqin): add paddings in the call, for causal case. inputs = self.norm.FProp(theta.norm, inputs) inputs = tf.squeeze(inputs, 2) inputs = self._ApplyActivation(inputs, p.conv_activation) inputs = self.linear_end.FProp(theta.linear_end, inputs) inputs = self.dropout.FProp(theta.dropout, inputs) output = inputs + unnormalized_inputs return output, paddings
def _ReshapeToMono2D(self, pcm_audio_data, paddings): """Reshapes a 3D or 4D input to 2D. Since the input to FProp can be 3D or 4D (see class comments), this will collapse it back to a 2D, mono shape for internal processing. Args: pcm_audio_data: 2D, 3D or 4D audio input. See class comments. Must have a rank. paddings: Original paddings shaped to the first two dims of pcm_audio_data. Returns: Tuple of 2D [batch_size, timestep] mono audio data, new paddings. """ shape = py_utils.GetShape(pcm_audio_data) rank = len(shape) if rank == 2: return pcm_audio_data, paddings elif rank == 3: # [batch, time, channel] with tf.control_dependencies([tf.assert_equal(shape[2], 1)]): return tf.squeeze(pcm_audio_data, axis=2), paddings elif rank == 4: # [batch, time, packet, channel] batch_size, orig_time, orig_packet_size, channel = shape time = orig_time * orig_packet_size with tf.control_dependencies([tf.assert_equal(channel, 1)]): pcm_audio_data = tf.reshape(pcm_audio_data, (batch_size, time)) # Transform paddings into the new time base with a padding per time # step vs per packet by duplicating each packet. paddings = tf.reshape( tf.tile(tf.expand_dims(paddings, axis=2), [1, 1, orig_packet_size]), (batch_size, time)) return pcm_audio_data, paddings else: raise ValueError('Illegal pcm_audio_data shape')
def assign(self, value, use_locking=False, name=None, read_value=True): """Implements the interface of tf.Variable.assign. Args: value: A manually sharded tensor that has the shape of the individual elements of the stacked variable (shard shape with the stacking dimension collapsed). use_locking: See tf.Variable.assign. name: See tf.Variable.assign. read_value: See tf.Variable.assign. If True, the returned value will be manually sharded. Returns: See tf.Variable.assign. If read_value is True, returns the updated value in the shard shape of the shape of the individual elements of the stacked variable (shard shape with the stacking dimension collapsed). """ value = tf.expand_dims(value, 0) value = self._to_auto(value) res = self._var.assign(value, use_locking, name, read_value) if read_value: res = self._to_manual(res) res = tf.squeeze(res, 0) return res
def FProp(self, theta, inputs, paddings): """Builds FProp graph. Args: theta: A NestedMap of Tensors, see base class. inputs: A Tensor of shape [batch, seqlen, dim0]. paddings: A Tensor of shape [batch, seqlen]. Returns: output: A Tensor of shape [batch, seqlen, dim0]. out_paddings: A Tensor of shape [batch, seqlen]. """ p = self.params with tf.name_scope(p.name): unnormalized_inputs = inputs inputs = self.ln.FProp(theta.ln, inputs) inputs = self.linear_start.FProp(theta.linear_start, inputs) inputs = self._GLU(inputs) # [b, t, d] --> [b, t, 1, d] inputs = tf.expand_dims(inputs, 2) inputs, paddings = self.depthwise_conv1d.FProp(theta.depthwise_conv1d, inputs, paddings) inputs = tf.squeeze(inputs, 2) inputs = self.norm.FProp(theta.norm, inputs) inputs = self._ApplyActivation(inputs, p.conv_activation) inputs = self.linear_end.FProp(theta.linear_end, inputs) inputs = self.dropout.FProp(theta.dropout, inputs) output = inputs + unnormalized_inputs return output, paddings
def FProp(self, theta, batch, state0=None): """Encodes source as represented by 'inputs' and 'paddings'. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. batch: A NestedMap with fields: - src_inputs - The inputs tensor. It is expected to be of shape [batch, time, feature_dim, channels]. - paddings - The paddings tensor. It is expected to be of shape [batch, time]. state0: Recurrent input state. Not supported/ignored by this encoder. Returns: A NestedMap containing - 'encoded': a feature tensor of shape [time, batch, depth] - 'padding': a 0/1 tensor of shape [time, batch] - 'state': the updated recurrent state - '${layer_type}_${layer_index}': The per-layer encoder output. Each one is a NestedMap containing 'encoded' and 'padding' similar to regular final outputs, except that 'encoded' from conv or conv_lstm layers are of shape [time, batch, depth, channels]. """ p = self.params inputs, paddings = batch.src_inputs, batch.paddings outputs = py_utils.NestedMap() with tf.name_scope(p.name): # Adding specAugmentation. if p.use_specaugment and not self.do_eval: inputs, paddings = self.specaugment.FProp( theta.specaugment, inputs, paddings) # Add a few extra padded timesteps at the end. This is for ensuring the # correctness of the conv-layers at the edges. if p.pad_steps > 0: # inplace_update() is not supported by TPU for now. Since we have done # padding on the input_generator, we may avoid this additional padding. assert not py_utils.use_tpu() inputs_pad = tf.zeros( inplace_ops.inplace_update(tf.shape(inputs), 1, p.pad_steps), inputs.dtype) paddings_pad = tf.ones( inplace_ops.inplace_update(tf.shape(paddings), 1, p.pad_steps), paddings.dtype) inputs = tf.concat([inputs, inputs_pad], 1, name='inputs') paddings = tf.concat([paddings, paddings_pad], 1) plots = [ summary_utils.PrepareSequenceForPlot( tf.transpose(inputs, [0, 1, 3, 2]), paddings, 'inputs') ] conv_out = inputs out_padding = paddings for i, conv_layer in enumerate(self.conv): conv_out, out_padding = conv_layer.FProp( theta.conv[i], conv_out, out_padding) if p.extra_per_layer_outputs: conv_out *= (1.0 - out_padding[:, :, tf.newaxis, tf.newaxis]) outputs['conv_%d' % i] = py_utils.NestedMap( encoded=tf.transpose(conv_out, [1, 0, 2, 3]), # to [t, b, d, c] padding=tf.transpose(out_padding)) plots.append( summary_utils.PrepareSequenceForPlot( tf.transpose(conv_out, [0, 1, 3, 2]), out_padding, 'conv_%d_out' % i)) def TransposeFirstTwoDims(t): first_dim = tf.shape(t)[0] second_dim = tf.shape(t)[1] t_new = tf.transpose( tf.reshape(t, [first_dim, second_dim, -1]), [1, 0, 2]) t_shape_new = tf.concat([[second_dim], [first_dim], tf.shape(t)[2:]], 0) return tf.reshape(t_new, t_shape_new) # Now the conv-lstm part. conv_lstm_out = conv_out conv_lstm_out_padding = out_padding for i, (rnn, cnn) in enumerate( zip(self.conv_lstm_rnn, self.conv_lstm_cnn)): conv_lstm_in = conv_lstm_out # Move time dimension to be the first. conv_lstm_in = TransposeFirstTwoDims(conv_lstm_in) conv_lstm_in = tf.expand_dims(conv_lstm_in, 2) conv_lstm_in_padding = tf.expand_dims( tf.transpose(conv_lstm_out_padding), 2) lstm_out = rnn.FProp(theta.conv_lstm_rnn[i], conv_lstm_in, conv_lstm_in_padding) # Move time dimension to be the second. cnn_in = TransposeFirstTwoDims(lstm_out) cnn_in = tf.squeeze(cnn_in, 2) cnn_in_padding = conv_lstm_out_padding cnn_out, cnn_out_padding = cnn.FProp(theta.conv_lstm_cnn[i], cnn_in, cnn_in_padding) conv_lstm_out, conv_lstm_out_padding = cnn_out, cnn_out_padding if p.extra_per_layer_outputs: conv_lstm_out *= ( 1.0 - conv_lstm_out_padding[:, :, tf.newaxis, tf.newaxis]) outputs['conv_lstm_%d' % i] = py_utils.NestedMap( encoded=tf.transpose(conv_lstm_out, [1, 0, 2, 3]), # to [t, b, d, c] padding=tf.transpose(conv_lstm_out_padding)) plots.append( summary_utils.PrepareSequenceForPlot( conv_lstm_out, conv_lstm_out_padding, 'conv_lstm_%d_out' % i)) # Need to do a reshape before starting the rnn layers. conv_lstm_out = py_utils.HasRank(conv_lstm_out, 4) conv_lstm_out_shape = tf.shape(conv_lstm_out) new_shape = tf.concat([conv_lstm_out_shape[:2], [-1]], 0) conv_lstm_out = tf.reshape(conv_lstm_out, new_shape) if self._first_lstm_input_dim_pad: conv_lstm_out = tf.pad( conv_lstm_out, [[0, 0], [0, 0], [0, self._first_lstm_input_dim_pad]]) conv_lstm_out = py_utils.HasShape( conv_lstm_out, [-1, -1, self._first_lstm_input_dim]) # Transpose to move the time dimension to be the first. rnn_in = tf.transpose(conv_lstm_out, [1, 0, 2]) rnn_padding = tf.expand_dims(tf.transpose(conv_lstm_out_padding), 2) # rnn_in is of shape [time, batch, depth] # rnn_padding is of shape [time, batch, 1] # Now the rnn layers. num_skips = 0 for i in range(p.num_lstm_layers): rnn_out = self.rnn[i].FProp(theta.rnn[i], rnn_in, rnn_padding) residual_index = i - p.residual_start + 1 if p.residual_start > 0 and residual_index >= 0: if residual_index % p.residual_stride == 0: residual_in = rnn_in if residual_index % p.residual_stride == p.residual_stride - 1: # Highway skip connection. if p.highway_skip: rnn_out = self.highway_skip[num_skips].FProp( theta.highway_skip[num_skips], residual_in, rnn_out) num_skips += 1 else: # Residual skip connection. rnn_out += py_utils.HasShape( residual_in, tf.shape(rnn_out)) if p.project_lstm_output and (i < p.num_lstm_layers - 1): # Projection layers. rnn_out = self.proj[i].FProp(theta.proj[i], rnn_out, rnn_padding) if i == p.num_lstm_layers - 1: rnn_out *= (1.0 - rnn_padding) if p.extra_per_layer_outputs: rnn_out *= (1.0 - rnn_padding) outputs['rnn_%d' % i] = py_utils.NestedMap( encoded=rnn_out, padding=tf.squeeze(rnn_padding, [2])) # Stacking layer connection. if p.layer_index_before_stacking == i: # Stacking layer expects input tensor shape as [batch, time, feature]. # So transpose the tensors before and after the layer. rnn_out, rnn_padding = self.stacking.FProp( tf.transpose(rnn_out, [1, 0, 2]), tf.transpose(rnn_padding, [1, 0, 2])) rnn_out = tf.transpose(rnn_out, [1, 0, 2]) rnn_padding = tf.transpose(rnn_padding, [1, 0, 2]) plots.append( summary_utils.PrepareSequenceForPlot( tf.transpose(rnn_out, [1, 0, 2]), tf.transpose(rnn_padding, [1, 0, 2]), 'rnn_%d_out' % i)) rnn_in = rnn_out final_out = rnn_in summary_utils.PlotSequenceFeatures(list(reversed(plots)), 'encoder_example', xlabel='Time') outputs['encoded'] = final_out outputs['padding'] = tf.squeeze(rnn_padding, [2]) outputs['state'] = py_utils.NestedMap() return outputs
def ComputePredictions(self, encoder_outputs, pronunciations, is_inference=False): """Computes the predictions from the encoder_outputs, updating losses. Despite the name, this function does the bulk of the decoding and loss computation, incrementing the loss at each time step. Args: encoder_outputs: a NestedMap consisting of outputs of the FeatureNeighborhoodEncoder with encoded - encoding of the input spelling neighbor_pronunciations_encoded - encodings of the neighbor prons neighbor_pronunciations_encoded - encodings of the neighbor spellings state - encoder state to which has been added dec_input - seed output for the decoder [*, 1] tensor consisting of sentence start indices (corresponding to "<s>") pronunciations: NestedMap with pronunciations - [*, max_pronunciation_len] tensor of pronunciations is_inference: If False then uses teacher forcing else does autoregression. Returns: NestedMap with loss, per_sequence_losses,labels, a [*, max_pronunciation_len] tensor of predictions, and attention ([*, max_pronunciation_len, max_spelling_len]), and neighbor_attention ([*, max_pronunciation_len, max_neighbors]) tensors, along with the raw batch passed through from the encoder. """ p = self.params targets = pronunciations.pronunciations t_len = int(targets.get_shape().as_list()[1]) t_idx = tf.constant(0) attention = tf.TensorArray(dtype=tf.float32, size=t_len) neighbor_attention = tf.TensorArray(dtype=tf.float32, size=t_len) outputs = tf.TensorArray(dtype=tf.float32, size=t_len) loop_cond = lambda t_idx, ts, *_: tf.less(t_idx, t_len) dec_input = tf.convert_to_tensor([p.start] * p.input.batch_size) state = encoder_outputs.state # pylint: disable=missing-docstring def loop_body(t_idx, dec_input, attention, neighbor_attention, state, outputs): decoder_result = self.Decode(encoder_outputs, dec_input, state) outputs = outputs.write(t_idx, decoder_result.predictions) attention = attention.write(t_idx, decoder_result.attention_weights) neighbor_attention = neighbor_attention.write( t_idx, tf.cast(decoder_result.neighbor_attention_weights, dtype=tf.float32)) if is_inference: dec_input = tf.cast(tf.argmax(decoder_result.predictions, 1), tf.int32) else: dec_input = targets[:, t_idx] t_idx = t_idx + 1 state = decoder_result.state return t_idx, dec_input, attention, neighbor_attention, state, outputs _, _, attention, neighbor_attention, state, outputs = tf.while_loop( loop_cond, loop_body, loop_vars=[ t_idx, dec_input, attention, neighbor_attention, state, outputs ]) outputs = tf.transpose(outputs.stack(), [1, 0, 2]) labels = tf.argmax(outputs, axis=-1) mask = tf.cast(tf.math.logical_not(tf.math.equal(targets, 0)), dtype=tf.float32) loss = self._loss_object(targets, outputs, sample_weight=mask) loss = tf.reduce_sum(loss, axis=1) per_sequence_losses = (loss / t_len) loss = tf.reduce_mean(per_sequence_losses) predictions = py_utils.NestedMap() predictions.loss = loss predictions.per_sequence_losses = per_sequence_losses predictions.labels = labels predictions.attention = tf.transpose(tf.squeeze(attention.stack()), perm=[1, 0, 2]) if p.use_neighbors: predictions.neighbor_attention = tf.transpose(tf.squeeze( neighbor_attention.stack()), perm=[1, 0, 2]) else: predictions.neighbor_attention = tf.squeeze( neighbor_attention.stack()) # Expose this for subsequent data analysis predictions.batch = encoder_outputs.batch return predictions
def _RemoveChannelDim(self, pcm_audio_data): if pcm_audio_data.shape.rank == 3: pcm_audio_data = tf.squeeze(pcm_audio_data, 2) assert pcm_audio_data.shape.rank == 2, ( 'MelAsrFrontend only supports one channel') return pcm_audio_data
def BuildInputBatch(self, batch_size, features_list, bucket_keys=None): """Builds an input batch. Args: batch_size: batch size to use, defaults to infeed batch size. features_list: Use this list to build the batch. bucket_keys: If None, bucket_keys[i] is the bucketing key of the i-th sample. Returns: py_utils.NestedMap with feature names as keys and tensors as values. """ p = self.params batch = py_utils.NestedMap() batch.bucket_keys = bucket_keys (utt_ids, tgt_ids, tgt_labels, tgt_paddings, src_frames, src_paddings) = features_list if not py_utils.use_tpu(): batch.sample_ids = utt_ids src_frames, src_paddings = self._MaybePadSourceInputs( src_frames, src_paddings) # We expect src_inputs to be of shape # [batch_size, num_frames, feature_dim, channels]. src_frames = tf.expand_dims(src_frames, axis=-1) # Convert target ids, labels, paddings, and weights from shape [batch_size, # 1, num_frames] to [batch_size, num_frames] tgt_ids = tf.squeeze(tgt_ids, axis=1) tgt_labels = tf.squeeze(tgt_labels, axis=1) tgt_paddings = tf.squeeze(tgt_paddings, axis=1) if p.pad_to_max_seq_length: assert p.source_max_length assert p.target_max_length if all(x == p.bucket_batch_limit[0] for x in p.bucket_batch_limit): # Set the input batch size as an int rather than a tensor. src_frames_shape = (self.InfeedBatchSize(), p.source_max_length, p.frame_size, 1) src_paddings_shape = (self.InfeedBatchSize(), p.source_max_length) tgt_shape = (self.InfeedBatchSize(), p.target_max_length) else: tf.logging.warning( 'Could not set static input shape since not all bucket batch sizes ' 'are the same:', p.bucket_batch_limit) src_frames_shape = None src_paddings_shape = None tgt_shape = None src_frames = py_utils.PadSequenceDimension(src_frames, p.source_max_length, 0, shape=src_frames_shape) src_paddings = py_utils.PadSequenceDimension( src_paddings, p.source_max_length, 1, shape=src_paddings_shape) tgt_ids = py_utils.PadSequenceDimension(tgt_ids, p.target_max_length, 0, shape=tgt_shape) tgt_labels = py_utils.PadSequenceDimension(tgt_labels, p.target_max_length, 0, shape=tgt_shape) tgt_paddings = py_utils.PadSequenceDimension(tgt_paddings, p.target_max_length, 1, shape=tgt_shape) batch.src = py_utils.NestedMap(src_inputs=src_frames, paddings=src_paddings) batch.tgt = py_utils.NestedMap(ids=tgt_ids, labels=tgt_labels, paddings=tgt_paddings, weights=1.0 - tgt_paddings) return batch
def _Squeeze(self, name): return self._Fn( name, fn=lambda x: tf.squeeze(x, 2), fn_out=lambda x: tshape.Shape(x[0:2] + x[3:]), fn_flops=lambda x: 1)
def __init__(self, params): super().__init__(params) p = self.params (utt_ids, audio_document_ids, num_utterances_in_audio_document, tgt_ids, tgt_labels, tgt_paddings, src_frames, src_paddings), self._bucket_keys = self._BuildDataSource() self._sample_ids = utt_ids src_frames, src_paddings = self._MaybePadSourceInputs( src_frames, src_paddings) # We expect src_inputs to be of shape # [batch_size, num_frames, feature_dim, channels]. src_frames = tf.expand_dims(src_frames, axis=-1) # Convert target ids, labels, paddings, and weights from shape [batch_size, # 1, num_frames] to [batch_size, num_frames] tgt_ids = tf.squeeze(tgt_ids, axis=1) tgt_labels = tf.squeeze(tgt_labels, axis=1) tgt_paddings = tf.squeeze(tgt_paddings, axis=1) if p.pad_to_max_seq_length: assert p.source_max_length assert p.target_max_length if all(x == p.bucket_batch_limit[0] for x in p.bucket_batch_limit): # Set the input batch size as an int rather than a tensor. src_frames_shape = (self.InfeedBatchSize(), p.source_max_length, p.frame_size, 1) src_paddings_shape = (self.InfeedBatchSize(), p.source_max_length) tgt_shape = (self.InfeedBatchSize(), p.target_max_length) else: tf.logging.warning( 'Could not set static input shape since not all bucket batch sizes ' 'are the same:', p.bucket_batch_limit) src_frames_shape = None src_paddings_shape = None tgt_shape = None src_frames = py_utils.PadBatchDimension(src_frames, self.InfeedBatchSize(), 0) src_paddings = py_utils.PadBatchDimension(src_paddings, self.InfeedBatchSize(), 1) tgt_ids = py_utils.PadBatchDimension(tgt_ids, self.InfeedBatchSize(), 0) tgt_labels = py_utils.PadBatchDimension(tgt_labels, self.InfeedBatchSize(), 0) tgt_paddings = py_utils.PadBatchDimension(tgt_paddings, self.InfeedBatchSize(), 1) self._sample_ids = py_utils.PadBatchDimension( self._sample_ids, self.InfeedBatchSize(), type(self).PAD_INDEX) # For reasons I don't understand, the shape of self._sample_ids after the above is # [BatchSize, 1] rather than [BatchSize]. self._sample_ids = tf.squeeze(self._sample_ids, axis=1) self._sample_ids = tf.ensure_shape(self._sample_ids, self.InfeedBatchSize()) audio_document_ids = py_utils.PadBatchDimension( audio_document_ids, self.InfeedBatchSize(), type(self).PAD_INDEX) # For reasons I don't understand, the shape of audio_document_ids after the above is # [BatchSize, 1] rather than [BatchSize]. audio_document_ids = tf.squeeze(audio_document_ids, axis=1) audio_document_ids = tf.ensure_shape(audio_document_ids, self.InfeedBatchSize()) num_utterances_in_audio_document = py_utils.PadBatchDimension( num_utterances_in_audio_document, self.InfeedBatchSize(), type(self).PAD_INDEX) # For reasons I don't understand, the shape of num_utterances_in_audio_document after the above is # [BatchSize, 1] rather than [BatchSize]. num_utterances_in_audio_document = tf.squeeze( num_utterances_in_audio_document, axis=1) num_utterances_in_audio_document = tf.ensure_shape( num_utterances_in_audio_document, self.InfeedBatchSize()) src_frames = py_utils.PadSequenceDimension(src_frames, p.source_max_length, 0, shape=src_frames_shape) src_paddings = py_utils.PadSequenceDimension( src_paddings, p.source_max_length, 1, shape=src_paddings_shape) tgt_ids = py_utils.PadSequenceDimension(tgt_ids, p.target_max_length, 0, shape=tgt_shape) tgt_labels = py_utils.PadSequenceDimension(tgt_labels, p.target_max_length, 0, shape=tgt_shape) tgt_paddings = py_utils.PadSequenceDimension(tgt_paddings, p.target_max_length, 1, shape=tgt_shape) tgt = py_utils.NestedMap(ids=tgt_ids, labels=tgt_labels, paddings=tgt_paddings, weights=1.0 - tgt_paddings) src = py_utils.NestedMap(src_inputs=src_frames, paddings=src_paddings) self._tgt = tgt self._src = src self._audio_document_ids = audio_document_ids self._num_utterances_in_audio_document = num_utterances_in_audio_document
def ComputeLoss(self, theta, predictions, input_batch): """Computes loss and other metrics for the given predictions. Args: theta: A `.NestedMap` object containing variable values of this task. predictions: The output of `ComputePredictions`, contains: logits - [b, nx, ny, nz, na, 7 + num_classes]. na is the number of anchor boxes per cell. [..., :7] are (dx, dy, dz, dw, dl, dh, dt). input_batch: The input batch from which we accesses the groundtruth. Returns: Two dicts defined as BaseTask.ComputeLoss. """ p = self.params predicted_residuals = py_utils.HasShape( predictions.residuals, [-1, -1, -1, -1, p.num_anchors, 7]) predicted_class_logits = py_utils.HasShape( predictions.classification_logits, [-1, -1, -1, -1, p.num_anchors, p.num_classes]) bs, nx, ny, nz, na, _ = py_utils.GetShape(predicted_class_logits, 6) # Compute class and regression weights. class_weights = input_batch.assigned_cls_mask class_weights = py_utils.HasShape(class_weights, [bs, nx, ny, nz, na]) reg_weights = input_batch.assigned_reg_mask reg_weights = py_utils.HasShape(reg_weights, [bs, nx, ny, nz, na]) reg_weights = tf.expand_dims(reg_weights, -1) if p.loss_norm_type == LossNormType.NORM_BY_NUM_POSITIVES: # Compute number of positive anchors per example. foreground_mask = py_utils.HasShape(input_batch.assigned_reg_mask, [bs, nx, ny, nz, na]) # Sum to get the number of foreground anchors for each example. loss_normalization = tf.reduce_sum(foreground_mask, axis=[1, 2, 3, 4]) loss_normalization = tf.maximum(loss_normalization, tf.ones_like(loss_normalization)) # Reshape for broadcasting. loss_normalization = tf.reshape(loss_normalization, [bs, 1, 1, 1, 1, 1]) class_weights /= loss_normalization reg_weights /= loss_normalization # Classification loss. assigned_gt_labels = py_utils.HasShape(input_batch.assigned_gt_labels, [bs, nx, ny, nz, na]) class_loss = py_utils.SigmoidCrossEntropyFocalLoss( logits=predicted_class_logits, labels=tf.one_hot(assigned_gt_labels, p.num_classes), alpha=p.focal_loss_alpha, gamma=p.focal_loss_gamma) class_loss *= class_weights[..., tf.newaxis] class_loss_sum = tf.reduce_sum(class_loss) # Regression loss. anchor_localization_residuals = py_utils.HasShape( input_batch.anchor_localization_residuals, [bs, nx, ny, nz, na, 7]) # Location and dimensions loss. reg_loc_and_dims_loss = self._utils.ScaledHuberLoss( predictions=py_utils.HasShape(predicted_residuals[..., :6], [bs, nx, ny, nz, na, 6]), labels=anchor_localization_residuals[..., :6], delta=1 / (3.**2)) # Rotation loss with SmoothL1(sin(delta)). rot_delta = (predicted_residuals[..., 6:] - input_batch.anchor_localization_residuals[..., 6:]) if p.use_atan2_heading_loss: atan2_of_delta = tf.atan2(tf.sin(rot_delta), tf.cos(rot_delta)) reg_rot_loss = self._utils.ScaledHuberLoss( predictions=atan2_of_delta, labels=tf.zeros_like(atan2_of_delta), delta=1 / (3.**2)) else: # Rotation loss with SmoothL1(sin(delta)). reg_rot_loss = self._utils.ScaledHuberLoss( predictions=tf.sin(rot_delta), labels=tf.zeros_like(rot_delta), delta=1 / (3.**2)) # Direction loss if p.direction_classifier_weight > 0.0: # The target rotations are in the assigned_gt_bbox tensor, # which already has assigned a gt bounding box to every anchor. rot_target = input_batch.assigned_gt_bbox[..., 6] # If rotation is > 0, the class is 1, else it is 0. rot_dir = tf.cast(rot_target > 0., tf.int32) # Compute one-hot labels as a target. rot_dir_onehot = tf.one_hot(rot_dir, 2) # Manually handle loss reduction. dir_loss = tf.losses.softmax_cross_entropy( onehot_labels=rot_dir_onehot, logits=predictions.predicted_dir, weights=tf.squeeze(reg_weights, axis=-1), reduction=tf.losses.Reduction.NONE) # Reduce across all dimensions (we'll divide by the batch size below). dir_loss_sum = tf.reduce_sum(dir_loss) else: dir_loss_sum = 0.0 # Compute loss contribution from location and dimension separately. reg_loc_loss = reg_loc_and_dims_loss[..., :3] * reg_weights reg_loc_loss_sum = tf.reduce_sum(reg_loc_loss) reg_dim_loss = reg_loc_and_dims_loss[..., 3:6] * reg_weights reg_dim_loss_sum = tf.reduce_sum(reg_dim_loss) # Compute rotation loss contribution. reg_rot_loss *= reg_weights reg_rot_loss_sum = tf.reduce_sum(reg_rot_loss) # Num. predictions. # TODO(zhifengc): Consider other normalization factors. E.g., # of bboxes. preds = tf.cast(bs, class_loss_sum.dtype) # Normalize all of the components by batch size. reg_loc_loss = reg_loc_loss_sum / preds reg_dim_loss = reg_dim_loss_sum / preds reg_rot_loss = reg_rot_loss_sum / preds class_loss = class_loss_sum / preds dir_loss = dir_loss_sum / preds # Compute total localization regression loss. reg_loss = (p.location_loss_weight * reg_loc_loss + p.dimension_loss_weight * reg_dim_loss + p.rotation_loss_weight * reg_rot_loss) # Apply weights to normalized class losses. loss = (class_loss * p.classification_loss_weight + reg_loss * p.localization_loss_weight + dir_loss * p.direction_classifier_weight) metrics_dict = { 'loss': (loss, preds), 'loss/class': (class_loss, preds), 'loss/reg': (reg_loss, preds), 'loss/reg/rot': (reg_rot_loss, preds), 'loss/reg/loc': (reg_loc_loss, preds), 'loss/reg/dim': (reg_dim_loss, preds), 'loss/dir': (dir_loss, preds), } # Calculate dimension errors min_angle_rad = -np.pi if p.use_atan2_heading_loss else 0 gt_bboxes = self._utils_3d.ResidualsToBBoxes( input_batch.anchor_bboxes, anchor_localization_residuals, min_angle_rad=min_angle_rad, max_angle_rad=np.pi) predicted_bboxes = self._utils_3d.ResidualsToBBoxes( input_batch.anchor_bboxes, predicted_residuals, min_angle_rad=min_angle_rad, max_angle_rad=np.pi) dimension_errors_dict = self._BBoxDimensionErrors( gt_bboxes, predicted_bboxes, reg_weights) metrics_dict.update(dimension_errors_dict) per_example_dict = { 'residuals': predicted_residuals, 'classification_logits': predicted_class_logits, } return metrics_dict, per_example_dict
def FProp(self, theta, in_nmap, state0=None): """Generates frame-weighted mean/std-dev statistics from the inputs. Args: theta: A NestedMap containing layer weights containing the key frame_weight_ffn describing the feed-forward network. This key is needed only when use_weighted_frames is set to True and stats_type is not 'PASS_THRU'. in_nmap: A NestedMap. Members include: - in_map[p.features_name]: Features tensor of shape [len, batch, input_dim], tf.float32. - in_map[p.paddings_name]: Paddings tensor of shape [len, batch], tf.float32. state0: A NestedMap containing sufficient statistics for the previous state. When not in inference mode state0 should be NullState. When in inference mode, state0, in addition to containing state0 information from other child layers, should also include the following keys within the NestedMap state0.accumulated_stats: - count: [batch_size], tf.float32 - sum_x: [batch_size, input_dim], tf.float32 - sum_xx: [batch_size, input_dim], tf.float32 The above keys point to the sufficient statistics accumulated across all data packets excluding the current data packet of features. Returns: A NestedMap (out_nmap). For the 'PASS_THRU' case, in_nmap is returned. For the 'MEAN' and 'MEAN_STD' cases, a NestedMap with the same information and structure as in_nmap with the following additional updates: out_nmap[p.features_name]: Features tensor of shape [len, batch, output_dim], tf.float32. The output_dim is either input_dim or 2*input_dim depending on whether standard deviation statistics are also included. out_nmap[p.paddings_name]: Paddings tensor of shape [len, batch], tf.float32. out_nmap.state: If state0 is NullState, NullState is returned as the output state. If state0 is not NullState, then a NestedMap containing sufficient statistics of this cumulative statistics layer gets returned. When the mode is 'PASS_THRU', the state is an empty NestedMap. In other cases, the state includes the frame-based counts, sum of X and sum of X-squared (if 'MEAN_STD' information is requested) The additional NestedMap keys are included within out_nmap.state.accumulated_stats as follows: - count: [batch_size], tf.float32 - sum_x: [batch_size, input_dim], tf.float32 - sum_xx: [batch_size, input_dim], tf.float32 For example, to reference the counts, given an output variable, out_nmap, the following would be used: out_nmap.state.accumulated_stats.count """ p = self.params # Return the input if we are using PASS_THRU mode. # Note: the state, even though it is empty, is always populated so as to be # consistent with the non-PASS_THRU cases. if p.stats_type == 'PASS_THRU': # Do not mutate the input NestedMap but the state, because of PASS_THRU. out_nmap = in_nmap.copy() out_nmap.state = self.NullState() return out_nmap # Get the input data and padding information input_features = in_nmap[p.features_name] padding = in_nmap[p.paddings_name] # Convert padding to frame based flag indicating if it is speech effective_frame_weight = tf.cast(1.0 - padding, dtype=p.dtype) # If using frame weighted analysis, calculate the sigmoid output and # multiply it with the speech/non-speech effective_frame_weight. if p.use_weighted_frames: # For each speech frame, a single weight value is generated between 0 and # 1 (if a sigmoid activation is used). The expected output (after the # squeeze function) is a tensor of shape [len, batch] with a tf.float32 # data type. ffn_frame_weight = tf.squeeze(self.frame_weight_ffn.FProp( theta.frame_weight_ffn, input_features, None), axis=2) effective_frame_weight = effective_frame_weight * ffn_frame_weight # Add a small floor for the effective_frame_weight effective_frame_weight = effective_frame_weight + p.epsilon # Calculate the cumulative count and cumulative sum_x for the current packet # of frames cumulative_count = tf.math.cumsum(effective_frame_weight, axis=0) cumulative_sum_x = tf.math.cumsum( input_features * effective_frame_weight[:, :, tf.newaxis], axis=0) # If standard deviation statistics are needed, calculate the cumulative # sum_xx (sum x-squared) for the current packet of frames if p.stats_type == 'MEAN_STD': cumulative_sum_xx = tf.math.cumsum( input_features * input_features * effective_frame_weight[:, :, tf.newaxis], axis=0) state1 = self.NullState() # If we are running in online mode, be sure to add in the total sums from # the past packets of features. If in offline mode, there is no state to # update. if not self.IsNullState(state0): # Calculate cumulative sums up to the current point in time. This includes # past packets and the current packet of data. cumulative_count = cumulative_count + state0.accumulated_stats.count[ tf.newaxis, :] cumulative_sum_x = cumulative_sum_x + state0.accumulated_stats.sum_x[ tf.newaxis, :, :] # Update the internal state by copying across the statistics of the very # last frame state1.accumulated_stats = py_utils.NestedMap() state1.accumulated_stats.count = cumulative_count[-1, :] state1.accumulated_stats.sum_x = cumulative_sum_x[-1, :, :] # If standard deviation statistics are needed, calculate the cumulative # sum_xx (sum x-squared) for the past and the current packet of frames if p.stats_type == 'MEAN_STD': cumulative_sum_xx = cumulative_sum_xx + state0.accumulated_stats.sum_xx[ tf.newaxis, :, :] state1.accumulated_stats.sum_xx = cumulative_sum_xx[-1, :, :] # Calculate the running mean for the current packet of features output_features = tf.math.divide(cumulative_sum_x, cumulative_count[:, :, tf.newaxis]) # If requested, calculate and append the standard deviation statistics. if p.stats_type == 'MEAN_STD': cumulative_mean_xx = tf.math.divide( cumulative_sum_xx, cumulative_count[:, :, tf.newaxis]) cumulative_std_dev = tf.math.sqrt(cumulative_mean_xx - output_features * output_features + p.epsilon) output_features = tf.concat((output_features, cumulative_std_dev), axis=2) # Return the output out_nmap = py_utils.NestedMap() out_nmap[p.features_name] = output_features out_nmap[p.paddings_name] = padding out_nmap.state = state1 return out_nmap
def MergeBeamSearchOutputs(max_hyps_per_beam, beam_search_outputs): """Merges beam search hyps from multiple decoders. Args: max_hyps_per_beam: the number of top hyps in the merged results. Must be less than or equal to total number of input hyps. beam_search_outputs: a list of BeamSearchDecodeOutput objects. Must share the same source_batch and max sequence length. Returns: A BeamSearchDecodeOutput object containing max_hyps_per_beam hypotheses per beam. """ source_batch = tf.shape(beam_search_outputs[0].topk_hyps)[0] value_dict = {} for output in beam_search_outputs: hyps_per_beam = py_utils.with_dependencies([ py_utils.assert_equal(source_batch, tf.shape(output.topk_hyps)[0]), ], tf.shape(output.topk_hyps)[1]) for k, v in six.iteritems(output._asdict()): if v is None: continue if k == 'done_hyps': v = tf.transpose(v) if k not in value_dict: value_dict[k] = [] value_dict[k].append(tf.reshape(v, [source_batch, hyps_per_beam, -1])) # Concatenate the tensors along the 'num_hyps_per_beam' dimension. concatenated = {} for k, values in six.iteritems(value_dict): if len(values) != len(beam_search_outputs): raise ValueError('Incomplete values for %s: %s' % (k, beam_search_outputs)) concatenated[k] = tf.concat(values, axis=1) scores = concatenated['topk_scores'] scores = tf.where( tf.equal(concatenated['topk_lens'], 0), tf.fill(tf.shape(scores), -1e6), scores) scores = tf.squeeze(scores, -1) # Select top max_hyps_per_beam indices per beam. _, top_indices = tf.nn.top_k(scores, max_hyps_per_beam) batch_ids = tf.tile( tf.expand_dims(tf.range(source_batch), -1), [1, max_hyps_per_beam]) # [source_batch, max_hyps_per_beam, 2] gather_indices = tf.stack([batch_ids, top_indices], axis=-1) # Gather the merged top hyps according to 'gather_indices'. top = beam_search_outputs[0]._asdict() total_hyps = source_batch * max_hyps_per_beam for k, v in six.iteritems(concatenated): v = tf.gather_nd(v, gather_indices) if k == 'done_hyps': v = tf.transpose(tf.reshape(v, [total_hyps, -1])) elif k == 'topk_hyps': v = tf.reshape(v, [source_batch, max_hyps_per_beam]) elif k == 'topk_ids': v = tf.reshape(v, [total_hyps, -1]) elif k in ('topk_lens', 'topk_scores', 'topk_decoded'): v = tf.reshape(v, [total_hyps]) else: raise ValueError('Unexpected field: %s' % k) top[k] = v return BeamSearchDecodeOutput(**top)
def FProp(self, theta, input_data): """Apply projection to inputs. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. input_data: A NestedMap object containing 'points', 'features', 'padding' Tensors, all of type tf.float32. 'points': Shape [N, P1, 3] 'features': Shape [N, P1, F] 'padding': Shape [N, P1] where 0 indicates real, 1 indicates padded. Returns: A NestedMap consisting of the following two NestedMaps, grouped_points: consists of the grouped points, features and padding. query_points: consists of the sampled points and padding. """ p = self.params features = input_data.features n, p1, c = py_utils.GetShape(features) points = py_utils.HasShape(input_data.points, [n, p1, 3]) padding = py_utils.HasShape(input_data.padding, [n, p1]) # Sampling sampled_idx, _ = car_lib.FarthestPointSampler( points, padding, num_sampled_points=p.num_samples) query_points = car_lib.MatmulGather(points, tf.expand_dims(sampled_idx, -1)) query_points = tf.squeeze(query_points, -2) # Grouping grouped_idx, grouped_padding = car_lib.NeighborhoodIndices( points, query_points, p.group_size, points_padding=padding, max_distance=p.ball_radius, sample_neighbors_uniformly=p.sample_neighbors_uniformly) grouped_points = car_lib.MatmulGather(points, grouped_idx) # Normalize the grouped points based on the location of the query point. grouped_points -= tf.expand_dims(query_points, -2) grouped_features = car_lib.MatmulGather(features, grouped_idx) # Get the padding for the query points. query_padding = tf.batch_gather(padding, sampled_idx) # Verify the shapes of output tensors. query_points = py_utils.HasShape(query_points, [n, p.num_samples, 3]) query_padding = py_utils.HasShape(query_padding, [n, p.num_samples]) grouped_features = py_utils.HasShape( grouped_features, [n, p.num_samples, p.group_size, c]) grouped_padding = py_utils.HasShape(grouped_padding, [n, p.num_samples, p.group_size]) output_grouped_points = py_utils.NestedMap(points=grouped_points, features=grouped_features, padding=grouped_padding) output_query = py_utils.NestedMap(points=query_points, padding=query_padding) output_map = py_utils.NestedMap({ 'grouped_points': output_grouped_points, 'query_points': output_query }) return output_map
def _maybe_update_block_mask(self, weights, threshold): """Performs block-granular masking of the weights. Block pruning occurs only if the block_height or block_width is > 1 and if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise pruning occurs. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if block pooling function is not AVG or MAX """ block_dims = self._get_block_dims(weights.op.name) squeezed_weights = tf.squeeze(weights) if squeezed_weights.get_shape().ndims != 2 or block_dims == [1, 1]: return self._update_mask(weights, threshold) for i in range(2): if block_dims[i] == -1: block_dims[i] = squeezed_weights.get_shape()[i] if self._block_pooling_function not in ['AVG', 'MAX']: raise ValueError( 'Unknown pooling function for block sparsity: %s' % self._block_pooling_function) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(squeezed_weights) pool_window = block_dims pool_fn = pruning_utils.factorized_pool squeeze_axis = None if not self._spec.use_tpu: pool_fn = tf.nn.pool abs_weights = tf.reshape(abs_weights, [ 1, abs_weights.get_shape()[0], abs_weights.get_shape()[1], 1 ]) squeeze_axis = [0, 3] pooled_weights = pool_fn(abs_weights, window_shape=pool_window, pooling_type=self._block_pooling_function, strides=pool_window, padding='SAME', name=weights.op.name + '_pooled') if pooled_weights.get_shape().ndims != 2: pooled_weights = tf.squeeze(pooled_weights, axis=squeeze_axis) smoothed_threshold, new_mask = self._update_mask( pooled_weights, threshold) updated_mask = pruning_utils.expand_tensor(new_mask, block_dims) sliced_mask = tf.slice(updated_mask, [0, 0], [ squeezed_weights.get_shape()[0], squeezed_weights.get_shape()[1] ]) return smoothed_threshold, tf.reshape(sliced_mask, tf.shape(weights))
def __init__(self, params): super(AsrInput, self).__init__(params) p = self.params (utt_ids, tgt_ids, tgt_labels, tgt_paddings, src_frames, src_paddings), self._bucket_keys = self._BuildDataSource() self._sample_ids = utt_ids src_frames, src_paddings = self._MaybePadSourceInputs( src_frames, src_paddings) # We expect src_inputs to be of shape # [batch_size, num_frames, feature_dim, channels]. src_frames = tf.expand_dims(src_frames, axis=-1) # Convert target ids, labels, paddings, and weights from shape [batch_size, # 1, num_frames] to [batch_size, num_frames] tgt_ids = tf.squeeze(tgt_ids, axis=1) tgt_labels = tf.squeeze(tgt_labels, axis=1) tgt_paddings = tf.squeeze(tgt_paddings, axis=1) if p.pad_to_max_seq_length: assert p.source_max_length assert p.target_max_length if all(x == p.bucket_batch_limit[0] for x in p.bucket_batch_limit): # Set the input batch size as an int rather than a tensor. src_frames_shape = (self.InfeedBatchSize(), p.source_max_length, p.frame_size, 1) src_paddings_shape = (self.InfeedBatchSize(), p.source_max_length) tgt_shape = (self.InfeedBatchSize(), p.target_max_length) else: tf.logging.warning( 'Could not set static input shape since not all bucket batch sizes ' 'are the same:', p.bucket_batch_limit) src_frames_shape = None src_paddings_shape = None tgt_shape = None src_frames = py_utils.PadSequenceDimension(src_frames, p.source_max_length, 0, shape=src_frames_shape) src_paddings = py_utils.PadSequenceDimension( src_paddings, p.source_max_length, 1, shape=src_paddings_shape) tgt_ids = py_utils.PadSequenceDimension(tgt_ids, p.target_max_length, 0, shape=tgt_shape) tgt_labels = py_utils.PadSequenceDimension(tgt_labels, p.target_max_length, 0, shape=tgt_shape) tgt_paddings = py_utils.PadSequenceDimension(tgt_paddings, p.target_max_length, 1, shape=tgt_shape) tgt = py_utils.NestedMap(ids=tgt_ids, labels=tgt_labels, paddings=tgt_paddings, weights=1.0 - tgt_paddings) src = py_utils.NestedMap(src_inputs=src_frames, paddings=src_paddings) self._tgt = tgt self._src = src
def _SqueezeFn(x): return tf.squeeze(x, axis=axis)
def FProp(self, theta, inputs, paddings, state0, labels=None): """Forward compute.""" p = self.params ids = py_utils.HasRank(inputs, 2) paddings = py_utils.HasShape(paddings, tf.shape(ids)) seqlen, batch = tf.unstack(tf.shape(inputs), num=2) assert state0 paddings_3d = tf.expand_dims(paddings, axis=2) # RNNs if p.shared_emb: emb_act = [self.emb.EmbLookup(theta.emb, inputs) ] * (1 + p.number_of_experts) else: emb_act = [ self.emb[i].EmbLookup(theta.emb[i], inputs) for i in range(1 + p.number_of_experts) ] state1 = py_utils.NestedMap(rnns=[]) rnns_act = [] for i, act in enumerate(emb_act): act, state = self.rnns[i].FProp(theta.rnns[i], act, paddings_3d, state0.rnns[i]) act = py_utils.HasRank(act, 3) rnns_act += [act] state1.rnns += [state] # [time, batch, experts, dims]. expert_stacked = tf.stack(rnns_act[1:], axis=2) # Compute gating softmax. The 0-th rnns is used as the expert # predictor. Because SoftmaxLayer.Logits takes a matrix as input, # we reshape rnns_act[0], the domain predictor activation, to a # matrix here. act = tf.reshape(rnns_act[0], [seqlen * batch, -1]) logits = self.domain_predictor_softmax.Logits( theta.domain_predictor_softmax, act) # [time, batch, experts] gating = tf.reshape(tf.nn.softmax(logits), [seqlen, batch, -1]) # Mix the experts. # [time, batch, dims] combined = tf.squeeze( tf.matmul( # [time, batch, 1, experts] tf.expand_dims(gating, axis=2), # [time, batch, experts, dims] expert_stacked), axis=2) if p.add_postgating_rnn: # Note that this layer includes 1 or more RNN layers followed # by a softmax. xent_loss, state1.merge = self.merge.FProp(theta.merge, combined, paddings, state0.merge, labels) else: xent_loss = self.output_softmax.FProp( theta=theta.output_softmax, inputs=combined, class_weights=labels.class_weights, class_ids=labels.class_ids) # return xent_loss, state1 return xent_loss, state1