def FProp(self, theta, batch, state0=None): """Encodes source as represented by 'inputs' and 'paddings'. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. batch: A NestedMap with fields: - src_inputs - The inputs tensor. It is expected to be of shape [batch, time, feature_dim, channels]. - paddings - The paddings tensor. It is expected to be of shape [batch, time]. state0: Recurrent input state. Not supported/ignored by this encoder. Returns: A NestedMap containing - 'encoded': a feature tensor of shape [time, batch, depth] - 'padding': a 0/1 tensor of shape [time, batch] - 'state': the updated recurrent state - '${layer_type}_${layer_index}': The per-layer encoder output. Each one is a NestedMap containing 'encoded' and 'padding' similar to regular final outputs, except that 'encoded' from conv or conv_lstm layers are of shape [time, batch, depth, channels]. """ p = self.params inputs, paddings = batch.src_inputs, batch.paddings outputs = py_utils.NestedMap() with tf.name_scope(p.name): # Adding specAugmentation. if p.use_specaugment and not self.do_eval: inputs, paddings = self.specaugment.FProp( theta.specaugment, inputs, paddings) # Add a few extra padded timesteps at the end. This is for ensuring the # correctness of the conv-layers at the edges. if p.pad_steps > 0: # inplace_update() is not supported by TPU for now. Since we have done # padding on the input_generator, we may avoid this additional padding. assert not py_utils.use_tpu() inputs_pad = tf.zeros( inplace_ops.inplace_update(tf.shape(inputs), 1, p.pad_steps), inputs.dtype) paddings_pad = tf.ones( inplace_ops.inplace_update(tf.shape(paddings), 1, p.pad_steps), paddings.dtype) inputs = tf.concat([inputs, inputs_pad], 1, name='inputs') paddings = tf.concat([paddings, paddings_pad], 1) plots = [ summary_utils.PrepareSequenceForPlot( tf.transpose(inputs, [0, 1, 3, 2]), paddings, 'inputs') ] conv_out = inputs out_padding = paddings for i, conv_layer in enumerate(self.conv): conv_out, out_padding = conv_layer.FProp( theta.conv[i], conv_out, out_padding) if p.extra_per_layer_outputs: conv_out *= (1.0 - out_padding[:, :, tf.newaxis, tf.newaxis]) outputs['conv_%d' % i] = py_utils.NestedMap( encoded=tf.transpose(conv_out, [1, 0, 2, 3]), # to [t, b, d, c] padding=tf.transpose(out_padding)) plots.append( summary_utils.PrepareSequenceForPlot( tf.transpose(conv_out, [0, 1, 3, 2]), out_padding, 'conv_%d_out' % i)) def TransposeFirstTwoDims(t): first_dim = tf.shape(t)[0] second_dim = tf.shape(t)[1] t_new = tf.transpose( tf.reshape(t, [first_dim, second_dim, -1]), [1, 0, 2]) t_shape_new = tf.concat([[second_dim], [first_dim], tf.shape(t)[2:]], 0) return tf.reshape(t_new, t_shape_new) # Now the conv-lstm part. conv_lstm_out = conv_out conv_lstm_out_padding = out_padding for i, (rnn, cnn) in enumerate( zip(self.conv_lstm_rnn, self.conv_lstm_cnn)): conv_lstm_in = conv_lstm_out # Move time dimension to be the first. conv_lstm_in = TransposeFirstTwoDims(conv_lstm_in) conv_lstm_in = tf.expand_dims(conv_lstm_in, 2) conv_lstm_in_padding = tf.expand_dims( tf.transpose(conv_lstm_out_padding), 2) lstm_out = rnn.FProp(theta.conv_lstm_rnn[i], conv_lstm_in, conv_lstm_in_padding) # Move time dimension to be the second. cnn_in = TransposeFirstTwoDims(lstm_out) cnn_in = tf.squeeze(cnn_in, 2) cnn_in_padding = conv_lstm_out_padding cnn_out, cnn_out_padding = cnn.FProp(theta.conv_lstm_cnn[i], cnn_in, cnn_in_padding) conv_lstm_out, conv_lstm_out_padding = cnn_out, cnn_out_padding if p.extra_per_layer_outputs: conv_lstm_out *= ( 1.0 - conv_lstm_out_padding[:, :, tf.newaxis, tf.newaxis]) outputs['conv_lstm_%d' % i] = py_utils.NestedMap( encoded=tf.transpose(conv_lstm_out, [1, 0, 2, 3]), # to [t, b, d, c] padding=tf.transpose(conv_lstm_out_padding)) plots.append( summary_utils.PrepareSequenceForPlot( conv_lstm_out, conv_lstm_out_padding, 'conv_lstm_%d_out' % i)) # Need to do a reshape before starting the rnn layers. conv_lstm_out = py_utils.HasRank(conv_lstm_out, 4) conv_lstm_out_shape = tf.shape(conv_lstm_out) new_shape = tf.concat([conv_lstm_out_shape[:2], [-1]], 0) conv_lstm_out = tf.reshape(conv_lstm_out, new_shape) if self._first_lstm_input_dim_pad: conv_lstm_out = tf.pad( conv_lstm_out, [[0, 0], [0, 0], [0, self._first_lstm_input_dim_pad]]) conv_lstm_out = py_utils.HasShape( conv_lstm_out, [-1, -1, self._first_lstm_input_dim]) # Transpose to move the time dimension to be the first. rnn_in = tf.transpose(conv_lstm_out, [1, 0, 2]) rnn_padding = tf.expand_dims(tf.transpose(conv_lstm_out_padding), 2) # rnn_in is of shape [time, batch, depth] # rnn_padding is of shape [time, batch, 1] # Now the rnn layers. num_skips = 0 for i in range(p.num_lstm_layers): rnn_out = self.rnn[i].FProp(theta.rnn[i], rnn_in, rnn_padding) residual_index = i - p.residual_start + 1 if p.residual_start > 0 and residual_index >= 0: if residual_index % p.residual_stride == 0: residual_in = rnn_in if residual_index % p.residual_stride == p.residual_stride - 1: # Highway skip connection. if p.highway_skip: rnn_out = self.highway_skip[num_skips].FProp( theta.highway_skip[num_skips], residual_in, rnn_out) num_skips += 1 else: # Residual skip connection. rnn_out += py_utils.HasShape( residual_in, tf.shape(rnn_out)) if p.project_lstm_output and (i < p.num_lstm_layers - 1): # Projection layers. rnn_out = self.proj[i].FProp(theta.proj[i], rnn_out, rnn_padding) if i == p.num_lstm_layers - 1: rnn_out *= (1.0 - rnn_padding) if p.extra_per_layer_outputs: rnn_out *= (1.0 - rnn_padding) outputs['rnn_%d' % i] = py_utils.NestedMap( encoded=rnn_out, padding=tf.squeeze(rnn_padding, [2])) # Stacking layer connection. if p.layer_index_before_stacking == i: # Stacking layer expects input tensor shape as [batch, time, feature]. # So transpose the tensors before and after the layer. rnn_out, rnn_padding = self.stacking.FProp( tf.transpose(rnn_out, [1, 0, 2]), tf.transpose(rnn_padding, [1, 0, 2])) rnn_out = tf.transpose(rnn_out, [1, 0, 2]) rnn_padding = tf.transpose(rnn_padding, [1, 0, 2]) plots.append( summary_utils.PrepareSequenceForPlot( tf.transpose(rnn_out, [1, 0, 2]), tf.transpose(rnn_padding, [1, 0, 2]), 'rnn_%d_out' % i)) rnn_in = rnn_out final_out = rnn_in summary_utils.PlotSequenceFeatures(list(reversed(plots)), 'encoder_example', xlabel='Time') outputs['encoded'] = final_out outputs['padding'] = tf.squeeze(rnn_padding, [2]) outputs['state'] = py_utils.NestedMap() return outputs
def FProp(self, theta, batch): """Encodes source as. Args: theta: A nested map object containing weights' values of this layer and its children layers. batch: A NestedMap with fields: spelling - The input spelling tensor. Optional fields: neighbor_spellings - [*, max_neighbors, max_spelling_len] int32 tensor of neighbor spellings; neighbor_pronunciations - [*, max_neighbors, max_pronunciation_len] int32 tensor of neighbor pronunciations. Returns: A NestedMap with: encoded: a [*, max_spelling, enc_units] tensor. state: a [*, enc_units] tensor for the state output of the GRU, batch: the original data batch And, optionally: neighbor_spellings: a [*, max_neighbors, enc_units] tensor. neighbor_pronunciations: a [*, max_neighbors, enc_units] tensor. or tf.constant(0) in each case if neighbor_spellings, neighbor_pronunciations or neighbor_distances are not present. """ def reshape(embeddings): dims = embeddings.shape return tf.reshape(embeddings, [-1, dims[2], dims[3]]) p = self.params with tf.name_scope(p.name): plots = [] # --> [batch_size, max_spelling_len, embedding_dim] x = self._embedding(batch.spelling) # encoded [batch_size, max_spelling_len, embedding_dim] # state [batch_size, embedding_dim] encoded, state = self._gru(x) self.__AppendPlotData(plots, encoded, "encoded") summary_utils.PlotSequenceFeatures(list(reversed(plots)), "encoder", xlabel="Input Position") try: if (batch.Get("neighbor_spellings") is not None and batch.Get("neighbor_pronunciations") is not None): # [batch_size, max_neighbors, max_spelling_len] --> # [batch_size, max_neighbors, max_spelling_len, embedding_dim] neighbor_spellings_embeddings = ( self._neighbor_spellings_embeddings( batch.neighbor_spellings)) neighbor_pronunciations_embeddings = ( self._neighbor_pronunciations_embeddings( batch.neighbor_pronunciations)) # [batch_size, max_spelling, max_spelling_len, enc_units] --> # [batch_size * max_spelling, max_spelling_len, enc_units] neighbor_spellings_encoded, _ = self._neighbor_spellings_gru( reshape(neighbor_spellings_embeddings)) neighbor_pronunciations_encoded, _ = ( self._neighbor_pronunciations_gru( reshape(neighbor_pronunciations_embeddings))) else: neighbor_spellings_encoded = tf.constant(0) neighbor_pronunciations_encoded = tf.constant(0) except AttributeError: neighbor_spellings_encoded = tf.constant(0) neighbor_pronunciations_encoded = tf.constant(0) return py_utils.NestedMap( encoded=encoded, state=state, neighbor_spellings_encoded=neighbor_spellings_encoded, neighbor_pronunciations_encoded=neighbor_pronunciations_encoded, batch=batch)