def test_convert_padding_direction(self): t1 = (torch.tensor([ [4.5, 2.3, 1.2, 0.0], [6.7, 9.8, 0.0, 0.0], [7.7, 5.4, 6.2, 8.0], [1.5, 0.0, 0.0, 0.0], ]).unsqueeze(-1).expand(-1, -1, 10)) t2 = (torch.tensor([ [0.0, 4.5, 2.3, 1.2], [0.0, 0.0, 6.7, 9.8], [7.7, 5.4, 6.2, 8.0], [0.0, 0.0, 0.0, 1.5], ]).unsqueeze(-1).expand(-1, -1, 10)) seq_len = torch.tensor([3, 2, 4, 1]).int() t1_to_t2 = utils.convert_padding_direction( t1, seq_len, right_to_left=True, ) self.assertTensorEqual(t1_to_t2, t2) t2_to_t1 = utils.convert_padding_direction( t2, seq_len, left_to_right=True, ) self.assertTensorEqual(t2_to_t1, t1)
def forward(self, src_tokens, src_lengths: Tensor, **unused): if self.left_pad: # nn.utils.rnn.pack_padded_sequence requires right-padding; # convert left-padding to right-padding src_tokens = speech_utils.convert_padding_direction( src_tokens, src_lengths, left_to_right=True, ) if self.conv_layers_before is not None: x, src_lengths, padding_mask = self.conv_layers_before( src_tokens, src_lengths) else: x, padding_mask = src_tokens, \ ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)) bsz, seqlen = x.size(0), x.size(1) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) for i in range(len(self.lstm)): if self.residual and i > 0: # residual connection starts from the 2nd layer prev_x = x # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data) # apply LSTM packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value * 1.0) if i < len( self.lstm) - 1: # not applying dropout for the last layer x = F.dropout(x, p=self.dropout_out, training=self.training) x = x + prev_x if self.residual and i > 0 else x assert list(x.size()) == [seqlen, bsz, self.output_units] encoder_padding_mask = padding_mask.t() return EncoderOut( encoder_out=x, # T x B x C encoder_padding_mask=encoder_padding_mask if encoder_padding_mask.any() else None, # T x B encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=src_lengths, # B )
def forward( self, src_tokens: Tensor, src_lengths: Tensor, enforce_sorted: bool = True, **unused, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` enforce_sorted (bool, optional): if True, `src_tokens` is expected to contain sequences sorted by length in a decreasing order. If False, this condition is not required. Default: True. """ if self.left_pad: # nn.utils.rnn.pack_padded_sequence requires right-padding; # convert left-padding to right-padding src_tokens = speech_utils.convert_padding_direction( src_tokens, src_lengths, left_to_right=True, ) if self.pre_encoder is not None: x, src_lengths, padding_mask = self.pre_encoder(src_tokens, src_lengths) else: x, padding_mask = ( src_tokens, ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)), ) bsz, seqlen = x.size(0), x.size(1) x = self.dropout_in_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) if self.multilayer_rnn_as_single_module: state_size = ( (2 if self.bidirectional else 1) * self.num_layers, bsz, self.hidden_size, ) h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence( x, ( src_lengths.cpu() if not self.src_bucketed else src_lengths.new_full( src_lengths.size(), x.size(0), device="cpu" ) ), enforce_sorted=enforce_sorted, ) # apply LSTM packed_outs, (_, _) = self.lstm(packed_x, (h0, c0)) # unpack outputs x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value * 1.0 ) else: # for back-compatibility state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) for i in range(len(self.lstm)): if ( self.residual and i > 0 ): # residual connection starts from the 2nd layer prev_x = x # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence( x, ( src_lengths.cpu() if not self.src_bucketed else src_lengths.new_full( src_lengths.size(), x.size(0), device="cpu" ) ), enforce_sorted=enforce_sorted, ) # apply LSTM packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value * 1.0 ) if i < len(self.lstm) - 1: # not applying dropout for the last layer x = self.dropout_out_module(x) x = x + prev_x if self.residual and i > 0 else x assert list(x.size()) == [seqlen, bsz, self.output_units] encoder_padding_mask = padding_mask.t() # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `foward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # T x B "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [src_lengths], # B }
def forward( self, src_tokens: Tensor, src_lengths: Tensor, enforce_sorted: bool = True, **unused, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` enforce_sorted (bool, optional): if True, `src_tokens` is expected to contain sequences sorted by length in a decreasing order. If False, this condition is not required. Default: True. """ if self.left_pad: # nn.utils.rnn.pack_padded_sequence requires right-padding; # convert left-padding to right-padding src_tokens = speech_utils.convert_padding_direction( src_tokens, src_lengths, left_to_right=True, ) if self.conv_layers_before is not None: x, src_lengths, padding_mask = self.conv_layers_before(src_tokens, src_lengths) else: x, padding_mask = src_tokens, \ ~speech_utils.sequence_mask(src_lengths, src_tokens.size(1)) bsz, seqlen = x.size(0), x.size(1) x = self.dropout_in_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) state_size = 2 if self.bidirectional else 1, bsz, self.hidden_size h0, c0 = x.new_zeros(*state_size), x.new_zeros(*state_size) for i in range(len(self.lstm)): if self.residual and i > 0: # residual connection starts from the 2nd layer prev_x = x # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence( x, ( src_lengths.cpu() if not self.src_bucketed else src_lengths.new_full(src_lengths.size(), x.size(0), device="cpu") ), enforce_sorted=enforce_sorted ) # apply LSTM packed_outs, (_, _) = self.lstm[i](packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence(packed_outs, padding_value=self.padding_value*1.0) if i < len(self.lstm) - 1: # not applying dropout for the last layer x = self.dropout_out_module(x) x = x + prev_x if self.residual and i > 0 else x assert list(x.size()) == [seqlen, bsz, self.output_units] encoder_padding_mask = padding_mask.t() return EncoderOut( encoder_out=x, # T x B x C encoder_padding_mask=encoder_padding_mask if encoder_padding_mask.any() else None, # T x B encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=src_lengths, # B )