def forward(self, inputs, input_length): #Args: # inputs: padding data of batch frames, [batch, max_seq_len, feat_dim] # inputs_length: length of padding batch data, [batch] enc_mask = get_enc_padding_mask(inputs, input_length) enc_output, enc_mask = self.embed(inputs, enc_mask) # enc_output: with shape [batch, max_seq_len, emd_dim], # used input for K and V of decoder src-attention, # so its shape is same as embedding of decoder # enc_mask: with shape [batch, 1, max_seq_len] # [1, max_seq_len] used to mask soft_max value # mask padding data of embed output use enc_mask enc_output.masked_fill_(~enc_mask.transpose(1, 2), 0.0) for _, block in enumerate(self.blocks): enc_output, enc_mask = block(enc_output, enc_mask) # mask padding data of embed output use enc_mask enc_output.masked_fill_(~enc_mask.transpose(1, 2), 0.0) if self.normalize_before: enc_output = self.after_norm(enc_output) return enc_output, enc_mask
def forward(self, inputs, input_length): enc_mask = get_enc_padding_mask(inputs, input_length) enc_output, enc_mask = self.embed(inputs, enc_mask) enc_output.masked_fill_(~enc_mask.transpose(1, 2), 0.0) for _, block in enumerate(self.blocks): enc_output, enc_mask = block(enc_output, enc_mask) enc_output.masked_fill_(~enc_mask.transpose(1, 2), 0.0) if self.normalize_before: enc_output = self.after_norm(enc_output) return enc_output, enc_mask
def forward(self, inputs, input_length, streaming=False): enc_mask = get_enc_padding_mask(inputs, input_length) enc_output, enc_mask = self.embed(inputs, enc_mask) enc_output.masked_fill_(~enc_mask.transpose(1, 2), 0.0) if streaming: length = torch.sum(enc_mask.squeeze(1), dim=-1) enc_mask = get_streaming_mask(enc_output, length, left_context=20, right_context=0) for _, block in enumerate(self.blocks): enc_output, enc_mask = block(enc_output, enc_mask) enc_output.masked_fill_(~enc_mask.transpose(1, 2), 0.0) if self.normalize_before: enc_output = self.after_norm(enc_output) return enc_output, enc_mask