def forward( self, memory: torch.Tensor, memory_mask: torch.Tensor, ys_in_pad: torch.Tensor, ys_in_lens: torch.Tensor, r_ys_in_pad: torch.Tensor = torch.empty(0), reverse_weight: float = 0.0, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Forward decoder. Args: memory: encoded memory, float32 (batch, maxlen_in, feat) memory_mask: encoder memory mask, (batch, 1, maxlen_in) ys_in_pad: padded input token ids, int64 (batch, maxlen_out) ys_in_lens: input lengths of this batch (batch) r_ys_in_pad: not used in transformer decoder, in order to unify api with bidirectional decoder reverse_weight: not used in transformer decoder, in order to unify api with bidirectional decode Returns: (tuple): tuple containing: x: decoded token score before softmax (batch, maxlen_out, vocab_size) if use_output_layer is True, torch.tensor(0.0), in order to unify api with bidirectional decoder olens: (batch, ) """ tgt = ys_in_pad maxlen = tgt.size(1) # tgt_mask: (B, 1, L) tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1) tgt_mask = tgt_mask.to(tgt.device) # m: (1, L, L) m = subsequent_mask(tgt_mask.size(-1), device=tgt_mask.device).unsqueeze(0) # tgt_mask: (B, L, L) tgt_mask = tgt_mask & m x, _ = self.embed(tgt) for layer in self.decoders: x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, memory_mask) if self.normalize_before: x = self.after_norm(x) if self.use_output_layer: x = self.output_layer(x) olens = tgt_mask.sum(1) return x, torch.tensor(0.0), olens
def forward( self, memory: torch.Tensor, memory_mask: torch.Tensor, ys_in_pad: torch.Tensor, ys_in_lens: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: """Forward decoder. Args: memory: encoded memory, float32 (batch, maxlen_in, feat) memory_mask: encoder memory mask, (batch, 1, maxlen_in) ys_in_pad: input token ids, int64 (batch, maxlen_out) if input_layer == "embed" input tensor (batch, maxlen_out, #mels) in the other cases ys_in_lens: (batch) Returns: (tuple): tuple containing: x: decoded token score before softmax (batch, maxlen_out, token) if use_output_layer is True, olens: (batch, ) """ tgt = ys_in_pad # tgt_mask: (B, 1, L) tgt_mask = (~make_pad_mask(ys_in_lens).unsqueeze(1)).to(tgt.device) # m: (1, L, L) m = subsequent_mask(tgt_mask.size(-1), device=tgt_mask.device).unsqueeze(0) # tgt_mask: (B, L, L) tgt_mask = tgt_mask & m x, _ = self.embed(tgt) for layer in self.decoders: x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, memory_mask) if self.normalize_before: x = self.after_norm(x) if self.use_output_layer: x = self.output_layer(x) olens = tgt_mask.sum(1) return x, olens
def recognize( self, speech: torch.Tensor, speech_lengths: torch.Tensor, beam_size: int = 10, decoding_chunk_size: int = -1, num_decoding_left_chunks: int = -1, simulate_streaming: bool = False, ) -> torch.Tensor: """ Apply beam search on attention decoder Args: speech (torch.Tensor): (batch, max_len, feat_dim) speech_length (torch.Tensor): (batch, ) beam_size (int): beam size for beam search decoding_chunk_size (int): decoding chunk for dynamic chunk trained model. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. 0: used for training, it's prohibited here simulate_streaming (bool): whether do encoder forward in a streaming fashion Returns: torch.Tensor: decoding result, (batch, max_result_len) """ assert speech.shape[0] == speech_lengths.shape[0] assert decoding_chunk_size != 0 device = speech.device batch_size = speech.shape[0] # Let's assume B = batch_size and N = beam_size # 1. Encoder encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) # (B, maxlen, encoder_dim) maxlen = encoder_out.size(1) encoder_dim = encoder_out.size(2) running_size = batch_size * beam_size encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) encoder_mask = encoder_mask.unsqueeze(1).repeat( 1, beam_size, 1, 1).view(running_size, 1, maxlen) # (B*N, 1, max_len) hyps = torch.ones([running_size, 1], dtype=torch.long, device=device).fill_(self.sos) # (B*N, 1) scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), dtype=torch.float) scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( device) # (B*N, 1) end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) cache: Optional[List[torch.Tensor]] = None # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos if end_flag.sum() == running_size: break # 2.1 Forward decoder step hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( running_size, 1, 1).to(device) # (B*N, i, i) # logp: (B*N, vocab) logp, cache = self.decoder.forward_one_step( encoder_out, encoder_mask, hyps, hyps_mask, cache) # 2.2 First beam prune: select topk best prob at current time top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) top_k_logp = mask_finished_scores(top_k_logp, end_flag) top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) # 2.3 Seconde beam prune: select topk score with history scores = scores + top_k_logp # (B*N, N), broadcast add scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) scores, offset_k_index = scores.topk(k=beam_size) # (B, N) scores = scores.view(-1, 1) # (B*N, 1) # 2.4. Compute base index in top_k_index, # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), # then find offset_k_index in top_k_index base_k_index = torch.arange(batch_size, device=device).view( -1, 1).repeat([1, beam_size]) # (B, N) base_k_index = base_k_index * beam_size * beam_size best_k_index = base_k_index.view(-1) + offset_k_index.view( -1) # (B*N) # 2.5 Update best hyps best_k_pred = torch.index_select(top_k_index.view(-1), dim=-1, index=best_k_index) # (B*N) best_hyps_index = best_k_index // beam_size last_best_k_hyps = torch.index_select( hyps, dim=0, index=best_hyps_index) # (B*N, i) hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), dim=1) # (B*N, i+1) # 2.6 Update end flag end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) # 3. Select best of best scores = scores.view(batch_size, beam_size) # TODO: length normalization best_index = torch.argmax(scores, dim=-1).long() best_hyps_index = best_index + torch.arange( batch_size, dtype=torch.long, device=device) * beam_size best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) best_hyps = best_hyps[:, 1:] return best_hyps