def reorder_incremental_state(self, incremental_state, new_order): super().reorder_incremental_state(incremental_state, new_order) encoder_out = utils.get_incremental_state(self, incremental_state, "encoder_out") if encoder_out is not None: encoder_out = tuple( eo.index_select(0, new_order) for eo in encoder_out) utils.set_incremental_state(self, incremental_state, "encoder_out", encoder_out)
def reorder_incremental_state(self, incremental_state, new_order): super().reorder_incremental_state(incremental_state, new_order) cached_state = utils.get_incremental_state(self, incremental_state, "cached_state") if cached_state is None: return def reorder_state(state): if isinstance(state, list): return [reorder_state(state_i) for state_i in state] return state.index_select(0, new_order) new_state = tuple(map(reorder_state, cached_state)) utils.set_incremental_state(self, incremental_state, "cached_state", new_state)
def _set_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], new_buffer): return utils.set_incremental_state(self, incremental_state, "input_buffer", new_buffer)
def _split_encoder_out(self, encoder_out, incremental_state): """Split and transpose encoder outputs. This is cached when doing incremental inference. """ cached_result = utils.get_incremental_state(self, incremental_state, "encoder_out") if cached_result is not None: return cached_result # transpose only once to speed up attention layers encoder_a, encoder_b = encoder_out encoder_a = encoder_a.transpose(1, 2).contiguous() result = (encoder_a, encoder_b) if incremental_state is not None: utils.set_incremental_state(self, incremental_state, "encoder_out", result) return result
def _set_input_buffer(self, incremental_state, new_buffer): return utils.set_incremental_state(self, incremental_state, "input_buffer", new_buffer)
def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs): encoder_padding_mask = encoder_out["encoder_padding_mask"] encoder_outs = encoder_out["encoder_out"] if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] bsz, seqlen = prev_output_tokens.size() srclen = encoder_outs.size(0) # embed tokens embeddings = self.embed_tokens(prev_output_tokens) x = embeddings if self.dropout is not None: x = self.dropout(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # initialize previous states (or get from cache during incremental # generation) cached_state = utils.get_incremental_state(self, incremental_state, "cached_state") if cached_state is not None: prev_hiddens, prev_cells = cached_state else: prev_hiddens = [encoder_out["encoder_out"].mean(dim=0) ] * self.num_layers prev_cells = [x.new_zeros(bsz, self.hidden_size)] * self.num_layers attn_scores = x.new_zeros(bsz, srclen) attention_outs = [] outs = [] for j in range(seqlen): input = x[j, :, :] attention_out = None for i, layer in enumerate(self.layers): # the previous state is one layer below except for the bottom # layer where the previous state is the state emitted by the # top layer hidden, cell = layer( input, ( prev_hiddens[(i - 1) % self.num_layers], prev_cells[(i - 1) % self.num_layers], ), ) if self.dropout is not None: hidden = self.dropout(hidden) prev_hiddens[i] = hidden prev_cells[i] = cell if attention_out is None: attention_out, attn_scores = self.attention( hidden, encoder_outs, encoder_padding_mask) if self.dropout is not None: attention_out = self.dropout(attention_out) attention_outs.append(attention_out) input = attention_out # collect the output of the top layer outs.append(hidden) # cache previous states (no-op except during incremental generation) utils.set_incremental_state(self, incremental_state, "cached_state", (prev_hiddens, prev_cells)) # collect outputs across time steps x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size) attention_outs_concat = torch.cat(attention_outs, dim=0).view(seqlen, bsz, self.context_dim) # T x B x C -> B x T x C x = x.transpose(0, 1) attention_outs_concat = attention_outs_concat.transpose(0, 1) # concat LSTM output, attention output and embedding # before output projection x = torch.cat((x, attention_outs_concat, embeddings), dim=2) x = self.deep_output_layer(x) x = torch.tanh(x) if self.dropout is not None: x = self.dropout(x) # project back to size of vocabulary x = self.output_projection(x) # to return the full attn_scores tensor, we need to fix the decoder # to account for subsampling input frames # return x, attn_scores return x, None