def test_cat(self): a = torch.IntTensor([[1, 2, 3], [4, 5, 6]]) b = torch.IntTensor([[11, 12, 13], [14, 15, 16]]) ab = torch.IntTensor([[1, 2, 3, 11, 12, 13], [4, 5, 6, 14, 15, 16]]) npt.assert_array_equal(pytorch_utils.maybe_cat([a, b], dim=1), ab) npt.assert_array_equal( pytorch_utils.maybe_cat([a, None, b, None, None], dim=1), ab ) npt.assert_array_equal(pytorch_utils.maybe_cat([None, None, a, None], dim=1), a)
def forward_unprojected(self, input_tokens, encoder_out, incremental_state=None): padded_tokens = F.pad( input_tokens, (self.history_len - 1, 0, 0, 0), "constant", self.dst_dict.eos(), ) # We use incremental_state only to check whether we are decoding or not # self.training is false even for the forward pass through validation if incremental_state is not None: padded_tokens = padded_tokens[:, -self.history_len:] utils.set_incremental_state(self, incremental_state, "incremental_marker", True) bsz, seqlen = padded_tokens.size() seqlen -= self.history_len - 1 # get outputs from encoder (encoder_outs, final_hidden, _, src_lengths, _) = encoder_out # padded_tokens has shape [batch_size, seq_len+history_len] x = self.embed_tokens(padded_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # Convolution needs shape [batch_size, channels, seq_len] x = self.history_conv(x.transpose(1, 2)).transpose(1, 2) x = F.dropout(x, p=self.dropout_out, training=self.training) # x has shape [batch_size, seq_len, channels] for i, layer in enumerate(self.layers): prev_x = x x = layer(x) x = F.dropout(x, p=self.dropout_out, training=self.training) if self.residual_level is not None and i >= self.residual_level: x = x + prev_x # Attention attn_out, attn_scores = self.attention( x.transpose(0, 1).contiguous().view(-1, self.hidden_dim), encoder_outs.repeat(1, seqlen, 1), src_lengths.repeat(seqlen), ) if attn_out is not None: attn_out = attn_out.view(seqlen, bsz, -1).transpose(1, 0) attn_scores = attn_scores.view(-1, seqlen, bsz).transpose(0, 2) x = maybe_cat((x, attn_out), dim=2) # bottleneck layer if hasattr(self, "additional_fc"): x = self.additional_fc(x) x = F.dropout(x, p=self.dropout_out, training=self.training) return x, attn_scores
def forward_unprojected(self, input_tokens, encoder_out, incremental_state=None): if incremental_state is not None: input_tokens = input_tokens[:, -1:] bsz, seqlen = input_tokens.size() # get outputs from encoder (encoder_outs, final_hidden, final_cell, src_lengths, src_tokens) = encoder_out # embed tokens x = self.embed_tokens(input_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # initialize previous states (or get from cache during incremental generation) cached_state = utils.get_incremental_state(self, incremental_state, "cached_state") input_feed = None if cached_state is not None: prev_hiddens, prev_cells, input_feed = cached_state else: # first time step, initialize previous states prev_hiddens, prev_cells = self._init_prev_states(encoder_out) if self.attention.context_dim: input_feed = self.initial_attn_context.expand( bsz, self.attention.context_dim) attn_scores_per_step = [] outs = [] for j in range(seqlen): # input feeding: concatenate context vector from previous time step step_input = maybe_cat((x[j, :, :], input_feed), dim=1) previous_layer_input = step_input for i, rnn in enumerate(self.layers): # recurrent cell hidden, cell = rnn(step_input, (prev_hiddens[i], prev_cells[i])) # hidden state becomes the input to the next layer layer_output = F.dropout(hidden, p=self.dropout_out, training=self.training) if self.residual_level is not None and i >= self.residual_level: # TODO add an assert related to sizes here step_input = layer_output + previous_layer_input else: step_input = layer_output previous_layer_input = step_input # save state for next time step prev_hiddens[i] = hidden prev_cells[i] = cell out, step_attn_scores = self.attention(hidden, encoder_outs, src_lengths) input_feed = out attn_scores_per_step.append(step_attn_scores.unsqueeze(1)) attn_scores = torch.cat(attn_scores_per_step, dim=1) # srclen x tgtlen x bsz -> bsz x tgtlen x srclen attn_scores = attn_scores.transpose(0, 2) combined_output_and_context = maybe_cat((hidden, out), dim=1) # save final output outs.append(combined_output_and_context) # cache previous states (no-op except during incremental generation) utils.set_incremental_state( self, incremental_state, "cached_state", (prev_hiddens, prev_cells, input_feed), ) # collect outputs across time steps x = torch.cat(outs, dim=0).view(seqlen, bsz, self.combined_output_and_context_dim) # T x B x C -> B x T x C x = x.transpose(1, 0) # bottleneck layer if hasattr(self, "additional_fc"): x = self.additional_fc(x) x = F.dropout(x, p=self.dropout_out, training=self.training) return x, attn_scores
def test_nullable(self): a = torch.IntTensor([[1, 2, 3], [4, 5, 6]]) pytorch_utils.maybe_cat([a, None], 1) pytorch_utils.maybe_cat([a, None], 1, nullable=[True, True]) pytorch_utils.maybe_cat([a, None], 1, nullable=[False, True]) with self.assertRaises(RuntimeError): pytorch_utils.maybe_cat([a, None], 1, nullable=[False, False]) with self.assertRaises(RuntimeError): pytorch_utils.maybe_cat([None, None], 1) with self.assertRaises(RuntimeError): pytorch_utils.maybe_cat([], 1)