def forward(self, x: torch.autograd.Variable): # x's shape must be [b, self.in_size] assert x.size(1) == self.in_size # (b, in_size) x = x.view(x.size(0), self.in_size, 1, 1) x = self.deconv(x) # (b, out_channels, 128, 128) return x
def forward(self, encoder_outputs: torch.autograd.Variable, hidden_state: torch.autograd.Variable, targets: torch.autograd.Variable = None, max_length: int = None) -> tuple: """ Forward step of the attentional decoder unit. If the targets parameter is not None, then teacher forcing is used, so during decoding, the previous output word will be provided at time step t. If targets is None, decoding follows the general method, when the input word for the recurrent unit at time step t, is the output word at time step t-1. :param targets: Variable, (batch_size, sequence_length) a batch of word ids. :param max_length: int, maximum length for the decoded sequence. If None the max_length parameter's value will be used. :param encoder_outputs: Variable, with size of (batch_size, sequence_length, hidden_size). :param hidden_state: Variable, (num_layers * directions, batch_size, hidden_size) initial hidden state. :return outputs: dict, containing three string keys, symbols: Ndarray, the decoded word ids, alignment_weights: """ batch_size = encoder_outputs.size(0) input_sequence_length = encoder_outputs.size(1) if targets is not None: predictions = self._forced_decode( targets=targets, batch_size=batch_size, hidden_state=hidden_state, encoder_outputs=encoder_outputs, input_sequence_length=input_sequence_length) else: predictions = self._predictive_decode( max_length=max_length, batch_size=batch_size, hidden_state=hidden_state, encoder_outputs=encoder_outputs, input_sequence_length=input_sequence_length) return self._outputs, predictions
def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.autograd.Variable): """ Computes and returns an element-wise dropout mask for a given tensor, where each element in the mask is dropped out with probability dropout_probability. Note that the mask is NOT applied to the tensor - the tensor is passed to retain the correct CUDA tensor type for the mask. Parameters ---------- dropout_probability : float, required. Probability of dropping a dimension of the input. tensor_for_masking : torch.Variable, required. Returns ------- A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability). This scaling ensures expected values and variances of the output of applying this mask and the original tensor are the same. """ binary_mask = tensor_for_masking.clone() binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability) # Scale mask by 1/keep_prob to preserve output statistics. dropout_mask = binary_mask.float().div(1.0 - dropout_probability) return dropout_mask
def _forced_decode(self, targets: torch.autograd.Variable, batch_size: int, hidden_state: torch.autograd.Variable, encoder_outputs: torch.autograd.Variable, input_sequence_length: int) -> list: """ This method is primarily used during training, when target outputs are provided to the decoder. These target sequences start with an <SOS> token, which will serve as the first input to the _decode function. During the decoding iterations the decoder's predictions will only be used as final outputs to measure the loss, so the input for the (t)-th time step will be the (t-1)-th element of the provided targets. :param targets: Variable, (batch_size, sequence_length) a batch of word ids. :param batch_size: int, size of the currently processed batch. :param hidden_state: Variable, (num_layers * directions, batch_size, hidden_size) initial hidden state. :param encoder_outputs: Variable, with size of (batch_size, sequence_length, hidden_size). :param input_sequence_length: int, length of the input (for the encoder) sequence. """ output_sequence_length = targets.size(1) - 1 inputs = targets[:, :-1].contiguous() embedded_inputs = self.embedding(inputs) predictions = [] self._outputs['symbols'] = numpy.zeros( (batch_size, output_sequence_length), dtype='int') self._outputs['alignment_weights'] = numpy.zeros( (batch_size, output_sequence_length, input_sequence_length)) for step in range(output_sequence_length): step_input = embedded_inputs[:, step, :] step_input = step_input.unsqueeze(1) step_output, hidden_state, attn_weights = self._decode( inputs=step_input, hidden_state=hidden_state, encoder_outputs=encoder_outputs, batch_size=batch_size, sequence_length=input_sequence_length) predictions.append(step_output.squeeze(1)) self._outputs[ 'alignment_weights'][:, step, :] = attn_weights.data.squeeze( 1).cpu().numpy() self._outputs['symbols'][:, step] = step_output.topk( 1)[1].data.squeeze(-1).squeeze(-1).cpu().numpy() return predictions
def forward(self, encoder_outputs: torch.autograd.Variable, hidden_state: torch.autograd.Variable, targets: torch.autograd.Variable = None, max_length: int = None) -> tuple: """ Forward step of the decoder unit. If the targets parameter is not None, then teacher forcing is used, so during decoding, the previous output word will be provided at time step t. If targets is None, decoding follows the general method, when the input word for the recurrent unit at time step t, is the output word at time step t-1. :param targets: Variable, (batch_size, sequence_length) a batch of word ids. If None, then normal teacher forcing is not applied. :param max_length: int, maximum length of the decoded sequence. If None, the maximum length parameter from the configuration file will be used as maximum length. This parameter has no effect, if targets parameter is provided, because in that case, the length of the target sequence will be decoding length. :param encoder_outputs: Variable, with size of (batch_size, sequence_length, hidden_size). This parameter is redundant for the standard decoder unit. :param hidden_state: Variable, (num_layers * directions, batch_size, hidden_size) initial hidden state. :return decoder_outputs: dict, containing two string keys, symbols: Ndarray, the decoded word ids. """ batch_size = encoder_outputs.size(0) if targets is not None: predictions = self._forced_decode(targets=targets, batch_size=batch_size, hidden_state=hidden_state, encoder_outputs=encoder_outputs, input_sequence_length=None) else: predictions = self._predictive_decode( max_length=max_length, batch_size=batch_size, hidden_state=hidden_state, encoder_outputs=encoder_outputs, input_sequence_length=None) return self._outputs, predictions
def _forced_decode(self, targets: torch.autograd.Variable, batch_size: int, hidden_state: torch.autograd.Variable, encoder_outputs: torch.autograd.Variable, input_sequence_length: int = None) -> list: """ This method is primarily used during training, when target outputs are provided to the decoder. These target sequences start with an <SOS> token, which will serve as the first input to the _decode function. During the decoding iterations the decoder's predictions will only be used as final outputs to measure the loss, so the input for the (t)-th time step will be the (t-1)-th element of the provided targets. :param targets: Variable, (batch_size, sequence_length) a batch of word ids. :param batch_size: int, size of the currently processed batch. :param hidden_state: Variable, (num_layers * directions, batch_size, hidden_size) initial hidden state. :param encoder_outputs: Variable, with size of (batch_size, sequence_length, hidden_size). :param input_sequence_length: This parameter is required only by the attentional version of this method. """ output_sequence_length = targets.size(1) - 1 self._outputs['symbols'] = numpy.zeros( (batch_size, output_sequence_length), dtype=numpy.int32) predictions = [] inputs = targets[:, :-1].contiguous() embedded_inputs = self.embedding(inputs) outputs, hidden_state, _ = self._decode(inputs=embedded_inputs, hidden_state=hidden_state, encoder_outputs=None, batch_size=batch_size, sequence_length=None) for step in range(output_sequence_length): self._outputs['symbols'][:, step] = outputs[:, step, :].topk( 1)[1].squeeze(-1).data.cpu().numpy() predictions.append(outputs[:, step, :]) return predictions
def probabilities(self, states: torch.autograd.Variable, training: bool = True) -> np.ndarray: epsilon = self._epsilon if training else 0 q_values = self._model.q_values(states) # noinspection PyArgumentList _, argmax = torch.max(q_values, dim=1) batch_size = states.size()[0] probabilities: torch.FloatTensor = torch.ones((batch_size, self._model.num_actions)) * \ epsilon / self._model.num_actions arange = torch.arange(0, batch_size).type(torch.LongTensor) if self._model.is_cuda: probabilities = probabilities.cuda() arange = arange.cuda() probabilities[arange, argmax.data] += (1 - epsilon) if self._model.is_cuda: return probabilities.cpu().numpy()[0] else: return probabilities.numpy()[0]