def forward(self, input_tensor: torch.Tensor, hidden: torch.Tensor, encoder_outputs: torch.Tensor, batch_size: int): embedded = self.embedding(input_tensor) output, hidden = self.rnn(embedded, hidden) verify_shape(tensor=output, expected=[1, batch_size, self.decoder_hidden_size]) verify_shape(tensor=hidden, expected=[ self.gru.num_layers, batch_size, self.decoder_hidden_size ]) output = output.squeeze(dim=0) verify_shape(tensor=output, expected=[batch_size, self.decoder_hidden_size]) output = log_softmax(self.out(output), dim=1) verify_shape(tensor=attn_weights, expected=[batch_size, self.max_src_length, 1]) attn_weights = attn_weights.squeeze(dim=2) verify_shape(tensor=output, expected=[batch_size, self.output_size]) verify_shape(tensor=hidden, expected=[ self.gru.num_layers, batch_size, self.decoder_hidden_size ]) verify_shape(tensor=attn_weights, expected=[batch_size, self.max_src_length]) # print(f"output.shape={output.shape}\t\thidden.shape={hidden.shape}\t\toutput[0].shape={output[0].shape}") # output.shape: [seq_length=1, batch_size=1, decoder_hidden_size] # hidden.shape: [num_layers=1, batch_size=1, decoder_hidden_size] # # output[0].shape: [batch_size=1, decoder_hidden_size] # output = log_softmax(self.out(output[0]), dim=1) # print(f"output.shape={output.shape}\t\thidden.shape={hidden.shape}\t\tattn_weights.shape={attn_weights.shape}") # output.shape: [batch_size=1, decoder_output_size] # hidden.shape: [num_layers=1, batch_size=1, decoder_hidden_size] # attn_weights: [batch_size=1, encoder_max_len] # return output, hidden, attn_weights
def encoder_decoder(encoder: EncoderRNN, input_sequence: torch.LongTensor) -> torch.Tensor: sequence_length: int = input_sequence.shape[0] batch_size: int = input_sequence.shape[1] device: torch.device = input_sequence.device encoder_hidden = encoder.init_hidden(batch_size=batch_size, device=device) encoder_outputs = torch.zeros( sequence_length, batch_size, encoder.hidden_size, device=device) # shape: [max_src_len, hidden_size] verify_shape(tensor=input_sequence, expected=[sequence_length, batch_size]) verify_shape( tensor=encoder_hidden, expected=[encoder.num_hidden_layers, batch_size, encoder.hidden_size]) verify_shape(tensor=encoder_outputs, expected=[sequence_length, batch_size, encoder.hidden_size]) for src_index in range(sequence_length): input_token_tensor: torch.Tensor = input_sequence[src_index] verify_shape(tensor=input_token_tensor, expected=[batch_size]) verify_shape(tensor=encoder_hidden, expected=[ encoder.num_hidden_layers, batch_size, encoder.hidden_size ]) encoder_output, encoder_hidden = encoder(input_token_tensor, encoder_hidden) verify_shape(tensor=encoder_hidden, expected=[ encoder.num_hidden_layers, batch_size, encoder.hidden_size ]) verify_shape(tensor=encoder_output, expected=[1, batch_size, encoder.hidden_size]) verify_shape(tensor=encoder_output[0], expected=[batch_size, encoder.hidden_size]) verify_shape(tensor=encoder_outputs[src_index], expected=[batch_size, encoder.hidden_size]) encoder_outputs[src_index] = encoder_output[0] verify_shape(tensor=encoder_outputs, expected=[sequence_length, batch_size, encoder.hidden_size]) return encoder_outputs
def decode_sequence(self, encoder_outputs: torch.Tensor, start_symbol: int, max_length: int, target_tensor: torch.Tensor = None): encoded_sequence_length: int = encoder_outputs.shape[0] batch_size: int = encoder_outputs.shape[1] encoder_hidden_size: int = encoder_outputs.shape[2] device = encoder_outputs.device decoder_input = torch.tensor(data=[[start_symbol] * batch_size], dtype=torch.long, device=device) decoder_hidden = self.init_hidden(batch_size=batch_size, device=device) verify_shape(tensor=decoder_input, expected=[1, batch_size]) verify_shape( tensor=decoder_hidden, expected=[self.gru.num_layers, batch_size, self.gru.hidden_size]) results: List[torch.Tensor] = list() for index in range(max_length): verify_shape(tensor=decoder_input, expected=[1, batch_size]) verify_shape(tensor=decoder_hidden, expected=[ self.gru.num_layers, batch_size, self.gru.hidden_size ]) decoder_output, decoder_hidden, decoder_attention = self( decoder_input, decoder_hidden, encoder_outputs, batch_size) verify_shape(tensor=decoder_output, expected=[batch_size, self.output_size]) verify_shape(tensor=decoder_hidden, expected=[ self.gru.num_layers, batch_size, self.gru.hidden_size ]) verify_shape(tensor=decoder_attention, expected=[batch_size, encoded_sequence_length]) results.append(decoder_output) if target_tensor is None: _, top_i = decoder_output.topk(1) decoder_input = top_i.detach().permute(1, 0) else: # print(f"target_tensor.shape={target_tensor.shape}\tindex={index}\tmax_length={max_length}") decoder_input = target_tensor[index].unsqueeze(dim=0) return torch.stack(tensors=results, dim=0)
def forward(self, input_tensor: torch.Tensor, hidden: torch.Tensor, encoder_outputs: torch.Tensor, batch_size: int): if encoder_outputs.shape[0] != self.max_src_length: raise ValueError( "Encoder outputs provided to this method must have same length as self.max_src_length:" + f"\t{encoder_outputs.shape[0]} != {self.max_src_length}") # actual_src_length: int = max(self.max_src_length, input_tensor.shape[0]) # print(f"self.max_src_length={self.max_src_length}\tinput_tensor.shape[0]={input_tensor.shape[0]}") verify_shape(tensor=input_tensor, expected=[1, batch_size]) verify_shape( tensor=hidden, expected=[self.gru.num_layers, batch_size, self.gru.hidden_size]) verify_shape(tensor=encoder_outputs, expected=[ self.max_src_length, batch_size, self.encoder_hidden_size ]) # input_tensor.shape: [1, 1] # hidden.shape: [num_hidden_layers=1, batch_size=1, decoder_hidden_size] # encoder_outputs.shape: [src_seq_len, encoder_hidden_size] #if input_tensor.shape == torch.Size([]): # raise RuntimeError(f"input_tensor.shape={input_tensor.shape} is a problem") # if self.embedding(input_tensor).shape != self.embedding(input_tensor).view(1, 1, -1).shape: # raise RuntimeError(f"input_tensor.shape={input_tensor.shape}\tembedding is {self.embedding(input_tensor).shape} vs expected {self.embedding(input_tensor).view(1, 1, -1).shape}") # print(f"input_tensor={input_tensor}\tdecoder input_tensor.shape={input_tensor.shape}\t\t" + # f"decoder hidden.shape={hidden.shape}\t\t" + # f"encoder_outputs.shape={encoder_outputs.shape}") #\t\tembedded.shape={embedded.shape}") # TODO: It should be safe to remove .view(1, 1, -1), as it appears to be a noop embedded = self.embedding(input_tensor) #.view(1, 1, -1) verify_shape(tensor=embedded, expected=[1, batch_size, self.embedding_size]) # self.embedding(input_tensor).shape: [1, 1, decoder_embedding_size] # embedded.shape: [1, 1, decoder_embedding_size] # print(f"self.embedding(input_tensor).shape={self.embedding(input_tensor).shape}\t\t" + # f"self.embedding(input_tensor).view(1, 1, -1).shape={self.embedding(input_tensor).view(1, 1, -1).shape}\t\t" + # f"embedded.shape={embedded.shape}") embedded = self.dropout(embedded) verify_shape(tensor=embedded, expected=[1, batch_size, self.embedding_size]) verify_shape(tensor=embedded[0], expected=[batch_size, self.embedding_size]) verify_shape(tensor=hidden[-1], expected=[batch_size, self.gru.hidden_size]) attn_input: torch.Tensor = torch.cat(tensors=(embedded[0], hidden[0]), dim=1) verify_shape( tensor=attn_input, expected=[batch_size, self.embedding_size + self.gru.hidden_size]) # print(f"embedded[0].shape={embedded[0].shape}\t\t"+ # f"hidden[0].shape={hidden[0].shape}\t\t" ) # sys.exit() # f"torch.cat(tensors=(embedded[0], hidden[0]), dim=1).shape="+ # f"{torch.cat(tensors=(embedded[0], hidden[0]), dim=1).shape}") #print(f"self.attn(...).shape={self.attn(torch.cat(tensors=(embedded[0], hidden[0]), dim=1)).shape}\t\t"+ # f"softmax(...).shape="+ # f"{softmax(self.attn(torch.cat(tensors=(embedded[0], hidden[0]), dim=1)), dim=1).shape}") # embedded.shape: [1, 1, decoder_embedding_size] # embedded[0].shape: [1, decoder_embedding_size] # # hidden.shape: [1, 1, decoder_hidden_size] # hidden[0].shape: [1, decoder_hidden_size] # # torch.cat(tensors=(embedded[0], hidden[0]), dim=1).shape: [1, embedded.shape[2]+hidden.shape[2]] # # self.attn(...).shape: [1, decoder_max_len] # softmax(self.attn(...)).shape: [1, decoder_max_len] attn_weights = softmax(self.attn(attn_input), dim=1) verify_shape(tensor=attn_weights, expected=[batch_size, self.max_src_length]) verify_shape(tensor=encoder_outputs, expected=[ self.max_src_length, batch_size, self.encoder_hidden_size ]) # Permute dimensions to prepare for batched matrix-matrix multiply encoder_outputs = encoder_outputs.permute(1, 2, 0) attn_weights = attn_weights.unsqueeze(2) verify_shape(tensor=encoder_outputs, expected=[ batch_size, self.encoder_hidden_size, self.max_src_length ]) verify_shape(tensor=attn_weights, expected=[batch_size, self.max_src_length, 1]) #print(f"attn_weights.shape={attn_weights.shape}\t\t"+ # f"encoder_outputs.shape={encoder_outputs.shape}") #import sys; #sys.exit() # print(f"attn_weights.unsqueeze(0).shape={attn_weights.unsqueeze(0).shape}\t\t"+ # f"encoder_outputs.unsqueeze(0).shape={encoder_outputs.unsqueeze(0).shape}") # attn_weights.shape: [1, decoder_max_len] # encoder_outputs.shape: [decoder_max_len, encoder_hidden_size] # # attn_weights.unsqueeze(0).shape: [1, 1, decoder_max_len] # encoder_outputs.unsqueeze(0).shape: [1, decoder_max_len, encoder_hidden_size] #attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)) # <-- Original attn_applied = torch.bmm(encoder_outputs, attn_weights) # <-- Batched # Get rid of superfluous final dimension #attn_applied = attn_applied.squeeze(dim=2) #verify_shape(tensor=attn_applied, expected=[batch_size, self.encoder_hidden_size]) # print(f"attn_applied.shape={attn_applied.shape}\t\t"+ # f"embedded[0].shape={embedded[0].shape}\t\t"+ # f"attn_applied[0].shape={attn_applied[0].shape}\t\t" # f"torch.cat(...).shape={torch.cat((embedded[0], attn_applied[0]), 1).shape}") # embedded.shape: [1, batch_size=1, decoder_embedding_size] # attn_applied.shape: [1, batch_size=1, encoder_hidden_size] # # embedded[0].shape: [batch_size=1, decoder_embedding_size] # attn_applied[0].shape: [batch_size=1, encoder_hidden_size] # # torch.cat((embedded[0], attn_applied[0]), 1).shape: [batch_size=1, decoder_embedding_size+encoder_hidden_size] # verify_shape(tensor=attn_applied, expected=[batch_size, self.encoder_hidden_size, 1]) verify_shape(tensor=embedded, expected=[1, batch_size, self.embedding_size]) # # The final dimension of attn_applied and the first dimension of embedded # # represents seq_len, which is not needed at this point. # attn_applied = attn_applied.squeeze(dim=2) # embedded = embedded.squeeze(dim=0) # # verify_shape(tensor=attn_applied, expected=[batch_size, self.encoder_hidden_size]) # verify_shape(tensor=embedded, expected=[batch_size, self.embedding_size]) # # output = torch.cat((embedded, attn_applied), dim=1) # verify_shape(tensor=output, expected=[batch_size, self.embedding_size + self.encoder_hidden_size]) attn_applied = attn_applied.permute(2, 0, 1) verify_shape(tensor=attn_applied, expected=[1, batch_size, self.encoder_hidden_size]) verify_shape(tensor=embedded, expected=[1, batch_size, self.embedding_size]) output = torch.cat(tensors=(embedded, attn_applied), dim=2) verify_shape(tensor=output, expected=[ 1, batch_size, self.embedding_size + self.encoder_hidden_size ]) # print(f"output.shape={output.shape}") # output.shape: [batch_size=1, encoder_hidden_size+decoder_embedding_size] # self.attn_combine(output).shape: [batch_size=1, decoder_hidden_size] # self.attn_combine(output).unsqueeze(0): [seq_len=1, batch_size=1, decoder_hidden_size] # output = self.attn_combine(output) #.unsqueeze(0) verify_shape(tensor=output, expected=[1, batch_size, self.encoder_hidden_size]) # print(f"output.shape={output.shape}") # print(f"relu(output).shape={relu(output).shape}\t\thidden.shape={hidden.shape}") # output.shape: [seq_length=1, batch_size=1, decoder_hidden_size] # relu(...).shape: [seq_length=1, batch_size=1, decoder_hidden_size] # hidden.shape: [num_layers=1, batch_size=1, decoder_hidden_size] # output = relu(output) verify_shape(tensor=output, expected=[1, batch_size, self.encoder_hidden_size]) output, hidden = self.gru(output, hidden) verify_shape(tensor=output, expected=[1, batch_size, self.decoder_hidden_size]) verify_shape(tensor=hidden, expected=[ self.gru.num_layers, batch_size, self.decoder_hidden_size ]) output = output.squeeze(dim=0) verify_shape(tensor=output, expected=[batch_size, self.decoder_hidden_size]) output = log_softmax(self.out(output), dim=1) verify_shape(tensor=attn_weights, expected=[batch_size, self.max_src_length, 1]) attn_weights = attn_weights.squeeze(dim=2) verify_shape(tensor=output, expected=[batch_size, self.output_size]) verify_shape(tensor=hidden, expected=[ self.gru.num_layers, batch_size, self.decoder_hidden_size ]) verify_shape(tensor=attn_weights, expected=[batch_size, self.max_src_length]) # print(f"output.shape={output.shape}\t\thidden.shape={hidden.shape}\t\toutput[0].shape={output[0].shape}") # output.shape: [seq_length=1, batch_size=1, decoder_hidden_size] # hidden.shape: [num_layers=1, batch_size=1, decoder_hidden_size] # # output[0].shape: [batch_size=1, decoder_hidden_size] # output = log_softmax(self.out(output[0]), dim=1) # print(f"output.shape={output.shape}\t\thidden.shape={hidden.shape}\t\tattn_weights.shape={attn_weights.shape}") # output.shape: [batch_size=1, decoder_output_size] # hidden.shape: [num_layers=1, batch_size=1, decoder_hidden_size] # attn_weights: [batch_size=1, encoder_max_len] # return output, hidden, attn_weights
def train_iters( *, #data: Data, corpus: Corpus, encoder: EncoderRNN, decoder: AttnDecoderRNN, device: torch.device, n_iters: int, batch_size: int, teacher_forcing_ratio: float, print_every: int = 1000, learning_rate: float = 0.01) -> None: data = torch.utils.data.DataLoader(dataset=corpus, batch_size=batch_size) start: float = time.time() plot_losses: List[float] = [] print_loss_total: float = 0 # Reset every print_every plot_loss_total: float = 0 # Reset every plot_every encoder_optimizer: Optimizer = SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer: Optimizer = SGD(decoder.parameters(), lr=learning_rate) # # training_pairs: List[ParallelTensor] = [random.choice(data.pairs).tensors(source_vocab=data.source_vocab, # target_vocab=data.target_vocab, # device=device) # for _ in range(n_iters)] criterion: nn.NLLLoss = nn.NLLLoss( reduction='mean') #ignore_index=corpus.characters.pad_int) #for pair in parallel_data: # print(f"src={len(pair['data'])}\ttgt={len(pair['labels'])}") for iteration in range(1, n_iters + 1): # type: int # training_pair: ParallelTensor = training_pairs[iteration - 1] # input_tensor: torch.Tensor = training_pair.source # shape: [seq_len, batch_size=1] # target_tensor: torch.Tensor = training_pair.target # shape: [seq_len, batch_size=1] for batch in data: #print(f"batch['data'].shape={batch['data'].shape}\tbatch['labels'].shape{batch['labels'].shape}") #sys.exit() input_tensor: torch.Tensor = batch["data"].permute(1, 0) target_tensor: torch.Tensor = batch["labels"].permute(1, 0) actual_batch_size: int = min(batch_size, input_tensor.shape[1]) verify_shape( tensor=input_tensor, expected=[corpus.word_tensor_length, actual_batch_size]) verify_shape( tensor=target_tensor, expected=[corpus.label_tensor_length, actual_batch_size]) # print(f"input_tensor.shape={input_tensor.shape}\t\ttarget_tensor.shape={target_tensor.shape}") # sys.exit() loss: float = train(input_tensor=input_tensor, target_tensor=target_tensor, encoder=encoder, decoder=decoder, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, criterion=criterion, device=device, max_src_length=corpus.word_tensor_length, max_tgt_length=corpus.label_tensor_length, batch_size=actual_batch_size, start_of_sequence_symbol=corpus.characters. start_of_sequence_int, teacher_forcing_ratio=teacher_forcing_ratio) print_loss_total += loss plot_loss_total += loss if iteration % print_every == 0: print_loss_avg: float = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (time_since(since=start, percent=iteration / n_iters), iteration, iteration / n_iters * 100, print_loss_avg)) sys.stdout.flush()
def train( *, input_tensor: torch.Tensor, # shape: [src_seq_len, batch_size] target_tensor: torch.Tensor, # shape: [tgt_seq_len, batch_size] encoder: EncoderRNN, decoder: AttnDecoderRNN, encoder_optimizer: Optimizer, decoder_optimizer: Optimizer, criterion: nn.Module, device: torch.device, max_src_length: int, max_tgt_length: int, batch_size: int, start_of_sequence_symbol: int, teacher_forcing_ratio: float) -> float: encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss: torch.Tensor = torch.tensor( 0, dtype=torch.float, device=device) # shape: [] meaning this is a scalar encoder_outputs = encoder.encode_sequence(input_tensor) decoder_input = target_tensor[0].unsqueeze(dim=0) decoder_hidden = decoder.init_hidden(batch_size=batch_size, device=device) verify_shape(tensor=decoder_input, expected=[1, batch_size]) verify_shape(tensor=target_tensor, expected=[max_tgt_length, batch_size]) verify_shape( tensor=decoder_hidden, expected=[decoder.gru.num_layers, batch_size, decoder.gru.hidden_size]) use_teacher_forcing = True if random.random( ) < teacher_forcing_ratio else False # use_teacher_forcing = False decoder_output = decoder.decode_sequence( encoder_outputs=encoder_outputs, start_of_sequence_symbol=start_of_sequence_symbol, max_length=max_tgt_length, target_tensor=target_tensor if use_teacher_forcing else None) # print(f"input_tensor.shape={input_tensor.shape}\tdecoder_output.shape={decoder_output.shape}\ttarget_tensor.shape={target_tensor.shape}\tmax_tgt_length={max_tgt_length}") # Our loss function requires predictions to be of the shape NxC, where N is the number of predictions and C is the number of possible predicted categories predictions = decoder_output.reshape( -1, decoder.output_size ) # Reshaping from [seq_len, batch_size, decoder.output_size] to [seq_len*batch_size, decoder.output_size] labels = target_tensor.reshape( -1 ) # Reshaping from [seq_len, batch_size] to [seq_len*batch_size] loss += criterion(predictions, labels) #print(f"\t{decoder_output.view(-1,decoder_output.shape[-1]).shape}") #print(target_tensor.reshape(-1)) # print(f"\t{target_tensor.view(-1)}") #sys.exit() #loss += criterion(decoder_output.view(1,1,-1), target_tensor.view(-1)) # loss += criterion(decoder_output.squeeze(dim=1), target_tensor.squeeze(dim=1)) # for index, decoder_output in enumerate(start=1, # iterable=decoder.decode_sequence(encoder_outputs=encoder_outputs, # start_of_sequence_symbol=start_of_sequence_symbol, # max_length=max_tgt_length, # target_tensor=target_tensor if use_teacher_forcing else None)): # # loss += criterion(decoder_output, target_tensor[index]) loss.backward() encoder_optimizer.step() decoder_optimizer.step() return loss.item()