def run (self, data_loader, batch_size, beam_size=3): #data is either a list of lists or a dataset_loader self.encoder.eval() self.decoder.eval() self.vae.eval() pbar = ProgressBar() pbar.set(total_steps=len(data_loader)) total_loss = 0. with torch.no_grad(): for counter, (x, y) in enumerate(data_loader): pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, eval average loss \033[93m{:.6f}\033[0m ... ".format(self.epoch, counter, len(data_loader), total_loss/(counter+1))) if x.size(0) != batch_size: print("\t Incomplete batch, skipping.") continue if(self.train_on_gpu): x, y = x.cuda(), y.cuda() x = x[0:1,:] y = y[0:1,:] results, scores, loss = self._run_instance(x, y, beam_size) pbar.update(text="Epoch {:d}, eval done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss/len(data_loader))) return total_loss/len(data_loader)
def _eval(self, valid_loader, batch_size): self.encoder.eval() self.decoder.eval() self.vae.eval() pbar = ProgressBar() pbar.set(total_steps=len(valid_loader)) counter = 0 total_loss = 0. with torch.no_grad(): for counter, (x, y) in enumerate(valid_loader): #if counter > 5: # break pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, eval average loss \033[93m{:.6f}\033[0m ... ".format(self.epoch, counter, len(valid_loader), total_loss/(counter+1))) batch_size = x.size(0) max_seq_len_x = x.size(1) max_seq_len_y = y.size(1) loss = 0 #print(" Epoch {}, batch: {}/{}, max_seq_len_x: {}, max_seq_len_y: {}".format(self.epoch, counter, len(valid_loader), max_seq_len_x, max_seq_len_y)) if x.size(0) != batch_size: print("\t Incomplete batch, skipping.") continue if(self.train_on_gpu): x, y = x.cuda(), y.cuda() encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) encoder_last_output = torch.zeros(batch_size, self.encoder_hidden_dim*2, device=self.device) for j in range(batch_size): encoder_last_output[j] = encoder_output[j][-1] # VAE z, mu, logvar = self.vae(encoder_last_output) word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device) word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it #XXX decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) decoder_output = decoder_output[-1].permute(1,0,2) loss = 0 print_example = True example_array = [2] for i in range(max_seq_len_y): #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, z) word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 if print_example: _, mi = word_softmax_projection[0].max(0) example_array.append(mi.item()) target_y = y[:,i] # select from y the ith column and shape as an array loss += self.criterion(word_softmax_projection, target_y) loss /= batch_size KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) loss += KLD total_loss += loss.data.item() #print("\t\t\t Eval Loss: {}".format(loss.data.item())) if print_example: print_example = False print() print("\n\n----- X:") print(" ".join([self.src_i2w[str(wi.data.item())] for wi in x[0]])) print("----- Y:") print(" ".join([self.tgt_i2w[str(wi.data.item())] for wi in y[0]])) print("----- OUR PREDICTION:") print(" ".join([self.tgt_i2w[str(wi)] for wi in example_array])) print() print(" ".join([str(wi.data.item()) for wi in y[0]])) print(" ".join([str(wi) for wi in example_array])) print() #self.writer.add_text('EvalText', " ".join([self.i2w[str(wi.data.item())] for wi in y[0]]) + " --vs-- "+" ".join([self.i2w[str(wi)] for wi in example_array]), self.epoch) pbar.update(text="Epoch {:d}, eval done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss/len(valid_loader))) return total_loss/len(valid_loader)
def _eval(self, valid_loader): self.encoder.eval() self.decoder.eval() self.attention.eval() pbar = ProgressBar() pbar.set(total_steps=len(valid_loader)) counter = 0 total_loss = 0. with torch.no_grad(): for counter, (x, y) in enumerate(valid_loader): #if counter > 5: # break pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, eval average loss \033[93m{:.6f}\033[0m ... ".format(self.epoch, counter, len(valid_loader), total_loss/(counter+1))) batch_size = x.size(0) max_seq_len_x = x.size(1) max_seq_len_y = y.size(1) loss = 0 if(self.train_on_gpu): x, y = x.cuda(), y.cuda() encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) word_softmax_projection = torch.zeros(batch_size, 5, dtype = torch.float, device=self.device) word_softmax_projection[:,2] = 1. # beginning of sentence value is 2, set it #XXX decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) decoder_output = decoder_output[-1].permute(1,0,2) loss = 0 print_example = True example_array = [] for i in range(max_seq_len_y): #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) context = self.attention(encoder_output, decoder_output) decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context) word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 if print_example: _, mi = word_softmax_projection[0].max(0) example_array.append(mi.item()) target_y = y[:,i] # select from y the ith column and shape as an array loss += self.criterion(word_softmax_projection, target_y) total_loss += loss.data.item() / batch_size #print("\t\t\t Eval Loss: {}".format(loss.data.item())) if print_example: print_example = False print() print("\n\n----- X:") print(" ".join([self.src_i2w[str(wi.data.item())] for wi in x[0]])) print("----- Y:") print(" ".join([self.tgt_i2w[str(wi.data.item())] for wi in y[0]])) print("----- OUR PREDICTION:") print(" ".join([self.tgt_i2w[str(wi)] for wi in example_array])) print() print(" ".join([str(wi.data.item()) for wi in y[0]])) print(" ".join([str(wi) for wi in example_array])) print() self.log.var("Loss|Train loss|Validation loss", self.epoch, total_loss, y_index=1) self.log.draw() pbar.update(text="Epoch {:d}, eval done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss/len(valid_loader))) return total_loss/len(valid_loader)
def _train_epoch(self, train_loader): self.epoch += 1 self.encoder.train() self.decoder.train() self.vae.train() #encoder_hidden = self.encoder.init_hidden(batch_size) #decoder_hidden = self.decoder.init_hidden(batch_size) total_loss = 0. pbar = ProgressBar() pbar.set(total_steps=len(train_loader)) for counter, (x, y) in enumerate(train_loader): batch_size = x.size(0) max_seq_len_x = x.size(1) # x este 64 x 399 (variabil) max_seq_len_y = y.size(1) # y este 64 x variabil pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, train average loss \033[93m{:.6f}\033[0m (bs/mx/my = {}/{}/{}) ... ".format(self.epoch, counter, len(train_loader), total_loss/(counter+1), batch_size, max_seq_len_x, max_seq_len_y)) #if counter > 1: # break if counter % 500 == 0 and counter > 0: self.save_checkpoint("last") loss = 0 """ if x.size(0) != batch_size: print("\t Incomplete batch, skipping.") continue """ # print(x.size()) # x is a 64 * 399 tensor (batch*max_seq_len_x) if(self.train_on_gpu): x, y = x.cuda(), y.cuda() encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) #print(decoder_hidden[0].size()) # zero grads in optimizer self.optimizer.zero_grad() # encoder # x is batch_size x max_seq_len_x encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) # encoder_output is batch_size x max_seq_len_x x encoder_hidden (where encoder_hidden is double because it is bidirectional) # print(encoder_output.size()) # take last state of encoder as encoder_last_output # not necessary when using attention encoder_last_output = torch.zeros(batch_size, self.encoder_hidden_dim*2, device=self.device) # was with ,1, in middle ? for j in range(batch_size): encoder_last_output[j] = encoder_output[j][-1] # encoder_last_output is last state of the encoder batch_size * encoder_hidden_dim # VAE z, mu, logvar = self.vae(encoder_last_output) # all are (batch_size, encoder_hidden_dim) # create first decoder output for initial attention call, extract from decoder_hidden decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) # it should look like batch_size x 1 x decoder_hidden_size, so tranform it decoder_output = decoder_output[-1].permute(1,0,2) #print(decoder_output.size()) recon_loss = 0 for i in range(max_seq_len_y): #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) # teacher forcing (or it is first word which always is start-of-sentence) if random.random()<=self.teacher_forcing_ratio or i==0: decoder_input = torch.zeros(batch_size, 1, dtype = torch.long, device=self.device) # 1 in middle is because lstm expects (batch, seq_len, input_size): for j in range(batch_size): decoder_input[j]=y[j][i] #print(decoder_input.size()) # batch_size x 1 else: # feed own previous prediction extracted from word_softmax_projection _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) # from batch_size to batch_size x 1 #print(decoder_input.size()) # batch_size x 1 # z context is batch_size * encoder_hidden_dim decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, z) # first, reduce word_softmax_projection which is torch.Size([64, 1, 50004]) to 64 * 50004 word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 # now, select target y # y looks like batch_size * max_seq_len_y : tensor([[ 2, 10890, 48108, ..., 0, 0, 0], ... ... .. target_y = y[:,i] # select from y the ith column and shape as an array # target_y now looks like [ 10, 2323, 5739, 24, 9785 ... ] of size 64 (batch_size) #print(word_softmax_projection.size()) #print(target_y.size()) recon_loss += self.criterion(word_softmax_projection, target_y) # end decoder individual step global_minibatch_step = (self.epoch-1)*len(train_loader)+counter #print("epoch {}, counter {}, global_minibatch_step {}".format(self.epoch, counter, global_minibatch_step)) self.log.var("train_loss|Total loss|Recon loss|Weighted KLD loss", global_minibatch_step, recon_loss.data.item(), y_index=1) KL_weight = self.vae.kl_anneal_function(step=global_minibatch_step, k=self.vae_kld_anneal_k, x0=self.vae_kld_anneal_x0, anneal_function=self.vae_kld_anneal_function) self.log.var("KLD weight", global_minibatch_step, KL_weight, y_index=0) KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) self.log.var("KLD", global_minibatch_step, KLD.data.item(), y_index=0) KLD *= KL_weight self.log.var("train_loss|Total loss|Recon loss|Weighted KLD loss", global_minibatch_step, KLD.data.item(), y_index=2) loss = recon_loss + KLD self.log.var("train_loss|Total loss|Recon loss|Weighted KLD loss", global_minibatch_step, loss.data.item(), y_index=0) total_loss += loss.data.item() / batch_size loss.backward() # calculate the loss and perform backprop # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.encoder.parameters(), self.gradient_clip) nn.utils.clip_grad_norm_(self.decoder.parameters(), self.gradient_clip) nn.utils.clip_grad_norm_(self.vae.parameters(), self.gradient_clip) self.optimizer.step() #self.writer.add_scalar('Train/Loss', loss.data.item()) #break self.log.draw() self.log.draw(last_quarter = True) # end batch #end epoch pbar.update(text="Epoch {:d}, train done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss)) #/len(train_loader) return total_loss #/len(train_loader)
def _train_epoch(self, train_loader): self.epoch += 1 self.encoder.train() self.decoder.train() self.attention.train() total_loss = 0. pbar = ProgressBar() pbar.set(total_steps=len(train_loader)) for counter, (x, y) in enumerate(train_loader): batch_size = x.size(0) max_seq_len_x = x.size(1) # x este 64 x 399 (variabil) max_seq_len_y = y.size(1) # y este 64 x variabil pbar.update(progress=counter, text="Epoch {:d}, progress {}/{}, train average loss \033[93m{:.6f}\033[0m (mx/my = {}/{}) ... ".format(self.epoch, counter, len(train_loader), total_loss/(counter+1), max_seq_len_x, max_seq_len_y)) #if counter > 1: # break if counter % 1000 == 0 and counter > 0: self.save_checkpoint("last") loss = 0 # print(x.size()) # x is a 64 * 399 tensor (batch*max_seq_len_x) if(self.train_on_gpu): x, y = x.cuda(), y.cuda() encoder_hidden = self.encoder.init_hidden(batch_size) decoder_hidden = self.decoder.init_hidden(batch_size) #print(decoder_hidden[0].size()) # zero grads in optimizer self.optimizer.zero_grad() # encoder # x is batch_size x max_seq_len_x encoder_output, encoder_hidden = self.encoder(x, encoder_hidden) # encoder_output is batch_size x max_seq_len_x x encoder_hidden #print(encoder_output.size()) # create first decoder output for initial attention call, extract from decoder_hidden decoder_output = decoder_hidden[0].view(self.decoder_n_layers, 1, batch_size, self.decoder_hidden_dim) #torch.Size([2, 1, 64, 512]) # it should look like batch_size x 1 x decoder_hidden_size, so tranform it decoder_output = decoder_output[-1].permute(1,0,2) #print(decoder_output.size()) loss = 0 for i in range(max_seq_len_y): # why decoder_hidden is initialized in epoch and not in batch?? #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) # teacher forcing (or it is first word which always is start-of-sentence) if random.random()<=self.teacher_forcing_ratio or i==0: decoder_input = torch.zeros(batch_size, 1, dtype = torch.long, device=self.device) # 1 in middle is because lstm expects (batch, seq_len, input_size): for j in range(batch_size): decoder_input[j]=y[j][i] #print(decoder_input.size()) # batch_size x 1 else: # feed own previous prediction extracted from word_softmax_projection _, decoder_input = word_softmax_projection.max(1) # no need for values, just indexes decoder_input = decoder_input.unsqueeze(1) # from batch_size to batch_size x 1 #print(decoder_input.size()) # batch_size x 1 # remove me, for printing attention if counter == 1: self.attention.should_print = False#True #print("\t Decoder step {}/{}".format(i, max_seq_len_y)) else: self.attention.should_print = False self.attention.att_mat = [] context = self.attention(encoder_output, decoder_output) # context is batch_size * encoder_hidden_dim decoder_output, decoder_hidden, word_softmax_projection = self.decoder.forward_step(decoder_input, decoder_hidden, context) # first, reduce word_softmax_projection which is torch.Size([64, 1, 50004]) to 64 * 50004 word_softmax_projection = word_softmax_projection.squeeze(1) # eliminate dim 1 # now, select target y # y looks like batch_size * max_seq_len_y : tensor([[ 2, 10890, 48108, ..., 0, 0, 0], ... ... .. target_y = y[:,i] # select from y the ith column and shape as an array # target_y now looks like [ 10, 2323, 5739, 24, 9785 ... ] of size 64 (batch_size) #print(word_softmax_projection.size()) #print(target_y.size()) loss += self.criterion(word_softmax_projection, target_y) # ignore index not set as we want 0 to count to error too # remove me, attention printing """if counter == 1: fig = plt.figure(figsize=(12, 10)) sns.heatmap(self.attention.att_mat,cmap="gist_heat") plt.tight_layout() fig.savefig('img/__'+str(self.epoch)+'.png') plt.clf() """ total_loss += loss.data.item()/batch_size loss.backward() # calculate the loss and perform backprop # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.encoder.parameters(), self.gradient_clip) nn.utils.clip_grad_norm_(self.decoder.parameters(), self.gradient_clip) nn.utils.clip_grad_norm_(self.attention.parameters(), self.gradient_clip) self.optimizer.step() # end batch # end current epoch pbar.update(text="Epoch {:d}, train done, average loss \033[93m{:.6f}\033[0m".format(self.epoch, total_loss)) self.log.var("Loss|Train loss|Validation loss", self.epoch, total_loss, y_index=0) self.log.draw() return total_loss