def embed_input_sentence(input_pair, encoder, max_length=MAX_LENGTH): """Embeds the input sentence using a trained encoder model""" with torch.no_grad(): if encoder.trainable_model: input_tensor, target_tensor = utils.tensorsFromPair(input_pair) input_length = input_tensor.size()[0] encoder_hidden = encoder.initHidden() encoder_outputs = torch.zeros(max_length+1, encoder.hidden_size, device=DEVICE) for ei in range(input_length): encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden) encoder_outputs[ei] += encoder_output[0, 0] decoder_hidden = encoder_hidden return decoder_hidden, target_tensor, encoder_outputs else: target_tensor = utils.tensorFromSentence(vocab_index, input_pair[1]) decoder_hidden = encoder.sentence_embedding(input_pair[0]) decoder_hidden = layer_normalize(decoder_hidden) return decoder_hidden, target_tensor, None
def train(self, pairs, n_iters, max_length=1000, teacher_forcing_ratio=0.5, print_every=1000, plot_every=100, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(self.encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(self.decoder.parameters(), lr=learning_rate) training_pairs = [ tensorsFromPair(self.input_lang, self.output_lang, random.choice(pairs), self.device) for i in range(n_iters) ] criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] loss = self.step(input_tensor, target_tensor, encoder_optimizer, decoder_optimizer, criterion, max_length, teacher_forcing_ratio) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01, lang_pack=None): assert not (lang_pack == None), "None shall pass" input_lang, output_lang, pairs = lang_pack start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) training_pairs = [ tensorsFromPair(random.choice(pairs), langs=[input_lang, output_lang]) for i in range(n_iters) ] criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def mask_batch(input_batch_pairs): """Convert batch of sentence pairs to tensors and masks for ESIM model""" input_tensor = torch.zeros((MAX_LENGTH, len(input_batch_pairs)), dtype=torch.long, device=DEVICE) target_tensor = torch.zeros((MAX_LENGTH, len(input_batch_pairs)), dtype=torch.long, device=DEVICE) for idx, pair in enumerate(input_batch_pairs): encoded_input, encoded_target = tensorsFromPair(pair) input_tensor[:len(encoded_input), idx], target_tensor[:len(encoded_target), idx] = \ encoded_input.view(-1), encoded_target.view(-1) input_tensor_mask, target_tensor_mask = input_tensor != 0, target_tensor != 0 input_tensor_mask, target_tensor_mask = input_tensor_mask.float( ), target_tensor_mask.float() return input_tensor, input_tensor_mask, target_tensor, target_tensor_mask
def trainItersBert(encoder, decoder, n_iters, training_pairs, eval_pairs, input_lang, output_lang, print_every=1000, plot_every=100, learning_rate=0.01, mom=0, model_name="QALD-dev"): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every plot_loss_avg = 1.0 #!!! encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate, momentum=mom) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate, momentum=mom) #encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, amsgrad=True) #encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(encoder_optimizer, n_iters) #decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, amsgrad=True) #decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(decoder_optimizer, n_iters) teacher_forcing_ratio = 1.0 criterion = nn.NLLLoss() '''src_sents, tgt_sents = [], [] for pair in training_pairs: src_sents.append(pair[0]) tgt_sents.append(pair[1]) src_size, tgt_size = utils.max_size(src_sents, tgt_sents )''' #!!! input_tensors, target_tensors, train_pairs = [], [], [] for pair in training_pairs: tensors = utils.tensorsFromPair(pair, input_lang, output_lang, device) train_pairs.append(tensors) '''print("tensor shape--> ", tensors[0].size()) print(tensors[0])''' input_tensors.append(tensors[0].view(-1,1).long()) #float() #!!! target_tensors.append(tensors[1].view(-1,1).long()) #!!! print("\n Dataset preparing... ") """input_tensors, target_tensors = torch.Tensor(len(training_pairs), 1, 1, 256), torch.Tensor(len(training_pairs), 1, 1, 256) torch.cat(en_tensors, out=input_tensors) torch.cat(sparql_tensors, out=target_tensors)""" '''print(" assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors) ") print(input_tensors[0].size(0), )''' input_tensors = rnn_utils.pad_sequence(input_tensors, batch_first=True, padding_value=0) target_tensors = rnn_utils.pad_sequence(target_tensors, batch_first=True, padding_value=0) #input_tensors, target_tensors = utils.padding(input_tensors, target_tensors ) torch_dataset = utils.TxtDataset(input_tensors, target_tensors ) # put the dataset into DataLoader loader = Data.DataLoader( dataset=torch_dataset, batch_size=6, # MINIBATCH_SIZE shuffle=True, #num_workers=1 # set multi-work num read data #collate_fn= utils.collate_fn #!!! ) print(" Dataset loader ready, begin training. \n") for epoch in range(1, n_iters + 1): # 1 epoch go the whole data for step, (batch_input, batch_target) in enumerate(loader): # here to train your model print('\n\n - epoch: ', epoch, ' | step: ', step, '\n | batch_input: \n', batch_input.size(), '\n | batch_target: \n', batch_target.size() ) #input_tensor, target_tensor = batch_input, batch_target #!!! batch_input = batch_input.reshape( [6, -1, 1] ) #!!! [6, 1, -1] batch_target = batch_target.reshape( [6, -1, 1] ) print("\n input_batch : ", batch_input.size()) print("\n target_batch : ", batch_target.size()) '''loss = trainBert(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) ''' rl = True if (epoch > 1) and (np.mean(plot_losses) < 1.0 ) else False '''loss = 0.0 for i in range(6): input_tensor, target_tensor = batch_input[i], batch_target[i] print("\n input_tensor : ", input_tensor.size() ) print("\n target_tensor : ", target_tensor.size() ) loss += trainBert(input_tensor, target_tensor, encoder, decoder, eval_pairs, input_lang, output_lang, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio = teacher_forcing_ratio, rl=rl ) plot_losses.append( loss/6 ) ''' loss = 0 for batch_input_, batch_target_ in zip(batch_input, batch_target): loss += trainBert(batch_input, batch_target, encoder, decoder, eval_pairs, input_lang, output_lang, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio = teacher_forcing_ratio, rl=rl ) plot_losses.append( loss/6 ) print("\t - %s step xentropy loss: "%str(epoch), loss, " \n" ) teacher_forcing_ratio = utils.teacher_force(float(loss) ) ;
def trainItersBert(model, n_iters, training_pairs, eval_pairs, input_lang, output_lang, batch_size, learning_rate=0.01, mom=0, model_name="qald-test"): #start = time.time() plot_losses = [] losses_trend = [] #encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate, momentum=mom) #decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate, momentum=mom) optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=mom) #encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, amsgrad=True) #encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(encoder_optimizer, n_iters) #decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, amsgrad=True) #decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(decoder_optimizer, n_iters) teacher_forcing_ratio = 1.0 criterion = nn.NLLLoss() ''' input_tensors, target_tensors, train_pairs = [], [], [] for pair in training_pairs: tensors = utils.tensorsFromPair(pair, input_lang, output_lang, device) train_pairs.append(tensors) #print("tensor shape--> ", tensors[0].size()) #print(tensors[0]) input_tensors.append(tensors[0].view(-1,1).long()) #float() #!!! target_tensors.append(tensors[1].view(-1,1).long()) #!!! print("\n Dataset preparing... ") input_tensors = rnn_utils.pad_sequence(input_tensors, batch_first=True, padding_value=0) target_tensors = rnn_utils.pad_sequence(target_tensors, batch_first=True, padding_value=0) torch.save(input_tensors, "./model/input_tensors.pt") torch.save(target_tensors, "./model/target_tensors.pt")''' eval_tensors = [utils.tensorsFromPair(pair, input_lang, output_lang, device) for pair in eval_pairs ] eval_inputs = [ tensors[0] for tensors in eval_tensors ] eval_targets = [ tensors[1] for tensors in eval_tensors ] eval_inputs = rnn_utils.pad_sequence(eval_inputs, batch_first=True, padding_value=0) eval_targets = rnn_utils.pad_sequence(eval_targets, batch_first=True, padding_value=0) #input_tensors, target_tensors = utils.padding(input_tensors, target_tensors ) '''torch_dataset = utils.TxtDataset(input_tensors, target_tensors )''' torch_dataset = utils.TxtDataset(eval_inputs, eval_targets ) # put the dataset into DataLoader loader = Data.DataLoader( dataset=torch_dataset, batch_size=batch_size, # MINIBATCH_SIZE = 6 shuffle=True, drop_last= False, num_workers= 2 if utils.getOSystPlateform() else 0 # set multi-work num read data based on OS plateform #collate_fn= utils.collate_fn #!!! ) print(" Dataset loader ready, begin training. \n") datset_len = len(loader) print("\n Dataset loader length is ", datset_len, ", save model every batch. " ) for epoch in range(1, n_iters + 1): # an epoch goes the whole data for batch, (batch_input, batch_target) in enumerate(loader): # here to train your model print('\n\n - Epoch ', epoch, ' | batch ', batch, '\n | input lenght: ', batch_input.size(), '\n | target length: ', batch_target.size() ," \n") #input_tensor, target_tensor = batch_input, batch_target #!!! #print(" * T-forcing ratio: ", teacher_forcing_ratio ) '''try: input_seq_len, target_seq_len = batch_input.size(1), batch_target.size(1) batch_input = batch_input.reshape( [input_seq_len, batch_size] ) #!!! [6, 1, -1] batch_target = batch_target.reshape( [target_seq_len, batch_size] ) print("\n input_seq_len, target_seq_len : ", input_seq_len, target_seq_len ) except: pass ; ''' """input_lens = [utils.getNzeroSize(tensor) for tensor in batch_input ] target_lens = [utils.getNzeroSize(tensor) for tensor in batch_target ]""" rl = False #True if (epoch > 1) and ( np.mean(losses_trend)<1.0 and len(losses_trend)>1 ) else False #!!! and / or loss = trainBert(batch_input, batch_target, model, eval_pairs, \ input_lang, output_lang, optimizer, criterion, \ teacher_forcing_ratio = teacher_forcing_ratio, rl=rl ) plot_losses.append( loss ) print("\t- the %s batch xentropy loss: "%str(str(epoch)+"."+str(batch)), loss, " " ) '''if 0 == batch%savepoint and batch > 1: print("\n Batch %d savepoint, save the trained model...\n"%batch ) save_model(encoder, decoder, plot_losses, model_name ) ;''' losses_trend.append(np.mean(plot_losses)) plot_losses.clear() if epoch > 1 :#and 0 == epoch%5 : save_model(model, losses_trend, model_name ) '''if epoch > 5 and 0 == epoch%5 : utils.showPlot(losses_trend, model_name, "epoch"+str(epoch) )''' print("\n Finish Epoch %d -- model saved. \n "%epoch ); #!!!
def trainIters(input_lang, output_lang, pairs, encoder, decoder, n_iters, max_length, print_every=1000, plot_every=100, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every # Create an parameter optimization object for both models encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) # Pick a random sentence and convert to tensor of word indices training_pairs = [ tensorsFromPair(input_lang, output_lang, random.choice(pairs)) for i in range(n_iters) ] # Use negative log likelihood as loss criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] # Enforce max sentence length input_tensor = training_pair[0] if len(input_tensor) > max_length: continue target_tensor = training_pair[1] # Train model using one sentence pair, returns the negative log likelihood loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 pickle.dump( plot_losses, open("losses/{}.p".format(encoder.__class__.__name__), "wb")) if iter % 10000 == 0: torch.save( encoder.state_dict(), 'trained_models/{}/encoder_it{}'.format( encoder.__class__.__name__, iter)) torch.save( decoder.state_dict(), 'trained_models/{}/decoder_it{}'.format( encoder.__class__.__name__, iter)) showPlot(plot_losses)
def trainIters(encoder, decoder, dictionary, pairs, epochs, print_every=1000, print_sentences=5, learning_rate=0.01, batch_size=16): #? start = time.time() plot_losses = [] print_loss_total = 0 # print_every 마다 초기화 plot_loss_total = 0 # plot_every 마다 초기화 encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) training_pairs = [ tensorsFromPair(pairs[i], dictionary) for i in range(len(pairs)) ] criterion = nn.NLLLoss() for e in range(epochs): num_batch = ceil(len(pairs) // batch_size) print_loss_total = 0 for b in range(num_batch): if b == num_batch - 1: num_data = len(pairs) - batch_size * b else: num_data = batch_size input_tensors = [ training_pairs[m][0] for m in range(batch_size * b, batch_size * b + num_data) ] target_tensors = [ training_pairs[m][1] for m in range(batch_size * b, batch_size * b + num_data) ] loss = train(input_tensors, target_tensors, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss print('%s epochs, %s step, %.4f' % (e, batch_size * b + num_data, loss)) print_loss_avg = print_loss_total / num_batch print('%s epochs, %.4f' % (e, print_loss_avg)) # for iter in range(1, len(pairs) + 1): # training_pair = training_pairs[iter - 1] # input_tensor = training_pair[0] # target_tensor = training_pair[1] # loss = train(input_tensor, target_tensor, encoder, # decoder, encoder_optimizer, decoder_optimizer, criterion) # print_loss_total += loss # plot_loss_total += loss # if iter % print_every == 0: # print_loss_avg = print_loss_total / print_every # print_loss_total = 0 # print('%s epochs, %s (%d %d%%) %.4f' % (e, timeSince(start, iter / len(pairs)), # iter, iter / len(pairs) * 100, print_loss_avg)) # if iter % plot_every == 0: # 매 epoch 마다 출력 evaluateRandomly(encoder, decoder, pairs, dictionary, n=print_sentences)
def trainItersBert(encoder, decoder, n_iters, training_pairs, eval_pairs, input_lang, output_lang, print_every=1000, plot_every=100, learning_rate=0.01, mom=0, model_name="qald-dev"): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every plot_loss_avg = 1.0 #!!! encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate, momentum=mom) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate, momentum=mom) #encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate, amsgrad=True) #encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(encoder_optimizer, n_iters) #decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate, amsgrad=True) #decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(decoder_optimizer, n_iters) teacher_forcing_ratio = 1.0 criterion = nn.NLLLoss() ''' input_tensors, target_tensors, train_pairs = [], [], [] for pair in training_pairs: tensors = utils.tensorsFromPair(pair, input_lang, output_lang, device) train_pairs.append(tensors) #print("tensor shape--> ", tensors[0].size()) #print(tensors[0]) input_tensors.append(tensors[0].view(-1,1).long()) #float() #!!! target_tensors.append(tensors[1].view(-1,1).long()) #!!! print("\n Dataset preparing... ") input_tensors = rnn_utils.pad_sequence(input_tensors, batch_first=True, padding_value=0) target_tensors = rnn_utils.pad_sequence(target_tensors, batch_first=True, padding_value=0) torch.save(input_tensors, "./model/input_tensors.pt") torch.save(target_tensors, "./model/target_tensors.pt")''' eval_tensors = [utils.tensorsFromPair(pair, input_lang, output_lang, device) for pair in eval_pairs ] eval_inputs = [ tensors[0] for tensors in eval_tensors ] eval_targets = [ tensors[1] for tensors in eval_tensors ] eval_inputs = rnn_utils.pad_sequence(eval_inputs, batch_first=True, padding_value=0) eval_targets = rnn_utils.pad_sequence(eval_targets, batch_first=True, padding_value=0) #input_tensors, target_tensors = utils.padding(input_tensors, target_tensors ) '''torch_dataset = utils.TxtDataset(input_tensors, target_tensors )''' torch_dataset = utils.TxtDataset(eval_inputs, eval_targets ) # put the dataset into DataLoader loader = Data.DataLoader( dataset=torch_dataset, batch_size=6, # MINIBATCH_SIZE shuffle=True, #drop_last= True, num_workers= 2 if utils.getOSystPlateform() else 0 # set multi-work num read data #collate_fn= utils.collate_fn #!!! ) print(" Dataset loader ready, begin training. \n") datset_len = len(loader) savepoint = datset_len//4 #12 print("\n Dataset loader length is ", datset_len, ", save model every %d batches. "%savepoint ) for epoch in range(1, n_iters + 1): # an epoch goes the whole data for batch, (batch_input, batch_target) in enumerate(loader): # here to train your model print('\n\n - Epoch ', epoch, ' | batch ', batch, '\n | batch_input: \n ', batch_input.size(), '\n | batch_target: \n ', batch_target.size() ," \n") #input_tensor, target_tensor = batch_input, batch_target #!!! try: batch_input = batch_input.reshape( [6, -1, 1] ) #!!! [6, 1, -1] batch_target = batch_target.reshape( [6, -1, 1] ) except: pass ; #input_lens = [utils.getNzeroSize(tensor) for tensor in batch_input ] #target_lens = [utils.getNzeroSize(tensor) for tensor in batch_target ] rl = True if (epoch > 1) and (np.mean(plot_losses) < 1.0) else False #!!! and / or loss = 0 for batch_input_item, batch_target_item in zip(batch_input, batch_target): #print("\n\t batch_input_item, batch_target_item : ", batch_input_item.size(), batch_target_item.size() ) loss += trainBert(batch_input_item, batch_target_item, encoder, decoder, eval_pairs, \ input_lang, output_lang, encoder_optimizer, decoder_optimizer, criterion, \ teacher_forcing_ratio = teacher_forcing_ratio, rl=rl ) loss = loss/6 plot_losses.append( loss ) print("\t - the %s batch xentropy loss: "%str(str(epoch)+"."+str(batch)), loss, " " ) teacher_forcing_ratio = utils.teacher_force(float(loss) ) ; if 0 == batch%savepoint and batch > 1: print("\n Batch %d savepoint, save the trained model...\n"%batch ) save_model(encoder, decoder, plot_losses, model_name ) ; if epoch > 1: save_model(encoder, decoder, plot_losses, model_name ) print("\n Finish Epoch %d -- model saved. "%epoch ); #!!!
def train(input_pair, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, teacher_forcing_ratio, max_length=MAX_LENGTH): """Model training logic, initializes graph, creates encoder outputs matrix for attention model, applies teacher forcing (randomly), calculates the loss and trains the models""" if encoder.trainable_model: # Encode sentences using encoder model input_tensor, target_tensor = utils.tensorsFromPair(input_pair) decoder_hidden, encoder_outputs, encoder_optimizer = train_encoder( input_tensor, encoder, encoder_optimizer, max_length) else: # Encode sentences using pretrained encoder model target_tensor = utils.tensorFromSentence(vocab_index, input_pair[1]) decoder_hidden = encoder.sentence_embedding(input_pair[0]) decoder_hidden = layer_normalize(decoder_hidden) # Clear the gradients from the decoder optimizer decoder_optimizer.zero_grad() target_length = target_tensor.size(0) decoder_input = torch.tensor([[SOS_token]], device=DEVICE) loss = 0 # Randomly apply teacher forcing subject to teacher forcing ratio use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False if use_teacher_forcing: # Teacher forcing: Feed the target as the next input for di in range(target_length): if decoder.uses_attention: decoder_output, decoder_hidden, _ = decoder( decoder_input, decoder_hidden, encoder_outputs) else: decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) loss += criterion(decoder_output, target_tensor[di]) decoder_input = target_tensor[di] # Teacher forcing: set next input to correct target else: # Without teacher forcing: use its own predictions as the next input for di in range(target_length): if decoder.uses_attention: decoder_output, decoder_hidden, _ = decoder( decoder_input, decoder_hidden, encoder_outputs) else: decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) topv, topi = decoder_output.topk(1) decoder_input = topi.squeeze().detach() # detach from history as input loss += criterion(decoder_output, target_tensor[di]) if decoder_input.item() == EOS_token: break # Calculate the error and blackpropogate through the network loss.backward() if encoder.trainable_model: encoder_optimizer.step() decoder_optimizer.step() return loss.item() / target_length
def beam_decode(input_pair, encoder, decoder, beam_width=5, n_output_sentences=1, encoder_outputs=None): """Implements beam search decoding using specified encoder, decoder, and beam length""" """ Notebook source: https://github.com/budzianowski/PyTorch-Beam-Search-Decoding/blob/master/decode_beam.py """ ''' :param target_tensor: target indexes tensor of shape [B, T] where B is the batch size and T is the maximum length of the output sentence :param decoder_hidden: input tensor of shape [1, B, H] for start of the decoding :param encoder_outputs: if you are using attention mechanism you can pass encoder outputs, [T, B, H] where T is the maximum length of input sentence :return: decoded_batch ''' assert beam_width > 1, 'Beam width must be greater than 1' if encoder.trainable_model: input_tensor, _ = utils.tensorsFromPair(input_pair) input_length = input_tensor.size()[0] encoder_hidden = encoder.initHidden() encoder_outputs = torch.zeros(MAX_LENGTH+1, encoder.hidden_size, device=DEVICE) for ei in range(input_length): encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden) encoder_outputs[ei] += encoder_output[0, 0] decoder_hidden = encoder_hidden else: decoder_hidden = encoder.sentence_embedding(input_pair[0]) decoder_hidden = layer_normalize(decoder_hidden) topk = n_output_sentences # how many sentence do you want to generate # Start with the start of the sentence token decoder_input = torch.tensor([[SOS_token]], device=DEVICE) # Number of sentence to generate endnodes = [] number_required = min((topk + 1), topk - len(endnodes)) # starting node - hidden vector, previous node, word id, logp, length node = BeamSearchNode(decoder_hidden, None, decoder_input, 0, 1) nodes = PriorityQueue() # start the queue nodes.put((-node.eval(), node)) qsize = 1 # start beam search for _ in range(2000): # give up when decoding takes too long if qsize > 1000: break # fetch the best node score, n = nodes.get() decoder_input = n.wordid decoder_hidden = n.h if n.wordid.item() == EOS_token and n.prevNode != None: endnodes.append((score, n)) # if we reached maximum # of sentences required if len(endnodes) >= number_required: break else: continue # decode for one step using decoder if decoder.uses_attention: decoder_output, decoder_hidden, _ = decoder( decoder_input, decoder_hidden, encoder_outputs) else: decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) # do actual beam search log_prob, indexes = torch.topk(decoder_output, beam_width) nextnodes = [] for new_k in range(beam_width): decoded_t = indexes[0][new_k].view(1, -1) log_p = log_prob[0][new_k].item() node = BeamSearchNode(decoder_hidden, n, decoded_t, n.logp + log_p, n.leng + 1) score = -node.eval() nextnodes.append((score, node)) # put them into queue for i in range(len(nextnodes)): score, next_node = nextnodes[i] nodes.put((score, next_node)) # increase qsize qsize += len(nextnodes) - 1 # choose nbest paths, back trace them if len(endnodes) == 0: endnodes = [nodes.get() for _ in range(topk)] utterances = [] for score, n in sorted(endnodes, key=operator.itemgetter(0)): utterance = [] utterance.append(n.wordid) # back trace while n.prevNode != None: n = n.prevNode utterance.append(n.wordid) utterance = utterance[::-1] utterances.append(utterance) output_sentences = [] for sentence in utterances: output_words = [vocab_index.index2word[word_idx.item()] for word_idx in sentence] output_sentences.append(' '.join(output_words[1:-1])) return output_sentences