def trainIters(encoder, decoder, n_epochs, validation_pairs, lang1, lang2, search, title, max_length_generation, print_every, val_every, learning_rate): start = time.time() count, print_loss_total = 0, 0 encoder_optimizer = torch.optim.Adadelta(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adadelta(decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss( ignore_index=PAD_token) # this ignores the padded token. for epoch in range(n_epochs): for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader): encoder.train() decoder.train() sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) sent1_length_batch, sent2_length_batch = sent1_lengths.to( device), sent2_lengths.to(device) loss = train(sent1_batch, sent1_length_batch, sent2_batch, sent2_length_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss count += 1 if (step + 1) % print_every == 0: # lets train and plot at the same time. print_loss_avg = print_loss_total / count count = 0 print_loss_total = 0 print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, step / n_epochs), step, step / n_epochs * 100, print_loss_avg)) print("Memory allocated (mb): ", torch.cuda.memory_allocated(device) / (1e6)) if (step + 1) % val_every == 0: with torch.no_grad(): bleu_score = test_model( encoder, decoder, search, validation_pairs, lang2, max_length=max_length_generation) # returns bleu score print("VALIDATION BLEU SCORE: " + str(bleu_score)) torch.save(encoder.state_dict(), "Attention_Vish_encoder_latest") torch.save(decoder.state_dict(), "Attention_Vish_decoder_latest") del sent1s, sent1_lengths, sent2s, sent2_lengths, sent1_batch, sent2_batch, sent1_length_batch, sent2_length_batch gc.collect()
def trainIters(encoder, decoder, n_epochs, pairs, validation_pairs, lang1, lang2, search, title, max_length_generation, val_every=1000, print_every=1000, plot_every=1000, learning_rate=0.0001): """ lang1 is the Lang object for language 1 Lang2 is the Lang object for language 2 Max length generation is the max length generation you want """ start = time.time() #plot_losses, val_losses = [], [] count, print_loss_total, plot_loss_total, val_loss_total, plot_val_loss = 0, 0, 0, 0, 0 encoder_optimizer = torch.optim.Adadelta(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adadelta(decoder.parameters(), lr=learning_rate) #encoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, mode="min") #decoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, mode="min") criterion = nn.NLLLoss(ignore_index=PAD_token) # this ignores the padded token. for epoch in range(n_epochs): for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader): encoder.train() decoder.train() sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) sent1_length_batch, sent2_length_batch = sent1_lengths.to(device), sent2_lengths.to(device) loss = train(sent1_batch, sent1_length_batch, sent2_batch, sent2_length_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss count += 1 if (step+1) % print_every == 0: # lets train and plot at the same time. print_loss_avg = print_loss_total / count count = 0 print_loss_total = 0 print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, step / n_epochs), step, step / n_epochs * 100, print_loss_avg)) print("Memory allocated: ", torch.cuda.memory_allocated(device)/(1e6)) if (step+1) % val_every == 0: with torch.no_grad(): v_loss = test_model(encoder, decoder, search, validation_pairs, lang2, max_length=max_length_generation) # returns bleu score print("VALIDATION BLEU SCORE: "+str(v_loss)) #val_losses.append(v_loss.item()) current_time = time.strftime("%Y-%m-%d-%H-%M-%S") torch.save(encoder.state_dict(), "Attention_Vish_encoder_" + current_time) torch.save(decoder.state_dict(), "Attention_Vish_decoder_" + current_time) #pickle.dump(val_losses, open("val_losses_1.2_2nd_train", "wb")) del sent1s, sent1_lengths, sent2s, sent2_lengths, sent1_batch, sent2_batch, sent1_length_batch, sent2_length_batch gc.collect()
def trainIters(encoder, decoder, n_epochs, validation_pairs, lang1, lang2, search, title, max_length_generation, print_every, val_every, learning_rate): start = time.time() count, print_loss_total = 0, 0 encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=5*learning_rate) criterion = nn.NLLLoss(ignore_index=PAD_token) # this ignores the padded token. for epoch in range(n_epochs): for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader): encoder.train() decoder.train() sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) sent1_length_batch, sent2_length_batch = sent1_lengths.to(device), sent2_lengths.to(device) loss = train(sent1_batch, sent1_length_batch, sent2_batch, sent2_length_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss count += 1 if (step+1) % print_every == 0: # lets train and plot at the same time. print_loss_avg = print_loss_total / count count = 0 print_loss_total = 0 print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, step / n_epochs), step, step / n_epochs * 100, print_loss_avg)) print("Memory allocated (mb): ", torch.cuda.memory_allocated(device)/(1e6)) if (step+1) % val_every == 0: with torch.no_grad(): bleu_score = test_model(encoder, decoder, search, validation_pairs, lang2, max_length=max_length_generation) # returns bleu score print("VALIDATION BLEU SCORE: "+str(bleu_score)) torch.save(encoder.state_dict(), "Attention_Vish_encoder_latest") torch.save(decoder.state_dict(), "Attention_Vish_decoder_latest") del sent1s, sent1_lengths, sent2s, sent2_lengths, sent1_batch, sent2_batch, sent1_length_batch, sent2_length_batch gc.collect()
def trainIters(encoder, decoder, n_epochs, validation_pairs, pairs, lang1, lang2, max_length, max_length_generation, title, print_every=5000, plot_every=5000, learning_rate=3e-4, search="beam"): """ lang1 is the Lang o|bject for language 1 Lang2 is the Lang object for language 2 n_iters is the number of training pairs per epoch you want to train on """ start = time.time() training_pairs = pairs n_iters = len(pairs) plot_losses, val_losses = [], [] val_losses = [] count, print_loss_total, plot_loss_total, val_loss_total, plot_val_loss = 0, 0, 0, 0, 0 encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss(ignore_index=PAD_token) plot_loss = [] val_loss = [] for i in range(n_epochs): plot_loss = [] val_loss = [] # framing it as a categorical loss function. for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] input_length = input_tensor.size(0) if target_tensor.size(0) < 3: continue loss_value, count = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length, count) print_loss_total += loss_value plot_loss_total += loss_value if iter % print_every == 0: print_loss_avg = print_loss_total / count count = 0 print_loss_total = 0 print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, iter / n_epochs), iter, iter / n_epochs * 100, print_loss_avg)) plot_loss.append(print_loss_avg) plot_loss_total = 0 with torch.no_grad(): v_loss = test_model(encoder, decoder, search, validation_pairs, lang2, max_length=None) # returns bleu score print("VALIDATION BLEU SCORE: " + str(v_loss)) val_loss.append(v_loss) save_model(encoder, decoder, title) plot_losses.append(plot_loss) val_losses.append(val_loss) save_model(encoder, decoder, title) make_graph(encoder, decoder, val_losses, plot_losses, title)
def trainIters(encoder, decoder, n_epochs, pairs, validation_pairs, lang1, lang2, search, title, max_length_generation, print_every=1000, plot_every=1000, learning_rate=0.0001): """ lang1 is the Lang object for language 1 Lang2 is the Lang object for language 2 Max length generation is the max length generation you want """ start = time.time() plot_losses = [] val_losses = [] count = 0 print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every val_loss_total = 0 plot_val_loss = 0 encoder_optimizer = torch.optim.Adadelta(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adadelta(decoder.parameters(), lr=learning_rate) #encoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, mode="min") #decoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, mode="min") criterion = nn.NLLLoss(ignore_index=PAD_token) # this ignores the padded token. plot_loss =[] val_loss = [] for epoch in range(n_epochs): plot_loss = [] val_loss = [] for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader): encoder.train() # what is this for? decoder.train() sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) sent1_length_batch, sent2_length_batch = sent1_lengths.to(device), sent2_lengths.to(device) loss, output_translations, count = train(sent1_batch, sent1_length_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, sent2_batch, sent2_length_batch, criterion, count) # Yikes, what is this. i = 0 #look at the first output ranslation output = output_translations[i] translated = [] answer = [] for j in range(len(output)): token = torch.argmax(output[j][0])[0] # you get the index translated.append(lang2.index2word[token.squeeze().item()]) answer.append(lang2.index2word[sent2_batch[i][j].squeeze().item()]) print(answer) print("translated prediction") print(translated) # lets output what it's actually getting as itsoutput of teh decoder here. # check if there is an SOS here as well. print_loss_total += loss plot_loss_total += loss # we also have tomaks when it's an eOS tag. if (step+1) % print_every == 0: # lets train and polot at the same time. print_loss_avg = print_loss_total / count count = 0 print_loss_total = 0 print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, step / n_epochs), step, step / n_epochs * 100, print_loss_avg)) with torch.no_grad(): v_loss = test_model(encoder, decoder, search, validation_pairs, lang2, max_length=max_length_generation) # returns bleu score print("VALIDATION BLEU SCORE: "+str(v_loss)) val_loss.append(v_loss) plot_loss.append(print_loss_avg) # save it every time it hits the step now. save_model(encoder, decoder, title) sys.stdin.flush() plot_loss_total = 0 plot_losses.append(plot_loss) val_losses.append(val_loss) print("AVERAGE PLOT LOSS") print(np.mean(plot_loss)) sys.stdin.flush() #encoder_scheduler.step(np.mean(plot_loss)) # this isnt' really doing anything. #decoder_scheduler.step(np.mean(plot_loss)) save_model(encoder, decoder, title) make_graph(encoder, decoder, val_losses, plot_losses, title) assert len(val_losses) == len(plot_losses) save_model(encoder, decoder, title) make_graph(encoder, decoder, val_losses, plot_losses, title)
def trainIters(encoder, decoder, n_epochs, pairs, validation_pairs, lang1, lang2, search, title, max_length_generation, print_every=1000, plot_every=1000, learning_rate=0.0001): """ lang1 is the Lang object for language 1 Lang2 is the Lang object for language 2 Max length generation is the max length generation you want """ start = time.time() plot_losses, val_losses = [], [] val_losses = [] count, print_loss_total, plot_loss_total, val_loss_total, plot_val_loss = 0, 0, 0, 0, 0 encoder_optimizer = torch.optim.Adadelta(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adadelta(decoder.parameters(), lr=learning_rate) #encoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, mode="min") #decoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, mode="min") criterion = nn.NLLLoss( ignore_index=PAD_token) # this ignores the padded token. plot_loss = [] val_loss = [] for epoch in range(n_epochs): plot_loss = [] val_loss = [] for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader): encoder.train() decoder.train() sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) sent1_length_batch, sent2_length_batch = sent1_lengths.to( device), sent2_lengths.to(device) encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() encoder_outputs, encoder_hidden = encoder(sent1_batch, sent1_length_batch) # outputs is 32 by 72 by 256 # encoder_hidden is 1 by 32 by 256 decoder_input = torch.LongTensor([SOS_token] * BATCH_SIZE).view( -1, 1).to(device) decoder_hidden = encoder_hidden # decoder_input is 32 by 1 # decoder_hidden is 1 by 32 by 256 max_trg_len = max(sent2_lengths) loss = 0 # Run through decoder one time step at a time using TEACHER FORCING=1.0 for t in range(max_trg_len): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_outputs) # decoder_output is 32 by vocab_size # sent2_batch is 32 by 46 loss += criterion(decoder_output, sent2_batch[:, t]) decoder_input = sent2_batch[:, t] loss = loss / max_trg_len.float() print_loss_total += loss count += 1 loss.backward() encoder_optimizer.step() decoder_optimizer.step() if (step + 1) % print_every == 0: # lets train and plot at the same time. print_loss_avg = print_loss_total / count count = 0 print_loss_total = 0 print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, step / n_epochs), step, step / n_epochs * 100, print_loss_avg)) # 42s # v_loss = test_model(encoder, decoder, search, validation_pairs, lang2, max_length=max_length_generation) # returns bleu score # print("VALIDATION BLEU SCORE: "+str(v_loss)) # val_loss.append(v_loss) plot_loss.append(print_loss_avg) plot_loss_total = 0 save_model(encoder, decoder, val_losses, plot_losses, title)
def trainIters(encoder, decoder, n_epochs, pairs, validation_pairs, lang1, lang2, search, title, max_length_generation, val_every=1000, print_every=1000, plot_every=1000, learning_rate=0.0001): """ lang1 is the Lang object for language 1 Lang2 is the Lang object for language 2 Max length generation is the max length generation you want """ start = time.time() #plot_losses, val_losses = [], [] count, print_loss_total, plot_loss_total, val_loss_total, plot_val_loss = 0, 0, 0, 0, 0 encoder_optimizer = torch.optim.Adadelta(encoder.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adadelta(decoder.parameters(), lr=learning_rate) #encoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, mode="min") #decoder_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, mode="min") criterion = nn.NLLLoss( ignore_index=PAD_token) # this ignores the padded token. for epoch in range(n_epochs): for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader): encoder.train() decoder.train() sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) sent1_length_batch, sent2_length_batch = sent1_lengths.to( device), sent2_lengths.to(device) loss = train(sent1_batch, sent1_length_batch, sent2_batch, sent2_length_batch, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss count += 1 if (step + 1) % print_every == 0: # lets train and plot at the same time. print_loss_avg = print_loss_total / count count = 0 print_loss_total = 0 print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, step / n_epochs), step, step / n_epochs * 100, print_loss_avg)) print("Memory allocated: ", torch.cuda.memory_allocated(device) / (1e6)) if (step + 1) % val_every == 0: with torch.no_grad(): v_loss = test_model(encoder, decoder, search, validation_pairs, lang2, max_length=max_length_generation) # returns bleu score print("VALIDATION BLEU SCORE: " + str(v_loss)) #val_losses.append(v_loss.item()) current_time = time.strftime("%Y-%m-%d-%H-%M-%S") torch.save(encoder.state_dict(), "Attention_Vish_encoder_" + current_time) torch.save(decoder.state_dict(), "Attention_Vish_decoder_" + current_time) #pickle.dump(val_losses, open("val_losses_1.2_2nd_train", "wb")) del sent1s, sent1_lengths, sent2s, sent2_lengths, sent1_batch, sent2_batch, sent1_length_batch, sent2_length_batch gc.collect()