def translate(): SOS = Language.SOS_TOKEN_IDX EOS = Language.EOS_TOKEN_IDX french_train = Language(path='data/train.fr.txt') english_train = Language(path='data/train.en.txt') french_train.build_vocab() english_train.build_vocab() model = Seq2Seq(french_train, english_train, attention_type=attention_type, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) model.load_state_dict( torch.load("seq2seq_" + attention_type + ".pth", map_location=device)) french_test = Language(path='data/test.fr.txt') english_test = Language(path='data/test.en.txt') french_test.set_vocab(french_train.word2idx, french_train.idx2word) english_test.set_vocab(english_train.word2idx, english_train.idx2word) dataset = NmtDataset(src=french_test, trg=english_test) samples = [dataset[0][0], dataset[1][0], dataset[2][0]] # You may choose your own samples to plot for i, french in enumerate(samples): translated, attention = model.translate( torch.Tensor(french).to(dtype=torch.long, device=device)) source_text = [french_train.idx2word[idx] for idx in french] translated_text = [english_train.idx2word[idx] for idx in translated] plot_attention(attention.cpu().detach(), translated_text, source_text, name=attention_type + '_' + str(i)) f = open('translated.txt', mode='w', encoding='utf-8') f_bleu = open('pred.en.txt', mode='w', encoding='utf-8') for french, english in tqdm(dataset, desc='Translated'): translated, attention = model.translate( torch.Tensor(french).to(dtype=torch.long, device=device)) source_text = [french_train.idx2word[idx] for idx in french] target_text = [ english_train.idx2word[idx] for idx in english if idx != SOS and idx != EOS ] translated_text = [ english_train.idx2word[idx] for idx in translated if idx != EOS ] f.write('French : ' + ' '.join(source_text) + '\n') f.write('English : ' + ' '.join(target_text) + '\n') f.write('Translated: ' + ' '.join(translated_text) + '\n\n') f_bleu.write(' '.join(translated_text) + '\n') f.close() f_bleu.close()
def test_initializer_and_forward(): print("=====Model Initializer & Forward Test Case======") french = Language(path='data/train.fr.txt') english = Language(path='data/train.en.txt') french.build_vocab() english.build_vocab() dataset = NmtDataset(src=french, trg=english) model = Seq2Seq(french, english, attention_type='dot') # the first test try: model.load_state_dict( torch.load("sanity_check.pth", map_location='cpu')) except Exception as e: print( "Your model initializer is wrong. Check the handout and comments in details and implement the model precisely." ) raise e print("The first test passed!") batch_size = 8 max_pad_len = 5 sentence_length = list( map(lambda pair: (len(pair[0]), len(pair[1])), dataset)) batch_indices = [[0, 1, 2, 3, 4, 5, 6, 7]] dataloader = torch.utils.data.dataloader.DataLoader( dataset, collate_fn=collate_fn, num_workers=0, batch_sampler=batch_indices) batch = next(iter(dataloader)) loss = model(batch[0], batch[1]) # the second test assert loss.detach().allclose(torch.tensor(3.03703070), atol=1e-7), \ "Loss of the model does not match expected result." print("The second test passed!") loss.backward() # the third test expected_grad = torch.Tensor( [[-8.29117271e-05, -4.44278521e-05, -2.64967621e-05], [-3.89243884e-04, -1.29778590e-03, -4.56827343e-04], [-2.76966626e-03, -1.00148167e-03, -6.68873254e-05]]) assert model.encoder.weight_ih_l0.grad[:3, :3].allclose(expected_grad, atol=1e-7), \ "Gradient of the model does not match expected result." print("The third test passed!") print("All 3 tests passed!")
def train(): max_epoch = 200 batch_size = 256 french = Language(path='data/train.fr.txt') english = Language(path='data/train.en.txt') french.build_vocab() english.build_vocab() dataset = NmtDataset(src=french, trg=english) max_pad_len = 5 sentence_length = list( map(lambda pair: (len(pair[0]), len(pair[1])), dataset)) batch_sampler = bucketed_batch_indices( sentence_length, batch_size=batch_size, max_pad_len=max_pad_len) if bucketing else None model = Seq2Seq(french, english, attention_type=attention_type, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) optimizer = torch.optim.Adam(model.parameters()) dataloader = torch.utils.data.dataloader.DataLoader( dataset, collate_fn=collate_fn, num_workers=2, batch_size=1 if bucketing else batch_size, batch_sampler=batch_sampler, shuffle=not bucketing) loss_log = tqdm(total=0, bar_format='{desc}', position=2) for epoch in trange(max_epoch, desc="Epoch", position=0): for src_sentence, trg_sentence in tqdm(dataloader, desc="Iteration", position=1): optimizer.zero_grad() src_sentence, trg_sentence = src_sentence.to( device), trg_sentence.to(device) loss = model(src_sentence, trg_sentence, teacher_force=0.5) loss.backward() optimizer.step() des = 'Loss per a non-<PAD> Word: {:06.4f}'.format(loss.cpu()) loss_log.set_description_str(des) torch.save(model.state_dict(), "seq2seq_" + attention_type + ".pth")
english_train = Language(path='data/train.en.txt') french_train.build_vocab() english_train.build_vocab() model = Seq2Seq(french_train, english_train, attention_type=attention_type, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device) model.load_state_dict( torch.load("seq2seq_" + attention_type + ".pth", map_location=device)) french_test = Language(path='data/test.fr.txt') english_test = Language(path='data/test.en.txt') french_test.set_vocab(french_train.word2idx, french_train.idx2word) english_test.set_vocab(english_train.word2idx, english_train.idx2word) dataset = NmtDataset(src=french_test, trg=english_test) samples = [dataset[0][0], dataset[1][0], dataset[2][0]] # You may choose your own samples to plot for i, french in enumerate(samples): translated, attention = model.translate( torch.Tensor(french).to(dtype=torch.long, device=device)) source_text = [french_train.idx2word[idx] for idx in french] translated_text = [english_train.idx2word[idx] for idx in translated] plot_attention(attention.cpu().detach(), translated_text, source_text, name=attention_type + '_' + str(i)) f = open('translated.txt', mode='w', encoding='utf-8')