示例#1
0
def test_translation():
    print("=====Translation Test Case======")
    french = Language(path='data/train.fr.txt')
    english = Language(path='data/train.en.txt')
    french.build_vocab()
    english.build_vocab()

    model = Seq2Seq(french, english, attention_type='dot')
    model.load_state_dict(torch.load("sanity_check.pth", map_location='cpu'))

    sentence = torch.Tensor([4, 6, 40, 41, 42, 43, 44, 13]).to(torch.long)
    translated, distributions = model.translate(sentence)

    # the first test
    assert translated.tolist() == [4, 16, 9, 56, 114, 51, 1, 14, 3], \
        "Your translation does not math expected result."
    print("The first test passed!")

    # the second test
    expected_dist = torch.Tensor(
        [[9.98170257e-01, 1.74237683e-03, 7.48323873e-05],
         [1.94309454e-03, 9.82858062e-01, 4.87918453e-03],
         [2.26807110e-02, 7.29433298e-02, 3.17393959e-01]])
    assert distributions[:3, :3].allclose(expected_dist, atol=1e-7), \
        "Your attetion distribution does not math expected result."
    print("The second test passed!")

    # the third test
    sentence = torch.Tensor([4, 6, 40, 41, 42, 43, 44, 13]).to(torch.long)
    translated, _ = model.translate(sentence, max_len=4)
    assert translated.tolist() == [4, 16, 9, 56], \
        "max_len parameter dose not work properly."
    print("The third test passed!")

    print("All 3 tests passed!")
示例#2
0
def translate():
    SOS = Language.SOS_TOKEN_IDX
    EOS = Language.EOS_TOKEN_IDX

    french_train = Language(path='data/train.fr.txt')
    english_train = Language(path='data/train.en.txt')
    french_train.build_vocab()
    english_train.build_vocab()
    model = Seq2Seq(french_train,
                    english_train,
                    attention_type=attention_type,
                    embedding_dim=embedding_dim,
                    hidden_dim=hidden_dim).to(device)
    model.load_state_dict(
        torch.load("seq2seq_" + attention_type + ".pth", map_location=device))

    french_test = Language(path='data/test.fr.txt')
    english_test = Language(path='data/test.en.txt')
    french_test.set_vocab(french_train.word2idx, french_train.idx2word)
    english_test.set_vocab(english_train.word2idx, english_train.idx2word)
    dataset = NmtDataset(src=french_test, trg=english_test)

    samples = [dataset[0][0], dataset[1][0],
               dataset[2][0]]  # You may choose your own samples to plot

    for i, french in enumerate(samples):
        translated, attention = model.translate(
            torch.Tensor(french).to(dtype=torch.long, device=device))
        source_text = [french_train.idx2word[idx] for idx in french]
        translated_text = [english_train.idx2word[idx] for idx in translated]
        plot_attention(attention.cpu().detach(),
                       translated_text,
                       source_text,
                       name=attention_type + '_' + str(i))

    f = open('translated.txt', mode='w', encoding='utf-8')
    f_bleu = open('pred.en.txt', mode='w', encoding='utf-8')
    for french, english in tqdm(dataset, desc='Translated'):
        translated, attention = model.translate(
            torch.Tensor(french).to(dtype=torch.long, device=device))
        source_text = [french_train.idx2word[idx] for idx in french]
        target_text = [
            english_train.idx2word[idx] for idx in english
            if idx != SOS and idx != EOS
        ]
        translated_text = [
            english_train.idx2word[idx] for idx in translated if idx != EOS
        ]

        f.write('French    : ' + ' '.join(source_text) + '\n')
        f.write('English   : ' + ' '.join(target_text) + '\n')
        f.write('Translated: ' + ' '.join(translated_text) + '\n\n')
        f_bleu.write(' '.join(translated_text) + '\n')
    f.close()
    f_bleu.close()
示例#3
0
def test_initializer_and_forward():
    print("=====Model Initializer & Forward Test Case======")
    french = Language(path='data/train.fr.txt')
    english = Language(path='data/train.en.txt')
    french.build_vocab()
    english.build_vocab()
    dataset = NmtDataset(src=french, trg=english)

    model = Seq2Seq(french, english, attention_type='dot')

    # the first test
    try:
        model.load_state_dict(
            torch.load("sanity_check.pth", map_location='cpu'))
    except Exception as e:
        print(
            "Your model initializer is wrong. Check the handout and comments in details and implement the model precisely."
        )
        raise e
    print("The first test passed!")

    batch_size = 8
    max_pad_len = 5
    sentence_length = list(
        map(lambda pair: (len(pair[0]), len(pair[1])), dataset))

    batch_indices = [[0, 1, 2, 3, 4, 5, 6, 7]]
    dataloader = torch.utils.data.dataloader.DataLoader(
        dataset,
        collate_fn=collate_fn,
        num_workers=0,
        batch_sampler=batch_indices)
    batch = next(iter(dataloader))
    loss = model(batch[0], batch[1])

    # the second test
    assert loss.detach().allclose(torch.tensor(3.03703070), atol=1e-7), \
        "Loss of the model does not match expected result."
    print("The second test passed!")

    loss.backward()

    # the third test
    expected_grad = torch.Tensor(
        [[-8.29117271e-05, -4.44278521e-05, -2.64967621e-05],
         [-3.89243884e-04, -1.29778590e-03, -4.56827343e-04],
         [-2.76966626e-03, -1.00148167e-03, -6.68873254e-05]])
    assert model.encoder.weight_ih_l0.grad[:3, :3].allclose(expected_grad, atol=1e-7), \
        "Gradient of the model does not match expected result."
    print("The third test passed!")

    print("All 3 tests passed!")
示例#4
0
def train():
    max_epoch = 200
    batch_size = 256

    french = Language(path='data/train.fr.txt')
    english = Language(path='data/train.en.txt')
    french.build_vocab()
    english.build_vocab()
    dataset = NmtDataset(src=french, trg=english)

    max_pad_len = 5
    sentence_length = list(
        map(lambda pair: (len(pair[0]), len(pair[1])), dataset))
    batch_sampler = bucketed_batch_indices(
        sentence_length, batch_size=batch_size,
        max_pad_len=max_pad_len) if bucketing else None

    model = Seq2Seq(french,
                    english,
                    attention_type=attention_type,
                    embedding_dim=embedding_dim,
                    hidden_dim=hidden_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters())
    dataloader = torch.utils.data.dataloader.DataLoader(
        dataset,
        collate_fn=collate_fn,
        num_workers=2,
        batch_size=1 if bucketing else batch_size,
        batch_sampler=batch_sampler,
        shuffle=not bucketing)

    loss_log = tqdm(total=0, bar_format='{desc}', position=2)
    for epoch in trange(max_epoch, desc="Epoch", position=0):
        for src_sentence, trg_sentence in tqdm(dataloader,
                                               desc="Iteration",
                                               position=1):
            optimizer.zero_grad()
            src_sentence, trg_sentence = src_sentence.to(
                device), trg_sentence.to(device)
            loss = model(src_sentence, trg_sentence, teacher_force=0.5)
            loss.backward()
            optimizer.step()

            des = 'Loss per a non-<PAD> Word: {:06.4f}'.format(loss.cpu())
            loss_log.set_description_str(des)

    torch.save(model.state_dict(), "seq2seq_" + attention_type + ".pth")
示例#5
0
import torch
from dataset import Language, NmtDataset
from model import Seq2Seq
from run import plot_attention

attention_type = 'concat'  # 'dot' or 'concat'
embedding_dim = 128
hidden_dim = 64
bucketing = True
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

if __name__ == "__main__":
    french_train = Language(path='data/train.fr.txt')
    english_train = Language(path='data/train.en.txt')
    french_train.build_vocab()
    english_train.build_vocab()
    model = Seq2Seq(french_train,
                    english_train,
                    attention_type=attention_type,
                    embedding_dim=embedding_dim,
                    hidden_dim=hidden_dim).to(device)
    model.load_state_dict(
        torch.load("seq2seq_" + attention_type + ".pth", map_location=device))

    french_test = Language(path='data/test.fr.txt')
    english_test = Language(path='data/test.en.txt')
    french_test.set_vocab(french_train.word2idx, french_train.idx2word)
    english_test.set_vocab(english_train.word2idx, english_train.idx2word)
    dataset = NmtDataset(src=french_test, trg=english_test)

    samples = [dataset[0][0], dataset[1][0],