''' loss_function = nn.NLLLoss(reduction = 'sum', ignore_index = de_vocab.item2index['_PAD_']) en_optimizer = optim.Adam(encoder.parameters(), lr = 1e-3, weight_decay = 0) de_optimizer = optim.Adam(decoder.parameters(), lr = 1e-3, weight_decay = 0) if use_cuda: encoder = encoder.cuda() decoder = decoder.cuda() ones_matrix = ones_matrix.cuda() loss_function = loss_function.cuda() for epoch in range(20): pl.reset() encoder.train() decoder.train() total_loss = torch.Tensor([0]) total_token = 0 for batch_idx, (en_seq, en_seq_len, de_seq, de_seq_len) in enumerate(pl.gen_pairs(batch_size)): en_optimizer.zero_grad() de_optimizer.zero_grad() en_seq = torch.LongTensor(en_seq) de_seq = torch.LongTensor(de_seq) if use_cuda: en_seq = en_seq.cuda() de_seq = de_seq.cuda()
tst_tgt_t = torch.LongTensor(tst_tgt_p) trn_src_t = torch.LongTensor(trn_src_p) trn_tgt_t = torch.LongTensor(trn_tgt_p) enc = Encoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad]) dec = Decoder(len(vocab), 100, 100, 2, 'cuda', vocab[pad], vocab[sos], vocab[eos], vocab[unk]) enc.to('cuda') dec.to('cuda') opt_enc = torch.optim.Adam(enc.parameters()) opt_dec = torch.optim.Adam(dec.parameters()) n_batch = len(trn_src_p) // batch_size for e in range(epochs): enc.train() dec.train() epoch_loss = 0 for i in range(n_batch): opt_enc.zero_grad() opt_dec.zero_grad() lengths = torch.LongTensor(l_trn_src[batch_size * i:batch_size * (i + 1)]) out, h_n = enc(trn_src_t[batch_size * i:batch_size * (i + 1)], lengths) output = dec.teacher_force( trn_tgt_t[batch_size * i:batch_size * (i + 1)].reshape( [batch_size, tgt_max, 1]), h_n, torch.LongTensor(l_trn_tgt[batch_size * i:batch_size * (i + 1)])) loss = 0 for o, l, t in zip(output, l_trn_tgt[batch_size * i:batch_size * (i + 1)],