params = list(cnn.linear.parameters()) + list(rnn.parameters()) optimizer = torch.optim.Adam(params, lr=1e-3) for epoch in range(num_epochs): tic = time.time() for i, (image, captions, lengths) in enumerate(dataset_loader): image = image.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] cnn.zero_grad() rnn.zero_grad() cnn_out = cnn.forward(image) lstm_out = rnn.forward(cnn_out, captions, lengths) loss = criterion(lstm_out, targets) loss.backward() optimizer.step() if i % 1000 == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, num_epochs, i, len(dataset_loader), loss.item(), np.exp(loss.item()))) toc = time.time() print('epoch %d time %.2f mins' % (epoch, (toc - tic) / 60)) torch.save(cnn.state_dict(), 'cnn.pkl')
batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=collate_func) vocab_size = vocab.index cnn = EncoderCNN(512).to(device) rnn = DecoderRNN(512, 512, vocab_size).to(device) cnn.load_state_dict(torch.load('cnn.pkl')) rnn.load_state_dict(torch.load('rnn.pkl')) hyp = [] references = [] for i, (image, captions, lengths, image_id) in enumerate(dataset_loader): image = image.to(device) for id in image_id: references.append([caption_dict[id].split(' ')[1:]]) features = cnn.forward(image) ids_list = rnn.sample(features) ids_list = ids_list.cpu().numpy() for ids in ids_list: snt = vocab.get_sentence(ids).split() hyp.append(snt[1:]) hyp = np.array(hyp) references = np.array(references) print(hyp.shape, references.shape) print(hyp) print(compute_bleu(references, hyp))