def evaluate(dataloader, cnn_model, rnn_model, batch_size): cnn_model.eval() rnn_model.eval() s_total_loss = 0 w_total_loss = 0 for step, data in enumerate(dataloader, 0): real_imgs, captions, cap_lens, class_ids, keys = prepare_data(data) words_features, sent_code = cnn_model(real_imgs[-1]) # nef = words_features.size(1) # words_features = words_features.view(batch_size, nef, -1) hidden = rnn_model.init_hidden(batch_size) words_emb, sent_emb = rnn_model(captions, cap_lens, hidden) w_loss0, w_loss1, attn = words_loss(words_features, words_emb, labels, cap_lens, class_ids, batch_size) w_total_loss += (w_loss0 + w_loss1).data s_loss0, s_loss1 = \ sent_loss(sent_code, sent_emb, labels, class_ids, batch_size) s_total_loss += (s_loss0 + s_loss1).data if step == 50: break s_cur_loss = s_total_loss / step w_cur_loss = w_total_loss / step return s_cur_loss, w_cur_loss
def train(dataloader, cnn_model, rnn_model, d_model, batch_size, labels, optimizer, d_optimizer, epoch, ixtoword, image_dir): cnn_model.train() rnn_model.train() s_total_loss0 = 0 s_total_loss1 = 0 w_total_loss0 = 0 w_total_loss1 = 0 g_total_loss = 0 d_total_loss = 0 count = (epoch + 1) * len(dataloader) start_time = time.time() for step, data in enumerate(dataloader, 0): # print('step', step) rnn_model.zero_grad() cnn_model.zero_grad() imgs, captions, cap_lens, class_ids, keys = prepare_data(data) class_ids = [c_id - 1 for c_id in class_ids] # words_features: batch_size x nef x 17 x 17 # sent_code: batch_size x nef words_features, sent_code = cnn_model(imgs[-1]) # --> batch_size x nef x 17*17 _, att_sze = words_features.size(1), words_features.size(2) # words_features = words_features.view(batch_size, nef, -1) hidden = rnn_model.init_hidden(batch_size) # words_emb: batch_size x nef x seq_len # sent_emb: batch_size x nef words_emb, sent_emb = rnn_model(captions, cap_lens, hidden) w_loss0, w_loss1, attn_maps = words_loss(words_features, words_emb, labels, cap_lens, class_ids, batch_size) w_total_loss0 += w_loss0.item() w_total_loss1 += w_loss1.item() loss = w_loss0 + w_loss1 s_loss0, s_loss1 = sent_loss(sent_code, sent_emb, labels, class_ids, batch_size) loss += s_loss0 + s_loss1 s_total_loss0 += s_loss0.item() s_total_loss1 += s_loss1.item() pred_by_sent, pred_class_by_sent = discriminator(sent_code) pred_by_emb, pred_class_by_emb = discriminator(sent_emb) targets = torch.LongTensor(class_ids).to(pred_by_emb.device) # ZSL G Losses g_loss_adv_sent = F.binary_cross_entropy_with_logits( pred_by_sent, torch.zeros_like(pred_by_sent)) g_loss_adv_emb = F.binary_cross_entropy_with_logits( pred_by_emb, torch.zeros_like(pred_by_emb)) g_loss_cls_sent = F.cross_entropy(pred_class_by_sent, targets) g_loss_cls_emb = F.cross_entropy(pred_class_by_emb, targets) g_loss = (g_loss_adv_sent + g_loss_adv_emb + g_loss_cls_sent + g_loss_cls_emb) loss += g_loss * cfg.ZSL.LAMBDA g_total_loss = g_loss.item() loss.backward() # TODO: SummaryWriter # `clip_grad_norm` helps prevent # the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), cfg.TRAIN.RNN_GRAD_CLIP) optimizer.step() # Discriminator step discriminator.zero_grad() hidden = rnn_model.init_hidden(batch_size) words_emb, sent_emb = rnn_model(captions, cap_lens, hidden) words_features, sent_code = cnn_model(imgs[-1]) pred_by_sent, pred_class_by_sent = discriminator(sent_code) pred_by_emb, pred_class_by_emb = discriminator(sent_emb) # ZSL D Losses d_loss_adv_sent = F.binary_cross_entropy_with_logits( pred_by_sent, torch.zeros_like(pred_by_sent)) d_loss_adv_emb = F.binary_cross_entropy_with_logits( pred_by_emb, torch.ones_like(pred_by_emb)) d_loss_cls_sent = F.cross_entropy(pred_class_by_sent, targets) d_loss_cls_emb = F.cross_entropy(pred_class_by_emb, targets) d_loss = (d_loss_adv_sent + d_loss_adv_emb + d_loss_cls_sent + d_loss_cls_emb) d_loss.backward() d_optimizer.step() d_total_loss += d_loss.item() if step % UPDATE_INTERVAL == 0: count = epoch * len(dataloader) + step s_cur_loss0 = s_total_loss0 / UPDATE_INTERVAL s_cur_loss1 = s_total_loss1 / UPDATE_INTERVAL w_cur_loss0 = w_total_loss0 / UPDATE_INTERVAL w_cur_loss1 = w_total_loss1 / UPDATE_INTERVAL elapsed = time.time() - start_time info = ('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 's_loss {:5.2f} {:5.2f} | ' 'w_loss {:5.2f} {:5.2f} | g_loss {:5.2f} d_loss {:5.2f}'. format(epoch, step, len(dataloader), elapsed * 1000. / UPDATE_INTERVAL, s_cur_loss0, s_cur_loss1, w_cur_loss0, w_cur_loss1, g_total_loss, d_total_loss)) LOG.info(info) s_total_loss0 = 0 s_total_loss1 = 0 w_total_loss0 = 0 w_total_loss1 = 0 g_total_loss = 0 d_total_loss = 0 start_time = time.time() # attention Maps img_set, _ = \ build_super_images(imgs[-1].cpu(), captions, ixtoword, attn_maps, att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s/attention_maps%d.png' % (image_dir, step) im.save(fullpath) return count
def sampling(self, split_dir): if cfg.TRAIN.NET_G == '' or cfg.TRAIN.NET_C == '': print('Error: the path for main module or DCM is not found!') else: if split_dir == 'test': split_dir = 'valid' if cfg.GAN.B_DCGAN: netG = G_DCGAN() else: netG = G_NET() netG.apply(weights_init) netG.cuda() netG.eval() # The text encoder text_encoder = RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) state_dict = \ torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder = text_encoder.cuda() text_encoder.eval() # The image encoder image_encoder = CNN_ENCODER(cfg.TEXT.EMBEDDING_DIM) img_encoder_path = cfg.TRAIN.NET_E.replace('text_encoder', 'image_encoder') state_dict = \ torch.load(img_encoder_path, map_location=lambda storage, loc: storage) image_encoder.load_state_dict(state_dict) print('Load image encoder from:', img_encoder_path) image_encoder = image_encoder.cuda() image_encoder.eval() # The VGG network VGG = VGGNet() print("Load the VGG model") VGG.cuda() VGG.eval() batch_size = self.batch_size nz = cfg.GAN.Z_DIM noise = Variable(torch.FloatTensor(batch_size, nz), volatile=True) noise = noise.cuda() # The DCM netDCM = DCM_Net() if cfg.TRAIN.NET_C != '': state_dict = \ torch.load(cfg.TRAIN.NET_C, map_location=lambda storage, loc: storage) netDCM.load_state_dict(state_dict) print('Load DCM from: ', cfg.TRAIN.NET_C) netDCM.cuda() netDCM.eval() model_dir = cfg.TRAIN.NET_G state_dict = \ torch.load(model_dir, map_location=lambda storage, loc: storage) netG.load_state_dict(state_dict) print('Load G from: ', model_dir) # the path to save generated images s_tmp = model_dir[:model_dir.rfind('.pth')] save_dir = '%s/%s' % (s_tmp, split_dir) mkdir_p(save_dir) cnt = 0 idx = 0 for _ in range(5): # (cfg.TEXT.CAPTIONS_PER_IMAGE): for step, data in enumerate(self.data_loader, 0): cnt += batch_size if step % 100 == 0: print('step: ', step) imgs, captions, cap_lens, class_ids, keys, wrong_caps, \ wrong_caps_len, wrong_cls_id = prepare_data(data) ####################################################### # (1) Extract text and image embeddings ###################################################### hidden = text_encoder.init_hidden(batch_size) words_embs, sent_emb = text_encoder( wrong_caps, wrong_caps_len, hidden) words_embs, sent_emb = words_embs.detach( ), sent_emb.detach() mask = (wrong_caps == 0) num_words = words_embs.size(2) if mask.size(1) > num_words: mask = mask[:, :num_words] region_features, cnn_code = \ image_encoder(imgs[cfg.TREE.BRANCH_NUM - 1]) ####################################################### # (2) Modify real images ###################################################### noise.data.normal_(0, 1) fake_imgs, attention_maps, mu, logvar, h_code, c_code = netG( noise, sent_emb, words_embs, mask, cnn_code, region_features) real_img = imgs[cfg.TREE.BRANCH_NUM - 1] real_features = VGG(real_img)[0] fake_img = netDCM(h_code, real_features, sent_emb, words_embs,\ mask, c_code) for j in range(batch_size): s_tmp = '%s/single' % (save_dir) folder = s_tmp[:s_tmp.rfind('/')] if not os.path.isdir(folder): print('Make a new folder: ', folder) mkdir_p(folder) k = -1 im = fake_img[j].data.cpu().numpy() im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) fullpath = '%s_s%d.png' % (s_tmp, idx) idx = idx + 1 im.save(fullpath)
def train(self): text_encoder, image_encoder, netG, netD, start_epoch, VGG, netDCM = self.build_models( ) avg_param_C = copy_G_params(netDCM) optimizerC, optimizerD = self.define_optimizers(netDCM, netD) real_labels, fake_labels, match_labels = self.prepare_labels() batch_size = self.batch_size nz = cfg.GAN.Z_DIM noise = Variable(torch.FloatTensor(batch_size, nz)) fixed_noise = Variable(torch.FloatTensor(batch_size, nz).normal_(0, 1)) if cfg.CUDA: noise, fixed_noise = noise.cuda(), fixed_noise.cuda() gen_iterations = 0 for epoch in range(start_epoch, self.max_epoch): start_t = time.time() data_iter = iter(self.data_loader) step = 0 while step < self.num_batches: ###################################################### # (1) Prepare training data and Compute text embeddings ###################################################### data = data_iter.next() imgs, captions, cap_lens, class_ids, keys, wrong_caps, \ wrong_caps_len, wrong_cls_id = prepare_data(data) hidden = text_encoder.init_hidden(batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef # matched text embeddings words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) words_embs, sent_emb = words_embs.detach(), sent_emb.detach() # mismatched text embeddings w_words_embs, w_sent_emb = text_encoder( wrong_caps, wrong_caps_len, hidden) w_words_embs, w_sent_emb = w_words_embs.detach( ), w_sent_emb.detach() # image embeddings: regional and global region_features, cnn_code = image_encoder( imgs[cfg.TREE.BRANCH_NUM - 1]) mask = (captions == 0) num_words = words_embs.size(2) if mask.size(1) > num_words: mask = mask[:, :num_words] ####################################################### # (2) Modify real images ###################################################### noise.data.normal_(0, 1) fake_imgs, _, mu, logvar, h_code, c_code = netG( noise, sent_emb, words_embs, mask, cnn_code, region_features) real_img = imgs[cfg.TREE.BRANCH_NUM - 1] real_features = VGG(real_img)[0] fake_img = netDCM(h_code, real_features, sent_emb, words_embs,\ mask, c_code) ####################################################### # (3) Update D network ###################################################### errD = 0 D_logs = '' netD.zero_grad() errD = discriminator_loss(netD, imgs[cfg.TREE.BRANCH_NUM - 1], fake_img, sent_emb, real_labels, fake_labels, words_embs, cap_lens, image_encoder, class_ids, w_words_embs, wrong_caps_len, wrong_cls_id) errD.backward() optimizerD.step() D_logs = 'errD: %.2f ' % (errD) ####################################################### # (4) Update G network: maximize log(D(G(z))) ###################################################### # compute total loss for training G step += 1 gen_iterations += 1 netDCM.zero_grad() errC_total, C_logs = \ DCM_generator_loss(netD, image_encoder, fake_img, real_labels, words_embs, sent_emb, match_labels, cap_lens,\ class_ids, VGG, real_img) errC_total.backward() optimizerC.step() for p, avg_p in zip(netDCM.parameters(), avg_param_C): avg_p.mul_(0.999).add_(0.001, p.data) if gen_iterations % 100 == 0: print(D_logs + '\n' + C_logs) # save images if gen_iterations % 1000 == 0: backup_para = copy_G_params(netDCM) load_params(netDCM, avg_param_C) self.save_img_results(netG, fixed_noise, sent_emb, words_embs, mask, image_encoder, captions, cap_lens, epoch, cnn_code, region_features, imgs, netDCM, real_features, name='average') load_params(netDCM, backup_para) end_t = time.time() print('''[%d/%d][%d] Loss_D: %.2f Loss_C: %.2f Time: %.2fs''' % (epoch, self.max_epoch, self.num_batches, errD, errC_total, end_t - start_t)) if epoch % cfg.TRAIN.SNAPSHOT_INTERVAL == 0: self.save_model(netDCM, avg_param_C, netD, epoch) self.save_model(netDCM, avg_param_C, netD, self.max_epoch)
def train(self): text_encoder, image_encoder, netG, netsD, start_epoch, VGG = self.build_models( ) avg_param_G = copy_G_params(netG) optimizerG, optimizersD = self.define_optimizers(netG, netsD) real_labels, fake_labels, match_labels = self.prepare_labels() batch_size = self.batch_size nz = cfg.GAN.Z_DIM noise = Variable(torch.FloatTensor(batch_size, nz)) fixed_noise = Variable(torch.FloatTensor(batch_size, nz).normal_(0, 1)) if cfg.CUDA: noise, fixed_noise = noise.cuda(), fixed_noise.cuda() gen_iterations = 0 for epoch in range(start_epoch, self.max_epoch): start_t = time.time() data_iter = iter(self.data_loader) step = 0 while step < self.num_batches: ###################################################### # (1) Prepare training data and Compute text embeddings ###################################################### data = data_iter.next() imgs, captions, cap_lens, class_ids, keys, wrong_caps, \ wrong_caps_len, wrong_cls_id = prepare_data(data) hidden = text_encoder.init_hidden(batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef # matched text embeddings words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) words_embs, sent_emb = words_embs.detach(), sent_emb.detach() # mismatched text embeddings w_words_embs, w_sent_emb = text_encoder( wrong_caps, wrong_caps_len, hidden) w_words_embs, w_sent_emb = w_words_embs.detach( ), w_sent_emb.detach() # image features: regional and global region_features, cnn_code = image_encoder(imgs[len(netsD) - 1]) mask = (captions == 0) num_words = words_embs.size(2) if mask.size(1) > num_words: mask = mask[:, :num_words] ####################################################### # (2) Modify real images ###################################################### noise.data.normal_(0, 1) fake_imgs, _, mu, logvar, _, _ = netG(noise, sent_emb, words_embs, mask, \ cnn_code, region_features) ####################################################### # (3) Update D network ###################################################### errD_total = 0 D_logs = '' for i in range(len(netsD)): netsD[i].zero_grad() errD = discriminator_loss(netsD[i], imgs[i], fake_imgs[i], sent_emb, real_labels, fake_labels, words_embs, cap_lens, image_encoder, class_ids, w_words_embs, wrong_caps_len, wrong_cls_id) # backward and update parameters errD.backward(retain_graph=True) optimizersD[i].step() errD_total += errD D_logs += 'errD%d: %.2f ' % (i, errD) ####################################################### # (4) Update G network: maximize log(D(G(z))) ###################################################### # compute total loss for training G step += 1 gen_iterations += 1 netG.zero_grad() errG_total, G_logs = \ generator_loss(netsD, image_encoder, fake_imgs, real_labels, words_embs, sent_emb, match_labels, cap_lens,\ class_ids, VGG, imgs) kl_loss = KL_loss(mu, logvar) errG_total += kl_loss G_logs += 'kl_loss: %.2f ' % kl_loss # backward and update parameters errG_total.backward() optimizerG.step() for p, avg_p in zip(netG.parameters(), avg_param_G): avg_p.mul_(0.999).add_(0.001, p.data) if gen_iterations % 100 == 0: print(D_logs + '\n' + G_logs) # save images if gen_iterations % 1000 == 0: backup_para = copy_G_params(netG) load_params(netG, avg_param_G) self.save_img_results(netG, fixed_noise, sent_emb, words_embs, mask, image_encoder, captions, cap_lens, epoch, cnn_code, region_features, imgs, name='average') load_params(netG, backup_para) end_t = time.time() print('''[%d/%d][%d] Loss_D: %.2f Loss_G: %.2f Time: %.2fs''' % (epoch, self.max_epoch, self.num_batches, errD_total, errG_total, end_t - start_t)) if epoch % cfg.TRAIN.SNAPSHOT_INTERVAL == 0: self.save_model(netG, avg_param_G, netsD, epoch) self.save_model(netG, avg_param_G, netsD, self.max_epoch)
def train(dataloader, cnn_model, rnn_model, batch_size, labels, optimizer, epoch, ixtoword, image_dir): cnn_model.train() rnn_model.train() s_total_loss0 = 0 s_total_loss1 = 0 w_total_loss0 = 0 w_total_loss1 = 0 count = (epoch + 1) * len(dataloader) start_time = time.time() for step, data in enumerate(dataloader, 0): rnn_model.zero_grad() cnn_model.zero_grad() imgs, captions, cap_lens, class_ids, keys, wrong_caps, \ wrong_caps_len, wrong_cls_id = prepare_data(data) # words_features: batch_size x nef x 17 x 17 # sent_code: batch_size x nef words_features, sent_code = cnn_model(imgs[-1]) # --> batch_size x nef x 17*17 nef, att_sze = words_features.size(1), words_features.size(2) # words_features = words_features.view(batch_size, nef, -1) hidden = rnn_model.init_hidden(batch_size) # words_emb: batch_size x nef x seq_len # sent_emb: batch_size x nef words_emb, sent_emb = rnn_model(captions, cap_lens, hidden) w_loss0, w_loss1, attn_maps = words_loss(words_features, words_emb, labels, cap_lens, class_ids, batch_size) w_total_loss0 += w_loss0 w_total_loss1 += w_loss1 loss = w_loss0 + w_loss1 s_loss0, s_loss1 = \ sent_loss(sent_code, sent_emb, labels, class_ids, batch_size) loss += s_loss0 + s_loss1 s_total_loss0 += s_loss0 s_total_loss1 += s_loss1 # loss.backward() # # `clip_grad_norm` helps prevent # the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(rnn_model.parameters(), cfg.TRAIN.RNN_GRAD_CLIP) optimizer.step() if step > 0 and step % UPDATE_INTERVAL == 0: count = epoch * len(dataloader) + step s_cur_loss0 = s_total_loss0 / UPDATE_INTERVAL s_cur_loss1 = s_total_loss1 / UPDATE_INTERVAL w_cur_loss0 = w_total_loss0 / UPDATE_INTERVAL w_cur_loss1 = w_total_loss1 / UPDATE_INTERVAL elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 's_loss {:5.2f} {:5.2f} | ' 'w_loss {:5.2f} {:5.2f}'.format( epoch, step, len(dataloader), elapsed * 1000. / UPDATE_INTERVAL, s_cur_loss0, s_cur_loss1, w_cur_loss0, w_cur_loss1)) s_total_loss0 = 0 s_total_loss1 = 0 w_total_loss0 = 0 w_total_loss1 = 0 start_time = time.time() # attention Maps img_set, _ = \ build_super_images(imgs[-1].cpu(), captions, ixtoword, attn_maps, att_sze) if img_set is not None: im = Image.fromarray(img_set) fullpath = '%s/attention_maps%d.png' % (image_dir, step) im.save(fullpath) return count
def sampling(self, split_dir): if cfg.TRAIN.NET_G == '': print('Error: the path for morels is not found!') else: if split_dir == 'test': split_dir = 'valid' # Build and load the generator if cfg.GAN.B_DCGAN: netG = G_DCGAN() else: netG = G_NET() netG.apply(weights_init) netG.cuda() netG.eval() # text_encoder = RNN_ENCODER(self.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) state_dict = \ torch.load(cfg.TRAIN.NET_E, map_location=lambda storage, loc: storage) text_encoder.load_state_dict(state_dict) print('Load text encoder from:', cfg.TRAIN.NET_E) text_encoder = text_encoder.cuda() text_encoder.eval() batch_size = self.batch_size noise = Variable(torch.FloatTensor(batch_size, nz), volatile=True) noise = noise.cuda() model_dir = cfg.TRAIN.NET_G state_dict = \ torch.load(model_dir, map_location=lambda storage, loc: storage) # state_dict = torch.load(cfg.TRAIN.NET_G) netG.load_state_dict(state_dict) print('Load G from: ', model_dir) # the path to save generated images s_tmp = model_dir[:model_dir.rfind('.pth')] save_dir = '%s/%s' % (s_tmp, split_dir) mkdir_p(save_dir) cnt = 0 for _ in range(1): # (cfg.TEXT.CAPTIONS_PER_IMAGE): for step, data in enumerate(self.data_loader, 0): cnt += batch_size if step % 100 == 0: print('step: ', step) # if step > 50: # break imgs, captions, cap_lens, class_ids, keys = prepare_data( data) hidden = text_encoder.init_hidden(batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef words_embs, sent_emb = text_encoder( captions, cap_lens, hidden) words_embs, sent_emb = words_embs.detach( ), sent_emb.detach() mask = (captions == 0) num_words = words_embs.size(2) if mask.size(1) > num_words: mask = mask[:, :num_words] ####################################################### # (2) Generate fake images ###################################################### fake_imgs, *_ = netG(sent_emb, words_embs, mask) for j in range(batch_size): s_tmp = '%s/single/%s' % (save_dir, keys[j]) folder = s_tmp[:s_tmp.rfind('/')] if not os.path.isdir(folder): print('Make a new folder: ', folder) mkdir_p(folder) k = -1 # for k in range(len(fake_imgs)): im = fake_imgs[k][j].data.cpu().numpy() # [-1, 1] --> [0, 255] im = (im + 1.0) * 127.5 im = im.astype(np.uint8) im = np.transpose(im, (1, 2, 0)) im = Image.fromarray(im) fullpath = '%s_s%d.png' % (s_tmp, k) im.save(fullpath)
def train(self): text_encoder, image_encoder, netG, netsD, zsl_discriminator, start_epoch = self.build_models( ) avg_param_G = copy_G_params(netG) optimizerG, optimizersD = self.define_optimizers(netG, netsD) real_labels, fake_labels, match_labels = self.prepare_labels() batch_size = self.batch_size gen_iterations = 0 # gen_iterations = start_epoch * self.num_batches for epoch in range(start_epoch, self.max_epoch): start_t = time.time() data_iter = iter(self.data_loader) it = tqdm.tqdm(range( self.num_batches)) if tqdm is not None else range( self.num_batches) for step in it: # reset requires_grad to be trainable for all Ds # self.set_requires_grad_value(netsD, True) ###################################################### # (1) Prepare training data and Compute text embeddings ###################################################### data = data_iter.next() imgs, captions, cap_lens, class_ids, keys = prepare_data(data) hidden = text_encoder.init_hidden(batch_size) # words_embs: batch_size x nef x seq_len # sent_emb: batch_size x nef words_embs, sent_emb = text_encoder(captions, cap_lens, hidden) words_embs, sent_emb = words_embs.detach(), sent_emb.detach() mask = (captions == 0) num_words = words_embs.size(2) if mask.size(1) > num_words: mask = mask[:, :num_words] ####################################################### # (2) Generate fake images ###################################################### fake_imgs, _ = netG(sent_emb, words_embs, mask) ####################################################### # (3) Update D network ###################################################### errD_total = 0 D_logs = '' for i in range(len(netsD)): netsD[i].zero_grad() errD = discriminator_loss(netsD[i], imgs[i], fake_imgs[i], sent_emb, real_labels, fake_labels) # backward and update parameters errD.backward() optimizersD[i].step() errD_total += errD D_logs += 'errD%d: %.2f ' % (i, errD.item()) writer.add_scalar(f'd/errD/{i}', errD.item(), gen_iterations) ####################################################### # (4) Update G network: maximize log(D(G(z))) ###################################################### # compute total loss for training G # do not need to compute gradient for Ds # self.set_requires_grad_value(netsD, False) netG.zero_grad() errG_total, G_logs = generator_loss( netsD, image_encoder, zsl_discriminator, fake_imgs, real_labels, words_embs, sent_emb, match_labels, cap_lens, class_ids, writer=writer, global_step=gen_iterations, ) # backward and update parameters errG_total.backward() optimizerG.step() for p, avg_p in zip(netG.parameters(), avg_param_G): avg_p.mul_(0.999).add_(0.001, p.data) if gen_iterations % 100 == 0: LOGGER.info(f'{D_logs}\n{G_logs}') # save images if gen_iterations % 1000 == 0: backup_para = copy_G_params(netG) load_params(netG, avg_param_G) self.save_img_results(netG, sent_emb, words_embs, mask, image_encoder, captions, cap_lens, epoch, name='average') load_params(netG, backup_para) # # self.save_img_results(netG, fixed_noise, sent_emb, # words_embs, mask, image_encoder, # captions, cap_lens, # epoch, name='current') gen_iterations += 1 end_t = time.time() info = (f'[{epoch}/{self.max_epoch}][{self.num_batches}] ' f'Loss_D: {errD_total.item():.2f} ' f'Loss_G: {errG_total.item():.2f} ' f'Time: {end_t - start_t:.2f}') LOGGER.info(info) if epoch % cfg.TRAIN.SNAPSHOT_INTERVAL == 0: # and epoch != 0: self.save_model(netG, avg_param_G, netsD, epoch) self.save_model(netG, avg_param_G, netsD, self.max_epoch)