def train(): train_gpu_id = DC.train_gpu_id device = t.device('cuda', train_gpu_id) if DC.use_gpu else t.device('cpu') transforms = T.Compose([ T.Resize(DC.input_size), T.CenterCrop(DC.input_size), T.ToTensor(), T.Lambda(lambda x: x*255) ]) train_dir = DC.train_content_dir batch_size = DC.train_batch_size train_data = ImageFolder(train_dir, transform=transforms) num_train_data = len(train_data) train_dataloader = t.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=DC.num_workers, drop_last=True) # transform net transformer = TransformerNet() if DC.load_model: transformer.load_state_dict( t.load(DC.load_model, map_location=lambda storage, loc: storage)) transformer.to(device) # Loss net (vgg16) vgg = Vgg16().eval() vgg.to(device) for param in vgg.parameters(): param.requires_grad = False optimizer = t.optim.Adam(transformer.parameters(), DC.base_lr) # Get the data from style image ys = utils.get_style_data(DC.style_img) ys = ys.to(device) # The Gram matrix of the style image with t.no_grad(): features_ys = vgg(ys) gram_ys = [utils.gram_matrix(ys) for ys in features_ys] # Start training train_imgs = 0 iteration = 0 for epoch in range(DC.max_epoch): for i, (data, label) in tqdm.tqdm(enumerate(train_dataloader)): train_imgs += batch_size iteration += 1 optimizer.zero_grad() # Transformer net x = data.to(device) y = transformer(x) x = utils.normalize_batch(x) yc = x y = utils.normalize_batch(y) features_y = vgg(y) features_yc = vgg(yc) # Content loss content_loss = DC.content_weight * \ nn.functional.mse_loss(features_y.relu2_2, features_yc.relu2_2) # content_loss = DC.content_weight * \ # nn.functional.mse_loss(features_y.relu3_3, # features_yc.relu3_3) # Style loss style_loss = 0.0 for ft_y, gm_ys in zip(features_y, gram_ys): gm_y = utils.gram_matrix(ft_y) style_loss += nn.functional.mse_loss(gm_y, gm_ys.expand_as(gm_y)) style_loss *= DC.style_weight # Total loss total_loss = content_loss + style_loss total_loss.backward() optimizer.step() if iteration%DC.show_iter == 0: print('\ncontent loss: ', content_loss.data) print('style loss: ', style_loss.data) print('total loss: ', total_loss.data) print() t.save(transformer.state_dict(), '{}_style.pth'.format(epoch))
def train(**kwargs): opt = Config() for k_, v_ in kwargs.items(): setattr(opt, k_, v_) # 可视化操作 vis = utils.Visualizer(opt.env) # 数据加载 transfroms = tv.transforms.Compose([ # 将输入的`PIL.Image`重新改变大小成给定的`size` `size`是最小边的边长 tv.transforms.Scale(opt.image_size), tv.transforms.CenterCrop(opt.image_size), # 转为0-1之间 tv.transforms.ToTensor(), # 转为0-255之间 tv.transforms.Lambda(lambda x: x * 255) ]) # 封装数据集,并进行数据转化 dataset = tv.datasets.ImageFolder(opt.data_root, transfroms) # 数据加载器 dataloader = data.DataLoader(dataset, opt.batch_size) # 转换网络 transformer = TransformerNet() if opt.model_path: transformer.load_state_dict( t.load(opt.model_path, map_location=lambda _s, _: _s)) # 损失网络 Vgg16 置为预测模式 vgg = Vgg16().eval() # 优化器(需要训练 风格转化网络的参数) optimizer = t.optim.Adam(transformer.parameters(), opt.lr) # 获取风格图片的数据 形状 1*c*h*w, 分布 -2~2(使用预设) style = utils.get_style_data(opt.style_path) # 可视化风格图:-2 到2 转化为0-1 vis.img('style', (style[0] * 0.225 + 0.45).clamp(min=0, max=1)) if opt.use_gpu: transformer.cuda() style = style.cuda() vgg.cuda() # 风格图片的gram矩阵 style_v = Variable(style, volatile=True) # 得到vgg中间四层的结果(用以跟输入图片的输出四层比较,计算损失) features_style = vgg(style_v) # gram_matrix:输入 b,c,h,w 输出 b,c,c 计算gram矩阵(四层的gram矩阵) gram_style = [Variable(utils.gram_matrix(y.data)) for y in features_style] # 损失统计 仪表盘 用以可视化(每个epoch中的所有batch平均损失) # 风格损失 style_meter = tnt.meter.AverageValueMeter() # 内容损失 content_meter = tnt.meter.AverageValueMeter() for epoch in range(opt.epoches): # 仪表盘清零 content_meter.reset() style_meter.reset() for ii, (x, _) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() if opt.use_gpu: x = x.cuda() # x为输入的真实图像 x = Variable(x) # 风格转换后的预测图像为y y = transformer(x) # 输入: b, ch, h, w 0~255 # 输出: b, ch, h, w - 2~2 # 将x,y范围从0-255转化为-2-2 y = utils.normalize_batch(y) x = utils.normalize_batch(x) # 返回 四个中间层的特征输出 features_y = vgg(y) features_x = vgg(x) # content loss内容损失 只计算relu2_2之间的损失 预测图片与原图在relu2_2中间层比较,计算损失 # content_weight内容的权重 mse_loss均方误差损失函数 content_loss = opt.content_weight * F.mse_loss( features_y.relu2_2, features_x.relu2_2) # style loss style_loss = 0. # 风格损失取四层的均方误差损失总和 # features_y:预测图像的四层输出内容 gram_style:风格图像的四层输出的gram_matrix # zip将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表 for ft_y, gm_s in zip(features_y, gram_style): # 计算预测图像的四层输出内容的gram_matrix gram_y = utils.gram_matrix(ft_y) style_loss += F.mse_loss(gram_y, gm_s.expand_as(gram_y)) style_loss *= opt.style_weight # 总损失=风格损失+内容损失 total_loss = content_loss + style_loss # 反向传播 total_loss.backward() # 更新参数 optimizer.step() # 损失平滑 将损失加入仪表盘,以便可视化损失过程 content_meter.add(content_loss.data[0]) style_meter.add(style_loss.data[0]) # 每plot_every次前向传播后可视化 if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # 可视化 vis.plot('content_loss', content_meter.value()[0]) vis.plot('style_loss', style_meter.value()[0]) # 因为x和y经过标准化处理(utils.normalize_batch),所以需要将它们还原 #x,y为[-2,2]还原回[0,1] vis.img('output', (y.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) vis.img('input', (x.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) # 每次epoch完毕后保存visdom和模型 vis.save([opt.env]) t.save(transformer.state_dict(), 'checkpoints/%s_style.pth' % epoch)
def train(**kwargs): opt = Config() for k_, v_ in kwargs.items(): setattr(opt, k_, v_) vis = utils.Visualizer(opt.env) # 数据加载 transfroms = tv.transforms.Compose([ tv.transforms.Scale(opt.image_size), tv.transforms.CenterCrop(opt.image_size), tv.transforms.ToTensor(), tv.transforms.Lambda(lambda x: x * 255) ]) dataset = tv.datasets.ImageFolder(opt.data_root, transfroms) dataloader = data.DataLoader(dataset, opt.batch_size) # 转换网络 transformer = TransformerNet() if opt.model_path: transformer.load_state_dict(t.load(opt.model_path, map_location=lambda _s, _: _s)) # 损失网络 Vgg16 vgg = Vgg16().eval() # 优化器 optimizer = t.optim.Adam(transformer.parameters(), opt.lr) # 获取风格图片的数据 style = utils.get_style_data(opt.style_path) vis.img('style', (style[0] * 0.225 + 0.45).clamp(min=0, max=1)) if opt.use_gpu: transformer.cuda() style = style.cuda() vgg.cuda() # 风格图片的gram矩阵 style_v = Variable(style, volatile=True) features_style = vgg(style_v) gram_style = [Variable(utils.gram_matrix(y.data)) for y in features_style] # 损失统计 style_meter = tnt.meter.AverageValueMeter() content_meter = tnt.meter.AverageValueMeter() for epoch in range(opt.epoches): content_meter.reset() style_meter.reset() for ii, (x, _) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() if opt.use_gpu: x = x.cuda() x = Variable(x) y = transformer(x) y = utils.normalize_batch(y) x = utils.normalize_batch(x) features_y = vgg(y) features_x = vgg(x) # content loss content_loss = opt.content_weight * F.mse_loss(features_y.relu2_2, features_x.relu2_2) # style loss style_loss = 0. for ft_y, gm_s in zip(features_y, gram_style): gram_y = utils.gram_matrix(ft_y) style_loss += F.mse_loss(gram_y, gm_s.expand_as(gram_y)) style_loss *= opt.style_weight total_loss = content_loss + style_loss total_loss.backward() optimizer.step() # 损失平滑 content_meter.add(content_loss.data[0]) style_meter.add(style_loss.data[0]) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # 可视化 vis.plot('content_loss', content_meter.value()[0]) vis.plot('style_loss', style_meter.value()[0]) # 因为x和y经过标准化处理(utils.normalize_batch),所以需要将它们还原 vis.img('output', (y.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) vis.img('input', (x.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) # 保存visdom和模型 vis.save([opt.env]) t.save(transformer.state_dict(), 'checkpoints/%s_style.pth' % epoch)
def train(**kwargs): opt = Config() for k_, v_ in kwargs.items(): setattr(opt, k_, v_) device = t.device('cuda') if opt.use_gpu else t.device('cpu') vis = utils.Visualizer(opt.env) # 数据加载 transfroms = tv.transforms.Compose([ tv.transforms.Resize(opt.image_size), tv.transforms.CenterCrop(opt.image_size), tv.transforms.ToTensor(), tv.transforms.Lambda(lambda x: x * 255) ]) dataset = tv.datasets.ImageFolder(opt.data_root, transfroms) dataloader = data.DataLoader(dataset, opt.batch_size) # 转换网络 transformer = TransformerNet() if opt.model_path: transformer.load_state_dict( t.load(opt.model_path, map_location=lambda _s, _: _s)) transformer.to(device) # 损失网络 Vgg16 vgg = Vgg16().eval() vgg.to(device) for param in vgg.parameters(): param.requires_grad = False # 优化器 optimizer = t.optim.Adam(transformer.parameters(), opt.lr) # 获取风格图片的数据 style = utils.get_style_data(opt.style_path) vis.img('style', (style.data[0] * 0.225 + 0.45).clamp(min=0, max=1)) style = style.to(device) # 风格图片的gram矩阵 with t.no_grad(): features_style = vgg(style) gram_style = [utils.gram_matrix(y) for y in features_style] # 损失统计 style_meter = tnt.meter.AverageValueMeter() content_meter = tnt.meter.AverageValueMeter() for epoch in range(opt.epoches): content_meter.reset() style_meter.reset() for ii, (x, _) in tqdm.tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() x = x.to(device) y = transformer(x) y = utils.normalize_batch(y) x = utils.normalize_batch(x) features_y = vgg(y) features_x = vgg(x) # content loss content_loss = opt.content_weight * F.mse_loss( features_y.relu2_2, features_x.relu2_2) # style loss style_loss = 0. for ft_y, gm_s in zip(features_y, gram_style): gram_y = utils.gram_matrix(ft_y) style_loss += F.mse_loss(gram_y, gm_s.expand_as(gram_y)) style_loss *= opt.style_weight total_loss = content_loss + style_loss total_loss.backward() optimizer.step() # 损失平滑 content_meter.add(content_loss.item()) style_meter.add(style_loss.item()) if (ii + 1) % opt.plot_every == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() # 可视化 vis.plot('content_loss', content_meter.value()[0]) vis.plot('style_loss', style_meter.value()[0]) # 因为x和y经过标准化处理(utils.normalize_batch),所以需要将它们还原 vis.img('output', (y.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) vis.img('input', (x.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) # 保存visdom和模型 vis.save([opt.env]) t.save(transformer.state_dict(), 'checkpoints/%s_style.pth' % epoch)
def train(**kwargs): # step1:config opt.parse(**kwargs) vis = Visualizer(opt.env) device = t.device('cuda') if opt.use_gpu else t.device('cpu') # step2:data # dataloader, style_img # 这次图片的处理和之前不一样,之前都是normalize,这次改成了lambda表达式乘以255,这种转化之后要给出一个合理的解释 # 图片共分为两种,一种是原图,一种是风格图片,在作者的代码里,原图用于训练,需要很多,风格图片需要一张,用于损失函数 transforms = T.Compose([ T.Resize(opt.image_size), T.CenterCrop(opt.image_size), T.ToTensor(), T.Lambda(lambda x: x*255) ]) # 这次获取图片的方式和第七章一样,仍然是ImageFolder的方式,而不是dataset的方式 dataset = tv.datasets.ImageFolder(opt.data_root,transform=transforms) dataloader = DataLoader(dataset,batch_size=opt.batch_size,shuffle=True,num_workers=opt.num_workers,drop_last=True) style_img = get_style_data(opt.style_path) # 1*c*H*W style_img = style_img.to(device) vis.img('style_image',(style_img.data[0]*0.225+0.45).clamp(min=0,max=1)) # 个人觉得这个没必要,下次可以实验一下 # step3: model:Transformer_net 和 损失网络vgg16 # 整个模型分为两部分,一部分是转化模型TransformerNet,用于转化原始图片,一部分是损失模型Vgg16,用于评价损失函数, # 在这里需要注意一下,Vgg16只是用于评价损失函数的,所以它的参数不参与反向传播,只有Transformer的参数参与反向传播, # 也就意味着,我们只训练TransformerNet,只保存TransformerNet的参数,Vgg16的参数是在网络设计时就已经加载进去的。 # Vgg16是以验证model.eval()的方式在运行,表示其中涉及到pooling等层会发生改变 # 那模型什么时候开始model.eval()呢,之前是是val和test中就会这样设置,那么Vgg16的设置理由是什么? # 这里加载模型的时候,作者使用了简单的map_location的记录方法,更轻巧一些 # 发现作者在写这些的时候越来越趋向方便的方式 # 在cuda的使用上,模型的cuda是直接使用的,而数据的cuda是在正式训练的时候才使用的,注意一下两者的区别 # 在第七章作者是通过两种方式实现网络分离的,一种是对于前面网络netg,进行 fake_img = netg(noises).detach(),使得非叶子节点变成一个类似不需要邱求导的叶子节点 # 第四章还需要重新看, transformer_net = TransformerNet() if opt.model_path: transformer_net.load_state_dict(t.load(opt.model_path,map_location= lambda _s, _: _s)) transformer_net.to(device) # step3: criterion and optimizer optimizer = t.optim.Adam(transformer_net.parameters(),opt.lr) # 此通过vgg16实现的,损失函数包含两个Gram矩阵和均方误差,所以,此外,我们还需要求Gram矩阵和均方误差 vgg16 = Vgg16().eval() # 待验证 vgg16.to(device) # vgg的参数不需要倒数,但仍然需要反向传播 # 回头重新考虑一下detach和requires_grad的区别 for param in vgg16.parameters(): param.requires_grad = False criterion = t.nn.MSELoss(reduce=True, size_average=True) # step4: meter 损失统计 style_meter = meter.AverageValueMeter() content_meter = meter.AverageValueMeter() total_meter = meter.AverageValueMeter() # step5.2:loss 补充 # 求style_image的gram矩阵 # gram_style:list [relu1_2,relu2_2,relu3_3,relu4_3] 每一个是b*c*c大小的tensor with t.no_grad(): features = vgg16(style_img) gram_style = [gram_matrix(feature) for feature in features] # 损失网络 Vgg16 # step5: train for epoch in range(opt.epoches): style_meter.reset() content_meter.reset() # step5.1: train for ii,(data,_) in tqdm(enumerate(dataloader)): optimizer.zero_grad() # 这里作者没有进行 Variable(),与之前不同 # pytorch 0.4.之后tensor和Variable不再严格区分,创建的tensor就是variable # https://mp.weixin.qq.com/s?__biz=MzI0ODcxODk5OA==&mid=2247494701&idx=2&sn=ea8411d66038f172a2f553770adccbec&chksm=e99edfd4dee956c23c47c7bb97a31ee816eb3a0404466c1a57c12948d807c975053e38b18097&scene=21#wechat_redirect data = data.to(device) y = transformer_net(data) # vgg对输入的图片需要进行归一化 data = normalize_batch(data) y = normalize_batch(y) feature_data = vgg16(data) feature_y = vgg16(y) # 疑问??现在的feature是一个什么样子的向量? # step5.2: loss:content loss and style loss # content_loss # 在这里和书上的讲的不一样,书上是relu3_3,代码用的是relu2_2 # https://blog.csdn.net/zhangxb35/article/details/72464152?utm_source=itdadao&utm_medium=referral # 均方误差指的是一个像素点的损失,可以理解N*b*h*w个元素加起来,然后除以N*b*h*w # 随机梯度下降法本身就是对batch内loss求平均后反向传播 content_loss = opt.content_weight*criterion(feature_y.relu2_2,feature_data.relu2_2) # style loss # style loss:relu1_2,relu2_2,relu3_3,relu3_4 # 此时需要求每一张图片的gram矩阵 style_loss = 0 # tensor也可以 for i in tensor:,此时只拆解外面一层的tensor # ft_y:b*c*h*w, gm_s:1*c*h*w for ft_y, gm_s in zip(feature_y, gram_style): gram_y = gram_matrix(ft_y) style_loss += criterion(gram_y, gm_s.expand_as(gram_y)) style_loss *= opt.style_weight total_loss = content_loss + style_loss optimizer.zero_grad() total_loss.backward() optimizer.step() #import ipdb #ipdb.set_trace() # 获取tensor的值 tensor.item() tensor.tolist() content_meter.add(content_loss.item()) style_meter.add(style_loss.item()) total_meter.add(total_loss.item()) # step5.3: visualize if (ii+1)%opt.print_freq == 0 and opt.vis: # 为什么总是以这种形式进行debug if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() vis.plot('content_loss',content_meter.value()[0]) vis.plot('style_loss',style_meter.value()[0]) vis.plot('total_loss',total_meter.value()[0]) # 因为现在data和y都已经经过了normalize,变成了-2~2,所以需要把它变回去0-1 vis.img('input',(data.data*0.225+0.45)[0].clamp(min=0,max=1)) vis.img('output',(y.data*0.225+0.45)[0].clamp(min=0,max=1)) # step 5.4 save and validate and visualize if (epoch+1) % opt.save_every == 0: t.save(transformer_net.state_dict(), 'checkpoints/%s_style.pth' % epoch) # 保存图片的几种方法,第七章的是 # tv.utils.save_image(fix_fake_imgs,'%s/%s.png' % (opt.img_save_path, epoch),normalize=True, range=(-1,1)) # vis.save竟然没找到 我的神 vis.save([opt.env])
def train(**kwargs): opt = Config() for _k, _v in kwargs.items(): setattr(opt, _k, _v) device = t.device("cuda" if t.cuda.is_available() else "cpu") vis = utils.Visualizer(opt.env) # 数据加载 transforms = tv.transforms.Compose([ tv.transforms.Resize(opt.image_size), tv.transforms.CenterCrop(opt.image_size), tv.transforms.ToTensor(), tv.transforms.Lambda(lambda x: x * 255) ]) dataset = tv.datasets.ImageFolder(opt.data_root, transforms) dataloader = data.DataLoader(dataset, opt.batch_size) # 风格转换网络 transformer = TransformerNet() if opt.model_path: transformer.load_state_dict( t.load(opt.model_path, map_location=t.device('cpu'))) transformer.to(device) # 损失网络 Vgg16 vgg = Vgg16().eval() vgg.to(device) for param in vgg.parameters(): param.requires_grad = False # 优化器 optimizer = t.optim.Adam(transformer.parameters(), opt.lr) # 获取风格图片的数据 style = utils.get_style_data(opt.style_path) vis.img('style', (style.data[0] * 0.225 + 0.45).clamp(min=0, max=1)) style = style.to(device) # 风格图片的gramj矩阵 with t.no_grad(): features_style = vgg(style) gram_style = [utils.gram_matrix(y) for y in features_style] # 损失统计 style_loss_avg = 0 content_loss_avg = 0 for epoch in range(opt.epoches): for ii, (x, _) in tqdm(enumerate(dataloader)): # 训练 optimizer.zero_grad() x = x.to(device) y = transformer(x) # print(y.size()) y = utils.normalize_batch(y) x = utils.normalize_batch(x) features_x = vgg(x) features_y = vgg(y) # content loss content_loss = opt.content_weight * F.mse_loss( features_y.relu3_3, features_x.relu3_3) # style loss style_loss = 0 for ft_y, gm_s in zip(features_y, gram_style): with t.no_grad(): gram_y = utils.gram_matrix(ft_y) style_loss += F.mse_loss(gram_y, gm_s.expand_as(gram_y)) style_loss *= opt.style_weight total_loss = content_loss + style_loss total_loss.backward() optimizer.step() content_loss_avg += content_loss.item() style_loss_avg += style_loss.item() if (ii + 1) % opt.plot_every == 0: vis.plot('content_loss', content_loss_avg / opt.plot_every) vis.plot('style_loss', style_loss_avg / opt.plot_every) content_loss_avg = 0 style_loss_avg = 0 vis.img('output', (y.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) vis.img('input', (x.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) if (ii + 1) % opt.save_every == 0: vis.save([opt.env]) t.save(transformer.state_dict(), 'checkpoints/%s_style.pth' % (ii + 1))
def train(**kwargs): for k_, v_ in kwargs.items(): setattr(opt, k_, v_) if opt.vis is True: from visualize import Visualizer vis = Visualizer(opt.env) transforms = tv.transforms.Compose([ tv.transforms.Resize(opt.image_size), tv.transforms.CenterCrop(opt.image_size), tv.transforms.ToTensor(), #change value to (0,1) tv.transforms.Lambda(lambda x: x * 255) ]) #change value to (0,255) dataset = tv.datasets.ImageFolder(opt.data_root, transforms) dataloader = data.DataLoader(dataset, opt.batch_size) #value is (0,255) transformer = TransformerNet() if opt.model_path: transformer.load_state_dict( t.load(opt.model_path, map_location=lambda _s, _: _s)) vgg = VGG16().eval() for param in vgg.parameters(): param.requires_grad = False optimizer = t.optim.Adam(transformer.parameters(), opt.lr) style = utils.get_style_data(opt.style_path) vis.img('style', (style[0] * 0.225 + 0.45).clamp(min=0, max=1)) if opt.use_gpu: transformer.cuda() style = style.cuda() vgg.cuda() style_v = Variable(style.unsqueeze(0), volatile=True) features_style = vgg(style_v) gram_style = [Variable(utils.gram_matrix(y.data)) for y in features_style] style_meter = tnt.meter.AverageValueMeter() content_meter = tnt.meter.AverageValueMeter() for epoch in range(opt.epoches): content_meter.reset() style_meter.reset() for ii, (x, _) in tqdm.tqdm(enumerate(dataloader)): optimizer.zero_grad() if opt.use_gpu: x = x.cuda() #(0,255) x = Variable(x) y = transformer(x) #(0,255) y = utils.normalize_batch(y) #(-2,2) x = utils.normalize_batch(x) #(-2,2) features_y = vgg(y) features_x = vgg(x) #calculate the content loss: it's only used relu2_2 # i think should add more layer's result to calculate the result like: w1*relu2_2+w2*relu3_2+w3*relu3_3+w4*relu4_3 content_loss = opt.content_weight * F.mse_loss( features_y.relu2_2, features_x.relu2_2) content_meter.add(content_loss.data) style_loss = 0 for ft_y, gm_s in zip(features_y, gram_style): gram_y = utils.gram_matrix(ft_y) style_loss += F.mse_loss(gram_y, gm_s.expand_as(gram_y)) style_meter.add(style_loss.data) style_loss *= opt.style_weight total_loss = content_loss + style_loss total_loss.backward() optimizer.step() if (ii + 1) % (opt.plot_every) == 0: if os.path.exists(opt.debug_file): ipdb.set_trace() vis.plot('content_loss', content_meter.value()[0]) vis.plot('style_loss', style_meter.value()[0]) vis.img('output', (y.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) vis.img('input', (x.data.cpu()[0] * 0.225 + 0.45).clamp(min=0, max=1)) vis.save([opt.env]) t.save(transformer.state_dict(), 'checkpoints/%s_style.pth' % epoch)