def main(): a = get_args() prev_enc = 0 def train(i): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) micro = None if a.in_txt2 is None else False imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc = model_clip.encode_image(imgs_sliced[-1]) if a.diverse > 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, norm_in, a.overscan, micro=micro) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean() del out_enc2; torch.cuda.empty_cache() if a.in_img is not None and os.path.isfile(a.in_img): # input image loss += sign * torch.cosine_similarity(img_enc, out_enc, dim=-1).mean() if a.in_txt is not None: # input text loss += sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.in_txt0 is not None: # subtract text loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() if a.sync > 0 and a.in_img is not None and os.path.isfile(a.in_img): # image composition loss *= 1. + a.sync * (a.steps/(i+1) * ssim_loss(img_out, img_in) - 1) if a.in_txt2 is not None: # input text for micro details imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=True) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean() del out_enc2; torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache() assert not isinstance(loss, int), ' Loss not defined, check the inputs' if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50':0.5, 'RN50x4':0.16, 'RN101':0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) if a.diverse > 0: a.samples = int(a.samples * 0.5) norm_in = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) out_name = [] if a.in_img is not None and os.path.isfile(a.in_img): if a.verbose is True: print(' ref image:', basename(a.in_img)) img_in = torch.from_numpy(img_read(a.in_img)/255.).unsqueeze(0).permute(0,3,1,2).cuda() img_in = img_in[:,:3,:,:] # fix rgb channels in_sliced = slice_imgs([img_in], a.samples, a.modsize, transform=norm_in, overscan=a.overscan)[0] img_enc = model_clip.encode_image(in_sliced).detach().clone() if a.sync > 0: ssim_loss = ssim.SSIM(window_size = 11) img_in = F.interpolate(img_in, a.size).float() else: del img_in del in_sliced; torch.cuda.empty_cache() out_name.append(basename(a.in_img).replace(' ', '_')) if a.in_txt is not None: if a.verbose is True: print(' ref text: ', basename(a.in_txt)) if a.translate: translator = Translator() a.in_txt = translator.translate(a.in_txt, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt) tx = clip.tokenize(a.in_txt).cuda() txt_enc = model_clip.encode_text(tx).detach().clone() out_name.append(txt_clean(a.in_txt)) if a.in_txt2 is not None: if a.verbose is True: print(' micro text:', basename(a.in_txt2)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt2 = translator.translate(a.in_txt2, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt2) tx2 = clip.tokenize(a.in_txt2).cuda() txt_enc2 = model_clip.encode_text(tx2).detach().clone() out_name.append(txt_clean(a.in_txt2)) if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) tx0 = clip.tokenize(a.in_txt0).cuda() txt_enc0 = model_clip.encode_text(tx0).detach().clone() out_name.append('off-' + txt_clean(a.in_txt0)) params, image_f = fft_image([1, 3, *a.size], resume=a.resume) image_f = to_valid_rgb(image_f) if a.prog is True: lr1 = a.lrate * 2 lr0 = lr1 * 0.01 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) sign = 1. if a.invert is True else -1. if a.verbose is True: print(' samples:', a.samples) out_name = '-'.join(out_name) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(a.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): train(i) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name))) shutil.copy(img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps))) if a.save_pt is True: torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
def main(): a = get_args() prev_enc = 0 def train(i): loss = 0 noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None img_out = image_f(noise) if a.sharp != 0: lx = torch.mean( torch.abs(img_out[0, :, :, 1:] - img_out[0, :, :, :-1])) ly = torch.mean( torch.abs(img_out[0, :, 1:, :] - img_out[0, :, :-1, :])) loss -= a.sharp * (ly + lx) micro = 1 - a.macro if a.in_txt2 is None else False imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=micro) out_enc = model_clip.encode_image(imgs_sliced[-1]) if a.diverse != 0: imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, trform_f, a.align, micro=micro) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += a.diverse * torch.cosine_similarity( out_enc, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.in_img is not None and os.path.isfile(a.in_img): # input image loss += sign * 0.5 * torch.cosine_similarity( img_enc, out_enc, dim=-1).mean() if a.in_txt is not None: # input text loss += sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean() if a.notext > 0: loss -= sign * a.notext * torch.cosine_similarity( txt_plot_enc, out_enc, dim=-1).mean() if a.in_txt0 is not None: # subtract text loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean() if a.sync > 0 and a.in_img is not None and os.path.isfile( a.in_img): # image composition prog_sync = (a.steps // a.fstep - i) / (a.steps // a.fstep) loss += prog_sync * a.sync * sim_loss(F.interpolate( img_out, sim_size).float(), img_in, normalize=True).squeeze() if a.in_txt2 is not None: # input text for micro details imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=True) out_enc2 = model_clip.encode_image(imgs_sliced[-1]) loss += sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean() del out_enc2 torch.cuda.empty_cache() if a.expand > 0: global prev_enc if i > 0: loss += a.expand * torch.cosine_similarity( out_enc, prev_enc, dim=-1).mean() prev_enc = out_enc.detach() del img_out, imgs_sliced, out_enc torch.cuda.empty_cache() assert not isinstance(loss, int), ' Loss not defined, check the inputs' if a.prog is True: lr_cur = lr0 + (i / a.steps) * (lr1 - lr0) for g in optimizer.param_groups: g['lr'] = lr_cur optimizer.zero_grad() loss.backward() optimizer.step() if i % a.fstep == 0: with torch.no_grad(): img = image_f(contrast=a.contrast).cpu().numpy()[0] if (a.sync > 0 and a.in_img is not None) or a.sharp != 0: img = img**1.3 # empirical tone mapping checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose) pbar.upd() # Load CLIP models model_clip, _ = clip.load(a.model) if a.verbose is True: print(' using model', a.model) xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33} if 'RN' in a.model: a.samples = int(a.samples * xmem[a.model]) if a.multilang is True: model_lang = SentenceTransformer( 'clip-ViT-B-32-multilingual-v1').cuda() def enc_text(txt): if a.multilang is True: emb = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False) else: emb = model_clip.encode_text(clip.tokenize(txt).cuda()) return emb.detach().clone() if a.diverse != 0: a.samples = int(a.samples * 0.5) if a.sync > 0: a.samples = int(a.samples * 0.5) if a.transform is True: trform_f = transforms.transforms_custom a.samples = int(a.samples * 0.95) else: trform_f = transforms.normalize() out_name = [] if a.in_txt is not None: if a.verbose is True: print(' ref text: ', basename(a.in_txt)) if a.translate: translator = Translator() a.in_txt = translator.translate(a.in_txt, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt) txt_enc = enc_text(a.in_txt) out_name.append(txt_clean(a.in_txt)) if a.notext > 0: txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone() if a.in_txt2 is not None: if a.verbose is True: print(' micro text:', basename(a.in_txt2)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt2 = translator.translate(a.in_txt2, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt2) txt_enc2 = enc_text(a.in_txt2) out_name.append(txt_clean(a.in_txt2)) if a.in_txt0 is not None: if a.verbose is True: print(' subtract text:', basename(a.in_txt0)) a.samples = int(a.samples * 0.75) if a.translate: translator = Translator() a.in_txt0 = translator.translate(a.in_txt0, dest='en').text if a.verbose is True: print(' translated to:', a.in_txt0) txt_enc0 = enc_text(a.in_txt0) out_name.append('off-' + txt_clean(a.in_txt0)) if a.multilang is True: del model_lang if a.in_img is not None and os.path.isfile(a.in_img): if a.verbose is True: print(' ref image:', basename(a.in_img)) img_in = torch.from_numpy( img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda() img_in = img_in[:, :3, :, :] # fix rgb channels in_sliced = slice_imgs([img_in], a.samples, a.modsize, transforms.normalize(), a.align, micro=False)[0] img_enc = model_clip.encode_image(in_sliced).detach().clone() if a.sync > 0: sim_loss = lpips.LPIPS(net='vgg', verbose=False).cuda() sim_size = [s // 2 for s in a.size] img_in = F.interpolate(img_in, sim_size).float() else: del img_in del in_sliced torch.cuda.empty_cache() out_name.append(basename(a.in_img).replace(' ', '_')) params, image_f = fft_image([1, 3, *a.size], resume=a.resume, decay_power=a.decay) image_f = to_valid_rgb(image_f, colors=a.colors) if a.prog is True: lr1 = a.lrate * 2 lr0 = lr1 * 0.01 else: lr0 = a.lrate optimizer = torch.optim.Adam(params, lr0) sign = 1. if a.invert is True else -1. if a.verbose is True: print(' samples:', a.samples) out_name = '-'.join(out_name) out_name += '-%s' % a.model if 'RN' in a.model.upper() else '' tempdir = os.path.join(a.out_dir, out_name) os.makedirs(tempdir, exist_ok=True) pbar = ProgressBar(a.steps // a.fstep) for i in range(a.steps): train(i) os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name))) shutil.copy( img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps))) if a.save_pt is True: torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
prob = sess.run(srGan.prob, feed_dict={x: x_train, train_mode: False}) return prob train_hr = np.zeros([65, 160, 160, 3]) train_lr = np.zeros([65, 40, 40, 3]) test_hr = np.zeros([26, 160, 160, 3]) test_lr = np.zeros([26, 40, 40, 3]) k = 0 for i in range(65): path = '../dataset/91-image/t' + str(i + 1) + '.bmp' img = utils.img_read(path) img = utils.img_crop(img, 160, 160) if img.shape != (160, 160, 3): continue train_hr[k, :, :, :] = img img = utils.img_downsize(img, 25) train_lr[k, :, :, :] = img k += 1 k = 0 for i in range(26): path = '../dataset/91-image/tt' + str(i + 1) + '.bmp' img = utils.img_read(path) img = utils.img_crop(img, 160, 160) if img.shape != (160, 160, 3): continue