예제 #1
0
def main():
    a = get_args()

    prev_enc = 0
    def train(i):
        loss = 0
        
        noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None
        img_out = image_f(noise)

        micro = None if a.in_txt2 is None else False
        imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=micro)
        out_enc = model_clip.encode_image(imgs_sliced[-1])
        if a.diverse > 0:
            imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, norm_in, a.overscan, micro=micro)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += a.diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean()
            del out_enc2; torch.cuda.empty_cache()
        if a.in_img is not None and os.path.isfile(a.in_img): # input image
            loss +=  sign * torch.cosine_similarity(img_enc, out_enc, dim=-1).mean()
        if a.in_txt is not None: # input text
            loss +=  sign * torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean()
        if a.in_txt0 is not None: # subtract text
            loss += -sign * torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean()
        if a.sync > 0 and a.in_img is not None and os.path.isfile(a.in_img): # image composition
            loss *= 1. + a.sync * (a.steps/(i+1) * ssim_loss(img_out, img_in) - 1)
        if a.in_txt2 is not None: # input text for micro details
            imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, norm_in, a.overscan, micro=True)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss +=  sign * torch.cosine_similarity(txt_enc2, out_enc2, dim=-1).mean()
            del out_enc2; torch.cuda.empty_cache()
        if a.expand > 0:
            global prev_enc
            if i > 0:
                loss += a.expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean()
            prev_enc = out_enc.detach()

        del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache()
        assert not isinstance(loss, int), ' Loss not defined, check the inputs'
        
        if a.prog is True:
            lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
            for g in optimizer.param_groups: 
                g['lr'] = lr_cur
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % a.fstep == 0:
            with torch.no_grad():
                img = image_f(contrast=a.contrast).cpu().numpy()[0]
            checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose)
            pbar.upd()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50':0.5, 'RN50x4':0.16, 'RN101':0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])
            
    if a.diverse > 0:
        a.samples = int(a.samples * 0.5)
            
    norm_in = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

    out_name = []
    if a.in_img is not None and os.path.isfile(a.in_img):
        if a.verbose is True: print(' ref image:', basename(a.in_img))
        img_in = torch.from_numpy(img_read(a.in_img)/255.).unsqueeze(0).permute(0,3,1,2).cuda()
        img_in = img_in[:,:3,:,:] # fix rgb channels
        in_sliced = slice_imgs([img_in], a.samples, a.modsize, transform=norm_in, overscan=a.overscan)[0]
        img_enc = model_clip.encode_image(in_sliced).detach().clone()
        if a.sync > 0:
            ssim_loss = ssim.SSIM(window_size = 11)
            img_in = F.interpolate(img_in, a.size).float()
        else:
            del img_in
        del in_sliced; torch.cuda.empty_cache()
        out_name.append(basename(a.in_img).replace(' ', '_'))

    if a.in_txt is not None:
        if a.verbose is True: print(' ref text: ', basename(a.in_txt))
        if a.translate:
            translator = Translator()
            a.in_txt = translator.translate(a.in_txt, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt) 
        tx = clip.tokenize(a.in_txt).cuda()
        txt_enc = model_clip.encode_text(tx).detach().clone()
        out_name.append(txt_clean(a.in_txt))

    if a.in_txt2 is not None:
        if a.verbose is True: print(' micro text:', basename(a.in_txt2))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt2 = translator.translate(a.in_txt2, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt2) 
        tx2 = clip.tokenize(a.in_txt2).cuda()
        txt_enc2 = model_clip.encode_text(tx2).detach().clone()
        out_name.append(txt_clean(a.in_txt2))

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0) 
        tx0 = clip.tokenize(a.in_txt0).cuda()
        txt_enc0 = model_clip.encode_text(tx0).detach().clone()
        out_name.append('off-' + txt_clean(a.in_txt0))

    params, image_f = fft_image([1, 3, *a.size], resume=a.resume)
    image_f = to_valid_rgb(image_f)

    if a.prog is True:
        lr1 = a.lrate * 2
        lr0 = lr1 * 0.01
    else:
        lr0 = a.lrate
    optimizer = torch.optim.Adam(params, lr0)
    sign = 1. if a.invert is True else -1.

    if a.verbose is True: print(' samples:', a.samples)
    out_name = '-'.join(out_name)
    out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
    tempdir = os.path.join(a.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    pbar = ProgressBar(a.steps // a.fstep)
    for i in range(a.steps):
        train(i)

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, out_name)))
    shutil.copy(img_list(tempdir)[-1], os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps)))
    if a.save_pt is True:
        torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
예제 #2
0
def main():
    a = get_args()

    prev_enc = 0

    def train(i):
        loss = 0

        noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4],
                                      1).cuda() if a.noise > 0 else None
        img_out = image_f(noise)

        if a.sharp != 0:
            lx = torch.mean(
                torch.abs(img_out[0, :, :, 1:] - img_out[0, :, :, :-1]))
            ly = torch.mean(
                torch.abs(img_out[0, :, 1:, :] - img_out[0, :, :-1, :]))
            loss -= a.sharp * (ly + lx)

        micro = 1 - a.macro if a.in_txt2 is None else False
        imgs_sliced = slice_imgs([img_out],
                                 a.samples,
                                 a.modsize,
                                 trform_f,
                                 a.align,
                                 micro=micro)
        out_enc = model_clip.encode_image(imgs_sliced[-1])
        if a.diverse != 0:
            imgs_sliced = slice_imgs([image_f(noise)],
                                     a.samples,
                                     a.modsize,
                                     trform_f,
                                     a.align,
                                     micro=micro)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += a.diverse * torch.cosine_similarity(
                out_enc, out_enc2, dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.in_img is not None and os.path.isfile(a.in_img):  # input image
            loss += sign * 0.5 * torch.cosine_similarity(
                img_enc, out_enc, dim=-1).mean()
        if a.in_txt is not None:  # input text
            loss += sign * torch.cosine_similarity(txt_enc, out_enc,
                                                   dim=-1).mean()
            if a.notext > 0:
                loss -= sign * a.notext * torch.cosine_similarity(
                    txt_plot_enc, out_enc, dim=-1).mean()
        if a.in_txt0 is not None:  # subtract text
            loss += -sign * torch.cosine_similarity(txt_enc0, out_enc,
                                                    dim=-1).mean()
        if a.sync > 0 and a.in_img is not None and os.path.isfile(
                a.in_img):  # image composition
            prog_sync = (a.steps // a.fstep - i) / (a.steps // a.fstep)
            loss += prog_sync * a.sync * sim_loss(F.interpolate(
                img_out, sim_size).float(),
                                                  img_in,
                                                  normalize=True).squeeze()
        if a.in_txt2 is not None:  # input text for micro details
            imgs_sliced = slice_imgs([img_out],
                                     a.samples,
                                     a.modsize,
                                     trform_f,
                                     a.align,
                                     micro=True)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += sign * torch.cosine_similarity(txt_enc2, out_enc2,
                                                   dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.expand > 0:
            global prev_enc
            if i > 0:
                loss += a.expand * torch.cosine_similarity(
                    out_enc, prev_enc, dim=-1).mean()
            prev_enc = out_enc.detach()

        del img_out, imgs_sliced, out_enc
        torch.cuda.empty_cache()
        assert not isinstance(loss, int), ' Loss not defined, check the inputs'

        if a.prog is True:
            lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
            for g in optimizer.param_groups:
                g['lr'] = lr_cur

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % a.fstep == 0:
            with torch.no_grad():
                img = image_f(contrast=a.contrast).cpu().numpy()[0]
            if (a.sync > 0 and a.in_img is not None) or a.sharp != 0:
                img = img**1.3  # empirical tone mapping
            checkout(img,
                     os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)),
                     verbose=a.verbose)
            pbar.upd()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])

    if a.multilang is True:
        model_lang = SentenceTransformer(
            'clip-ViT-B-32-multilingual-v1').cuda()

    def enc_text(txt):
        if a.multilang is True:
            emb = model_lang.encode([txt],
                                    convert_to_tensor=True,
                                    show_progress_bar=False)
        else:
            emb = model_clip.encode_text(clip.tokenize(txt).cuda())
        return emb.detach().clone()

    if a.diverse != 0:
        a.samples = int(a.samples * 0.5)
    if a.sync > 0:
        a.samples = int(a.samples * 0.5)

    if a.transform is True:
        trform_f = transforms.transforms_custom
        a.samples = int(a.samples * 0.95)
    else:
        trform_f = transforms.normalize()

    out_name = []
    if a.in_txt is not None:
        if a.verbose is True: print(' ref text: ', basename(a.in_txt))
        if a.translate:
            translator = Translator()
            a.in_txt = translator.translate(a.in_txt, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt)
        txt_enc = enc_text(a.in_txt)
        out_name.append(txt_clean(a.in_txt))

        if a.notext > 0:
            txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) /
                                        255.).unsqueeze(0).permute(0, 3, 1,
                                                                   2).cuda()
            txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()

    if a.in_txt2 is not None:
        if a.verbose is True: print(' micro text:', basename(a.in_txt2))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt2 = translator.translate(a.in_txt2, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt2)
        txt_enc2 = enc_text(a.in_txt2)
        out_name.append(txt_clean(a.in_txt2))

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0)
        txt_enc0 = enc_text(a.in_txt0)
        out_name.append('off-' + txt_clean(a.in_txt0))

    if a.multilang is True: del model_lang

    if a.in_img is not None and os.path.isfile(a.in_img):
        if a.verbose is True: print(' ref image:', basename(a.in_img))
        img_in = torch.from_numpy(
            img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda()
        img_in = img_in[:, :3, :, :]  # fix rgb channels
        in_sliced = slice_imgs([img_in],
                               a.samples,
                               a.modsize,
                               transforms.normalize(),
                               a.align,
                               micro=False)[0]
        img_enc = model_clip.encode_image(in_sliced).detach().clone()
        if a.sync > 0:
            sim_loss = lpips.LPIPS(net='vgg', verbose=False).cuda()
            sim_size = [s // 2 for s in a.size]
            img_in = F.interpolate(img_in, sim_size).float()
        else:
            del img_in
        del in_sliced
        torch.cuda.empty_cache()
        out_name.append(basename(a.in_img).replace(' ', '_'))

    params, image_f = fft_image([1, 3, *a.size],
                                resume=a.resume,
                                decay_power=a.decay)
    image_f = to_valid_rgb(image_f, colors=a.colors)

    if a.prog is True:
        lr1 = a.lrate * 2
        lr0 = lr1 * 0.01
    else:
        lr0 = a.lrate
    optimizer = torch.optim.Adam(params, lr0)
    sign = 1. if a.invert is True else -1.

    if a.verbose is True: print(' samples:', a.samples)
    out_name = '-'.join(out_name)
    out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
    tempdir = os.path.join(a.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    pbar = ProgressBar(a.steps // a.fstep)
    for i in range(a.steps):
        train(i)

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
              (tempdir, os.path.join(a.out_dir, out_name)))
    shutil.copy(
        img_list(tempdir)[-1],
        os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps)))
    if a.save_pt is True:
        torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
예제 #3
0
        prob = sess.run(srGan.prob, feed_dict={x: x_train, train_mode: False})

        return prob


train_hr = np.zeros([65, 160, 160, 3])
train_lr = np.zeros([65, 40, 40, 3])
test_hr = np.zeros([26, 160, 160, 3])
test_lr = np.zeros([26, 40, 40, 3])

k = 0
for i in range(65):

    path = '../dataset/91-image/t' + str(i + 1) + '.bmp'
    img = utils.img_read(path)
    img = utils.img_crop(img, 160, 160)
    if img.shape != (160, 160, 3):
        continue
    train_hr[k, :, :, :] = img
    img = utils.img_downsize(img, 25)
    train_lr[k, :, :, :] = img
    k += 1

k = 0
for i in range(26):
    path = '../dataset/91-image/tt' + str(i + 1) + '.bmp'
    img = utils.img_read(path)
    img = utils.img_crop(img, 160, 160)
    if img.shape != (160, 160, 3):
        continue