Exemplo n.º 1
0
    fake_song_padded = read_padded
    
    for (i,block) in enumerate(m):
        if i == 0:
            fake_song_padded = block(fake_song_padded)
        else:
            fake_read_padded = block(fake_song_padded)

    #song_padded_auto = auto(song_padded)
    loss_sp2sing = 0.1 * criterion(fake_song_padded, song_padded)
    loss_sp2sing = loss_sp2sing.mean()
    
    loss_cycle = 0.1 * criterion(fake_read_padded, read_padded).mean()
      
    
    loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(dis_high, song_padded, fake_song_padded, k)#song_len_padded[...,:fake_song_padded.size(2)]
    OptimStep([(m, opt, loss_gan +loss_sp2sing + loss_cycle, True),
        (dis_high, opt_dis, loss_dis, False)], 3) #(auto, opt_auto, loss_auto + 0.01*latent_loss, False),
    k, convergence = recorder(real_dloss, fake_dloss, update_k=True)
    
    if iteration % 5 == 0:
        logger.log_training(iteration = iteration, loss_gan = loss_gan,loss_cycle = loss_cycle,
        loss_dis = loss_dis, loss_sing2sp = loss_sp2sing,
        k = k, convergence = convergence)
    
    if (iteration % 50 == 0):

        save_checkpoint(m, opt, iteration, f'checkpoint/{args.checkpoint_path}/gen')
        save_checkpoint(dis_high, opt_dis, iteration, f'checkpoint/{args.checkpoint_path}/dis')
        
        idx = 0
Exemplo n.º 2
0
    #mel = (mel-mean)/std
    singing = singing[:,:,:min(speech_2x.size(2), singing.size(2))]
    speech_2x = speech_2x[:,:,:min(speech_2x.size(2), singing.size(2))]
    

    singing = F.pad(singing,(0,(singing.size(2)//8+1)*8 - singing.size(2)), 'reflect')
    speech_2x = F.pad(speech_2x,(0,(speech_2x.size(2)//8+1)*8 - speech_2x.size(2)), 'reflect')
    
    fake_singing = speech_2x
    for (i,block) in enumerate(m):
        if i == 0:
            fake_singing = block(fake_singing)
        else:
            fake_speech = block(fake_singing)
    if hp.loss == 'BEGAN':
        loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(dis_high, singing, fake_singing, k)
        loss_cycle = criterion(speech_2x, fake_speech).mean()
        OptimStep([(m, opt, loss_gan + 0.2 * loss_cycle, True),
            (dis_high, opt_dis, loss_dis, False)], 3)
        
        k, convergence = recorder(real_dloss, fake_dloss, update_k=True)
    
    if iteration % 5 == 0:
        if hp.loss == "BEGAN":
            logger.log_training(iteration = iteration, loss_gan = loss_gan, 
            loss_dis = loss_dis, loss_cycle = loss_cycle, k = k, convergence = convergence)

    if (iteration % 50 == 0):

        save_checkpoint(m, opt, iteration, f'checkpoint/{args.checkpoint_path}/gen')
        save_checkpoint(dis_high, opt_dis, iteration, f'checkpoint/{args.checkpoint_path}/dis')
Exemplo n.º 3
0
def train_(args, model, opt, latent_loss_weight, criterion, loader, epochs,
           inf_iterator_test, logger, iteration):
    dis = NetD(80).cuda()
    opt_dis = optim.Adam(dis.parameters())
    gamma = 1.0
    lambda_k = 0.01
    init_k = 0.0
    recorder = BEGANRecorder(lambda_k, init_k, gamma)
    k = recorder.k.item()
    opt_dec = optim.Adam(model.dec.parameters())
    for epoch in range(epochs):
        mse_sum = 0
        mse_n = 0

        for i, audio in enumerate(loader):

            audio = audio.cuda()
            audio = (audio * 25 + 50) / 50
            factor = 32

            time_step = audio.size(2)

            audio_shuffle = [[] for i in range(time_step // factor)]
            nums = [x for x in range(time_step // factor)]
            random.shuffle(nums)

            for i_n, n in enumerate(nums):
                sf = random.uniform(0.5, 2)
                audio_shuffle[n] = F.interpolate(audio[..., factor * n:factor *
                                                       (n + 1)],
                                                 scale_factor=sf,
                                                 mode='nearest')

            audio_shuffle = torch.cat(audio_shuffle, dim=2)

            audio = audio_shuffle  #F.interpolate(audio, scale_factor= audio_shuffle.size(2)/time_step)
            audio = audio[..., :audio.size(2) // 16 * 16]

            audio_middile = F.interpolate(audio, scale_factor=1 / 2)
            audio_middile = audio_middile[:, :audio_middile.size(1) // 2, :]

            audio_low = F.interpolate(audio_middile, scale_factor=1 / 2)
            audio_low = audio_low[:, :audio_low.size(1) // 2, :]

            audio_list = [audio_low, audio_middile, audio]

            out, latent_loss, index_list = model(audio)

            recon_loss = 0
            for num in range(3):
                recon_loss += criterion(out[num], audio_list[num])

            latent_loss = latent_loss.mean()

            #OptimStep([(model, opt,  recon_loss + latent_loss_weight*latent_loss , True)], 3)# True),

            #################################
            # BEGAN TRAINING PHASE          #
            #################################
            model.zero_grad()
            index_list_ = []
            for l in index_list:
                idx = torch.randperm(l.size(0))
                index_list_ += [l[idx]]
            out_code = model.index_to_decode(index_list_)
            loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(
                dis, audio, out_code[-1], k)
            OptimStep([(model, opt,
                        recon_loss + latent_loss_weight * latent_loss, True),
                       (model.dec, opt_dec, 0.2 * (loss_gan), True),
                       (dis, opt_dis, loss_dis, False)], 3)

            k, convergence = recorder(real_dloss, fake_dloss, update_k=True)
            iteration += 1
            print(iteration)
            model.zero_grad()

            if i % 5 == 0:
                logger.log_training(iteration=iteration,
                                    loss_gan=loss_gan,
                                    loss_dis=loss_dis,
                                    loss_recon=recon_loss,
                                    latent_loss=latent_loss,
                                    k=k,
                                    convergence=convergence)

            if i % 50 == 0:
                model.eval()
                a = torch.stack([audio[0], out[-1][0], out_code[-1][0]], dim=0)
                a = (a * 50 - 50) / 25
                a = vocoder.inverse(a)
                a = a.detach().cpu().numpy()
                logger.log_validation(
                    iteration=iteration,
                    mel_ori=("image", plot_spectrogram_to_numpy(), audio[0]),
                    mel_recon=("image", plot_spectrogram_to_numpy(),
                               out[-1][0]),
                    mel_code=("image", plot_spectrogram_to_numpy(),
                              out_code[-1][0]),
                    audio_ori=("audio", 22050, a[0]),
                    audio_recon=("audio", 22050, a[1]),
                    audio_code=("audio", 22050, a[2]),
                )

                save_checkpoint(
                    model, opt, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/gen'
                )
                save_checkpoint(
                    dis, opt_dis, iteration,
                    f'checkpoint/{args.model}_n{args.n_embed}_ch{args.channel}_{args.trainer}/dis'
                )

                model.train()
                logger.close()
Exemplo n.º 4
0
    voc = voc[..., :voc.size(2) // 16 * 16]
    linear = linear[..., :linear.size(2) // 16 * 16]
    accom = accom[..., :accom.size(2) // 16 * 16]

    fake_accom = voc

    print("fake_accom size:", fake_accom.size())
    for block in m:
        fake_accom = block(fake_accom)

    fake_linear = (10**fake_accom + 10**voc) / 6
    fake_linear = torch.log10(torch.clamp(fake_linear, min=1e-5))

    if hp.loss == 'BEGAN':
        loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(
            dis_high, linear, fake_linear, k)
        loss_accom, loss_dis_accom, real_acloss, fake_acloss = BEGANLoss(
            dis_accom, accom, fake_accom, k2)

        OptimStep([(m, opt, loss_gan + loss_accom, True),
                   (dis_accom, opt_accom, loss_dis_accom, True),
                   (dis_high, opt_dis, loss_dis, False)], 3)

        k, convergence = recorder(real_dloss, fake_dloss, update_k=True)
        k2, convergence2 = recorder(real_acloss, fake_acloss, update_k=True)
    if iteration % 5 == 0:
        if hp.loss == "BEGAN":
            logger.log_training(iteration=iteration,
                                loss_gan=loss_gan,
                                loss_dis=loss_dis,
                                k=k,
    speech = speech[:, :, :min(fake_speech.size(2), speech.size(2))]
    fake_speech = fake_speech[:, :, :min(fake_speech.size(2), speech.size(2))]

    speech = F.pad(speech,
                   (0, (speech.size(2) // 64 + 1) * 64 - speech.size(2)),
                   'reflect')
    fake_speech = F.pad(
        fake_speech,
        (0, (fake_speech.size(2) // 64 + 1) * 64 - fake_speech.size(2)),
        'reflect')

    for block in m:
        fake_speech = block(fake_speech)

    if hp.loss == 'BEGAN':
        loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(
            dis_high, speech, fake_speech, k)

        OptimStep([(m, opt, loss_gan, True),
                   (dis_high, opt_dis, loss_dis, False)], 3)

        k, convergence = recorder(real_dloss, fake_dloss, update_k=True)

    if iteration % 5 == 0:
        if hp.loss == "BEGAN":
            logger.log_training(iteration=iteration,
                                loss_gan=loss_gan,
                                loss_dis=loss_dis,
                                k=k,
                                convergence=convergence)

    if (iteration % 50 == 0):
Exemplo n.º 6
0
        f'checkpoint/{args.checkpoint_path}/dis', dis, opt_dis)

for epoch in range(800):
    mse_sum = 0
    mse_n = 0

    for i, audio in enumerate(loader):
        cluster_size = audio.size(1)
        audio = audio.cuda()

        out, out_conversion, enc_content, spk, latent_loss, idx = model(audio)
        recon_loss = criterion(out, audio)
        latent_loss = latent_loss.mean()

        loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(
            dis, out, out_conversion,
            k)  #song_len_padded[...,:fake_song_padded.size(2)]

        OptimStep([(model, opt, 0.5 * (loss_gan) + recon_loss +
                    latent_loss_weight * latent_loss, True),
                   (dis, opt_dis, loss_dis, False)], 3)
        k, convergence = recorder(real_dloss, fake_dloss, update_k=True)

        mse_sum += recon_loss.item() * audio.shape[0]
        mse_n += audio.shape[0]

        if i % 5 == 0:
            logger.log_training(iteration=iteration,
                                loss_gan=loss_gan,
                                loss_dis=loss_dis,
                                loss_recon=recon_loss,
Exemplo n.º 7
0
    mel = next(inf_iterator_tr_speech).cuda()

    #mel = (mel-mean)/std
    mel = F.pad(mel, (0, (mel.size(2) // 64 + 1) * 64 - mel.size(2)),
                'reflect')

    bs, n_mel_channels, time = mel.size()
    z = torch.zeros((bs, hp.z_dim, int(time / 64))).normal_(0,
                                                            1).float().cuda()

    for block in m:
        z = block(z)

    if hp.loss == 'BEGAN':
        loss_gan, loss_dis, real_dloss, fake_dloss = BEGANLoss(
            dis_high, mel, z, k)

        OptimStep([(m, opt, loss_gan, True),
                   (dis_high, opt_dis, loss_dis, False)], 3)

        k, convergence = recorder(real_dloss, fake_dloss, update_k=True)

    if iteration % 5 == 0:
        if hp.loss == "BEGAN":
            logger.log_training(iteration=iteration,
                                loss_gan=loss_gan,
                                loss_dis=loss_dis,
                                k=k,
                                convergence=convergence)

    if (iteration % 50 == 0):