예제 #1
0
    def build_model(self):

        self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre,
                           self.freq)

        self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001)

        ## TODO: load checkpoint here

        # this is from make_metadata.py
        #c_checkpoint = torch.load('3000000-BL.ckpt')
        #new_state_dict = OrderedDict()
        #for key, val in c_checkpoint['model_b'].items():
        #    new_key = key[7:]
        #    new_state_dict[new_key] = val
        #C.load_state_dict(new_state_dict)

        # this is from waveglow
        # model.load_state_dict( checkpoint_dict['model'].state_dict() )
        # self.start_iter = checkpoint_dict['iteration']
        # self.g_optimizer.load_state_dict(checkpoint_dict['optimizer'])

        # this is from conversion.ipynb
        #g_checkpoint = torch.load('autovc.ckpt')
        #G.load_state_dict(g_checkpoint['model'])

        self.G.to(self.device)

        if self.resume:
            g_checkpoint = torch.load(self.resume)  # ,map_location='cuda:0'
            self.G.load_state_dict(g_checkpoint['model'])
            self.g_optimizer.load_state_dict(g_checkpoint['optimizer'])
            self.start_iter = g_checkpoint['iteration']
예제 #2
0
 def build_model(self):
     
     self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq)        
     
     self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001)
     
     self.G.to(self.device)
def load_ckpt_arch():
    G = Generator(dim_neck=dim_neck,
                  dim_emb=dim_emb,
                  dim_pre=dim_pre,
                  freq=freq,
                  speaker_num=speaker_num).eval().to(device)
    return G
예제 #4
0
    def build_model(self):

        self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre,
                           self.freq)
        self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001)
        """self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.g_optimizer, mode='min', 
                factor=0.5, patience=1000, threshold=0.0001, verbose=True)"""
        self.G.eval().to(self.device)
    def build_model(self):

        self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre,
                           self.freq)
        #self.G = torch.load('checkpoints_wided_addnoise/autovc_450000.pt' ,map_location=torch.device('cuda'))

        self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001)

        self.G.to(self.device)
def generateAudioGroup(original_audio, ref_audios, autovc_checkpoint = 'checkpoints_fully/autovc_700000.pt', vocoder_checkpoint = "../checkpoint_step001000000_ema.pth"):

    mel_org = makeSpect(original_audio, None)

    def pad_seq(x, base=32):
        len_out = int(base * ceil(float(x.shape[0])/base))
        len_pad = len_out - x.shape[0]
        assert len_pad >= 0
        return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

    device = 'cuda:0'
    G = Generator(32,256,512,32).eval().to(device)

    g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
    
    G = g_checkpoint.eval()

    x_org = mel_org
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

    emb_org = get_verification_pytorch_1000(original_audio)
    emb_refs = []
    i = 0
    
    for file in os.listdir(ref_audios):
        i += 1
        print("{}/{}".format(i, len(os.listdir(ref_audios))))
    
        emb_ref = get_verification_pytorch_1000(ref_audios + file, 1)
        if emb_ref is not None: emb_refs.append(emb_ref)
        
   
    emb_refs = np.mean(emb_refs, axis=0)
    
    emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda()
    emb_refs = torch.FloatTensor(emb_refs).unsqueeze(0).cuda()
    
    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_refs)

    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()


    device = torch.device("cuda")
    model = build_model().to(device)
    checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda'))
    model.load_state_dict(checkpoint["state_dict"])

    waveform = wavegen(model, c=uttr_trg)   
    return waveform
예제 #7
0
def main():
    # init model
    device = 'cuda:0'
    G = Generator(dim_neck=dim_neck,
                  dim_emb=dim_emb,
                  dim_pre=dim_pre,
                  freq=freq,
                  speaker_num=speaker_num).eval().to(device)
    g_checkpoint = torch.load(ckpt_path)
    G.load_state_dict(g_checkpoint['model'])

    # init speaker name -> id
    speaker_id_dict = text2dict(speaker_id_dict_path)

    # p228/p228_077.npy|p228|p227
    f = open(conversion_list_path, 'r').readlines()
    tasks = [i.strip() for i in f]

    spect_vc = []
    for task in tasks:
        task = task.split('|')
        assert len(task) == 3
        mel_path = task[0]
        s_name = task[1]
        t_name = task[2]

        # process from string -> data: mel, s, t
        mel = np.load(os.path.join(data_dir, mel_path))
        mel, len_pad = pad_seq(mel)
        s_id = speaker_id_dict[s_name]
        t_id = speaker_id_dict[t_name]

        # process from data -> batch tensor: mel, s, t
        mel = torch.from_numpy(mel[np.newaxis, :, :]).to(device)
        s_id = torch.from_numpy(np.asarray([s_id])).to(device)
        t_id = torch.from_numpy(np.asarray([t_id])).to(device)
        print('speaker model out----------', s_id.size())

        with torch.no_grad():
            _, x_identic_psnt, _ = G(mel, s_id, t_id)
            print('mel size:', x_identic_psnt.size())

        if len_pad == 0:
            # uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
            x_identic_psnt = x_identic_psnt[0, :, :].cpu().numpy()
        else:
            # uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()
            x_identic_psnt = x_identic_psnt[0, :-len_pad, :].cpu().numpy()

        spect_vc.append(('{}x{}'.format(s_name, t_name), x_identic_psnt))

    with open('results.pkl', 'wb') as handle:
        pickle.dump(spect_vc, handle)
def generateAudio(original_audio, ref_audio, autovc_checkpoint, vocoder_checkpoint ,english=False):

    mel_org = makeSpect(original_audio, None)

    def pad_seq(x, base=32):
        len_out = int(base * ceil(float(x.shape[0])/base))
        len_pad = len_out - x.shape[0]
        assert len_pad >= 0
        return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

    device = 'cuda:0'
    G = Generator(32,256,512,32).eval().to(device)

    g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
    
    G = g_checkpoint.eval()

    x_org = mel_org
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

    emb_org = get_verification_pytorch_1000(original_audio)
    
    if not english:
        emb_ref = get_verification_pytorch_1000(ref_audio)
    else:
        emb_ref = get_verification_eng(ref_audio)
        
    if emb_org is None or emb_ref is None: return None
   
    emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda()
    if not english:
        emb_ref = torch.FloatTensor(emb_ref).unsqueeze(0).cuda()
    else:
        emb_ref = emb_ref.type(torch.cuda.FloatTensor)
    
    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_ref)

    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()


    device = torch.device("cuda")
    model = build_model().to(device)
    checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda'))
    model.load_state_dict(checkpoint["state_dict"])

    waveform = wavegen(model, c=uttr_trg)   
    return waveform
예제 #9
0
    def __encode__(self, source, target):
        '''

        Produces result.pkl
        :param source: string of source filename
        :param target: string of target filename
        :return: None
        '''

        source = "audio/upload/p225.pkl"
        target = "audio/upload/p256.pkl"
        
        device = 'cuda:0'
        G = Generator(32, 256, 512, 32).eval() #.to(device)

        g_checkpoint = torch.load('autovc.ckpt', map_location=torch.device('cpu'))#, map_location='cuda:0')
        G.load_state_dict(g_checkpoint['model'])

        # load data
        source = pickle.load(open(source, "rb"))
        target = pickle.load(open(target, "rb"))

        metadata = [source, target]

        # do work
        spect_vc = []

        x_org = source[2]
        x_org, len_pad = self.__pad_seq__(x_org)
        uttr_org = torch.from_numpy(x_org[np.newaxis, :, :])#.to(device)
        emb_org = torch.from_numpy(source[1][np.newaxis, :])#.to(device)

        emb_trg = torch.from_numpy(target[1][np.newaxis, :])#.to(device)

        with torch.no_grad():
            _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg)

        if len_pad == 0:
            uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
        else:
            uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()

        spect_vc.append(('{}x{}'.format(source[0], target[0]), uttr_trg))

        # save the result
        with open('results.pkl', 'wb') as handle:
            pickle.dump(spect_vc, handle)

        return None
예제 #10
0
    def build_model(self):
        
        self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq, self.speaker_num)

        if os.path.exists(self.logs_dir) is False:
            os.makedirs(self.logs_dir, exist_ok=True)

        if os.path.exists(self.logs_tensorboard_dir) is False:
            os.makedirs(self.logs_tensorboard_dir, exist_ok=True)


        self.writer = SummaryWriter(log_dir=self.logs_tensorboard_dir)
        
        self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001)
        
        self.G.to(self.device)
예제 #11
0
def trace():
    generator = Generator(8, 256, 512, 4)
    if args.model:
        ckpt = torch.load(args.model)
        if ckpt:
            logging.info(f'loading generator ckpt {args.model}')
            generator.load_state_dict(ckpt['model_state_dict'])

    else:
        ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt')))
        if len(ckpts) > 0:
            latest_ckpt_path = ckpts[-1]
            ckpt = torch.load(latest_ckpt_path)
            if ckpt:
                logging.info(f'loading generator ckpt {latest_ckpt_path}')
                generator.load_state_dict(ckpt['model_state_dict'])

    device = torch.device("cpu")
    generator.to(device=device)
    generator.eval()

    x1 = torch.ones(1, 298, 80)
    x2 = torch.ones(1, 256)
    x3 = torch.ones(1, 10, 256)

    # out = generator(x1, x2, x3)

    enc_x_1 = torch.ones(1, 320, 80)
    enc_x_2 = torch.ones(1, 256)
    # dec_x = torch.ones(1, 256, 32*2+256)
    post_x = torch.ones(1, 80, 298)
    # out = generator(x1, x2, x3)
    traced_postnet = torch.jit.trace(generator.postnet, (post_x))
    generator.postnet = traced_postnet
    sm = torch.jit.script(generator, (x1, x2, x3))
    print(sm.code)
    out = sm(x1, x2, x3)
    print(out.shape)
    print(out)
    sm.save('autovc_script_model.pt')
예제 #12
0
def train():
    stcmds_ds = dataset.new_stcmds_dataset(
        root=hp.stcmds_data_root, mel_feature_root=hp.mel_feature_root)
    # aishell_ds = dataset.new_aishell_dataset(root=hp.aishell_data_root, mel_feature_root=hp.mel_feature_root)
    # aidatatang_ds = dataset.new_aidatatang_dataset(root=hp.aidatatang_data_root, mel_feature_root=hp.mel_feature_root)
    # primewords_ds = dataset.new_primewords_dataset(root=hp.primewords_data_root, mel_feature_root=hp.mel_feature_root)
    # toy_ds = dataset.new_toy_dataset(root=hp.toy_data_root, mel_feature_root=hp.mel_feature_root)

    # datasets = [stcmds_ds, aishell_ds, aidatatang_ds, primewords_ds]
    datasets = [stcmds_ds]
    # datasets = [toy_ds]
    mds = dataset.MultiAudioDataset(datasets)
    random.shuffle(mds.speakers)
    train_speakers = mds.speakers[:-40]
    eval_speakers = mds.speakers[-40:]

    ds = dataset.SpeakerDataset(
        train_speakers,
        utterances_per_speaker=hp.generator_utterances_per_speaker,
        seq_len=hp.generator_seq_len)
    loader = torch.utils.data.DataLoader(
        ds,
        batch_size=hp.generator_speakers_per_batch,
        shuffle=True,
        num_workers=6)

    eval_ds = dataset.SpeakerDataset(
        eval_speakers,
        utterances_per_speaker=hp.generator_utterances_per_speaker,
        seq_len=hp.generator_seq_len)
    eval_loader = torch.utils.data.DataLoader(
        eval_ds,
        batch_size=hp.generator_speakers_per_batch,
        shuffle=True,
        num_workers=6)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loss_device = torch.device("cpu")

    speaker_encoder = SpeakerEncoder(device, loss_device, 3)
    ckpts = sorted(list(Path(hp.save_dir).glob('*.pt')))
    if len(ckpts) > 0:
        latest_ckpt_path = ckpts[-1]
        ckpt = torch.load(latest_ckpt_path)
        if ckpt:
            logging.info(f'loading speaker encoder ckpt {latest_ckpt_path}')
            speaker_encoder.load_state_dict(ckpt['model_state_dict'])
        else:
            raise Exception('ckpt', 'no ckpts found')
    else:
        raise Exception('ckpt', 'no ckpts found')
    speaker_encoder.eval()

    # generator = Generator(32, 256, 512, 16) # train_speakers[:120] g5_ckpts_bak
    # generator = Generator(32, 256, 512, 16) # train_speakers[:800] g6_ckpts_bak
    # generator = Generator(32, 256, 512, 16) # train_speakers[:800] g12_ckpts_bak 3layers-speaker_encoder
    # generator = Generator(16, 256, 512, 16) # train_speakers[:800] g13_ckpts_bak 3layers-speaker_encoder
    # generator = Generator(24, 256, 512, 16) # train_speakers[:800] g14_ckpts_bak 3layers-speaker_encoder
    # generator = Generator(24, 256, 512, 16) # [stcmds_ds, aishell_ds, aidatatang_ds, primewords_ds] g15_ckpts_bak 3layers-speaker_encoder
    # use src emb from a different utterance
    # use variate seq_len (128, 256, ...)
    # generator = Generator(24, 256, 512, 16) # train_speakers[:800] g16_ckpts_bak 3layers-speaker_encoder var-seqlen (128train->256finetune) diff-emb
    # generator = Generator(8, 256, 512, 4) # train_speakers[:800] g17_ckpts_bak 3layers-speaker_encoder
    generator = Generator(
        8, 256, 512,
        4)  # train_speakers[:800] g18_ckpts_bak 3layers-speaker_encoder bs-16
    # large batch size
    # speaker code reconstruct
    # generator = Generator(32, 256, 512, 8) train_speakers[:120] g7
    # generator = Generator(32, 256, 512, 8) # train_speakers[:800] g11
    # generator = Generator(32, 256, 512, 2) [:120] g8
    # generator = Generator(32, 256, 512, 2) [:800] g9
    # generator = Generator(16, 256, 512, 2) [:800] # g10
    # generator = Generator(16, 256, 512, 2)
    generator.to(device=device)

    opt = torch.optim.Adam(generator.parameters(), lr=hp.generator_lr)
    total_steps = 0

    ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt')))
    if len(ckpts) > 0:
        latest_ckpt_path = ckpts[-1]
        ckpt = torch.load(latest_ckpt_path)
        if ckpt:
            logging.info(f'loading generator ckpt {latest_ckpt_path}')
            generator.load_state_dict(ckpt['model_state_dict'])
            opt.load_state_dict(ckpt['optimizer_state_dict'])
            total_steps = ckpt['total_steps']

    if args.pretrained:
        ckpt = torch.load(args.pretrained)
        generator.load_state_dict(ckpt['model_state_dict'])
        logging.info(f'loaded pretrained model {args.pretrained}')

    while True:
        if total_steps >= hp.generator_train_steps:
            break

        for batch in loader:
            if total_steps >= hp.generator_train_steps:
                break

            for param_group in opt.param_groups:
                param_group['lr'] = hp.generator_get_lr(total_steps + 1)

            generator.train()

            batch = batch.cuda()
            n_speakers, n_utterances, freq_len, tempo_len = batch.shape
            data = batch.view(-1, freq_len, tempo_len)
            embeds = speaker_encoder(data.transpose(1, 2)).detach()
            embeds = embeds.view(n_speakers, n_utterances, -1)

            # assert batch.size(1) == 2
            src_mels = batch[:, 0, :, :]
            src_mels = src_mels.transpose(1, 2)
            # logging.info(f'src_mels.shape {src_mels.shape}')

            # assert embeds.size(1) == 2
            # src_embeds = embeds.mean(dim=1) # average the embeddings
            # Target embed from the same speaker as source embed in training phase,
            # and should be a different speaker in inference phase. Here the target
            # utterance is also different from the source utterance.
            src_embeds = embeds[:, 0, :]
            # logging.info(f'embeds.shape {src_embeds.shape} {tgt_embeds.shape}')

            init_out, final_out, content_out, code_exp = generator(
                src_mels, src_embeds, src_embeds.unsqueeze(1))
            # content_out2 = generator(batch[:, 1, :, :].transpose(1, 2), tgt_embeds, None)
            # logging.info(f'out shapes {init_out.shape} {final_out.shape} {content_out.shape}')

            # content_diff_loss = F.cosine_similarity(content_out.view(1, -1), content_out2.view(1, -1)).mean()

            loss, recon_loss, recon0_loss, content_recon_loss = generator.loss(
                src_mels, src_embeds, init_out, final_out, content_out)

            opt.zero_grad()
            # (loss + 0.3 * content_diff_loss).backward()
            loss.backward()
            opt.step()
            total_steps += 1

            if (total_steps + 1) % hp.generator_train_print_interval == 0:
                logging.info(
                    f'generator step {total_steps+1} loss {loss:.3f} ==> recon_loss {recon_loss:.3f} recon0_loss {recon0_loss:.3f} content_recon_loss {content_recon_loss:.5f}'
                )
            if (total_steps + 1) % hp.generator_evaluate_interval == 0:
                evaluate(generator, speaker_encoder, eval_loader)
            if (total_steps + 1) % hp.generator_save_interval == 0:
                if not Path(hp.generator_save_dir).exists():
                    Path(hp.generator_save_dir).mkdir()
                save_path = Path(
                    hp.generator_save_dir) / f'{total_steps+1:012d}.pt'
                logging.info(f'saving generrator ckpt {save_path}')
                torch.save(
                    {
                        'model_state_dict': generator.state_dict(),
                        'optimizer_state_dict': opt.state_dict(),
                        'total_steps': total_steps
                    }, save_path)

                # remove old ckpts
                ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt')))
                if len(ckpts) > hp.generator_max_ckpts:
                    for ckpt in ckpts[:-hp.generator_max_ckpts]:
                        Path(ckpt).unlink()
                        logging.info(f'ckpt {ckpt} removed')
            # if (total_steps+1) % hp.generator_bak_interval == 0:
            #     if not Path(hp.generator_bak_dir).exists():
            #         Path(hp.generator_bak_dir).mkdir()
            #     ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt')))
            #     shutil.copy(ckpts[-1], hp.generator_bak_dir)
            #     logging.info(f'ckpt {ckpts[-1]} backuped')
            if (total_steps + 1) % hp.generator_sample_interval == 0:
                results = [
                    src_mels.detach().cpu().numpy(),
                    final_out.detach().cpu().numpy(),
                    content_out.detach().cpu().numpy(),
                    code_exp.detach().cpu().numpy(),
                ]
                with open('generator_samples.pkl', 'wb') as f:
                    pickle.dump(results, f)
                pass
예제 #13
0
def inference():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loss_device = torch.device("cpu")

    speaker_encoder = SpeakerEncoder(device, loss_device, 3)
    ckpts = sorted(list(Path(hp.save_dir).glob('*.pt')))
    if len(ckpts) > 0:
        latest_ckpt_path = ckpts[-1]
        ckpt = torch.load(latest_ckpt_path)
        if ckpt:
            logging.info(f'loading speaker encoder ckpt {latest_ckpt_path}')
            speaker_encoder.load_state_dict(ckpt['model_state_dict'])
        else:
            raise Exception('ckpt', 'no ckpts found')
    else:
        raise Exception('ckpt', 'no ckpts found')

    generator = Generator(8, 256, 512, 4)
    ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt')))
    if len(ckpts) > 0:
        latest_ckpt_path = ckpts[-1]
        ckpt = torch.load(latest_ckpt_path)
        if ckpt:
            logging.info(f'loading generator ckpt {latest_ckpt_path}')
            generator.load_state_dict(ckpt['model_state_dict'])

    generator.to(device=device)
    speaker_encoder.eval()
    generator.eval()

    # pad with zeros to the end of the time axis
    def pad_zeros(x):
        mul = math.ceil(float(x.shape[1]) / 32)
        pad_len = mul * 32 - x.shape[1]
        return np.pad(x, pad_width=((0, 0), (0, pad_len)), mode='constant')

    def pad_zeros_multi(xs):
        max_len = 0
        for x in xs:
            if x.shape[1] > max_len:
                max_len = x.shape[1]

        newxs = []
        for x in xs:
            mul = math.ceil(float(max_len) / 32)
            pad_len = mul * 32 - x.shape[1]
            newxs.append(
                np.pad(x, pad_width=((0, 0), (0, pad_len)), mode='constant'))

        return newxs

    stcmds_ds = dataset.new_stcmds_dataset(
        root=hp.stcmds_data_root, mel_feature_root=hp.mel_feature_root)
    datasets = [stcmds_ds]
    mds = dataset.MultiAudioDataset(datasets)
    random.shuffle(mds.speakers)
    speakers = mds.speakers

    # src_uttrn = speakers[1].random_utterances(1)[0]
    src_uttrn = dataset.Utterance(
        id=None,
        raw_file='/tmp/v1.wav'
        # raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0026.wav',
    )

    src_mel = src_uttrn.melspectrogram()

    src_embed = speaker_encoder(
        torch.unsqueeze(torch.from_numpy(src_mel), 0).transpose(1, 2).cuda())
    # src_mel = pad_zeros(src_mel)
    src_mels = torch.unsqueeze(torch.from_numpy(src_mel),
                               0).transpose(1, 2).cuda()

    # 804 female sharp
    # 1 female soft
    # tgt_uttrns = speakers[1].random_utterances(10)
    # print(f'tgt raw file {tgt_uttrns[0].raw_file}')
    # tgt_uttrns = [dataset.Utterance(id=None, raw_file=f'/tmp/a{i}.wav') for i in range(1, 5)]

    tgt_uttrns = [
        dataset.Utterance(
            id=None,
            raw_file=
            '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0026.wav'
        ),
        dataset.Utterance(
            id=None,
            raw_file=
            '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0027.wav'
        ),
        dataset.Utterance(
            id=None,
            raw_file=
            '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0028.wav'
        ),
        dataset.Utterance(
            id=None,
            raw_file=
            '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0029.wav'
        ),
        dataset.Utterance(
            id=None,
            raw_file=
            '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0030.wav'
        ),

        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0030.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0031.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0032.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0033.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0034.wav'),

        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0025.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0026.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0027.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0028.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0029.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0107.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0060.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0061.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0062.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0063.wav'),
        # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0064.wav'),
    ]
    tgt_mels = [tgt_uttrn.melspectrogram() for tgt_uttrn in tgt_uttrns]

    tgt_embeds = []
    for m in tgt_mels:
        tgt_embeds.append(
            speaker_encoder(
                torch.from_numpy(m).unsqueeze(0).transpose(1, 2).cuda()))
    tgt_embed = torch.cat(tgt_embeds, dim=0).unsqueeze(0)
    # tgt_embed = speaker_encoder(torch.from_numpy(np.array(tgt_mels)).transpose(1, 2).cuda()).mean(dim=0, keepdim=True) # S2

    print(f'src_mels {src_mels.shape}')
    print(f'src_embed {src_embed.shape}')
    print(f'tgt_embed {tgt_embed.shape}')

    init_out, out_mels, content_out, _ = generator(src_mels, src_embed,
                                                   tgt_embed)
    init_out2, out_mels2, content_out2, _ = generator(src_mels, src_embed,
                                                      src_embed.unsqueeze(1))

    # loss, recon_loss, recon0_loss, content_recon_loss = generator.loss(src_mels,
    #                 src_embed,
    #                 init_out,
    #                 out_mels,
    #                 content_out)

    # logging.info(f'inference loss {loss:.3f} recon_loss {recon_loss:.3f} recon0_loss {recon0_loss:.3f} content_recon_loss {content_recon_loss:.3f}')

    netG = model_vocoder.Generator(hp.num_mels, hp.vocoder_ngf,
                                   hp.vocoder_n_residual_layers).cuda()
    ckpts = sorted(list(Path(hp.vocoder_save_dir).glob('*.pt')))
    if len(ckpts) > 0:
        latest_ckpt_path = ckpts[-1]
        logging.info(f'loading vocoder ckpt {latest_ckpt_path}')
        ckpt = torch.load(latest_ckpt_path)
        netG.load_state_dict(ckpt['netG_state_dict'])
    S = out_mels.squeeze(1).transpose(1, 2)
    y_recon = netG(src_mels.transpose(1, 2))
    y_pred = netG(S)
    y_recon2 = netG(out_mels2.squeeze(1).transpose(1, 2))
    print(
        f'shapes out_mels {out_mels.shape}, S {S.shape}, y_pred {y_pred.shape}'
    )

    results = [
        src_mels.detach().cpu().numpy(),
        tgt_mels,
        out_mels.detach().cpu().numpy(),
        y_pred.detach().cpu().numpy(),
        y_recon.detach().cpu().numpy(),
        src_uttrn.raw(sr=hp.sample_rate),
        tgt_uttrns[0].raw(sr=hp.sample_rate),
        out_mels2.detach().cpu().numpy(),
        y_recon2.detach().cpu().numpy(),
    ]

    with open('generator_results.pkl', 'wb') as f:
        pickle.dump(results, f)
예제 #14
0
    def build_model(self):

        if self.config.which_embs == 'vt-live' or self.config.which_embs == 'vt-avg':
            self.vte = Vt_Embedder(self.config, self.spmel_params)
            for param in self.vte.parameters():
                param.requires_grad = False
            self.vte_optimizer = torch.optim.Adam(self.vte.parameters(),
                                                  0.0001)
            self.vte_checkpoint = torch.load(self.config.emb_ckpt)
            new_state_dict = OrderedDict()
            for i, (key, val) in enumerate(
                    self.vte_checkpoint['model_state_dict'].items()):
                #            if key.startswith('class_layer'):
                #                continue
                new_state_dict[key] = val
            self.vte.load_state_dict(new_state_dict)
            for state in self.vte_optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.cuda(self.device)
            self.vte.to(self.device)
            self.vte.eval()
            self.avg_vt_embs = np.load(
                os.path.dirname(self.config.emb_ckpt) + '/averaged_embs.npy')

        elif self.config.which_embs == 'spkrid-live':
            # C is the speaker encoder. The config values match with the paper
            self.C = D_VECTOR(dim_input=80, dim_cell=768,
                              dim_emb=256).eval().cuda()
            # Speaker encoder checkpoint things. Load up the pretrained checkpoint info
            c_checkpoint = torch.load(
                '/homes/bdoc3/my_data/autovc_data/3000000-BL.ckpt')
            new_state_dict = OrderedDict()
            for key, val in c_checkpoint['model_b'].items():
                new_key = key[7:]
                new_state_dict[new_key] = val
            self.C.load_state_dict(new_state_dict)
            # freezes weights so they are unaffected by backprop
            for param in self.C.parameters():
                param.requires_grad = False
            self.C.to(self.device)

        self.G = Generator(self.config.dim_neck, self.config.dim_emb,
                           self.config.dim_pre, self.config.freq)
        self.g_optimizer = torch.optim.Adam(self.G.parameters(),
                                            self.config.adam_init)
        if self.config.ckpt_model != '':
            ckpt_path = os.path.join(
                '/homes/bdoc3/my_data/autovc_data/autoStc',
                self.config.ckpt_model)
            g_checkpoint = torch.load(ckpt_path)
            self.G.load_state_dict(g_checkpoint['model_state_dict'])
            self.g_optimizer.load_state_dict(
                g_checkpoint['optimizer_state_dict'])
            # fixes tensors on different devices error
            # https://github.com/pytorch/pytorch/issues/2830
            for state in self.g_optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.to(self.device)

            self.current_iter = g_checkpoint['iteration']
            tester = 2
        else:
            self.current_iter = 0
        self.G.to(self.device)
예제 #15
0
    if args.multigpu:
        device = 'cuda:0'
    else:
        device = args.device

    experimentName = args.experiment_name
    save_dir = os.path.join(args.save_dir, experimentName)
    mkdir("logs/" + experimentName)
    mkdir(save_dir)
    G = Generator(hparams.dim_neck,
                  hparams.speaker_embedding_size,
                  512,
                  hparams.freq,
                  lr=1e-3,
                  is_train=True,
                  loss_content=args.loss_content,
                  discriminator=args.dis,
                  lambda_gan=args.lambda_gan,
                  multigpu=args.multigpu,
                  lambda_wavenet=args.lambda_wavenet,
                  test_path_source=args.test_path_A,
                  test_path_target=args.test_path_B,
                  args=args).to(device)

    G.optimize_parameters(dataloader,
                          args.epochs,
                          device,
                          experimentName=experimentName,
                          save_dir=save_dir,
                          save_freq=args.save_freq,
                          display_freq=args.display_freq,
                          load_model=args.load_model,
예제 #16
0
import torch


def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0]) / base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad


device = 'cuda:0'

speaker_emb_dim = 19
base = 16  # this is set to the last number i think

G = Generator(32, speaker_emb_dim, 512,
              16).eval().to(device)  # 2nd number is  onehot

#g_checkpoint = torch.load('autovc.ckpt' ,map_location='cuda:0')

print('loading model')

g_checkpoint = torch.load('checkpoint/v4/chkpt_400000', map_location='cuda:0')
G.load_state_dict(g_checkpoint['model'])

# generate the metadata
#

print('gen metadata')

metadata = []
예제 #17
0
    parser.add_argument('--model')
    parser.add_argument('--parallel',
                        dest='parallel',
                        default=False,
                        action='store_true')
    args = parser.parse_args()

    device = "cuda:0"

    model_path = "../saved_models/"  # please change it to the trained models' path

    G = Generator(hparams.dim_neck,
                  hparams.speaker_embedding_size,
                  512,
                  hparams.freq,
                  is_train=False,
                  encoder_type="single",
                  discriminator=True,
                  use_lsgan=True,
                  train_wavenet=True).to(device)

    model_list = glob.glob(model_path + "*.pkl")
    name_list = [x.split('/')[-1].split('.')[0] for x in model_list]
    print(name_list)
    if args.model in name_list:
        print("Loading autovc model...", end='\t')
        load_model = "../saved_models/%s.pkl" % args.model
        d = torch.load(load_model)
        newdict = d.copy()
        for key, value in d.items():
            newkey = key
예제 #18
0
    wavnet.load_state_dict(checkpoint["state_dict"])

    wav = load_wav(src_wav_path)
    emb = np.load(src_emb_path)
    emb_tgt = np.load(tgt_emb_path)

    mel = melspectrogram(wav)

    pad_len = math.ceil(mel.shape[1] / 32) * 32 - mel.shape[1]
    mel = np.pad(mel, ((0,0), (0, pad_len)), mode='constant')

    mel = torch.FloatTensor(mel)
    emb = torch.FloatTensor(emb)
    emb_tgt = torch.FloatTensor(emb_tgt)

    model = Generator(dim_neck, dim_emb, dim_pre, freq)

    checkpoint = torch.load(autovc_checkpoint_path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['model'])
    model.eval()

    x = mel.unsqueeze(0).transpose(2,1) 
    e = emb.unsqueeze(0)
    et = emb_tgt.unsqueeze(0)

    mel_outputs, mel_outputs_postnet, codes = model(x, e, et)
    mel_rec = mel_outputs_postnet.transpose(2,1).cpu().detach().numpy()[0]

    mel_rec = mel_rec[:,:-pad_len]

    c = np.transpose(mel_rec, (1, 0))
예제 #19
0
import os
import pickle 
import torch
import numpy as npfrom math import ceil
from model_vc import Generator


def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

device = 'cuda:0'
G = Generator(32,256,512,32).eval().to(device)

g_checkpoint = torch.load('autovc.ckpt', map_location=device)
G.load_state_dict(g_checkpoint['model'])
metadata = pickle.load(open('metadata.pkl', "rb"))

spmelDir = './spmel'
spect_vc = []

for sbmt_i in metadata:
    x_org = sbmt_i[2]
    if isinstance(x_org, str):
        x_org = np.load(os.path.join(spmelDir, x_org))
    
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
    emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
예제 #20
0
    librosa.output.write_wav(args.output_file, s2t_wav.astype(np.float32), hparams.sample_rate)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--wav_path')
    parser.add_argument('--model')
    parser.add_argument('--parallel', dest='parallel', default=False, action='store_true')
    parser.add_argument('--output_file')
    args = parser.parse_args()

    device = "cuda:0"

    model_path = args.model

    G = Generator(hparams.dim_neck, hparams.speaker_embedding_size, 512, hparams.freq, is_train=False,
                  discriminator=False).to(device)

    print("Loading autovc model...", end='\t')
    load_model = model_path
    d = torch.load(load_model)
    newdict = d.copy()
    for key, value in d.items():
        newkey = key
        if 'wavenet' in key:
            newdict[key.replace('wavenet', 'vocoder')] = newdict.pop(key)
            newkey = key.replace('wavenet', 'vocoder')
        if 'module' in key:
            newdict[newkey.replace('module.','',1)] = newdict.pop(newkey)
            newkey = newkey.replace('module.', '', 1)
        if newkey not in G.state_dict():
            #print(newkey)
예제 #21
0
        train_data = json.load(f)

    with open(os.path.join(data_path, 'test_data.json'), 'r') as f:
        test_data = json.load(f)

    train_loader = torch.utils.data.DataLoader(
        AudiobookDataset(train_data),
        collate_fn=train_collate,
        batch_size=args.batch_size, shuffle=True, **kwargs)

    test_loader = torch.utils.data.DataLoader(
        AudiobookDataset(test_data),
        collate_fn=test_collate,
        batch_size=1, shuffle=False, **kwargs)

    model = Generator(hp.dim_neck, hp.dim_emb, hp.dim_pre, hp.freq).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    current_epoch = 0
    if args.checkpoint:
        current_epoch = load_checkpoint(args.checkpoint, model, device, optimizer)
    
    checkpoint_dir = 'checkpoints'
    os.makedirs(checkpoint_dir, exist_ok=True)

    for epoch in range(current_epoch + 1, args.epochs + 1):
        print(f'epoch {epoch}')
        train(args, model, device, train_loader, optimizer, epoch)

        if epoch % 10 == 0:
            test(model, device, test_loader, checkpoint_dir, epoch)