def build_model(self): self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq) self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001) ## TODO: load checkpoint here # this is from make_metadata.py #c_checkpoint = torch.load('3000000-BL.ckpt') #new_state_dict = OrderedDict() #for key, val in c_checkpoint['model_b'].items(): # new_key = key[7:] # new_state_dict[new_key] = val #C.load_state_dict(new_state_dict) # this is from waveglow # model.load_state_dict( checkpoint_dict['model'].state_dict() ) # self.start_iter = checkpoint_dict['iteration'] # self.g_optimizer.load_state_dict(checkpoint_dict['optimizer']) # this is from conversion.ipynb #g_checkpoint = torch.load('autovc.ckpt') #G.load_state_dict(g_checkpoint['model']) self.G.to(self.device) if self.resume: g_checkpoint = torch.load(self.resume) # ,map_location='cuda:0' self.G.load_state_dict(g_checkpoint['model']) self.g_optimizer.load_state_dict(g_checkpoint['optimizer']) self.start_iter = g_checkpoint['iteration']
def build_model(self): self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq) self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001) self.G.to(self.device)
def load_ckpt_arch(): G = Generator(dim_neck=dim_neck, dim_emb=dim_emb, dim_pre=dim_pre, freq=freq, speaker_num=speaker_num).eval().to(device) return G
def build_model(self): self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq) self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001) """self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.g_optimizer, mode='min', factor=0.5, patience=1000, threshold=0.0001, verbose=True)""" self.G.eval().to(self.device)
def build_model(self): self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq) #self.G = torch.load('checkpoints_wided_addnoise/autovc_450000.pt' ,map_location=torch.device('cuda')) self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001) self.G.to(self.device)
def generateAudioGroup(original_audio, ref_audios, autovc_checkpoint = 'checkpoints_fully/autovc_700000.pt', vocoder_checkpoint = "../checkpoint_step001000000_ema.pth"): mel_org = makeSpect(original_audio, None) def pad_seq(x, base=32): len_out = int(base * ceil(float(x.shape[0])/base)) len_pad = len_out - x.shape[0] assert len_pad >= 0 return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad device = 'cuda:0' G = Generator(32,256,512,32).eval().to(device) g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda')) G = g_checkpoint.eval() x_org = mel_org x_org, len_pad = pad_seq(x_org) uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device) emb_org = get_verification_pytorch_1000(original_audio) emb_refs = [] i = 0 for file in os.listdir(ref_audios): i += 1 print("{}/{}".format(i, len(os.listdir(ref_audios)))) emb_ref = get_verification_pytorch_1000(ref_audios + file, 1) if emb_ref is not None: emb_refs.append(emb_ref) emb_refs = np.mean(emb_refs, axis=0) emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda() emb_refs = torch.FloatTensor(emb_refs).unsqueeze(0).cuda() with torch.no_grad(): _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_refs) if len_pad == 0: uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() else: uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() device = torch.device("cuda") model = build_model().to(device) checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda')) model.load_state_dict(checkpoint["state_dict"]) waveform = wavegen(model, c=uttr_trg) return waveform
def main(): # init model device = 'cuda:0' G = Generator(dim_neck=dim_neck, dim_emb=dim_emb, dim_pre=dim_pre, freq=freq, speaker_num=speaker_num).eval().to(device) g_checkpoint = torch.load(ckpt_path) G.load_state_dict(g_checkpoint['model']) # init speaker name -> id speaker_id_dict = text2dict(speaker_id_dict_path) # p228/p228_077.npy|p228|p227 f = open(conversion_list_path, 'r').readlines() tasks = [i.strip() for i in f] spect_vc = [] for task in tasks: task = task.split('|') assert len(task) == 3 mel_path = task[0] s_name = task[1] t_name = task[2] # process from string -> data: mel, s, t mel = np.load(os.path.join(data_dir, mel_path)) mel, len_pad = pad_seq(mel) s_id = speaker_id_dict[s_name] t_id = speaker_id_dict[t_name] # process from data -> batch tensor: mel, s, t mel = torch.from_numpy(mel[np.newaxis, :, :]).to(device) s_id = torch.from_numpy(np.asarray([s_id])).to(device) t_id = torch.from_numpy(np.asarray([t_id])).to(device) print('speaker model out----------', s_id.size()) with torch.no_grad(): _, x_identic_psnt, _ = G(mel, s_id, t_id) print('mel size:', x_identic_psnt.size()) if len_pad == 0: # uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() x_identic_psnt = x_identic_psnt[0, :, :].cpu().numpy() else: # uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() x_identic_psnt = x_identic_psnt[0, :-len_pad, :].cpu().numpy() spect_vc.append(('{}x{}'.format(s_name, t_name), x_identic_psnt)) with open('results.pkl', 'wb') as handle: pickle.dump(spect_vc, handle)
def generateAudio(original_audio, ref_audio, autovc_checkpoint, vocoder_checkpoint ,english=False): mel_org = makeSpect(original_audio, None) def pad_seq(x, base=32): len_out = int(base * ceil(float(x.shape[0])/base)) len_pad = len_out - x.shape[0] assert len_pad >= 0 return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad device = 'cuda:0' G = Generator(32,256,512,32).eval().to(device) g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda')) G = g_checkpoint.eval() x_org = mel_org x_org, len_pad = pad_seq(x_org) uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device) emb_org = get_verification_pytorch_1000(original_audio) if not english: emb_ref = get_verification_pytorch_1000(ref_audio) else: emb_ref = get_verification_eng(ref_audio) if emb_org is None or emb_ref is None: return None emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda() if not english: emb_ref = torch.FloatTensor(emb_ref).unsqueeze(0).cuda() else: emb_ref = emb_ref.type(torch.cuda.FloatTensor) with torch.no_grad(): _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_ref) if len_pad == 0: uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() else: uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() device = torch.device("cuda") model = build_model().to(device) checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda')) model.load_state_dict(checkpoint["state_dict"]) waveform = wavegen(model, c=uttr_trg) return waveform
def __encode__(self, source, target): ''' Produces result.pkl :param source: string of source filename :param target: string of target filename :return: None ''' source = "audio/upload/p225.pkl" target = "audio/upload/p256.pkl" device = 'cuda:0' G = Generator(32, 256, 512, 32).eval() #.to(device) g_checkpoint = torch.load('autovc.ckpt', map_location=torch.device('cpu'))#, map_location='cuda:0') G.load_state_dict(g_checkpoint['model']) # load data source = pickle.load(open(source, "rb")) target = pickle.load(open(target, "rb")) metadata = [source, target] # do work spect_vc = [] x_org = source[2] x_org, len_pad = self.__pad_seq__(x_org) uttr_org = torch.from_numpy(x_org[np.newaxis, :, :])#.to(device) emb_org = torch.from_numpy(source[1][np.newaxis, :])#.to(device) emb_trg = torch.from_numpy(target[1][np.newaxis, :])#.to(device) with torch.no_grad(): _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_trg) if len_pad == 0: uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() else: uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() spect_vc.append(('{}x{}'.format(source[0], target[0]), uttr_trg)) # save the result with open('results.pkl', 'wb') as handle: pickle.dump(spect_vc, handle) return None
def build_model(self): self.G = Generator(self.dim_neck, self.dim_emb, self.dim_pre, self.freq, self.speaker_num) if os.path.exists(self.logs_dir) is False: os.makedirs(self.logs_dir, exist_ok=True) if os.path.exists(self.logs_tensorboard_dir) is False: os.makedirs(self.logs_tensorboard_dir, exist_ok=True) self.writer = SummaryWriter(log_dir=self.logs_tensorboard_dir) self.g_optimizer = torch.optim.Adam(self.G.parameters(), 0.0001) self.G.to(self.device)
def trace(): generator = Generator(8, 256, 512, 4) if args.model: ckpt = torch.load(args.model) if ckpt: logging.info(f'loading generator ckpt {args.model}') generator.load_state_dict(ckpt['model_state_dict']) else: ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt'))) if len(ckpts) > 0: latest_ckpt_path = ckpts[-1] ckpt = torch.load(latest_ckpt_path) if ckpt: logging.info(f'loading generator ckpt {latest_ckpt_path}') generator.load_state_dict(ckpt['model_state_dict']) device = torch.device("cpu") generator.to(device=device) generator.eval() x1 = torch.ones(1, 298, 80) x2 = torch.ones(1, 256) x3 = torch.ones(1, 10, 256) # out = generator(x1, x2, x3) enc_x_1 = torch.ones(1, 320, 80) enc_x_2 = torch.ones(1, 256) # dec_x = torch.ones(1, 256, 32*2+256) post_x = torch.ones(1, 80, 298) # out = generator(x1, x2, x3) traced_postnet = torch.jit.trace(generator.postnet, (post_x)) generator.postnet = traced_postnet sm = torch.jit.script(generator, (x1, x2, x3)) print(sm.code) out = sm(x1, x2, x3) print(out.shape) print(out) sm.save('autovc_script_model.pt')
def train(): stcmds_ds = dataset.new_stcmds_dataset( root=hp.stcmds_data_root, mel_feature_root=hp.mel_feature_root) # aishell_ds = dataset.new_aishell_dataset(root=hp.aishell_data_root, mel_feature_root=hp.mel_feature_root) # aidatatang_ds = dataset.new_aidatatang_dataset(root=hp.aidatatang_data_root, mel_feature_root=hp.mel_feature_root) # primewords_ds = dataset.new_primewords_dataset(root=hp.primewords_data_root, mel_feature_root=hp.mel_feature_root) # toy_ds = dataset.new_toy_dataset(root=hp.toy_data_root, mel_feature_root=hp.mel_feature_root) # datasets = [stcmds_ds, aishell_ds, aidatatang_ds, primewords_ds] datasets = [stcmds_ds] # datasets = [toy_ds] mds = dataset.MultiAudioDataset(datasets) random.shuffle(mds.speakers) train_speakers = mds.speakers[:-40] eval_speakers = mds.speakers[-40:] ds = dataset.SpeakerDataset( train_speakers, utterances_per_speaker=hp.generator_utterances_per_speaker, seq_len=hp.generator_seq_len) loader = torch.utils.data.DataLoader( ds, batch_size=hp.generator_speakers_per_batch, shuffle=True, num_workers=6) eval_ds = dataset.SpeakerDataset( eval_speakers, utterances_per_speaker=hp.generator_utterances_per_speaker, seq_len=hp.generator_seq_len) eval_loader = torch.utils.data.DataLoader( eval_ds, batch_size=hp.generator_speakers_per_batch, shuffle=True, num_workers=6) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loss_device = torch.device("cpu") speaker_encoder = SpeakerEncoder(device, loss_device, 3) ckpts = sorted(list(Path(hp.save_dir).glob('*.pt'))) if len(ckpts) > 0: latest_ckpt_path = ckpts[-1] ckpt = torch.load(latest_ckpt_path) if ckpt: logging.info(f'loading speaker encoder ckpt {latest_ckpt_path}') speaker_encoder.load_state_dict(ckpt['model_state_dict']) else: raise Exception('ckpt', 'no ckpts found') else: raise Exception('ckpt', 'no ckpts found') speaker_encoder.eval() # generator = Generator(32, 256, 512, 16) # train_speakers[:120] g5_ckpts_bak # generator = Generator(32, 256, 512, 16) # train_speakers[:800] g6_ckpts_bak # generator = Generator(32, 256, 512, 16) # train_speakers[:800] g12_ckpts_bak 3layers-speaker_encoder # generator = Generator(16, 256, 512, 16) # train_speakers[:800] g13_ckpts_bak 3layers-speaker_encoder # generator = Generator(24, 256, 512, 16) # train_speakers[:800] g14_ckpts_bak 3layers-speaker_encoder # generator = Generator(24, 256, 512, 16) # [stcmds_ds, aishell_ds, aidatatang_ds, primewords_ds] g15_ckpts_bak 3layers-speaker_encoder # use src emb from a different utterance # use variate seq_len (128, 256, ...) # generator = Generator(24, 256, 512, 16) # train_speakers[:800] g16_ckpts_bak 3layers-speaker_encoder var-seqlen (128train->256finetune) diff-emb # generator = Generator(8, 256, 512, 4) # train_speakers[:800] g17_ckpts_bak 3layers-speaker_encoder generator = Generator( 8, 256, 512, 4) # train_speakers[:800] g18_ckpts_bak 3layers-speaker_encoder bs-16 # large batch size # speaker code reconstruct # generator = Generator(32, 256, 512, 8) train_speakers[:120] g7 # generator = Generator(32, 256, 512, 8) # train_speakers[:800] g11 # generator = Generator(32, 256, 512, 2) [:120] g8 # generator = Generator(32, 256, 512, 2) [:800] g9 # generator = Generator(16, 256, 512, 2) [:800] # g10 # generator = Generator(16, 256, 512, 2) generator.to(device=device) opt = torch.optim.Adam(generator.parameters(), lr=hp.generator_lr) total_steps = 0 ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt'))) if len(ckpts) > 0: latest_ckpt_path = ckpts[-1] ckpt = torch.load(latest_ckpt_path) if ckpt: logging.info(f'loading generator ckpt {latest_ckpt_path}') generator.load_state_dict(ckpt['model_state_dict']) opt.load_state_dict(ckpt['optimizer_state_dict']) total_steps = ckpt['total_steps'] if args.pretrained: ckpt = torch.load(args.pretrained) generator.load_state_dict(ckpt['model_state_dict']) logging.info(f'loaded pretrained model {args.pretrained}') while True: if total_steps >= hp.generator_train_steps: break for batch in loader: if total_steps >= hp.generator_train_steps: break for param_group in opt.param_groups: param_group['lr'] = hp.generator_get_lr(total_steps + 1) generator.train() batch = batch.cuda() n_speakers, n_utterances, freq_len, tempo_len = batch.shape data = batch.view(-1, freq_len, tempo_len) embeds = speaker_encoder(data.transpose(1, 2)).detach() embeds = embeds.view(n_speakers, n_utterances, -1) # assert batch.size(1) == 2 src_mels = batch[:, 0, :, :] src_mels = src_mels.transpose(1, 2) # logging.info(f'src_mels.shape {src_mels.shape}') # assert embeds.size(1) == 2 # src_embeds = embeds.mean(dim=1) # average the embeddings # Target embed from the same speaker as source embed in training phase, # and should be a different speaker in inference phase. Here the target # utterance is also different from the source utterance. src_embeds = embeds[:, 0, :] # logging.info(f'embeds.shape {src_embeds.shape} {tgt_embeds.shape}') init_out, final_out, content_out, code_exp = generator( src_mels, src_embeds, src_embeds.unsqueeze(1)) # content_out2 = generator(batch[:, 1, :, :].transpose(1, 2), tgt_embeds, None) # logging.info(f'out shapes {init_out.shape} {final_out.shape} {content_out.shape}') # content_diff_loss = F.cosine_similarity(content_out.view(1, -1), content_out2.view(1, -1)).mean() loss, recon_loss, recon0_loss, content_recon_loss = generator.loss( src_mels, src_embeds, init_out, final_out, content_out) opt.zero_grad() # (loss + 0.3 * content_diff_loss).backward() loss.backward() opt.step() total_steps += 1 if (total_steps + 1) % hp.generator_train_print_interval == 0: logging.info( f'generator step {total_steps+1} loss {loss:.3f} ==> recon_loss {recon_loss:.3f} recon0_loss {recon0_loss:.3f} content_recon_loss {content_recon_loss:.5f}' ) if (total_steps + 1) % hp.generator_evaluate_interval == 0: evaluate(generator, speaker_encoder, eval_loader) if (total_steps + 1) % hp.generator_save_interval == 0: if not Path(hp.generator_save_dir).exists(): Path(hp.generator_save_dir).mkdir() save_path = Path( hp.generator_save_dir) / f'{total_steps+1:012d}.pt' logging.info(f'saving generrator ckpt {save_path}') torch.save( { 'model_state_dict': generator.state_dict(), 'optimizer_state_dict': opt.state_dict(), 'total_steps': total_steps }, save_path) # remove old ckpts ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt'))) if len(ckpts) > hp.generator_max_ckpts: for ckpt in ckpts[:-hp.generator_max_ckpts]: Path(ckpt).unlink() logging.info(f'ckpt {ckpt} removed') # if (total_steps+1) % hp.generator_bak_interval == 0: # if not Path(hp.generator_bak_dir).exists(): # Path(hp.generator_bak_dir).mkdir() # ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt'))) # shutil.copy(ckpts[-1], hp.generator_bak_dir) # logging.info(f'ckpt {ckpts[-1]} backuped') if (total_steps + 1) % hp.generator_sample_interval == 0: results = [ src_mels.detach().cpu().numpy(), final_out.detach().cpu().numpy(), content_out.detach().cpu().numpy(), code_exp.detach().cpu().numpy(), ] with open('generator_samples.pkl', 'wb') as f: pickle.dump(results, f) pass
def inference(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loss_device = torch.device("cpu") speaker_encoder = SpeakerEncoder(device, loss_device, 3) ckpts = sorted(list(Path(hp.save_dir).glob('*.pt'))) if len(ckpts) > 0: latest_ckpt_path = ckpts[-1] ckpt = torch.load(latest_ckpt_path) if ckpt: logging.info(f'loading speaker encoder ckpt {latest_ckpt_path}') speaker_encoder.load_state_dict(ckpt['model_state_dict']) else: raise Exception('ckpt', 'no ckpts found') else: raise Exception('ckpt', 'no ckpts found') generator = Generator(8, 256, 512, 4) ckpts = sorted(list(Path(hp.generator_save_dir).glob('*.pt'))) if len(ckpts) > 0: latest_ckpt_path = ckpts[-1] ckpt = torch.load(latest_ckpt_path) if ckpt: logging.info(f'loading generator ckpt {latest_ckpt_path}') generator.load_state_dict(ckpt['model_state_dict']) generator.to(device=device) speaker_encoder.eval() generator.eval() # pad with zeros to the end of the time axis def pad_zeros(x): mul = math.ceil(float(x.shape[1]) / 32) pad_len = mul * 32 - x.shape[1] return np.pad(x, pad_width=((0, 0), (0, pad_len)), mode='constant') def pad_zeros_multi(xs): max_len = 0 for x in xs: if x.shape[1] > max_len: max_len = x.shape[1] newxs = [] for x in xs: mul = math.ceil(float(max_len) / 32) pad_len = mul * 32 - x.shape[1] newxs.append( np.pad(x, pad_width=((0, 0), (0, pad_len)), mode='constant')) return newxs stcmds_ds = dataset.new_stcmds_dataset( root=hp.stcmds_data_root, mel_feature_root=hp.mel_feature_root) datasets = [stcmds_ds] mds = dataset.MultiAudioDataset(datasets) random.shuffle(mds.speakers) speakers = mds.speakers # src_uttrn = speakers[1].random_utterances(1)[0] src_uttrn = dataset.Utterance( id=None, raw_file='/tmp/v1.wav' # raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0026.wav', ) src_mel = src_uttrn.melspectrogram() src_embed = speaker_encoder( torch.unsqueeze(torch.from_numpy(src_mel), 0).transpose(1, 2).cuda()) # src_mel = pad_zeros(src_mel) src_mels = torch.unsqueeze(torch.from_numpy(src_mel), 0).transpose(1, 2).cuda() # 804 female sharp # 1 female soft # tgt_uttrns = speakers[1].random_utterances(10) # print(f'tgt raw file {tgt_uttrns[0].raw_file}') # tgt_uttrns = [dataset.Utterance(id=None, raw_file=f'/tmp/a{i}.wav') for i in range(1, 5)] tgt_uttrns = [ dataset.Utterance( id=None, raw_file= '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0026.wav' ), dataset.Utterance( id=None, raw_file= '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0027.wav' ), dataset.Utterance( id=None, raw_file= '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0028.wav' ), dataset.Utterance( id=None, raw_file= '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0029.wav' ), dataset.Utterance( id=None, raw_file= '/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00254I0030.wav' ), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0030.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0031.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0032.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0033.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0034.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0025.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0026.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0027.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0028.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST-CMDS-20170001_1-OS/20170001P00047I0029.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0107.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0060.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0061.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0062.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0063.wav'), # dataset.Utterance(id=None, raw_file='/mnt/ssd500/dataset/speech/ST_CMDS_holdout/20170001P00211I0064.wav'), ] tgt_mels = [tgt_uttrn.melspectrogram() for tgt_uttrn in tgt_uttrns] tgt_embeds = [] for m in tgt_mels: tgt_embeds.append( speaker_encoder( torch.from_numpy(m).unsqueeze(0).transpose(1, 2).cuda())) tgt_embed = torch.cat(tgt_embeds, dim=0).unsqueeze(0) # tgt_embed = speaker_encoder(torch.from_numpy(np.array(tgt_mels)).transpose(1, 2).cuda()).mean(dim=0, keepdim=True) # S2 print(f'src_mels {src_mels.shape}') print(f'src_embed {src_embed.shape}') print(f'tgt_embed {tgt_embed.shape}') init_out, out_mels, content_out, _ = generator(src_mels, src_embed, tgt_embed) init_out2, out_mels2, content_out2, _ = generator(src_mels, src_embed, src_embed.unsqueeze(1)) # loss, recon_loss, recon0_loss, content_recon_loss = generator.loss(src_mels, # src_embed, # init_out, # out_mels, # content_out) # logging.info(f'inference loss {loss:.3f} recon_loss {recon_loss:.3f} recon0_loss {recon0_loss:.3f} content_recon_loss {content_recon_loss:.3f}') netG = model_vocoder.Generator(hp.num_mels, hp.vocoder_ngf, hp.vocoder_n_residual_layers).cuda() ckpts = sorted(list(Path(hp.vocoder_save_dir).glob('*.pt'))) if len(ckpts) > 0: latest_ckpt_path = ckpts[-1] logging.info(f'loading vocoder ckpt {latest_ckpt_path}') ckpt = torch.load(latest_ckpt_path) netG.load_state_dict(ckpt['netG_state_dict']) S = out_mels.squeeze(1).transpose(1, 2) y_recon = netG(src_mels.transpose(1, 2)) y_pred = netG(S) y_recon2 = netG(out_mels2.squeeze(1).transpose(1, 2)) print( f'shapes out_mels {out_mels.shape}, S {S.shape}, y_pred {y_pred.shape}' ) results = [ src_mels.detach().cpu().numpy(), tgt_mels, out_mels.detach().cpu().numpy(), y_pred.detach().cpu().numpy(), y_recon.detach().cpu().numpy(), src_uttrn.raw(sr=hp.sample_rate), tgt_uttrns[0].raw(sr=hp.sample_rate), out_mels2.detach().cpu().numpy(), y_recon2.detach().cpu().numpy(), ] with open('generator_results.pkl', 'wb') as f: pickle.dump(results, f)
def build_model(self): if self.config.which_embs == 'vt-live' or self.config.which_embs == 'vt-avg': self.vte = Vt_Embedder(self.config, self.spmel_params) for param in self.vte.parameters(): param.requires_grad = False self.vte_optimizer = torch.optim.Adam(self.vte.parameters(), 0.0001) self.vte_checkpoint = torch.load(self.config.emb_ckpt) new_state_dict = OrderedDict() for i, (key, val) in enumerate( self.vte_checkpoint['model_state_dict'].items()): # if key.startswith('class_layer'): # continue new_state_dict[key] = val self.vte.load_state_dict(new_state_dict) for state in self.vte_optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda(self.device) self.vte.to(self.device) self.vte.eval() self.avg_vt_embs = np.load( os.path.dirname(self.config.emb_ckpt) + '/averaged_embs.npy') elif self.config.which_embs == 'spkrid-live': # C is the speaker encoder. The config values match with the paper self.C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().cuda() # Speaker encoder checkpoint things. Load up the pretrained checkpoint info c_checkpoint = torch.load( '/homes/bdoc3/my_data/autovc_data/3000000-BL.ckpt') new_state_dict = OrderedDict() for key, val in c_checkpoint['model_b'].items(): new_key = key[7:] new_state_dict[new_key] = val self.C.load_state_dict(new_state_dict) # freezes weights so they are unaffected by backprop for param in self.C.parameters(): param.requires_grad = False self.C.to(self.device) self.G = Generator(self.config.dim_neck, self.config.dim_emb, self.config.dim_pre, self.config.freq) self.g_optimizer = torch.optim.Adam(self.G.parameters(), self.config.adam_init) if self.config.ckpt_model != '': ckpt_path = os.path.join( '/homes/bdoc3/my_data/autovc_data/autoStc', self.config.ckpt_model) g_checkpoint = torch.load(ckpt_path) self.G.load_state_dict(g_checkpoint['model_state_dict']) self.g_optimizer.load_state_dict( g_checkpoint['optimizer_state_dict']) # fixes tensors on different devices error # https://github.com/pytorch/pytorch/issues/2830 for state in self.g_optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(self.device) self.current_iter = g_checkpoint['iteration'] tester = 2 else: self.current_iter = 0 self.G.to(self.device)
if args.multigpu: device = 'cuda:0' else: device = args.device experimentName = args.experiment_name save_dir = os.path.join(args.save_dir, experimentName) mkdir("logs/" + experimentName) mkdir(save_dir) G = Generator(hparams.dim_neck, hparams.speaker_embedding_size, 512, hparams.freq, lr=1e-3, is_train=True, loss_content=args.loss_content, discriminator=args.dis, lambda_gan=args.lambda_gan, multigpu=args.multigpu, lambda_wavenet=args.lambda_wavenet, test_path_source=args.test_path_A, test_path_target=args.test_path_B, args=args).to(device) G.optimize_parameters(dataloader, args.epochs, device, experimentName=experimentName, save_dir=save_dir, save_freq=args.save_freq, display_freq=args.display_freq, load_model=args.load_model,
import torch def pad_seq(x, base=32): len_out = int(base * ceil(float(x.shape[0]) / base)) len_pad = len_out - x.shape[0] assert len_pad >= 0 return np.pad(x, ((0, len_pad), (0, 0)), 'constant'), len_pad device = 'cuda:0' speaker_emb_dim = 19 base = 16 # this is set to the last number i think G = Generator(32, speaker_emb_dim, 512, 16).eval().to(device) # 2nd number is onehot #g_checkpoint = torch.load('autovc.ckpt' ,map_location='cuda:0') print('loading model') g_checkpoint = torch.load('checkpoint/v4/chkpt_400000', map_location='cuda:0') G.load_state_dict(g_checkpoint['model']) # generate the metadata # print('gen metadata') metadata = []
parser.add_argument('--model') parser.add_argument('--parallel', dest='parallel', default=False, action='store_true') args = parser.parse_args() device = "cuda:0" model_path = "../saved_models/" # please change it to the trained models' path G = Generator(hparams.dim_neck, hparams.speaker_embedding_size, 512, hparams.freq, is_train=False, encoder_type="single", discriminator=True, use_lsgan=True, train_wavenet=True).to(device) model_list = glob.glob(model_path + "*.pkl") name_list = [x.split('/')[-1].split('.')[0] for x in model_list] print(name_list) if args.model in name_list: print("Loading autovc model...", end='\t') load_model = "../saved_models/%s.pkl" % args.model d = torch.load(load_model) newdict = d.copy() for key, value in d.items(): newkey = key
wavnet.load_state_dict(checkpoint["state_dict"]) wav = load_wav(src_wav_path) emb = np.load(src_emb_path) emb_tgt = np.load(tgt_emb_path) mel = melspectrogram(wav) pad_len = math.ceil(mel.shape[1] / 32) * 32 - mel.shape[1] mel = np.pad(mel, ((0,0), (0, pad_len)), mode='constant') mel = torch.FloatTensor(mel) emb = torch.FloatTensor(emb) emb_tgt = torch.FloatTensor(emb_tgt) model = Generator(dim_neck, dim_emb, dim_pre, freq) checkpoint = torch.load(autovc_checkpoint_path, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model']) model.eval() x = mel.unsqueeze(0).transpose(2,1) e = emb.unsqueeze(0) et = emb_tgt.unsqueeze(0) mel_outputs, mel_outputs_postnet, codes = model(x, e, et) mel_rec = mel_outputs_postnet.transpose(2,1).cpu().detach().numpy()[0] mel_rec = mel_rec[:,:-pad_len] c = np.transpose(mel_rec, (1, 0))
import os import pickle import torch import numpy as npfrom math import ceil from model_vc import Generator def pad_seq(x, base=32): len_out = int(base * ceil(float(x.shape[0])/base)) len_pad = len_out - x.shape[0] assert len_pad >= 0 return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad device = 'cuda:0' G = Generator(32,256,512,32).eval().to(device) g_checkpoint = torch.load('autovc.ckpt', map_location=device) G.load_state_dict(g_checkpoint['model']) metadata = pickle.load(open('metadata.pkl', "rb")) spmelDir = './spmel' spect_vc = [] for sbmt_i in metadata: x_org = sbmt_i[2] if isinstance(x_org, str): x_org = np.load(os.path.join(spmelDir, x_org)) x_org, len_pad = pad_seq(x_org) uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device) emb_org = torch.from_numpy(sbmt_i[1][np.newaxis, :]).to(device)
librosa.output.write_wav(args.output_file, s2t_wav.astype(np.float32), hparams.sample_rate) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--wav_path') parser.add_argument('--model') parser.add_argument('--parallel', dest='parallel', default=False, action='store_true') parser.add_argument('--output_file') args = parser.parse_args() device = "cuda:0" model_path = args.model G = Generator(hparams.dim_neck, hparams.speaker_embedding_size, 512, hparams.freq, is_train=False, discriminator=False).to(device) print("Loading autovc model...", end='\t') load_model = model_path d = torch.load(load_model) newdict = d.copy() for key, value in d.items(): newkey = key if 'wavenet' in key: newdict[key.replace('wavenet', 'vocoder')] = newdict.pop(key) newkey = key.replace('wavenet', 'vocoder') if 'module' in key: newdict[newkey.replace('module.','',1)] = newdict.pop(newkey) newkey = newkey.replace('module.', '', 1) if newkey not in G.state_dict(): #print(newkey)
train_data = json.load(f) with open(os.path.join(data_path, 'test_data.json'), 'r') as f: test_data = json.load(f) train_loader = torch.utils.data.DataLoader( AudiobookDataset(train_data), collate_fn=train_collate, batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( AudiobookDataset(test_data), collate_fn=test_collate, batch_size=1, shuffle=False, **kwargs) model = Generator(hp.dim_neck, hp.dim_emb, hp.dim_pre, hp.freq).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) current_epoch = 0 if args.checkpoint: current_epoch = load_checkpoint(args.checkpoint, model, device, optimizer) checkpoint_dir = 'checkpoints' os.makedirs(checkpoint_dir, exist_ok=True) for epoch in range(current_epoch + 1, args.epochs + 1): print(f'epoch {epoch}') train(args, model, device, train_loader, optimizer, epoch) if epoch % 10 == 0: test(model, device, test_loader, checkpoint_dir, epoch)