def do_test_generate(self, paths, step, data_path, test_index, deterministic=False, use_half=False, verbose=False): k = step // 1000 test_index = [ x[:2] if len(x) > 0 else [] for i, x in enumerate(test_index) ] dataset = env.MultispeakerDataset(test_index, data_path) loader = DataLoader(dataset, shuffle=False) data = [x for x in loader] n_points = len(data) gt = [(x[0].float() + 0.5) / (2**15 - 0.5) for speaker, x in data] extended = [ np.concatenate([ np.zeros(self.pad_left_encoder(), dtype=np.float32), x, np.zeros(self.pad_right(), dtype=np.float32) ]) for x in gt ] speakers = [ torch.FloatTensor(speaker[0].float()) for speaker, x in data ] maxlen = max([len(x) for x in extended]) aligned = [ torch.cat([torch.FloatTensor(x), torch.zeros(maxlen - len(x))]) for x in extended ] os.makedirs(paths.gen_path(), exist_ok=True) out, _, _, _ = self.forward_generate(torch.stack( speakers + list(reversed(speakers)), dim=0).cuda(), torch.stack(aligned + aligned, dim=0).cuda(), verbose=verbose, use_half=use_half) logger.log(f'out: {out.size()}') for i, x in enumerate(gt): librosa.output.write_wav( f'{paths.gen_path()}/{k}k_steps_{i}_target.wav', x.cpu().numpy(), sr=sample_rate) audio = out[i][:len(x)].cpu().numpy() librosa.output.write_wav( f'{paths.gen_path()}/{k}k_steps_{i}_generated.wav', audio, sr=sample_rate) audio_tr = out[n_points + i][:len(x)].cpu().numpy() librosa.output.write_wav( f'{paths.gen_path()}/{k}k_steps_{i}_transferred.wav', audio_tr, sr=sample_rate)
def do_generate(self, paths, step, data_path, test_index, deterministic=False, use_half=False, verbose=False): """Speech generation from command-line (not during test) """ k = step // 1000 test_index = [ x[:10] if len(x) > 0 else [] for i, x in enumerate(test_index) ] test_index[0] = [] test_index[1] = [] test_index[2] = [] # test_index[3] = [] dataset = env.MultispeakerDataset(test_index, data_path) loader = DataLoader(dataset, shuffle=False) data = [x for x in loader] n_points = len(data) gt = [(x[0].float() + 0.5) / (2**15 - 0.5) for speaker, x in data] extended = [ np.concatenate([ np.zeros(self.pad_left_encoder(), dtype=np.float32), x, np.zeros(self.pad_right(), dtype=np.float32) ]) for x in gt ] speakers = [ torch.FloatTensor(speaker[0].float()) for speaker, x in data ] vc_speakers = [ torch.FloatTensor((np.arange(30) == 1).astype(np.float)) for _ in range(10) ] # vc_speakers = [torch.FloatTensor((np.arange(30) == 14).astype(np.float)) for _ in range(20)] # vc_speakers = [torch.FloatTensor((np.arange(30) == 23).astype(np.float)) for _ in range(20)] # vc_speakers = [torch.FloatTensor((np.arange(30) == 4).astype(np.float)) for _ in range(20)] maxlen = max([len(x) for x in extended]) aligned = [ torch.cat([torch.FloatTensor(x), torch.zeros(maxlen - len(x))]) for x in extended ] os.makedirs(paths.gen_dir(), exist_ok=True) # out = self.forward_generate(torch.stack(speakers + list(reversed(speakers)), dim=0).cuda(), torch.stack(aligned + aligned, dim=0).cuda(), verbose=verbose, use_half=use_half) out = self.forward_generate(torch.stack(vc_speakers, dim=0).cuda(), torch.stack(aligned, dim=0).cuda(), verbose=verbose, use_half=use_half) # for i, x in enumerate(gt) : # librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_target.wav', x.cpu().numpy(), sr=sample_rate) # audio = out[i][:len(x)].cpu().numpy() # librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_generated.wav', audio, sr=sample_rate) # audio_tr = out[n_points+i][:len(x)].cpu().numpy() # librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_transferred.wav', audio_tr, sr=sample_rate) for i, x in enumerate(gt): # librosa.output.write_wav(f'{paths.gen_dir()}/gsb_{i+1:04d}.wav', x.cpu().numpy(), sr=sample_rate) # librosa.output.write_wav(f'{paths.gen_dir()}/gt_gsb_{i+1:03d}.wav', x.cpu().numpy(), sr=sample_rate) # audio = out[i][:len(x)].cpu().numpy() # librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_generated.wav', audio, sr=sample_rate) # audio_tr = out[n_points+i][:len(x)].cpu().numpy() audio_tr = out[i][:self.pad_left_encoder() + len(x)].cpu().numpy() # librosa.output.write_wav(f'{paths.gen_dir()}/{k}k_steps_{i}_transferred.wav', audio_tr, sr=sample_rate) librosa.output.write_wav(f'{paths.gen_dir()}/gsb_{i + 1:04d}.wav', audio_tr, sr=sample_rate)
def do_test(self, writer, epoch, step, data_path, test_index): dataset = env.MultispeakerDataset(test_index, data_path) criterion = nn.NLLLoss().cuda() # k = 0 # saved_k = 0 pad_left = self.pad_left() pad_left_encoder = self.pad_left_encoder() pad_left_decoder = self.pad_left_decoder() extra_pad_right = 0 pad_right = self.pad_right() + extra_pad_right window = 16 * self.total_scale() test_loader = DataLoader( dataset, collate_fn=lambda batch: env.collate_multispeaker_samples( pad_left, window, pad_right, batch), batch_size=16, num_workers=2, shuffle=False, pin_memory=True) running_loss_c = 0. running_loss_f = 0. running_loss_vq = 0. running_loss_vqc = 0. running_entropy = 0. running_max_grad = 0. running_max_grad_name = "" for i, (speaker, wave16) in enumerate(test_loader): speaker = speaker.cuda() wave16 = wave16.cuda() coarse = (wave16 + 2**15) // 256 fine = (wave16 + 2**15) % 256 coarse_f = coarse.float() / 127.5 - 1. fine_f = fine.float() / 127.5 - 1. total_f = (wave16.float() + 0.5) / 32767.5 noisy_f = total_f x = torch.cat([ coarse_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), fine_f[:, pad_left - pad_left_decoder:-pad_right].unsqueeze(-1), coarse_f[:, pad_left - pad_left_decoder + 1:1 - pad_right].unsqueeze(-1), ], dim=2) y_coarse = coarse[:, pad_left + 1:1 - pad_right] y_fine = fine[:, pad_left + 1:1 - pad_right] translated = noisy_f[:, pad_left - pad_left_encoder:] p_cf, vq_pen, encoder_pen, entropy = self(speaker, x, translated) p_c, p_f = p_cf loss_c = criterion(p_c.transpose(1, 2).float(), y_coarse) loss_f = criterion(p_f.transpose(1, 2).float(), y_fine) # encoder_weight = 0.01 * min(1, max(0.1, step / 1000 - 1)) # loss = loss_c + loss_f + vq_pen + encoder_weight * encoder_pen running_loss_c += loss_c.item() running_loss_f += loss_f.item() running_loss_vq += vq_pen.item() running_loss_vqc += encoder_pen.item() running_entropy += entropy avg_loss_c = running_loss_c / (i + 1) avg_loss_f = running_loss_f / (i + 1) avg_loss_vq = running_loss_vq / (i + 1) avg_loss_vqc = running_loss_vqc / (i + 1) avg_entropy = running_entropy / (i + 1) k = step // 1000 # tensorboard writer writer.add_scalars( 'Test/loss_group', { 'loss_c': avg_loss_c, 'loss_f': avg_loss_f, 'vq': avg_loss_vq, 'vqc': avg_loss_vqc, 'entropy': avg_entropy, }, step - 1)
index = pickle.load(f) logger.log(f"len of vctk index pkl object is {len(index)}" ) # should be equal to total number of speakers in the dataset # logger.log(f"index.pkl file --- index[:5] {index[:5]}") # logger.log(f"index.pkl file --- index[0][:5] {index[0][:5]}") test_index = [ x[:args.test_utts_per_speaker] if i < args.test_speakers else [] for i, x in enumerate(index) ] # take first 30 utts from args.test_speakers speakers as test data train_index = [ x[args.test_utts_per_speaker:] if i < args.test_speakers else x for i, x in enumerate(index) ] # rest of utts are training data from each speaker dataset = env.MultispeakerDataset(train_index, data_path) elif dataset_type == 'single': data_path = config.single_speaker_data_path with open(f'{data_path}/dataset_ids.pkl', 'rb') as f: index = pickle.load(f) test_index = index[-args.test_speakers:] + index[:args.test_speakers] train_index = index[:-args.test_speakers] dataset = env.AudiobookDataset(train_index, data_path) else: raise RuntimeError('bad dataset type') print(f'dataset size: {len(dataset)}') model = model_fn(dataset) if use_half:
def do_generate(self, paths, data_path, index, test_speakers, test_utts_per_speaker, use_half=False, verbose=False, only_discrete=False): # Set the speaker to generate for each utterance # speaker_id = 1 # the speaker id to condition the model on for generation # TODO make this a CLA? # Get the utts we have chosen to generate from 'index' # 'index' contains ALL utts in dataset test_index = [] for i, x in enumerate(index): if test_speakers == 0 or i < test_speakers: if test_utts_per_speaker == 0: # if test_utts_per_speaker is 0, then use ALL utts for the speaker test_index.append(x) else: test_index.append(x[:test_utts_per_speaker]) else: test_index.append( [] ) # done so that speaker one hots are created of correct dimension # test_index = [x[:test_utts_per_speaker] if len(x) > 0 else [] for i, x in enumerate(test_index)] # logger.log('second:') # logger.log(test_index) # make containing directories os.makedirs(f'{paths.gen_path()}embeddings', exist_ok=True) os.makedirs(f'{paths.gen_path()}vqvae_tokens', exist_ok=True) os.makedirs(f'{paths.gen_path()}decoder_input_vectors', exist_ok=True) # TODO Save embedding matrix to disk for plotting and analysis torch.save(self.vq.embedding0.clone().detach(), f'{paths.gen_path()}embeddings/vqvae_codebook.pt') dataset = env.MultispeakerDataset(test_index, data_path, return_filename=True) loader = DataLoader(dataset, batch_size=1, shuffle=False) for speaker, x, filename in loader: # NB!!! Following code in for loop is only designed for batch size == 1 for now print("speaker.size()", speaker.size()) print("x.size()", x.size()) print("filename", filename) # data = [x for x in loader] # logger.log("data:") # logger.log(f"len(data) = {len(data)}") # logger.log(f"data[0]: {data[0]}") # n_points = len(data) # gt = [(x[0].float() + 0.5) / (2 ** 15 - 0.5) for speaker, x, filename in data] # extended = [np.concatenate( # [np.zeros(self.pad_left_encoder(), dtype=np.float32), x, np.zeros(self.pad_right(), dtype=np.float32)]) for # x in gt] gt = (x[0].float() + 0.5) / (2**15 - 0.5) extended = np.concatenate([ np.zeros(self.pad_left_encoder(), dtype=np.float32), gt, np.zeros(self.pad_right(), dtype=np.float32) ]) # TODO use speaker id from dataset speakers = [ torch.FloatTensor(speaker[0].float()) ] # TODO seems to only have 3 speakers? As per the CLA. look at dataset... total_test_utts = test_speakers * test_utts_per_speaker print("test_speakers", test_speakers) print("test_utts_per_speaker", test_utts_per_speaker) # (np.arange(30) == 1) is a one hot conditioning vector indicating speaker 2 # vc_speakers = [torch.FloatTensor((np.arange(30) == speaker_id).astype(np.float)) for _ in range(total_test_utts)] # speakers = vc_speakers print("speakers:") print("speakers", speakers) print("len(speakers)", len(speakers)) print("speakers[0].size()", speakers[0].size()) print("torch.stack(speakers, dim=0).size()", torch.stack(speakers, dim=0).size()) # maxlen = max([len(x) for x in extended]) print("extended.shape", extended.shape) maxlen = len(extended) # aligned = [torch.cat([torch.FloatTensor(x), torch.zeros(maxlen - len(x))]) for x in extended] aligned = [torch.FloatTensor(extended)] print("torch.stack(aligned, dim=0).size()", torch.stack(aligned, dim=0).size()) # out = self.forward_generate(torch.stack(speakers + list(reversed(speakers)), dim=0).cuda(), torch.stack(aligned + aligned, dim=0).cuda(), verbose=verbose, use_half=use_half, only_discrete=only_discrete) out, discrete, index_atom, index_group = self.forward_generate( torch.stack(speakers, dim=0).cuda(), torch.stack(aligned, dim=0).cuda(), verbose=verbose, use_half=use_half, only_discrete=only_discrete) if out is not None: logger.log(f'out[0]: {out[0]}') logger.log(f'out: {out.size()}') logger.log(f'index_atom.size(): {index_atom.size()}') # logger.log(f'index_atom[0]: {index_atom[0]}') logger.log(f'index_atom[0].size(): {index_atom[0].size()}') logger.log(f'index_group.size(): {index_group.size()}') # logger.log(f'index_group[0]: {index_group[0]}') logger.log(f'index_group[0].size(): {index_group[0].size()}') # for i, x in enumerate(gt) : # librosa.output.write_wav(f'{paths.gen_path()}/{k}k_steps_{i}_target.wav', x.cpu().numpy(), sr=sample_rate) # audio = out[i][:len(x)].cpu().numpy() # librosa.output.write_wav(f'{paths.gen_path()}/{k}k_steps_{i}_generated.wav', audio, sr=sample_rate) # audio_tr = out[n_points+i][:len(x)].cpu().numpy() # librosa.output.write_wav(f'{paths.gen_path()}/{k}k_steps_{i}_transferred.wav', audio_tr, sr=sample_rate) ###################################### # Generate atom and group data to save to disk index_atom = index_atom.squeeze() index_group = index_group.squeeze() assert index_atom.size() == index_group.size() vqvae_tokens = [] for i in range(len(index_atom)): atom_id = int(index_atom[i]) group_id = int(index_group[i]) vqvae_tokens.append(f"{group_id}_{atom_id}") vqvae_tokens = '\n'.join(vqvae_tokens) ###################################### # Save files to disk # Discrete vqvae symbols # for i, x in enumerate(gt): # os.makedirs(f'{paths.gen_path()}groups', exist_ok=True) filename_noext = f'{filename[0]}' with open(f'{paths.gen_path()}vqvae_tokens/{filename_noext}.txt', 'w') as f: f.write(vqvae_tokens) # TODO The ACTUAL embeddings fed into the decoder # TODO (average of atoms in group weighted according to their distance from encoder output) torch.save( discrete, f'{paths.gen_path()}decoder_input_vectors/{filename_noext}.pt') # discrete vqvae tokens for analysis and modification/pronunciation correction # torch.save(index_atom, f'{paths.gen_path()}atoms/{filename_noext}_atom.pt') # torch.save(index_group, f'{paths.gen_path()}groups/{filename_noext}_group.pt') # TODO currently we are saving the entire matrix of discrete tokens for all utts multiple times # TODO need to change this so that we are saving a single vector of discrete tokens for each input test utt # TODO create more informative filenames for test generated utts. use original vctk filename and include the speaker that was used to condition the model (create a mapping from one hot speaker id [0-30] to vctk speaker names [pxxx-pzzz] to do this) # print(len(index_atom.tolist())) # print(len(index_group.tolist())) # print(index_atom.tolist()) # print(index_group.tolist()) # save wav file for listening if out is not None: audio_tr = out[0][:self.pad_left_encoder() + len(gt)].cpu().numpy() wav_path = f'{paths.gen_path()}{filename_noext}.wav' librosa.output.write_wav(wav_path, audio_tr, sr=sample_rate) print(f"Saved audio to {wav_path}")